LLVM 23.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45 const AMDGPUTargetMachine &TM)
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
49#include "AMDGPUGenGlobalISel.inc"
52#include "AMDGPUGenGlobalISel.inc"
54{
55}
56
57const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
58
69
70// Return the wave level SGPR base address if this is a wave address.
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(1).getReg()
74 : Register();
75}
76
77bool AMDGPUInstructionSelector::isVCC(Register Reg,
78 const MachineRegisterInfo &MRI) const {
79 // The verifier is oblivious to s1 being a valid value for wavesize registers.
80 if (Reg.isPhysical())
81 return false;
82
83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84 const TargetRegisterClass *RC =
86 if (RC) {
87 const LLT Ty = MRI.getType(Reg);
88 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
89 return false;
90 // G_TRUNC s1 result is never vcc.
91 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
92 RC->hasSuperClassEq(TRI.getBoolRC());
93 }
94
95 const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
96 return RB->getID() == AMDGPU::VCCRegBankID;
97}
98
99bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
100 unsigned NewOpc) const {
101 MI.setDesc(TII.get(NewOpc));
102 MI.removeOperand(1); // Remove intrinsic ID.
103 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
104
105 MachineOperand &Dst = MI.getOperand(0);
106 MachineOperand &Src = MI.getOperand(1);
107
108 // TODO: This should be legalized to s32 if needed
109 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
110 return false;
111
112 const TargetRegisterClass *DstRC
113 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
114 const TargetRegisterClass *SrcRC
115 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
116 if (!DstRC || DstRC != SrcRC)
117 return false;
118
119 if (!RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) ||
120 !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
121 return false;
122 const MCInstrDesc &MCID = MI.getDesc();
123 if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
124 MI.getOperand(0).setIsEarlyClobber(true);
125 }
126 return true;
127}
128
129bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
130 const DebugLoc &DL = I.getDebugLoc();
131 MachineBasicBlock *BB = I.getParent();
132 I.setDesc(TII.get(TargetOpcode::COPY));
133
134 const MachineOperand &Src = I.getOperand(1);
135 MachineOperand &Dst = I.getOperand(0);
136 Register DstReg = Dst.getReg();
137 Register SrcReg = Src.getReg();
138
139 if (isVCC(DstReg, *MRI)) {
140 if (SrcReg == AMDGPU::SCC) {
141 const TargetRegisterClass *RC
142 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
143 if (!RC)
144 return true;
145 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
146 }
147
148 if (!isVCC(SrcReg, *MRI)) {
149 // TODO: Should probably leave the copy and let copyPhysReg expand it.
150 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
151 return false;
152
153 const TargetRegisterClass *SrcRC
154 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
155
156 std::optional<ValueAndVReg> ConstVal =
157 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
158 if (ConstVal) {
159 unsigned MovOpc =
160 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
161 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
162 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
163 } else {
164 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
165
166 // We can't trust the high bits at this point, so clear them.
167
168 // TODO: Skip masking high bits if def is known boolean.
169
170 if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
171 assert(Subtarget->useRealTrue16Insts());
172 const int64_t NoMods = 0;
173 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
174 .addImm(NoMods)
175 .addImm(1)
176 .addImm(NoMods)
177 .addReg(SrcReg)
178 .addImm(NoMods);
179 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
180 .addImm(NoMods)
181 .addImm(0)
182 .addImm(NoMods)
183 .addReg(MaskedReg)
184 .addImm(NoMods);
185 } else {
186 bool IsSGPR = TRI.isSGPRClass(SrcRC);
187 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
188 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
189 .addImm(1)
190 .addReg(SrcReg);
191 if (IsSGPR)
192 And.setOperandDead(3); // Dead scc
193
194 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
195 .addImm(0)
196 .addReg(MaskedReg);
197 }
198 }
199
200 if (!MRI->getRegClassOrNull(SrcReg))
201 MRI->setRegClass(SrcReg, SrcRC);
202 I.eraseFromParent();
203 return true;
204 }
205
206 const TargetRegisterClass *RC =
207 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
208 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
209 return false;
210
211 return true;
212 }
213
214 for (const MachineOperand &MO : I.operands()) {
215 if (MO.getReg().isPhysical())
216 continue;
217
218 const TargetRegisterClass *RC =
219 TRI.getConstrainedRegClassForOperand(MO, *MRI);
220 if (!RC)
221 continue;
222 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
223 }
224 return true;
225}
226
227bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
228 const DebugLoc &DL = I.getDebugLoc();
229 MachineBasicBlock *BB = I.getParent();
230 Register VCCReg = I.getOperand(1).getReg();
231 MachineInstr *Cmp;
232
233 // Set SCC as a side effect with S_CMP or S_OR.
234 if (STI.hasScalarCompareEq64()) {
235 unsigned CmpOpc =
236 STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
237 Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0);
238 } else {
239 Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
240 Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst)
241 .addReg(VCCReg)
242 .addReg(VCCReg);
243 }
244
245 constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
246
247 Register DstReg = I.getOperand(0).getReg();
248 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
249
250 I.eraseFromParent();
251 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
252}
253
254bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
255 const DebugLoc &DL = I.getDebugLoc();
256 MachineBasicBlock *BB = I.getParent();
257
258 Register DstReg = I.getOperand(0).getReg();
259 Register SrcReg = I.getOperand(1).getReg();
260 std::optional<ValueAndVReg> Arg =
261 getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);
262
263 if (Arg) {
264 const int64_t Value = Arg->Value.getZExtValue();
265 if (Value == 0) {
266 unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
267 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
268 } else {
269 assert(Value == 1);
270 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());
271 }
272 I.eraseFromParent();
273 return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
274 }
275
276 // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
277 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);
278
279 unsigned SelectOpcode =
280 STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
281 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
282 .addReg(TRI.getExec())
283 .addImm(0);
284
285 I.eraseFromParent();
287 return true;
288}
289
290bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
291 Register DstReg = I.getOperand(0).getReg();
292 Register SrcReg = I.getOperand(1).getReg();
293
294 const DebugLoc &DL = I.getDebugLoc();
295 MachineBasicBlock *BB = I.getParent();
296
297 auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
298 .addReg(SrcReg);
299
300 I.eraseFromParent();
301 constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
302 return true;
303}
304
305bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
306 const Register DefReg = I.getOperand(0).getReg();
307 const LLT DefTy = MRI->getType(DefReg);
308
309 // S1 G_PHIs should not be selected in instruction-select, instead:
310 // - divergent S1 G_PHI should go through lane mask merging algorithm
311 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
312 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
313 if (DefTy == LLT::scalar(1))
314 return false;
315
316 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
317
318 const RegClassOrRegBank &RegClassOrBank =
319 MRI->getRegClassOrRegBank(DefReg);
320
321 const TargetRegisterClass *DefRC =
323 if (!DefRC) {
324 if (!DefTy.isValid()) {
325 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
326 return false;
327 }
328
329 const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
330 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
331 if (!DefRC) {
332 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
333 return false;
334 }
335 }
336
337 // If inputs have register bank, assign corresponding reg class.
338 // Note: registers don't need to have the same reg bank.
339 for (unsigned i = 1; i != I.getNumOperands(); i += 2) {
340 const Register SrcReg = I.getOperand(i).getReg();
341
342 const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
343 if (RB) {
344 const LLT SrcTy = MRI->getType(SrcReg);
345 const TargetRegisterClass *SrcRC =
346 TRI.getRegClassForTypeOnBank(SrcTy, *RB);
347 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
348 return false;
349 }
350 }
351
352 I.setDesc(TII.get(TargetOpcode::PHI));
353 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
354}
355
357AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
358 const TargetRegisterClass &SubRC,
359 unsigned SubIdx) const {
360
361 MachineInstr *MI = MO.getParent();
362 MachineBasicBlock *BB = MO.getParent()->getParent();
363 Register DstReg = MRI->createVirtualRegister(&SubRC);
364
365 if (MO.isReg()) {
366 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
367 Register Reg = MO.getReg();
368 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
369 .addReg(Reg, {}, ComposedSubIdx);
370
371 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
372 MO.isKill(), MO.isDead(), MO.isUndef(),
373 MO.isEarlyClobber(), 0, MO.isDebug(),
374 MO.isInternalRead());
375 }
376
377 assert(MO.isImm());
378
379 APInt Imm(64, MO.getImm());
380
381 switch (SubIdx) {
382 default:
383 llvm_unreachable("do not know to split immediate with this sub index.");
384 case AMDGPU::sub0:
385 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
386 case AMDGPU::sub1:
387 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
388 }
389}
390
391static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
392 switch (Opc) {
393 case AMDGPU::G_AND:
394 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
395 case AMDGPU::G_OR:
396 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
397 case AMDGPU::G_XOR:
398 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
399 default:
400 llvm_unreachable("not a bit op");
401 }
402}
403
404bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
405 Register DstReg = I.getOperand(0).getReg();
406 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
407
408 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
409 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
410 DstRB->getID() != AMDGPU::VCCRegBankID)
411 return false;
412
413 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
414 STI.isWave64());
415 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
416
417 // Dead implicit-def of scc
418 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
419 true, // isImp
420 false, // isKill
421 true)); // isDead
422 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
423 return true;
424}
425
426bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
427 MachineBasicBlock *BB = I.getParent();
428 MachineFunction *MF = BB->getParent();
429 Register DstReg = I.getOperand(0).getReg();
430 const DebugLoc &DL = I.getDebugLoc();
431 LLT Ty = MRI->getType(DstReg);
432 if (Ty.isVector())
433 return false;
434
435 unsigned Size = Ty.getSizeInBits();
436 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
437 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
438 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
439
440 if (Size == 32) {
441 if (IsSALU) {
442 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
443 MachineInstr *Add =
444 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
445 .add(I.getOperand(1))
446 .add(I.getOperand(2))
447 .setOperandDead(3); // Dead scc
448 I.eraseFromParent();
449 constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
450 return true;
451 }
452
453 if (STI.hasAddNoCarryInsts()) {
454 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
455 I.setDesc(TII.get(Opc));
456 I.addOperand(*MF, MachineOperand::CreateImm(0));
457 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
458 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
459 return true;
460 }
461
462 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
463
464 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
465 MachineInstr *Add
466 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
467 .addDef(UnusedCarry, RegState::Dead)
468 .add(I.getOperand(1))
469 .add(I.getOperand(2))
470 .addImm(0);
471 I.eraseFromParent();
472 constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
473 return true;
474 }
475
476 assert(!Sub && "illegal sub should not reach here");
477
478 const TargetRegisterClass &RC
479 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
480 const TargetRegisterClass &HalfRC
481 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
482
483 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
484 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
485 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
486 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
487
488 Register DstLo = MRI->createVirtualRegister(&HalfRC);
489 Register DstHi = MRI->createVirtualRegister(&HalfRC);
490
491 if (IsSALU) {
492 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
493 .add(Lo1)
494 .add(Lo2);
495 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
496 .add(Hi1)
497 .add(Hi2)
498 .setOperandDead(3); // Dead scc
499 } else {
500 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
501 Register CarryReg = MRI->createVirtualRegister(CarryRC);
502 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
503 .addDef(CarryReg)
504 .add(Lo1)
505 .add(Lo2)
506 .addImm(0);
507 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
508 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
509 .add(Hi1)
510 .add(Hi2)
511 .addReg(CarryReg, RegState::Kill)
512 .addImm(0);
513
514 constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI);
515 }
516
517 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
518 .addReg(DstLo)
519 .addImm(AMDGPU::sub0)
520 .addReg(DstHi)
521 .addImm(AMDGPU::sub1);
522
523
524 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
525 return false;
526
527 I.eraseFromParent();
528 return true;
529}
530
531bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
532 MachineInstr &I) const {
533 MachineBasicBlock *BB = I.getParent();
534 MachineFunction *MF = BB->getParent();
535 const DebugLoc &DL = I.getDebugLoc();
536 Register Dst0Reg = I.getOperand(0).getReg();
537 Register Dst1Reg = I.getOperand(1).getReg();
538 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
539 I.getOpcode() == AMDGPU::G_UADDE;
540 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
541 I.getOpcode() == AMDGPU::G_USUBE;
542
543 if (isVCC(Dst1Reg, *MRI)) {
544 unsigned NoCarryOpc =
545 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
546 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
547 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
548 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
549 I.addOperand(*MF, MachineOperand::CreateImm(0));
550 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
551 return true;
552 }
553
554 Register Src0Reg = I.getOperand(2).getReg();
555 Register Src1Reg = I.getOperand(3).getReg();
556
557 if (HasCarryIn) {
558 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
559 .addReg(I.getOperand(4).getReg());
560 }
561
562 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
563 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
564
565 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
566 .add(I.getOperand(2))
567 .add(I.getOperand(3));
568
569 if (MRI->use_nodbg_empty(Dst1Reg)) {
570 CarryInst.setOperandDead(3); // Dead scc
571 } else {
572 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
573 .addReg(AMDGPU::SCC);
574 if (!MRI->getRegClassOrNull(Dst1Reg))
575 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
576 }
577
578 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
579 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
580 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
581 return false;
582
583 if (HasCarryIn &&
584 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
585 AMDGPU::SReg_32RegClass, *MRI))
586 return false;
587
588 I.eraseFromParent();
589 return true;
590}
591
592bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
593 MachineInstr &I) const {
594 MachineBasicBlock *BB = I.getParent();
595 MachineFunction *MF = BB->getParent();
596 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
597 bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() &&
598 MRI->use_nodbg_empty(I.getOperand(1).getReg());
599
600 unsigned Opc;
601 if (Subtarget->hasMADIntraFwdBug())
602 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
603 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
604 else if (UseNoCarry)
605 Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64
606 : AMDGPU::V_MAD_NC_I64_I32_e64;
607 else
608 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
609
610 if (UseNoCarry)
611 I.removeOperand(1);
612
613 I.setDesc(TII.get(Opc));
614 I.addOperand(*MF, MachineOperand::CreateImm(0));
615 I.addImplicitDefUseOperands(*MF);
616 I.getOperand(0).setIsEarlyClobber(true);
617 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
618 return true;
619}
620
621// TODO: We should probably legalize these to only using 32-bit results.
622bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
623 MachineBasicBlock *BB = I.getParent();
624 Register DstReg = I.getOperand(0).getReg();
625 Register SrcReg = I.getOperand(1).getReg();
626 LLT DstTy = MRI->getType(DstReg);
627 LLT SrcTy = MRI->getType(SrcReg);
628 const unsigned SrcSize = SrcTy.getSizeInBits();
629 unsigned DstSize = DstTy.getSizeInBits();
630
631 // TODO: Should handle any multiple of 32 offset.
632 unsigned Offset = I.getOperand(2).getImm();
633 if (Offset % 32 != 0 || DstSize > 128)
634 return false;
635
636 // 16-bit operations really use 32-bit registers.
637 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
638 if (DstSize == 16)
639 DstSize = 32;
640
641 const TargetRegisterClass *DstRC =
642 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
643 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
644 return false;
645
646 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
647 const TargetRegisterClass *SrcRC =
648 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
649 if (!SrcRC)
650 return false;
651 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
652 DstSize / 32);
653 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
654 if (!SrcRC)
655 return false;
656
657 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
658 *SrcRC, I.getOperand(1));
659 const DebugLoc &DL = I.getDebugLoc();
660 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
661 .addReg(SrcReg, {}, SubReg);
662
663 I.eraseFromParent();
664 return true;
665}
666
667bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
668 MachineBasicBlock *BB = MI.getParent();
669 Register DstReg = MI.getOperand(0).getReg();
670 LLT DstTy = MRI->getType(DstReg);
671 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
672
673 const unsigned SrcSize = SrcTy.getSizeInBits();
674 if (SrcSize < 32)
675 return selectImpl(MI, *CoverageInfo);
676
677 const DebugLoc &DL = MI.getDebugLoc();
678 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
679 const unsigned DstSize = DstTy.getSizeInBits();
680 const TargetRegisterClass *DstRC =
681 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
682 if (!DstRC)
683 return false;
684
685 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
686 MachineInstrBuilder MIB =
687 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
688 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
689 MachineOperand &Src = MI.getOperand(I + 1);
690 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
691 MIB.addImm(SubRegs[I]);
692
693 const TargetRegisterClass *SrcRC
694 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
695 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
696 return false;
697 }
698
699 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
700 return false;
701
702 MI.eraseFromParent();
703 return true;
704}
705
706bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
707 MachineBasicBlock *BB = MI.getParent();
708 const int NumDst = MI.getNumOperands() - 1;
709
710 MachineOperand &Src = MI.getOperand(NumDst);
711
712 Register SrcReg = Src.getReg();
713 Register DstReg0 = MI.getOperand(0).getReg();
714 LLT DstTy = MRI->getType(DstReg0);
715 LLT SrcTy = MRI->getType(SrcReg);
716
717 const unsigned DstSize = DstTy.getSizeInBits();
718 const unsigned SrcSize = SrcTy.getSizeInBits();
719 const DebugLoc &DL = MI.getDebugLoc();
720 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
721
722 const TargetRegisterClass *SrcRC =
723 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
724 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
725 return false;
726
727 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
728 // source, and this relies on the fact that the same subregister indices are
729 // used for both.
730 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
731 for (int I = 0, E = NumDst; I != E; ++I) {
732 MachineOperand &Dst = MI.getOperand(I);
733 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
734 .addReg(SrcReg, {}, SubRegs[I]);
735
736 // Make sure the subregister index is valid for the source register.
737 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
738 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
739 return false;
740
741 const TargetRegisterClass *DstRC =
742 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
743 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
744 return false;
745 }
746
747 MI.eraseFromParent();
748 return true;
749}
750
751bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
752 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
753 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
754
755 Register Src0 = MI.getOperand(1).getReg();
756 Register Src1 = MI.getOperand(2).getReg();
757 LLT SrcTy = MRI->getType(Src0);
758 const unsigned SrcSize = SrcTy.getSizeInBits();
759
760 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
761 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
762 return selectG_MERGE_VALUES(MI);
763 }
764
765 // Selection logic below is for V2S16 only.
766 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
767 Register Dst = MI.getOperand(0).getReg();
768 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
769 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
770 SrcTy != LLT::scalar(32)))
771 return selectImpl(MI, *CoverageInfo);
772
773 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
774 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
775 return false;
776
777 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
778 DstBank->getID() == AMDGPU::VGPRRegBankID);
779 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
780
781 const DebugLoc &DL = MI.getDebugLoc();
782 MachineBasicBlock *BB = MI.getParent();
783
784 // First, before trying TableGen patterns, check if both sources are
785 // constants. In those cases, we can trivially compute the final constant
786 // and emit a simple move.
787 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
788 if (ConstSrc1) {
789 auto ConstSrc0 =
790 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
791 if (ConstSrc0) {
792 const int64_t K0 = ConstSrc0->Value.getSExtValue();
793 const int64_t K1 = ConstSrc1->Value.getSExtValue();
794 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
795 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
796 uint32_t Imm = Lo16 | (Hi16 << 16);
797
798 // VALU
799 if (IsVector) {
800 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
801 MI.eraseFromParent();
802 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
803 }
804
805 // SALU
806 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
807 MI.eraseFromParent();
808 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
809 }
810 }
811
812 // Now try TableGen patterns.
813 if (selectImpl(MI, *CoverageInfo))
814 return true;
815
816 // TODO: This should probably be a combine somewhere
817 // (build_vector $src0, undef) -> copy $src0
818 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
819 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
820 MI.setDesc(TII.get(AMDGPU::COPY));
821 MI.removeOperand(2);
822 const auto &RC =
823 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
824 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
825 RBI.constrainGenericRegister(Src0, RC, *MRI);
826 }
827
828 // TODO: Can be improved?
829 if (IsVector) {
830 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
831 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
832 .addImm(0xFFFF)
833 .addReg(Src0);
834 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
835
836 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
837 .addReg(Src1)
838 .addImm(16)
839 .addReg(TmpReg);
840 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
841
842 MI.eraseFromParent();
843 return true;
844 }
845
846 Register ShiftSrc0;
847 Register ShiftSrc1;
848
849 // With multiple uses of the shift, this will duplicate the shift and
850 // increase register pressure.
851 //
852 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
853 // => (S_PACK_HH_B32_B16 $src0, $src1)
854 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
855 // => (S_PACK_HL_B32_B16 $src0, $src1)
856 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
857 // => (S_PACK_LH_B32_B16 $src0, $src1)
858 // (build_vector $src0, $src1)
859 // => (S_PACK_LL_B32_B16 $src0, $src1)
860
861 bool Shift0 = mi_match(
862 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
863
864 bool Shift1 = mi_match(
865 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
866
867 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
868 if (Shift0 && Shift1) {
869 Opc = AMDGPU::S_PACK_HH_B32_B16;
870 MI.getOperand(1).setReg(ShiftSrc0);
871 MI.getOperand(2).setReg(ShiftSrc1);
872 } else if (Shift1) {
873 Opc = AMDGPU::S_PACK_LH_B32_B16;
874 MI.getOperand(2).setReg(ShiftSrc1);
875 } else if (Shift0) {
876 auto ConstSrc1 =
877 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
878 if (ConstSrc1 && ConstSrc1->Value == 0) {
879 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
880 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
881 .addReg(ShiftSrc0)
882 .addImm(16)
883 .setOperandDead(3); // Dead scc
884
885 MI.eraseFromParent();
886 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
887 return true;
888 }
889 if (STI.hasSPackHL()) {
890 Opc = AMDGPU::S_PACK_HL_B32_B16;
891 MI.getOperand(1).setReg(ShiftSrc0);
892 }
893 }
894
895 MI.setDesc(TII.get(Opc));
896 constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
897 return true;
898}
899
900bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
901 const MachineOperand &MO = I.getOperand(0);
902
903 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
904 // regbank check here is to know why getConstrainedRegClassForOperand failed.
905 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
906 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
907 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
908 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
909 return true;
910 }
911
912 return false;
913}
914
915bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
916 MachineBasicBlock *BB = I.getParent();
917
918 Register DstReg = I.getOperand(0).getReg();
919 Register Src0Reg = I.getOperand(1).getReg();
920 Register Src1Reg = I.getOperand(2).getReg();
921 LLT Src1Ty = MRI->getType(Src1Reg);
922
923 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
924 unsigned InsSize = Src1Ty.getSizeInBits();
925
926 int64_t Offset = I.getOperand(3).getImm();
927
928 // FIXME: These cases should have been illegal and unnecessary to check here.
929 if (Offset % 32 != 0 || InsSize % 32 != 0)
930 return false;
931
932 // Currently not handled by getSubRegFromChannel.
933 if (InsSize > 128)
934 return false;
935
936 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
937 if (SubReg == AMDGPU::NoSubRegister)
938 return false;
939
940 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
941 const TargetRegisterClass *DstRC =
942 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
943 if (!DstRC)
944 return false;
945
946 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
947 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
948 const TargetRegisterClass *Src0RC =
949 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
950 const TargetRegisterClass *Src1RC =
951 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
952
953 // Deal with weird cases where the class only partially supports the subreg
954 // index.
955 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
956 if (!Src0RC || !Src1RC)
957 return false;
958
959 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
960 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
961 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
962 return false;
963
964 const DebugLoc &DL = I.getDebugLoc();
965 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
966 .addReg(Src0Reg)
967 .addReg(Src1Reg)
968 .addImm(SubReg);
969
970 I.eraseFromParent();
971 return true;
972}
973
974bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
975 Register DstReg = MI.getOperand(0).getReg();
976 Register SrcReg = MI.getOperand(1).getReg();
977 Register OffsetReg = MI.getOperand(2).getReg();
978 Register WidthReg = MI.getOperand(3).getReg();
979
980 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
981 "scalar BFX instructions are expanded in regbankselect");
982 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
983 "64-bit vector BFX instructions are expanded in regbankselect");
984
985 const DebugLoc &DL = MI.getDebugLoc();
986 MachineBasicBlock *MBB = MI.getParent();
987
988 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
989 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
990 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
991 .addReg(SrcReg)
992 .addReg(OffsetReg)
993 .addReg(WidthReg);
994 MI.eraseFromParent();
995 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
996 return true;
997}
998
999bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
1000 if (STI.getLDSBankCount() != 16)
1001 return selectImpl(MI, *CoverageInfo);
1002
1003 Register Dst = MI.getOperand(0).getReg();
1004 Register Src0 = MI.getOperand(2).getReg();
1005 Register M0Val = MI.getOperand(6).getReg();
1006 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
1007 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
1008 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
1009 return false;
1010
1011 // This requires 2 instructions. It is possible to write a pattern to support
1012 // this, but the generated isel emitter doesn't correctly deal with multiple
1013 // output instructions using the same physical register input. The copy to m0
1014 // is incorrectly placed before the second instruction.
1015 //
1016 // TODO: Match source modifiers.
1017
1018 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1019 const DebugLoc &DL = MI.getDebugLoc();
1020 MachineBasicBlock *MBB = MI.getParent();
1021
1022 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1023 .addReg(M0Val);
1024 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
1025 .addImm(2)
1026 .addImm(MI.getOperand(4).getImm()) // $attr
1027 .addImm(MI.getOperand(3).getImm()); // $attrchan
1028
1029 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
1030 .addImm(0) // $src0_modifiers
1031 .addReg(Src0) // $src0
1032 .addImm(MI.getOperand(4).getImm()) // $attr
1033 .addImm(MI.getOperand(3).getImm()) // $attrchan
1034 .addImm(0) // $src2_modifiers
1035 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
1036 .addImm(MI.getOperand(5).getImm()) // $high
1037 .addImm(0) // $clamp
1038 .addImm(0); // $omod
1039
1040 MI.eraseFromParent();
1041 return true;
1042}
1043
1044// Writelane is special in that it can use SGPR and M0 (which would normally
1045// count as using the constant bus twice - but in this case it is allowed since
1046// the lane selector doesn't count as a use of the constant bus). However, it is
1047// still required to abide by the 1 SGPR rule. Fix this up if we might have
1048// multiple SGPRs.
1049bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
1050 // With a constant bus limit of at least 2, there's no issue.
1051 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
1052 return selectImpl(MI, *CoverageInfo);
1053
1054 MachineBasicBlock *MBB = MI.getParent();
1055 const DebugLoc &DL = MI.getDebugLoc();
1056 Register VDst = MI.getOperand(0).getReg();
1057 Register Val = MI.getOperand(2).getReg();
1058 Register LaneSelect = MI.getOperand(3).getReg();
1059 Register VDstIn = MI.getOperand(4).getReg();
1060
1061 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
1062
1063 std::optional<ValueAndVReg> ConstSelect =
1064 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
1065 if (ConstSelect) {
1066 // The selector has to be an inline immediate, so we can use whatever for
1067 // the other operands.
1068 MIB.addReg(Val);
1069 MIB.addImm(ConstSelect->Value.getSExtValue() &
1070 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
1071 } else {
1072 std::optional<ValueAndVReg> ConstVal =
1074
1075 // If the value written is an inline immediate, we can get away without a
1076 // copy to m0.
1077 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
1078 STI.hasInv2PiInlineImm())) {
1079 MIB.addImm(ConstVal->Value.getSExtValue());
1080 MIB.addReg(LaneSelect);
1081 } else {
1082 MIB.addReg(Val);
1083
1084 // If the lane selector was originally in a VGPR and copied with
1085 // readfirstlane, there's a hazard to read the same SGPR from the
1086 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
1087 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
1088
1089 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1090 .addReg(LaneSelect);
1091 MIB.addReg(AMDGPU::M0);
1092 }
1093 }
1094
1095 MIB.addReg(VDstIn);
1096
1097 MI.eraseFromParent();
1098 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1099 return true;
1100}
1101
1102// We need to handle this here because tablegen doesn't support matching
1103// instructions with multiple outputs.
1104bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
1105 Register Dst0 = MI.getOperand(0).getReg();
1106 Register Dst1 = MI.getOperand(1).getReg();
1107
1108 LLT Ty = MRI->getType(Dst0);
1109 unsigned Opc;
1110 if (Ty == LLT::scalar(32))
1111 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1112 else if (Ty == LLT::scalar(64))
1113 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1114 else
1115 return false;
1116
1117 // TODO: Match source modifiers.
1118
1119 const DebugLoc &DL = MI.getDebugLoc();
1120 MachineBasicBlock *MBB = MI.getParent();
1121
1122 Register Numer = MI.getOperand(3).getReg();
1123 Register Denom = MI.getOperand(4).getReg();
1124 unsigned ChooseDenom = MI.getOperand(5).getImm();
1125
1126 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1127
1128 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1129 .addDef(Dst1)
1130 .addImm(0) // $src0_modifiers
1131 .addUse(Src0) // $src0
1132 .addImm(0) // $src1_modifiers
1133 .addUse(Denom) // $src1
1134 .addImm(0) // $src2_modifiers
1135 .addUse(Numer) // $src2
1136 .addImm(0) // $clamp
1137 .addImm(0); // $omod
1138
1139 MI.eraseFromParent();
1140 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1141 return true;
1142}
1143
1144bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1145 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1146 switch (IntrinsicID) {
1147 case Intrinsic::amdgcn_if_break: {
1148 MachineBasicBlock *BB = I.getParent();
1149
1150 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1151 // SelectionDAG uses for wave32 vs wave64.
1152 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1153 .add(I.getOperand(0))
1154 .add(I.getOperand(2))
1155 .add(I.getOperand(3));
1156
1157 Register DstReg = I.getOperand(0).getReg();
1158 Register Src0Reg = I.getOperand(2).getReg();
1159 Register Src1Reg = I.getOperand(3).getReg();
1160
1161 I.eraseFromParent();
1162
1163 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1164 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1165
1166 return true;
1167 }
1168 case Intrinsic::amdgcn_interp_p1_f16:
1169 return selectInterpP1F16(I);
1170 case Intrinsic::amdgcn_wqm:
1171 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1172 case Intrinsic::amdgcn_softwqm:
1173 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1174 case Intrinsic::amdgcn_strict_wwm:
1175 case Intrinsic::amdgcn_wwm:
1176 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1177 case Intrinsic::amdgcn_strict_wqm:
1178 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1179 case Intrinsic::amdgcn_writelane:
1180 return selectWritelane(I);
1181 case Intrinsic::amdgcn_div_scale:
1182 return selectDivScale(I);
1183 case Intrinsic::amdgcn_icmp:
1184 case Intrinsic::amdgcn_fcmp:
1185 if (selectImpl(I, *CoverageInfo))
1186 return true;
1187 return selectIntrinsicCmp(I);
1188 case Intrinsic::amdgcn_ballot:
1189 return selectBallot(I);
1190 case Intrinsic::amdgcn_reloc_constant:
1191 return selectRelocConstant(I);
1192 case Intrinsic::amdgcn_groupstaticsize:
1193 return selectGroupStaticSize(I);
1194 case Intrinsic::returnaddress:
1195 return selectReturnAddress(I);
1196 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1197 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1198 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1199 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1200 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1201 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1202 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1203 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1204 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1205 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1206 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1207 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1208 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1209 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1210 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1211 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1212 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1213 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1214 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1215 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1216 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1217 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1218 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1219 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1220 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1221 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1222 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1223 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1224 return selectSMFMACIntrin(I);
1225 case Intrinsic::amdgcn_permlane16_swap:
1226 case Intrinsic::amdgcn_permlane32_swap:
1227 return selectPermlaneSwapIntrin(I, IntrinsicID);
1228 case Intrinsic::amdgcn_wave_shuffle:
1229 return selectWaveShuffleIntrin(I);
1230 default:
1231 return selectImpl(I, *CoverageInfo);
1232 }
1233}
1234
1236 const GCNSubtarget &ST) {
1237 if (Size != 16 && Size != 32 && Size != 64)
1238 return -1;
1239
1240 if (Size == 16 && !ST.has16BitInsts())
1241 return -1;
1242
1243 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
1244 unsigned FakeS16Opc, unsigned S32Opc,
1245 unsigned S64Opc) {
1246 if (Size == 16)
1247 return ST.hasTrue16BitInsts()
1248 ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc
1249 : S16Opc;
1250 if (Size == 32)
1251 return S32Opc;
1252 return S64Opc;
1253 };
1254
1255 switch (P) {
1256 default:
1257 llvm_unreachable("Unknown condition code!");
1258 case CmpInst::ICMP_NE:
1259 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1260 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1261 AMDGPU::V_CMP_NE_U64_e64);
1262 case CmpInst::ICMP_EQ:
1263 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1264 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1265 AMDGPU::V_CMP_EQ_U64_e64);
1266 case CmpInst::ICMP_SGT:
1267 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1268 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1269 AMDGPU::V_CMP_GT_I64_e64);
1270 case CmpInst::ICMP_SGE:
1271 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1272 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1273 AMDGPU::V_CMP_GE_I64_e64);
1274 case CmpInst::ICMP_SLT:
1275 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1276 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1277 AMDGPU::V_CMP_LT_I64_e64);
1278 case CmpInst::ICMP_SLE:
1279 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1280 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1281 AMDGPU::V_CMP_LE_I64_e64);
1282 case CmpInst::ICMP_UGT:
1283 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1284 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1285 AMDGPU::V_CMP_GT_U64_e64);
1286 case CmpInst::ICMP_UGE:
1287 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1288 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1289 AMDGPU::V_CMP_GE_U64_e64);
1290 case CmpInst::ICMP_ULT:
1291 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1292 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1293 AMDGPU::V_CMP_LT_U64_e64);
1294 case CmpInst::ICMP_ULE:
1295 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1296 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1297 AMDGPU::V_CMP_LE_U64_e64);
1298
1299 case CmpInst::FCMP_OEQ:
1300 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1301 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1302 AMDGPU::V_CMP_EQ_F64_e64);
1303 case CmpInst::FCMP_OGT:
1304 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1305 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1306 AMDGPU::V_CMP_GT_F64_e64);
1307 case CmpInst::FCMP_OGE:
1308 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1309 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1310 AMDGPU::V_CMP_GE_F64_e64);
1311 case CmpInst::FCMP_OLT:
1312 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1313 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1314 AMDGPU::V_CMP_LT_F64_e64);
1315 case CmpInst::FCMP_OLE:
1316 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1317 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1318 AMDGPU::V_CMP_LE_F64_e64);
1319 case CmpInst::FCMP_ONE:
1320 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1321 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1322 AMDGPU::V_CMP_NEQ_F64_e64);
1323 case CmpInst::FCMP_ORD:
1324 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1325 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1326 AMDGPU::V_CMP_O_F64_e64);
1327 case CmpInst::FCMP_UNO:
1328 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1329 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1330 AMDGPU::V_CMP_U_F64_e64);
1331 case CmpInst::FCMP_UEQ:
1332 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1333 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1334 AMDGPU::V_CMP_NLG_F64_e64);
1335 case CmpInst::FCMP_UGT:
1336 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1337 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1338 AMDGPU::V_CMP_NLE_F64_e64);
1339 case CmpInst::FCMP_UGE:
1340 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1341 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1342 AMDGPU::V_CMP_NLT_F64_e64);
1343 case CmpInst::FCMP_ULT:
1344 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1345 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1346 AMDGPU::V_CMP_NGE_F64_e64);
1347 case CmpInst::FCMP_ULE:
1348 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1349 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1350 AMDGPU::V_CMP_NGT_F64_e64);
1351 case CmpInst::FCMP_UNE:
1352 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1353 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1354 AMDGPU::V_CMP_NEQ_F64_e64);
1355 case CmpInst::FCMP_TRUE:
1356 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1357 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1358 AMDGPU::V_CMP_TRU_F64_e64);
1360 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1361 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1362 AMDGPU::V_CMP_F_F64_e64);
1363 }
1364}
1365
1366int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1367 unsigned Size) const {
1368 if (Size == 64) {
1369 if (!STI.hasScalarCompareEq64())
1370 return -1;
1371
1372 switch (P) {
1373 case CmpInst::ICMP_NE:
1374 return AMDGPU::S_CMP_LG_U64;
1375 case CmpInst::ICMP_EQ:
1376 return AMDGPU::S_CMP_EQ_U64;
1377 default:
1378 return -1;
1379 }
1380 }
1381
1382 if (Size == 32) {
1383 switch (P) {
1384 case CmpInst::ICMP_NE:
1385 return AMDGPU::S_CMP_LG_U32;
1386 case CmpInst::ICMP_EQ:
1387 return AMDGPU::S_CMP_EQ_U32;
1388 case CmpInst::ICMP_SGT:
1389 return AMDGPU::S_CMP_GT_I32;
1390 case CmpInst::ICMP_SGE:
1391 return AMDGPU::S_CMP_GE_I32;
1392 case CmpInst::ICMP_SLT:
1393 return AMDGPU::S_CMP_LT_I32;
1394 case CmpInst::ICMP_SLE:
1395 return AMDGPU::S_CMP_LE_I32;
1396 case CmpInst::ICMP_UGT:
1397 return AMDGPU::S_CMP_GT_U32;
1398 case CmpInst::ICMP_UGE:
1399 return AMDGPU::S_CMP_GE_U32;
1400 case CmpInst::ICMP_ULT:
1401 return AMDGPU::S_CMP_LT_U32;
1402 case CmpInst::ICMP_ULE:
1403 return AMDGPU::S_CMP_LE_U32;
1404 case CmpInst::FCMP_OEQ:
1405 return AMDGPU::S_CMP_EQ_F32;
1406 case CmpInst::FCMP_OGT:
1407 return AMDGPU::S_CMP_GT_F32;
1408 case CmpInst::FCMP_OGE:
1409 return AMDGPU::S_CMP_GE_F32;
1410 case CmpInst::FCMP_OLT:
1411 return AMDGPU::S_CMP_LT_F32;
1412 case CmpInst::FCMP_OLE:
1413 return AMDGPU::S_CMP_LE_F32;
1414 case CmpInst::FCMP_ONE:
1415 return AMDGPU::S_CMP_LG_F32;
1416 case CmpInst::FCMP_ORD:
1417 return AMDGPU::S_CMP_O_F32;
1418 case CmpInst::FCMP_UNO:
1419 return AMDGPU::S_CMP_U_F32;
1420 case CmpInst::FCMP_UEQ:
1421 return AMDGPU::S_CMP_NLG_F32;
1422 case CmpInst::FCMP_UGT:
1423 return AMDGPU::S_CMP_NLE_F32;
1424 case CmpInst::FCMP_UGE:
1425 return AMDGPU::S_CMP_NLT_F32;
1426 case CmpInst::FCMP_ULT:
1427 return AMDGPU::S_CMP_NGE_F32;
1428 case CmpInst::FCMP_ULE:
1429 return AMDGPU::S_CMP_NGT_F32;
1430 case CmpInst::FCMP_UNE:
1431 return AMDGPU::S_CMP_NEQ_F32;
1432 default:
1433 llvm_unreachable("Unknown condition code!");
1434 }
1435 }
1436
1437 if (Size == 16) {
1438 if (!STI.hasSALUFloatInsts())
1439 return -1;
1440
1441 switch (P) {
1442 case CmpInst::FCMP_OEQ:
1443 return AMDGPU::S_CMP_EQ_F16;
1444 case CmpInst::FCMP_OGT:
1445 return AMDGPU::S_CMP_GT_F16;
1446 case CmpInst::FCMP_OGE:
1447 return AMDGPU::S_CMP_GE_F16;
1448 case CmpInst::FCMP_OLT:
1449 return AMDGPU::S_CMP_LT_F16;
1450 case CmpInst::FCMP_OLE:
1451 return AMDGPU::S_CMP_LE_F16;
1452 case CmpInst::FCMP_ONE:
1453 return AMDGPU::S_CMP_LG_F16;
1454 case CmpInst::FCMP_ORD:
1455 return AMDGPU::S_CMP_O_F16;
1456 case CmpInst::FCMP_UNO:
1457 return AMDGPU::S_CMP_U_F16;
1458 case CmpInst::FCMP_UEQ:
1459 return AMDGPU::S_CMP_NLG_F16;
1460 case CmpInst::FCMP_UGT:
1461 return AMDGPU::S_CMP_NLE_F16;
1462 case CmpInst::FCMP_UGE:
1463 return AMDGPU::S_CMP_NLT_F16;
1464 case CmpInst::FCMP_ULT:
1465 return AMDGPU::S_CMP_NGE_F16;
1466 case CmpInst::FCMP_ULE:
1467 return AMDGPU::S_CMP_NGT_F16;
1468 case CmpInst::FCMP_UNE:
1469 return AMDGPU::S_CMP_NEQ_F16;
1470 default:
1471 llvm_unreachable("Unknown condition code!");
1472 }
1473 }
1474
1475 return -1;
1476}
1477
1478bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1479
1480 MachineBasicBlock *BB = I.getParent();
1481 const DebugLoc &DL = I.getDebugLoc();
1482
1483 Register SrcReg = I.getOperand(2).getReg();
1484 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1485
1486 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1487
1488 Register CCReg = I.getOperand(0).getReg();
1489 if (!isVCC(CCReg, *MRI)) {
1490 int Opcode = getS_CMPOpcode(Pred, Size);
1491 if (Opcode == -1)
1492 return false;
1493 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1494 .add(I.getOperand(2))
1495 .add(I.getOperand(3));
1496 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1497 .addReg(AMDGPU::SCC);
1498 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1499 bool Ret =
1500 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1501 I.eraseFromParent();
1502 return Ret;
1503 }
1504
1505 if (I.getOpcode() == AMDGPU::G_FCMP)
1506 return false;
1507
1508 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1509 if (Opcode == -1)
1510 return false;
1511
1512 MachineInstrBuilder ICmp;
1513 // t16 instructions
1514 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers)) {
1515 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1516 .addImm(0)
1517 .add(I.getOperand(2))
1518 .addImm(0)
1519 .add(I.getOperand(3))
1520 .addImm(0); // op_sel
1521 } else {
1522 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1523 .add(I.getOperand(2))
1524 .add(I.getOperand(3));
1525 }
1526
1527 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1528 *TRI.getBoolRC(), *MRI);
1529 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1530 I.eraseFromParent();
1531 return true;
1532}
1533
1534bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1535 Register Dst = I.getOperand(0).getReg();
1536 if (isVCC(Dst, *MRI))
1537 return false;
1538
1539 LLT DstTy = MRI->getType(Dst);
1540 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1541 return false;
1542
1543 MachineBasicBlock *BB = I.getParent();
1544 const DebugLoc &DL = I.getDebugLoc();
1545 Register SrcReg = I.getOperand(2).getReg();
1546 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1547
1548 // i1 inputs are not supported in GlobalISel.
1549 if (Size == 1)
1550 return false;
1551
1552 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1553 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1554 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1555 I.eraseFromParent();
1556 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1557 }
1558
1559 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1560 if (Opcode == -1)
1561 return false;
1562
1563 MachineInstrBuilder SelectedMI;
1564 MachineOperand &LHS = I.getOperand(2);
1565 MachineOperand &RHS = I.getOperand(3);
1566 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1567 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
1568 Register Src0Reg =
1569 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1570 Register Src1Reg =
1571 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1572 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1573 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1574 SelectedMI.addImm(Src0Mods);
1575 SelectedMI.addReg(Src0Reg);
1576 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1577 SelectedMI.addImm(Src1Mods);
1578 SelectedMI.addReg(Src1Reg);
1579 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1580 SelectedMI.addImm(0); // clamp
1581 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1582 SelectedMI.addImm(0); // op_sel
1583
1584 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1585 constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI);
1586
1587 I.eraseFromParent();
1588 return true;
1589}
1590
1591// Ballot has to zero bits in input lane-mask that are zero in current exec,
1592// Done as AND with exec. For inputs that are results of instruction that
1593// implicitly use same exec, for example compares in same basic block or SCC to
1594// VCC copy, use copy.
1597 MachineInstr *MI = MRI.getVRegDef(Reg);
1598 if (MI->getParent() != MBB)
1599 return false;
1600
1601 // Lane mask generated by SCC to VCC copy.
1602 if (MI->getOpcode() == AMDGPU::COPY) {
1603 auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg());
1604 auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg());
1605 if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1606 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1607 return true;
1608 }
1609
1610 // Lane mask generated using compare with same exec.
1611 if (isa<GAnyCmp>(MI))
1612 return true;
1613
1614 Register LHS, RHS;
1615 // Look through AND.
1616 if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
1617 return isLaneMaskFromSameBlock(LHS, MRI, MBB) ||
1619
1620 return false;
1621}
1622
1623bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1624 MachineBasicBlock *BB = I.getParent();
1625 const DebugLoc &DL = I.getDebugLoc();
1626 Register DstReg = I.getOperand(0).getReg();
1627 Register SrcReg = I.getOperand(2).getReg();
1628 const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
1629 const unsigned WaveSize = STI.getWavefrontSize();
1630
1631 // In the common case, the return type matches the wave size.
1632 // However we also support emitting i64 ballots in wave32 mode.
1633 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1634 return false;
1635
1636 std::optional<ValueAndVReg> Arg =
1638
1639 Register Dst = DstReg;
1640 // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1641 if (BallotSize != WaveSize) {
1642 Dst = MRI->createVirtualRegister(TRI.getBoolRC());
1643 }
1644
1645 if (Arg) {
1646 const int64_t Value = Arg->Value.getZExtValue();
1647 if (Value == 0) {
1648 // Dst = S_MOV 0
1649 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1650 BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
1651 } else {
1652 // Dst = COPY EXEC
1653 assert(Value == 1);
1654 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
1655 }
1656 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1657 return false;
1658 } else {
1659 if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {
1660 // Dst = COPY SrcReg
1661 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
1662 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1663 return false;
1664 } else {
1665 // Dst = S_AND SrcReg, EXEC
1666 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1667 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
1668 .addReg(SrcReg)
1669 .addReg(TRI.getExec())
1670 .setOperandDead(3); // Dead scc
1671 constrainSelectedInstRegOperands(*And, TII, TRI, RBI);
1672 }
1673 }
1674
1675 // i64 ballot on Wave32: zero-extend i32 ballot to i64.
1676 if (BallotSize != WaveSize) {
1677 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1678 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1679 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1680 .addReg(Dst)
1681 .addImm(AMDGPU::sub0)
1682 .addReg(HiReg)
1683 .addImm(AMDGPU::sub1);
1684 }
1685
1686 I.eraseFromParent();
1687 return true;
1688}
1689
1690bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1691 Register DstReg = I.getOperand(0).getReg();
1692 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1693 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1694 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1695 return false;
1696
1697 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1698
1699 Module *M = MF->getFunction().getParent();
1700 const MDNode *Metadata = I.getOperand(2).getMetadata();
1701 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1702 auto *RelocSymbol = cast<GlobalVariable>(
1703 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1704
1705 MachineBasicBlock *BB = I.getParent();
1706 BuildMI(*BB, &I, I.getDebugLoc(),
1707 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1709
1710 I.eraseFromParent();
1711 return true;
1712}
1713
1714bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1715 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1716
1717 Register DstReg = I.getOperand(0).getReg();
1718 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1719 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1720 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1721
1722 MachineBasicBlock *MBB = I.getParent();
1723 const DebugLoc &DL = I.getDebugLoc();
1724
1725 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1726
1727 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1728 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1729 MIB.addImm(MFI->getLDSSize());
1730 } else {
1731 Module *M = MF->getFunction().getParent();
1732 const GlobalValue *GV =
1733 Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1735 }
1736
1737 I.eraseFromParent();
1738 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1739 return true;
1740}
1741
1742bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1743 MachineBasicBlock *MBB = I.getParent();
1744 MachineFunction &MF = *MBB->getParent();
1745 const DebugLoc &DL = I.getDebugLoc();
1746
1747 MachineOperand &Dst = I.getOperand(0);
1748 Register DstReg = Dst.getReg();
1749 unsigned Depth = I.getOperand(2).getImm();
1750
1751 const TargetRegisterClass *RC
1752 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1753 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1754 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1755 return false;
1756
1757 // Check for kernel and shader functions
1758 if (Depth != 0 ||
1759 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1760 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1761 .addImm(0);
1762 I.eraseFromParent();
1763 return true;
1764 }
1765
1766 MachineFrameInfo &MFI = MF.getFrameInfo();
1767 // There is a call to @llvm.returnaddress in this function
1768 MFI.setReturnAddressIsTaken(true);
1769
1770 // Get the return address reg and mark it as an implicit live-in
1771 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1772 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1773 AMDGPU::SReg_64RegClass, DL);
1774 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1775 .addReg(LiveIn);
1776 I.eraseFromParent();
1777 return true;
1778}
1779
1780bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1781 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1782 // SelectionDAG uses for wave32 vs wave64.
1783 MachineBasicBlock *BB = MI.getParent();
1784 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1785 .add(MI.getOperand(1));
1786
1787 Register Reg = MI.getOperand(1).getReg();
1788 MI.eraseFromParent();
1789
1790 if (!MRI->getRegClassOrNull(Reg))
1791 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1792 return true;
1793}
1794
1795bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1796 MachineInstr &MI, Intrinsic::ID IntrID) const {
1797 MachineBasicBlock *MBB = MI.getParent();
1798 MachineFunction *MF = MBB->getParent();
1799 const DebugLoc &DL = MI.getDebugLoc();
1800
1801 unsigned IndexOperand = MI.getOperand(7).getImm();
1802 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1803 bool WaveDone = MI.getOperand(9).getImm() != 0;
1804
1805 if (WaveDone && !WaveRelease) {
1806 // TODO: Move this to IR verifier
1807 const Function &Fn = MF->getFunction();
1808 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1809 Fn, "ds_ordered_count: wave_done requires wave_release", DL));
1810 }
1811
1812 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1813 IndexOperand &= ~0x3f;
1814 unsigned CountDw = 0;
1815
1816 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1817 CountDw = (IndexOperand >> 24) & 0xf;
1818 IndexOperand &= ~(0xf << 24);
1819
1820 if (CountDw < 1 || CountDw > 4) {
1821 const Function &Fn = MF->getFunction();
1822 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1823 Fn, "ds_ordered_count: dword count must be between 1 and 4", DL));
1824 CountDw = 1;
1825 }
1826 }
1827
1828 if (IndexOperand) {
1829 const Function &Fn = MF->getFunction();
1830 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1831 Fn, "ds_ordered_count: bad index operand", DL));
1832 }
1833
1834 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1835 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1836
1837 unsigned Offset0 = OrderedCountIndex << 2;
1838 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1839
1840 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1841 Offset1 |= (CountDw - 1) << 6;
1842
1843 if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
1844 Offset1 |= ShaderType << 2;
1845
1846 unsigned Offset = Offset0 | (Offset1 << 8);
1847
1848 Register M0Val = MI.getOperand(2).getReg();
1849 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1850 .addReg(M0Val);
1851
1852 Register DstReg = MI.getOperand(0).getReg();
1853 Register ValReg = MI.getOperand(3).getReg();
1854 MachineInstrBuilder DS =
1855 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1856 .addReg(ValReg)
1857 .addImm(Offset)
1858 .cloneMemRefs(MI);
1859
1860 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1861 return false;
1862
1863 constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1864 MI.eraseFromParent();
1865 return true;
1866}
1867
1868static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1869 switch (IntrID) {
1870 case Intrinsic::amdgcn_ds_gws_init:
1871 return AMDGPU::DS_GWS_INIT;
1872 case Intrinsic::amdgcn_ds_gws_barrier:
1873 return AMDGPU::DS_GWS_BARRIER;
1874 case Intrinsic::amdgcn_ds_gws_sema_v:
1875 return AMDGPU::DS_GWS_SEMA_V;
1876 case Intrinsic::amdgcn_ds_gws_sema_br:
1877 return AMDGPU::DS_GWS_SEMA_BR;
1878 case Intrinsic::amdgcn_ds_gws_sema_p:
1879 return AMDGPU::DS_GWS_SEMA_P;
1880 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1881 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1882 default:
1883 llvm_unreachable("not a gws intrinsic");
1884 }
1885}
1886
1887bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1888 Intrinsic::ID IID) const {
1889 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1890 !STI.hasGWSSemaReleaseAll()))
1891 return false;
1892
1893 // intrinsic ID, vsrc, offset
1894 const bool HasVSrc = MI.getNumOperands() == 3;
1895 assert(HasVSrc || MI.getNumOperands() == 2);
1896
1897 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1898 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1899 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1900 return false;
1901
1902 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1903 unsigned ImmOffset;
1904
1905 MachineBasicBlock *MBB = MI.getParent();
1906 const DebugLoc &DL = MI.getDebugLoc();
1907
1908 MachineInstr *Readfirstlane = nullptr;
1909
1910 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1911 // incoming offset, in case there's an add of a constant. We'll have to put it
1912 // back later.
1913 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1914 Readfirstlane = OffsetDef;
1915 BaseOffset = OffsetDef->getOperand(1).getReg();
1916 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1917 }
1918
1919 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1920 // If we have a constant offset, try to use the 0 in m0 as the base.
1921 // TODO: Look into changing the default m0 initialization value. If the
1922 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1923 // the immediate offset.
1924
1925 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1926 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1927 .addImm(0);
1928 } else {
1929 std::tie(BaseOffset, ImmOffset) =
1930 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, VT);
1931
1932 if (Readfirstlane) {
1933 // We have the constant offset now, so put the readfirstlane back on the
1934 // variable component.
1935 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1936 return false;
1937
1938 Readfirstlane->getOperand(1).setReg(BaseOffset);
1939 BaseOffset = Readfirstlane->getOperand(0).getReg();
1940 } else {
1941 if (!RBI.constrainGenericRegister(BaseOffset,
1942 AMDGPU::SReg_32RegClass, *MRI))
1943 return false;
1944 }
1945
1946 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1947 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1948 .addReg(BaseOffset)
1949 .addImm(16)
1950 .setOperandDead(3); // Dead scc
1951
1952 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1953 .addReg(M0Base);
1954 }
1955
1956 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1957 // offset field) % 64. Some versions of the programming guide omit the m0
1958 // part, or claim it's from offset 0.
1959
1960 unsigned Opc = gwsIntrinToOpcode(IID);
1961 const MCInstrDesc &InstrDesc = TII.get(Opc);
1962
1963 if (HasVSrc) {
1964 Register VSrc = MI.getOperand(1).getReg();
1965
1966 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
1967 const TargetRegisterClass *DataRC = TII.getRegClass(InstrDesc, Data0Idx);
1968 const TargetRegisterClass *SubRC =
1969 TRI.getSubRegisterClass(DataRC, AMDGPU::sub0);
1970
1971 if (!SubRC) {
1972 // 32-bit normal case.
1973 if (!RBI.constrainGenericRegister(VSrc, *DataRC, *MRI))
1974 return false;
1975
1976 BuildMI(*MBB, &MI, DL, InstrDesc)
1977 .addReg(VSrc)
1978 .addImm(ImmOffset)
1979 .cloneMemRefs(MI);
1980 } else {
1981 // Requires even register alignment, so create 64-bit value and pad the
1982 // top half with undef.
1983 Register DataReg = MRI->createVirtualRegister(DataRC);
1984 if (!RBI.constrainGenericRegister(VSrc, *SubRC, *MRI))
1985 return false;
1986
1987 Register UndefReg = MRI->createVirtualRegister(SubRC);
1988 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
1989 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), DataReg)
1990 .addReg(VSrc)
1991 .addImm(AMDGPU::sub0)
1992 .addReg(UndefReg)
1993 .addImm(AMDGPU::sub1);
1994
1995 BuildMI(*MBB, &MI, DL, InstrDesc)
1996 .addReg(DataReg)
1997 .addImm(ImmOffset)
1998 .cloneMemRefs(MI);
1999 }
2000 } else {
2001 BuildMI(*MBB, &MI, DL, InstrDesc)
2002 .addImm(ImmOffset)
2003 .cloneMemRefs(MI);
2004 }
2005
2006 MI.eraseFromParent();
2007 return true;
2008}
2009
2010bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
2011 bool IsAppend) const {
2012 Register PtrBase = MI.getOperand(2).getReg();
2013 LLT PtrTy = MRI->getType(PtrBase);
2014 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2015
2016 unsigned Offset;
2017 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
2018
2019 // TODO: Should this try to look through readfirstlane like GWS?
2020 if (!isDSOffsetLegal(PtrBase, Offset)) {
2021 PtrBase = MI.getOperand(2).getReg();
2022 Offset = 0;
2023 }
2024
2025 MachineBasicBlock *MBB = MI.getParent();
2026 const DebugLoc &DL = MI.getDebugLoc();
2027 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2028
2029 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2030 .addReg(PtrBase);
2031 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
2032 return false;
2033
2034 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
2035 .addImm(Offset)
2036 .addImm(IsGDS ? -1 : 0)
2037 .cloneMemRefs(MI);
2038 MI.eraseFromParent();
2039 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2040 return true;
2041}
2042
2043bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
2044 MachineFunction *MF = MI.getMF();
2045 SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
2046
2047 MFInfo->setInitWholeWave();
2048 return selectImpl(MI, *CoverageInfo);
2049}
2050
2051static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
2052 bool &IsTexFail) {
2053 if (TexFailCtrl)
2054 IsTexFail = true;
2055
2056 TFE = TexFailCtrl & 0x1;
2057 TexFailCtrl &= ~(uint64_t)0x1;
2058 LWE = TexFailCtrl & 0x2;
2059 TexFailCtrl &= ~(uint64_t)0x2;
2060
2061 return TexFailCtrl == 0;
2062}
2063
2064bool AMDGPUInstructionSelector::selectImageIntrinsic(
2065 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
2066 MachineBasicBlock *MBB = MI.getParent();
2067 const DebugLoc &DL = MI.getDebugLoc();
2068 unsigned IntrOpcode = Intr->BaseOpcode;
2069
2070 // For image atomic: use no-return opcode if result is unused.
2071 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) {
2072 Register ResultDef = MI.getOperand(0).getReg();
2073 if (MRI->use_nodbg_empty(ResultDef))
2074 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
2075 }
2076
2077 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2079
2080 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
2081 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
2082 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
2083 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
2084
2085 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
2086
2087 Register VDataIn = AMDGPU::NoRegister;
2088 Register VDataOut = AMDGPU::NoRegister;
2089 LLT VDataTy;
2090 int NumVDataDwords = -1;
2091 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
2092 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
2093
2094 bool Unorm;
2095 if (!BaseOpcode->Sampler)
2096 Unorm = true;
2097 else
2098 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
2099
2100 bool TFE;
2101 bool LWE;
2102 bool IsTexFail = false;
2103 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
2104 TFE, LWE, IsTexFail))
2105 return false;
2106
2107 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
2108 const bool IsA16 = (Flags & 1) != 0;
2109 const bool IsG16 = (Flags & 2) != 0;
2110
2111 // A16 implies 16 bit gradients if subtarget doesn't support G16
2112 if (IsA16 && !STI.hasG16() && !IsG16)
2113 return false;
2114
2115 unsigned DMask = 0;
2116 unsigned DMaskLanes = 0;
2117
2118 if (BaseOpcode->Atomic) {
2119 if (!BaseOpcode->NoReturn)
2120 VDataOut = MI.getOperand(0).getReg();
2121 VDataIn = MI.getOperand(2).getReg();
2122 LLT Ty = MRI->getType(VDataIn);
2123
2124 // Be careful to allow atomic swap on 16-bit element vectors.
2125 const bool Is64Bit = BaseOpcode->AtomicX2 ?
2126 Ty.getSizeInBits() == 128 :
2127 Ty.getSizeInBits() == 64;
2128
2129 if (BaseOpcode->AtomicX2) {
2130 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
2131
2132 DMask = Is64Bit ? 0xf : 0x3;
2133 NumVDataDwords = Is64Bit ? 4 : 2;
2134 } else {
2135 DMask = Is64Bit ? 0x3 : 0x1;
2136 NumVDataDwords = Is64Bit ? 2 : 1;
2137 }
2138 } else {
2139 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
2140 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
2141
2142 if (BaseOpcode->Store) {
2143 VDataIn = MI.getOperand(1).getReg();
2144 VDataTy = MRI->getType(VDataIn);
2145 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
2146 } else if (BaseOpcode->NoReturn) {
2147 NumVDataDwords = 0;
2148 } else {
2149 VDataOut = MI.getOperand(0).getReg();
2150 VDataTy = MRI->getType(VDataOut);
2151 NumVDataDwords = DMaskLanes;
2152
2153 if (IsD16 && !STI.hasUnpackedD16VMem())
2154 NumVDataDwords = (DMaskLanes + 1) / 2;
2155 }
2156 }
2157
2158 // Set G16 opcode
2159 if (Subtarget->hasG16() && IsG16) {
2160 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2162 assert(G16MappingInfo);
2163 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
2164 }
2165
2166 // TODO: Check this in verifier.
2167 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
2168
2169 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
2170 // Keep GLC only when the atomic's result is actually used.
2171 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
2173 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
2175 return false;
2176
2177 int NumVAddrRegs = 0;
2178 int NumVAddrDwords = 0;
2179 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
2180 // Skip the $noregs and 0s inserted during legalization.
2181 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
2182 if (!AddrOp.isReg())
2183 continue; // XXX - Break?
2184
2185 Register Addr = AddrOp.getReg();
2186 if (!Addr)
2187 break;
2188
2189 ++NumVAddrRegs;
2190 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
2191 }
2192
2193 // The legalizer preprocessed the intrinsic arguments. If we aren't using
2194 // NSA, these should have been packed into a single value in the first
2195 // address register
2196 const bool UseNSA =
2197 NumVAddrRegs != 1 &&
2198 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2199 : NumVAddrDwords == NumVAddrRegs);
2200 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
2201 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
2202 return false;
2203 }
2204
2205 if (IsTexFail)
2206 ++NumVDataDwords;
2207
2208 int Opcode = -1;
2209 if (IsGFX12Plus) {
2210 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
2211 NumVDataDwords, NumVAddrDwords);
2212 } else if (IsGFX11Plus) {
2213 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2214 UseNSA ? AMDGPU::MIMGEncGfx11NSA
2215 : AMDGPU::MIMGEncGfx11Default,
2216 NumVDataDwords, NumVAddrDwords);
2217 } else if (IsGFX10Plus) {
2218 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2219 UseNSA ? AMDGPU::MIMGEncGfx10NSA
2220 : AMDGPU::MIMGEncGfx10Default,
2221 NumVDataDwords, NumVAddrDwords);
2222 } else {
2223 if (Subtarget->hasGFX90AInsts()) {
2224 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
2225 NumVDataDwords, NumVAddrDwords);
2226 if (Opcode == -1) {
2227 LLVM_DEBUG(
2228 dbgs()
2229 << "requested image instruction is not supported on this GPU\n");
2230 return false;
2231 }
2232 }
2233 if (Opcode == -1 &&
2234 STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
2235 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
2236 NumVDataDwords, NumVAddrDwords);
2237 if (Opcode == -1)
2238 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
2239 NumVDataDwords, NumVAddrDwords);
2240 }
2241 if (Opcode == -1)
2242 return false;
2243
2244 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
2245 .cloneMemRefs(MI);
2246
2247 if (VDataOut) {
2248 if (BaseOpcode->AtomicX2) {
2249 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
2250
2251 Register TmpReg = MRI->createVirtualRegister(
2252 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2253 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2254
2255 MIB.addDef(TmpReg);
2256 if (!MRI->use_empty(VDataOut)) {
2257 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
2258 .addReg(TmpReg, RegState::Kill, SubReg);
2259 }
2260
2261 } else {
2262 MIB.addDef(VDataOut); // vdata output
2263 }
2264 }
2265
2266 if (VDataIn)
2267 MIB.addReg(VDataIn); // vdata input
2268
2269 for (int I = 0; I != NumVAddrRegs; ++I) {
2270 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2271 if (SrcOp.isReg()) {
2272 assert(SrcOp.getReg() != 0);
2273 MIB.addReg(SrcOp.getReg());
2274 }
2275 }
2276
2277 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2278 if (BaseOpcode->Sampler)
2279 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2280
2281 MIB.addImm(DMask); // dmask
2282
2283 if (IsGFX10Plus)
2284 MIB.addImm(DimInfo->Encoding);
2285 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2286 MIB.addImm(Unorm);
2287
2288 MIB.addImm(CPol);
2289 MIB.addImm(IsA16 && // a16 or r128
2290 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2291 if (IsGFX10Plus)
2292 MIB.addImm(IsA16 ? -1 : 0);
2293
2294 if (!Subtarget->hasGFX90AInsts()) {
2295 MIB.addImm(TFE); // tfe
2296 } else if (TFE) {
2297 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2298 return false;
2299 }
2300
2301 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2302 MIB.addImm(LWE); // lwe
2303 if (!IsGFX10Plus)
2304 MIB.addImm(DimInfo->DA ? -1 : 0);
2305 if (BaseOpcode->HasD16)
2306 MIB.addImm(IsD16 ? -1 : 0);
2307
2308 MI.eraseFromParent();
2309 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2310 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2311 return true;
2312}
2313
2314// We need to handle this here because tablegen doesn't support matching
2315// instructions with multiple outputs.
2316bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2317 MachineInstr &MI) const {
2318 Register Dst0 = MI.getOperand(0).getReg();
2319 Register Dst1 = MI.getOperand(1).getReg();
2320
2321 const DebugLoc &DL = MI.getDebugLoc();
2322 MachineBasicBlock *MBB = MI.getParent();
2323
2324 Register Addr = MI.getOperand(3).getReg();
2325 Register Data0 = MI.getOperand(4).getReg();
2326 Register Data1 = MI.getOperand(5).getReg();
2327 unsigned Offset = MI.getOperand(6).getImm();
2328
2329 unsigned Opc;
2330 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
2331 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2332 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2333 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2334 break;
2335 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2336 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2337 break;
2338 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2339 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2340 break;
2341 }
2342
2343 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
2344 .addDef(Dst1)
2345 .addUse(Addr)
2346 .addUse(Data0)
2347 .addUse(Data1)
2348 .addImm(Offset)
2349 .cloneMemRefs(MI);
2350
2351 MI.eraseFromParent();
2352 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2353 return true;
2354}
2355
2356bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2357 MachineInstr &I) const {
2358 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2359 switch (IntrinsicID) {
2360 case Intrinsic::amdgcn_end_cf:
2361 return selectEndCfIntrinsic(I);
2362 case Intrinsic::amdgcn_ds_ordered_add:
2363 case Intrinsic::amdgcn_ds_ordered_swap:
2364 return selectDSOrderedIntrinsic(I, IntrinsicID);
2365 case Intrinsic::amdgcn_ds_gws_init:
2366 case Intrinsic::amdgcn_ds_gws_barrier:
2367 case Intrinsic::amdgcn_ds_gws_sema_v:
2368 case Intrinsic::amdgcn_ds_gws_sema_br:
2369 case Intrinsic::amdgcn_ds_gws_sema_p:
2370 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2371 return selectDSGWSIntrinsic(I, IntrinsicID);
2372 case Intrinsic::amdgcn_ds_append:
2373 return selectDSAppendConsume(I, true);
2374 case Intrinsic::amdgcn_ds_consume:
2375 return selectDSAppendConsume(I, false);
2376 case Intrinsic::amdgcn_init_whole_wave:
2377 return selectInitWholeWave(I);
2378 case Intrinsic::amdgcn_raw_buffer_load_lds:
2379 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
2380 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2381 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
2382 case Intrinsic::amdgcn_struct_buffer_load_lds:
2383 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
2384 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2385 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
2386 return selectBufferLoadLds(I);
2387 // Until we can store both the address space of the global and the LDS
2388 // arguments by having tto MachineMemOperands on an intrinsic, we just trust
2389 // that the argument is a global pointer (buffer pointers have been handled by
2390 // a LLVM IR-level lowering).
2391 case Intrinsic::amdgcn_load_to_lds:
2392 case Intrinsic::amdgcn_load_async_to_lds:
2393 case Intrinsic::amdgcn_global_load_lds:
2394 case Intrinsic::amdgcn_global_load_async_lds:
2395 return selectGlobalLoadLds(I);
2396 case Intrinsic::amdgcn_tensor_load_to_lds:
2397 case Intrinsic::amdgcn_tensor_store_from_lds:
2398 return selectTensorLoadStore(I, IntrinsicID);
2399 case Intrinsic::amdgcn_asyncmark:
2400 case Intrinsic::amdgcn_wait_asyncmark:
2401 // FIXME: Not supported on GFX12 yet. Will need a new feature when we do.
2402 if (!Subtarget->hasVMemToLDSLoad())
2403 return false;
2404 break;
2405 case Intrinsic::amdgcn_exp_compr:
2406 if (!STI.hasCompressedExport()) {
2407 Function &F = I.getMF()->getFunction();
2408 F.getContext().diagnose(
2409 DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
2410 I.getDebugLoc(), DS_Error));
2411 return false;
2412 }
2413 break;
2414 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2415 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2416 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2417 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2418 return selectDSBvhStackIntrinsic(I);
2419 case Intrinsic::amdgcn_s_alloc_vgpr: {
2420 // S_ALLOC_VGPR doesn't have a destination register, it just implicitly sets
2421 // SCC. We then need to COPY it into the result vreg.
2422 MachineBasicBlock *MBB = I.getParent();
2423 const DebugLoc &DL = I.getDebugLoc();
2424
2425 Register ResReg = I.getOperand(0).getReg();
2426
2427 MachineInstr *AllocMI = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_ALLOC_VGPR))
2428 .add(I.getOperand(2));
2429 (void)BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), ResReg)
2430 .addReg(AMDGPU::SCC);
2431 I.eraseFromParent();
2432 constrainSelectedInstRegOperands(*AllocMI, TII, TRI, RBI);
2433 return RBI.constrainGenericRegister(ResReg, AMDGPU::SReg_32RegClass, *MRI);
2434 }
2435 case Intrinsic::amdgcn_s_barrier_init:
2436 case Intrinsic::amdgcn_s_barrier_signal_var:
2437 return selectNamedBarrierInit(I, IntrinsicID);
2438 case Intrinsic::amdgcn_s_wakeup_barrier: {
2439 if (!STI.hasSWakeupBarrier()) {
2440 Function &F = I.getMF()->getFunction();
2441 F.getContext().diagnose(
2442 DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
2443 I.getDebugLoc(), DS_Error));
2444 return false;
2445 }
2446 return selectNamedBarrierInst(I, IntrinsicID);
2447 }
2448 case Intrinsic::amdgcn_s_barrier_join:
2449 case Intrinsic::amdgcn_s_get_named_barrier_state:
2450 return selectNamedBarrierInst(I, IntrinsicID);
2451 case Intrinsic::amdgcn_s_get_barrier_state:
2452 return selectSGetBarrierState(I, IntrinsicID);
2453 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2454 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2455 }
2456 return selectImpl(I, *CoverageInfo);
2457}
2458
2459bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2460 if (selectImpl(I, *CoverageInfo))
2461 return true;
2462
2463 MachineBasicBlock *BB = I.getParent();
2464 const DebugLoc &DL = I.getDebugLoc();
2465
2466 Register DstReg = I.getOperand(0).getReg();
2467 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2468 assert(Size <= 32 || Size == 64);
2469 const MachineOperand &CCOp = I.getOperand(1);
2470 Register CCReg = CCOp.getReg();
2471 if (!isVCC(CCReg, *MRI)) {
2472 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2473 AMDGPU::S_CSELECT_B32;
2474 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2475 .addReg(CCReg);
2476
2477 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2478 // bank, because it does not cover the register class that we used to represent
2479 // for it. So we need to manually set the register class here.
2480 if (!MRI->getRegClassOrNull(CCReg))
2481 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2482 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2483 .add(I.getOperand(2))
2484 .add(I.getOperand(3));
2485
2487 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2488 I.eraseFromParent();
2489 return true;
2490 }
2491
2492 // Wide VGPR select should have been split in RegBankSelect.
2493 if (Size > 32)
2494 return false;
2495
2496 MachineInstr *Select =
2497 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2498 .addImm(0)
2499 .add(I.getOperand(3))
2500 .addImm(0)
2501 .add(I.getOperand(2))
2502 .add(I.getOperand(1));
2503
2505 I.eraseFromParent();
2506 return true;
2507}
2508
2509bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2510 Register DstReg = I.getOperand(0).getReg();
2511 Register SrcReg = I.getOperand(1).getReg();
2512 const LLT DstTy = MRI->getType(DstReg);
2513 const LLT SrcTy = MRI->getType(SrcReg);
2514 const LLT S1 = LLT::scalar(1);
2515
2516 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2517 const RegisterBank *DstRB;
2518 if (DstTy == S1) {
2519 // This is a special case. We don't treat s1 for legalization artifacts as
2520 // vcc booleans.
2521 DstRB = SrcRB;
2522 } else {
2523 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2524 if (SrcRB != DstRB)
2525 return false;
2526 }
2527
2528 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2529
2530 unsigned DstSize = DstTy.getSizeInBits();
2531 unsigned SrcSize = SrcTy.getSizeInBits();
2532
2533 const TargetRegisterClass *SrcRC =
2534 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2535 const TargetRegisterClass *DstRC =
2536 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2537 if (!SrcRC || !DstRC)
2538 return false;
2539
2540 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2541 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2542 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2543 return false;
2544 }
2545
2546 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2547 assert(STI.useRealTrue16Insts());
2548 const DebugLoc &DL = I.getDebugLoc();
2549 MachineBasicBlock *MBB = I.getParent();
2550 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
2551 .addReg(SrcReg, {}, AMDGPU::lo16);
2552 I.eraseFromParent();
2553 return true;
2554 }
2555
2556 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2557 MachineBasicBlock *MBB = I.getParent();
2558 const DebugLoc &DL = I.getDebugLoc();
2559
2560 Register LoReg = MRI->createVirtualRegister(DstRC);
2561 Register HiReg = MRI->createVirtualRegister(DstRC);
2562 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2563 .addReg(SrcReg, {}, AMDGPU::sub0);
2564 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2565 .addReg(SrcReg, {}, AMDGPU::sub1);
2566
2567 if (IsVALU && STI.hasSDWA()) {
2568 // Write the low 16-bits of the high element into the high 16-bits of the
2569 // low element.
2570 MachineInstr *MovSDWA =
2571 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2572 .addImm(0) // $src0_modifiers
2573 .addReg(HiReg) // $src0
2574 .addImm(0) // $clamp
2575 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2576 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2577 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2578 .addReg(LoReg, RegState::Implicit);
2579 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2580 } else {
2581 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2582 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2583 Register ImmReg = MRI->createVirtualRegister(DstRC);
2584 if (IsVALU) {
2585 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2586 .addImm(16)
2587 .addReg(HiReg);
2588 } else {
2589 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2590 .addReg(HiReg)
2591 .addImm(16)
2592 .setOperandDead(3); // Dead scc
2593 }
2594
2595 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2596 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2597 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2598
2599 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2600 .addImm(0xffff);
2601 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2602 .addReg(LoReg)
2603 .addReg(ImmReg);
2604 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2605 .addReg(TmpReg0)
2606 .addReg(TmpReg1);
2607
2608 if (!IsVALU) {
2609 And.setOperandDead(3); // Dead scc
2610 Or.setOperandDead(3); // Dead scc
2611 }
2612 }
2613
2614 I.eraseFromParent();
2615 return true;
2616 }
2617
2618 if (!DstTy.isScalar())
2619 return false;
2620
2621 if (SrcSize > 32) {
2622 unsigned SubRegIdx = DstSize < 32
2623 ? static_cast<unsigned>(AMDGPU::sub0)
2624 : TRI.getSubRegFromChannel(0, DstSize / 32);
2625 if (SubRegIdx == AMDGPU::NoSubRegister)
2626 return false;
2627
2628 // Deal with weird cases where the class only partially supports the subreg
2629 // index.
2630 const TargetRegisterClass *SrcWithSubRC
2631 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2632 if (!SrcWithSubRC)
2633 return false;
2634
2635 if (SrcWithSubRC != SrcRC) {
2636 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2637 return false;
2638 }
2639
2640 I.getOperand(1).setSubReg(SubRegIdx);
2641 }
2642
2643 I.setDesc(TII.get(TargetOpcode::COPY));
2644 return true;
2645}
2646
2647/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2648static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2650 int SignedMask = static_cast<int>(Mask);
2651 return SignedMask >= -16 && SignedMask <= 64;
2652}
2653
2654// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2655const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2656 Register Reg, const MachineRegisterInfo &MRI,
2657 const TargetRegisterInfo &TRI) const {
2658 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2659 if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))
2660 return RB;
2661
2662 // Ignore the type, since we don't use vcc in artifacts.
2663 if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
2664 return &RBI.getRegBankFromRegClass(*RC, LLT());
2665 return nullptr;
2666}
2667
2668bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2669 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2670 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2671 const DebugLoc &DL = I.getDebugLoc();
2672 MachineBasicBlock &MBB = *I.getParent();
2673 const Register DstReg = I.getOperand(0).getReg();
2674 const Register SrcReg = I.getOperand(1).getReg();
2675
2676 const LLT DstTy = MRI->getType(DstReg);
2677 const LLT SrcTy = MRI->getType(SrcReg);
2678 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2679 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2680 const unsigned DstSize = DstTy.getSizeInBits();
2681 if (!DstTy.isScalar())
2682 return false;
2683
2684 // Artifact casts should never use vcc.
2685 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2686
2687 // FIXME: This should probably be illegal and split earlier.
2688 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2689 if (DstSize <= 32)
2690 return selectCOPY(I);
2691
2692 const TargetRegisterClass *SrcRC =
2693 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2694 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2695 const TargetRegisterClass *DstRC =
2696 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2697
2698 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2699 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2700 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2701 .addReg(SrcReg)
2702 .addImm(AMDGPU::sub0)
2703 .addReg(UndefReg)
2704 .addImm(AMDGPU::sub1);
2705 I.eraseFromParent();
2706
2707 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2708 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2709 }
2710
2711 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2712 // 64-bit should have been split up in RegBankSelect
2713
2714 // Try to use an and with a mask if it will save code size.
2715 unsigned Mask;
2716 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2717 MachineInstr *ExtI =
2718 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2719 .addImm(Mask)
2720 .addReg(SrcReg);
2721 I.eraseFromParent();
2722 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2723 return true;
2724 }
2725
2726 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2727 MachineInstr *ExtI =
2728 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2729 .addReg(SrcReg)
2730 .addImm(0) // Offset
2731 .addImm(SrcSize); // Width
2732 I.eraseFromParent();
2733 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2734 return true;
2735 }
2736
2737 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2738 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2739 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2740 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2741 return false;
2742
2743 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2744 const unsigned SextOpc = SrcSize == 8 ?
2745 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2746 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2747 .addReg(SrcReg);
2748 I.eraseFromParent();
2749 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2750 }
2751
2752 // Using a single 32-bit SALU to calculate the high half is smaller than
2753 // S_BFE with a literal constant operand.
2754 if (DstSize > 32 && SrcSize == 32) {
2755 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2756 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2757 if (Signed) {
2758 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2759 .addReg(SrcReg, {}, SubReg)
2760 .addImm(31)
2761 .setOperandDead(3); // Dead scc
2762 } else {
2763 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2764 .addImm(0);
2765 }
2766 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2767 .addReg(SrcReg, {}, SubReg)
2768 .addImm(AMDGPU::sub0)
2769 .addReg(HiReg)
2770 .addImm(AMDGPU::sub1);
2771 I.eraseFromParent();
2772 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2773 *MRI);
2774 }
2775
2776 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2777 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2778
2779 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2780 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2781 // We need a 64-bit register source, but the high bits don't matter.
2782 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2783 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2784 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2785
2786 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2787 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2788 .addReg(SrcReg, {}, SubReg)
2789 .addImm(AMDGPU::sub0)
2790 .addReg(UndefReg)
2791 .addImm(AMDGPU::sub1);
2792
2793 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2794 .addReg(ExtReg)
2795 .addImm(SrcSize << 16);
2796
2797 I.eraseFromParent();
2798 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2799 }
2800
2801 unsigned Mask;
2802 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2803 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2804 .addReg(SrcReg)
2805 .addImm(Mask)
2806 .setOperandDead(3); // Dead scc
2807 } else {
2808 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2809 .addReg(SrcReg)
2810 .addImm(SrcSize << 16);
2811 }
2812
2813 I.eraseFromParent();
2814 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2815 }
2816
2817 return false;
2818}
2819
2823
2825 Register BitcastSrc;
2826 if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
2827 Reg = BitcastSrc;
2828 return Reg;
2829}
2830
2832 Register &Out) {
2833 Register Trunc;
2834 if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
2835 return false;
2836
2837 Register LShlSrc;
2838 Register Cst;
2839 if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
2840 Cst = stripCopy(Cst, MRI);
2841 if (mi_match(Cst, MRI, m_SpecificICst(16))) {
2842 Out = stripBitCast(LShlSrc, MRI);
2843 return true;
2844 }
2845 }
2846
2847 MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
2848 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2849 return false;
2850
2851 assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2852 LLT::fixed_vector(2, 16));
2853
2854 ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2855 assert(Mask.size() == 2);
2856
2857 if (Mask[0] == 1 && Mask[1] <= 1) {
2858 Out = Shuffle->getOperand(0).getReg();
2859 return true;
2860 }
2861
2862 return false;
2863}
2864
2865bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2866 if (!Subtarget->hasSALUFloatInsts())
2867 return false;
2868
2869 Register Dst = I.getOperand(0).getReg();
2870 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2871 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2872 return false;
2873
2874 Register Src = I.getOperand(1).getReg();
2875
2876 if (MRI->getType(Dst) == LLT::scalar(32) &&
2877 MRI->getType(Src) == LLT::scalar(16)) {
2878 if (isExtractHiElt(*MRI, Src, Src)) {
2879 MachineBasicBlock *BB = I.getParent();
2880 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2881 .addUse(Src);
2882 I.eraseFromParent();
2883 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2884 }
2885 }
2886
2887 return false;
2888}
2889
2890bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2891 // Only manually handle the f64 SGPR case.
2892 //
2893 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2894 // the bit ops theoretically have a second result due to the implicit def of
2895 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2896 // that is easy by disabling the check. The result works, but uses a
2897 // nonsensical sreg32orlds_and_sreg_1 regclass.
2898 //
2899 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2900 // the variadic REG_SEQUENCE operands.
2901
2902 Register Dst = MI.getOperand(0).getReg();
2903 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2904 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2905 MRI->getType(Dst) != LLT::scalar(64))
2906 return false;
2907
2908 Register Src = MI.getOperand(1).getReg();
2909 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2910 if (Fabs)
2911 Src = Fabs->getOperand(1).getReg();
2912
2913 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2914 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2915 return false;
2916
2917 MachineBasicBlock *BB = MI.getParent();
2918 const DebugLoc &DL = MI.getDebugLoc();
2919 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2920 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2921 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2922 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2923
2924 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2925 .addReg(Src, {}, AMDGPU::sub0);
2926 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2927 .addReg(Src, {}, AMDGPU::sub1);
2928 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2929 .addImm(0x80000000);
2930
2931 // Set or toggle sign bit.
2932 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2933 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2934 .addReg(HiReg)
2935 .addReg(ConstReg)
2936 .setOperandDead(3); // Dead scc
2937 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2938 .addReg(LoReg)
2939 .addImm(AMDGPU::sub0)
2940 .addReg(OpReg)
2941 .addImm(AMDGPU::sub1);
2942 MI.eraseFromParent();
2943 return true;
2944}
2945
2946// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2947bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2948 Register Dst = MI.getOperand(0).getReg();
2949 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2950 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2951 MRI->getType(Dst) != LLT::scalar(64))
2952 return false;
2953
2954 Register Src = MI.getOperand(1).getReg();
2955 MachineBasicBlock *BB = MI.getParent();
2956 const DebugLoc &DL = MI.getDebugLoc();
2957 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2958 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2959 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2960 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2961
2962 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2963 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2964 return false;
2965
2966 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2967 .addReg(Src, {}, AMDGPU::sub0);
2968 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2969 .addReg(Src, {}, AMDGPU::sub1);
2970 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2971 .addImm(0x7fffffff);
2972
2973 // Clear sign bit.
2974 // TODO: Should this used S_BITSET0_*?
2975 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2976 .addReg(HiReg)
2977 .addReg(ConstReg)
2978 .setOperandDead(3); // Dead scc
2979 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2980 .addReg(LoReg)
2981 .addImm(AMDGPU::sub0)
2982 .addReg(OpReg)
2983 .addImm(AMDGPU::sub1);
2984
2985 MI.eraseFromParent();
2986 return true;
2987}
2988
2989static bool isConstant(const MachineInstr &MI) {
2990 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2991}
2992
2993void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2994 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2995
2996 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2997 const MachineInstr *PtrMI =
2998 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2999
3000 assert(PtrMI);
3001
3002 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
3003 return;
3004
3005 GEPInfo GEPInfo;
3006
3007 for (unsigned i = 1; i != 3; ++i) {
3008 const MachineOperand &GEPOp = PtrMI->getOperand(i);
3009 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
3010 assert(OpDef);
3011 if (i == 2 && isConstant(*OpDef)) {
3012 // TODO: Could handle constant base + variable offset, but a combine
3013 // probably should have commuted it.
3014 assert(GEPInfo.Imm == 0);
3015 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
3016 continue;
3017 }
3018 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
3019 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
3020 GEPInfo.SgprParts.push_back(GEPOp.getReg());
3021 else
3022 GEPInfo.VgprParts.push_back(GEPOp.getReg());
3023 }
3024
3025 AddrInfo.push_back(GEPInfo);
3026 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
3027}
3028
3029bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
3030 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
3031}
3032
3033bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
3034 if (!MI.hasOneMemOperand())
3035 return false;
3036
3037 const MachineMemOperand *MMO = *MI.memoperands_begin();
3038 const Value *Ptr = MMO->getValue();
3039
3040 // UndefValue means this is a load of a kernel input. These are uniform.
3041 // Sometimes LDS instructions have constant pointers.
3042 // If Ptr is null, then that means this mem operand contains a
3043 // PseudoSourceValue like GOT.
3045 return true;
3046
3048 return true;
3049
3050 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
3051 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
3052 AMDGPU::SGPRRegBankID;
3053
3054 const Instruction *I = dyn_cast<Instruction>(Ptr);
3055 return I && I->getMetadata("amdgpu.uniform");
3056}
3057
3058bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
3059 for (const GEPInfo &GEPInfo : AddrInfo) {
3060 if (!GEPInfo.VgprParts.empty())
3061 return true;
3062 }
3063 return false;
3064}
3065
3066void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
3067 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
3068 unsigned AS = PtrTy.getAddressSpace();
3070 STI.ldsRequiresM0Init()) {
3071 MachineBasicBlock *BB = I.getParent();
3072
3073 // If DS instructions require M0 initialization, insert it before selecting.
3074 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3075 .addImm(-1);
3076 }
3077}
3078
3079bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
3080 MachineInstr &I) const {
3081 initM0(I);
3082 return selectImpl(I, *CoverageInfo);
3083}
3084
3086 if (Reg.isPhysical())
3087 return false;
3088
3090 const unsigned Opcode = MI.getOpcode();
3091
3092 if (Opcode == AMDGPU::COPY)
3093 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
3094
3095 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
3096 Opcode == AMDGPU::G_XOR)
3097 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
3098 isVCmpResult(MI.getOperand(2).getReg(), MRI);
3099
3100 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
3101 return GI->is(Intrinsic::amdgcn_class);
3102
3103 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
3104}
3105
3106bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
3107 MachineBasicBlock *BB = I.getParent();
3108 MachineOperand &CondOp = I.getOperand(0);
3109 Register CondReg = CondOp.getReg();
3110 const DebugLoc &DL = I.getDebugLoc();
3111
3112 unsigned BrOpcode;
3113 Register CondPhysReg;
3114 const TargetRegisterClass *ConstrainRC;
3115
3116 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
3117 // whether the branch is uniform when selecting the instruction. In
3118 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
3119 // RegBankSelect knows what it's doing if the branch condition is scc, even
3120 // though it currently does not.
3121 if (!isVCC(CondReg, *MRI)) {
3122 if (MRI->getType(CondReg) != LLT::scalar(32))
3123 return false;
3124
3125 CondPhysReg = AMDGPU::SCC;
3126 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
3127 ConstrainRC = &AMDGPU::SReg_32RegClass;
3128 } else {
3129 // FIXME: Should scc->vcc copies and with exec?
3130
3131 // Unless the value of CondReg is a result of a V_CMP* instruction then we
3132 // need to insert an and with exec.
3133 if (!isVCmpResult(CondReg, *MRI)) {
3134 const bool Is64 = STI.isWave64();
3135 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
3136 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
3137
3138 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
3139 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
3140 .addReg(CondReg)
3141 .addReg(Exec)
3142 .setOperandDead(3); // Dead scc
3143 CondReg = TmpReg;
3144 }
3145
3146 CondPhysReg = TRI.getVCC();
3147 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
3148 ConstrainRC = TRI.getBoolRC();
3149 }
3150
3151 if (!MRI->getRegClassOrNull(CondReg))
3152 MRI->setRegClass(CondReg, ConstrainRC);
3153
3154 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
3155 .addReg(CondReg);
3156 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
3157 .addMBB(I.getOperand(1).getMBB());
3158
3159 I.eraseFromParent();
3160 return true;
3161}
3162
3163bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
3164 MachineInstr &I) const {
3165 Register DstReg = I.getOperand(0).getReg();
3166 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3167 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3168 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
3169 if (IsVGPR)
3170 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3171
3172 return RBI.constrainGenericRegister(
3173 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
3174}
3175
3176bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
3177 Register DstReg = I.getOperand(0).getReg();
3178 Register SrcReg = I.getOperand(1).getReg();
3179 Register MaskReg = I.getOperand(2).getReg();
3180 LLT Ty = MRI->getType(DstReg);
3181 LLT MaskTy = MRI->getType(MaskReg);
3182 MachineBasicBlock *BB = I.getParent();
3183 const DebugLoc &DL = I.getDebugLoc();
3184
3185 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3186 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3187 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
3188 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3189 if (DstRB != SrcRB) // Should only happen for hand written MIR.
3190 return false;
3191
3192 // Try to avoid emitting a bit operation when we only need to touch half of
3193 // the 64-bit pointer.
3194 APInt MaskOnes = VT->getKnownOnes(MaskReg).zext(64);
3195 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
3196 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
3197
3198 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
3199 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
3200
3201 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
3202 !CanCopyLow32 && !CanCopyHi32) {
3203 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
3204 .addReg(SrcReg)
3205 .addReg(MaskReg)
3206 .setOperandDead(3); // Dead scc
3207 I.eraseFromParent();
3208 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3209 return true;
3210 }
3211
3212 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3213 const TargetRegisterClass &RegRC
3214 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3215
3216 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
3217 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
3218 const TargetRegisterClass *MaskRC =
3219 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
3220
3221 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3222 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3223 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
3224 return false;
3225
3226 if (Ty.getSizeInBits() == 32) {
3227 assert(MaskTy.getSizeInBits() == 32 &&
3228 "ptrmask should have been narrowed during legalize");
3229
3230 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
3231 .addReg(SrcReg)
3232 .addReg(MaskReg);
3233
3234 if (!IsVGPR)
3235 NewOp.setOperandDead(3); // Dead scc
3236 I.eraseFromParent();
3237 return true;
3238 }
3239
3240 Register HiReg = MRI->createVirtualRegister(&RegRC);
3241 Register LoReg = MRI->createVirtualRegister(&RegRC);
3242
3243 // Extract the subregisters from the source pointer.
3244 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
3245 .addReg(SrcReg, {}, AMDGPU::sub0);
3246 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
3247 .addReg(SrcReg, {}, AMDGPU::sub1);
3248
3249 Register MaskedLo, MaskedHi;
3250
3251 if (CanCopyLow32) {
3252 // If all the bits in the low half are 1, we only need a copy for it.
3253 MaskedLo = LoReg;
3254 } else {
3255 // Extract the mask subregister and apply the and.
3256 Register MaskLo = MRI->createVirtualRegister(&RegRC);
3257 MaskedLo = MRI->createVirtualRegister(&RegRC);
3258
3259 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
3260 .addReg(MaskReg, {}, AMDGPU::sub0);
3261 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
3262 .addReg(LoReg)
3263 .addReg(MaskLo);
3264 }
3265
3266 if (CanCopyHi32) {
3267 // If all the bits in the high half are 1, we only need a copy for it.
3268 MaskedHi = HiReg;
3269 } else {
3270 Register MaskHi = MRI->createVirtualRegister(&RegRC);
3271 MaskedHi = MRI->createVirtualRegister(&RegRC);
3272
3273 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3274 .addReg(MaskReg, {}, AMDGPU::sub1);
3275 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3276 .addReg(HiReg)
3277 .addReg(MaskHi);
3278 }
3279
3280 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3281 .addReg(MaskedLo)
3282 .addImm(AMDGPU::sub0)
3283 .addReg(MaskedHi)
3284 .addImm(AMDGPU::sub1);
3285 I.eraseFromParent();
3286 return true;
3287}
3288
3289/// Return the register to use for the index value, and the subregister to use
3290/// for the indirectly accessed register.
3291static std::pair<Register, unsigned>
3293 const TargetRegisterClass *SuperRC, Register IdxReg,
3294 unsigned EltSize, GISelValueTracking &ValueTracking) {
3295 Register IdxBaseReg;
3296 int Offset;
3297
3298 std::tie(IdxBaseReg, Offset) =
3299 AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &ValueTracking);
3300 if (IdxBaseReg == AMDGPU::NoRegister) {
3301 // This will happen if the index is a known constant. This should ordinarily
3302 // be legalized out, but handle it as a register just in case.
3303 assert(Offset == 0);
3304 IdxBaseReg = IdxReg;
3305 }
3306
3307 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3308
3309 // Skip out of bounds offsets, or else we would end up using an undefined
3310 // register.
3311 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3312 return std::pair(IdxReg, SubRegs[0]);
3313 return std::pair(IdxBaseReg, SubRegs[Offset]);
3314}
3315
3316bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3317 MachineInstr &MI) const {
3318 Register DstReg = MI.getOperand(0).getReg();
3319 Register SrcReg = MI.getOperand(1).getReg();
3320 Register IdxReg = MI.getOperand(2).getReg();
3321
3322 LLT DstTy = MRI->getType(DstReg);
3323 LLT SrcTy = MRI->getType(SrcReg);
3324
3325 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3326 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3327 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3328
3329 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3330 // into a waterfall loop.
3331 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3332 return false;
3333
3334 const TargetRegisterClass *SrcRC =
3335 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3336 const TargetRegisterClass *DstRC =
3337 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3338 if (!SrcRC || !DstRC)
3339 return false;
3340 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3341 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3342 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3343 return false;
3344
3345 MachineBasicBlock *BB = MI.getParent();
3346 const DebugLoc &DL = MI.getDebugLoc();
3347 const bool Is64 = DstTy.getSizeInBits() == 64;
3348
3349 unsigned SubReg;
3350 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3351 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *VT);
3352
3353 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3354 if (DstTy.getSizeInBits() != 32 && !Is64)
3355 return false;
3356
3357 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3358 .addReg(IdxReg);
3359
3360 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3361 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3362 .addReg(SrcReg, {}, SubReg)
3363 .addReg(SrcReg, RegState::Implicit);
3364 MI.eraseFromParent();
3365 return true;
3366 }
3367
3368 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3369 return false;
3370
3371 if (!STI.useVGPRIndexMode()) {
3372 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3373 .addReg(IdxReg);
3374 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3375 .addReg(SrcReg, {}, SubReg)
3376 .addReg(SrcReg, RegState::Implicit);
3377 MI.eraseFromParent();
3378 return true;
3379 }
3380
3381 const MCInstrDesc &GPRIDXDesc =
3382 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3383 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3384 .addReg(SrcReg)
3385 .addReg(IdxReg)
3386 .addImm(SubReg);
3387
3388 MI.eraseFromParent();
3389 return true;
3390}
3391
3392// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3393bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3394 MachineInstr &MI) const {
3395 Register DstReg = MI.getOperand(0).getReg();
3396 Register VecReg = MI.getOperand(1).getReg();
3397 Register ValReg = MI.getOperand(2).getReg();
3398 Register IdxReg = MI.getOperand(3).getReg();
3399
3400 LLT VecTy = MRI->getType(DstReg);
3401 LLT ValTy = MRI->getType(ValReg);
3402 unsigned VecSize = VecTy.getSizeInBits();
3403 unsigned ValSize = ValTy.getSizeInBits();
3404
3405 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3406 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3407 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3408
3409 assert(VecTy.getElementType() == ValTy);
3410
3411 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3412 // into a waterfall loop.
3413 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3414 return false;
3415
3416 const TargetRegisterClass *VecRC =
3417 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3418 const TargetRegisterClass *ValRC =
3419 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3420
3421 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3422 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3423 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3424 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3425 return false;
3426
3427 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3428 return false;
3429
3430 unsigned SubReg;
3431 std::tie(IdxReg, SubReg) =
3432 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *VT);
3433
3434 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3435 STI.useVGPRIndexMode();
3436
3437 MachineBasicBlock *BB = MI.getParent();
3438 const DebugLoc &DL = MI.getDebugLoc();
3439
3440 if (!IndexMode) {
3441 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3442 .addReg(IdxReg);
3443
3444 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3445 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3446 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3447 .addReg(VecReg)
3448 .addReg(ValReg)
3449 .addImm(SubReg);
3450 MI.eraseFromParent();
3451 return true;
3452 }
3453
3454 const MCInstrDesc &GPRIDXDesc =
3455 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3456 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3457 .addReg(VecReg)
3458 .addReg(ValReg)
3459 .addReg(IdxReg)
3460 .addImm(SubReg);
3461
3462 MI.eraseFromParent();
3463 return true;
3464}
3465
3466static bool isAsyncLDSDMA(Intrinsic::ID Intr) {
3467 switch (Intr) {
3468 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
3469 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
3470 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
3471 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
3472 case Intrinsic::amdgcn_load_async_to_lds:
3473 case Intrinsic::amdgcn_global_load_async_lds:
3474 return true;
3475 }
3476 return false;
3477}
3478
3479bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3480 if (!Subtarget->hasVMemToLDSLoad())
3481 return false;
3482 unsigned Opc;
3483 unsigned Size = MI.getOperand(3).getImm();
3484 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
3485
3486 // The struct intrinsic variants add one additional operand over raw.
3487 const bool HasVIndex = MI.getNumOperands() == 9;
3488 Register VIndex;
3489 int OpOffset = 0;
3490 if (HasVIndex) {
3491 VIndex = MI.getOperand(4).getReg();
3492 OpOffset = 1;
3493 }
3494
3495 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3496 std::optional<ValueAndVReg> MaybeVOffset =
3498 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3499
3500 switch (Size) {
3501 default:
3502 return false;
3503 case 1:
3504 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3505 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3506 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3507 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3508 break;
3509 case 2:
3510 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3511 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3512 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3513 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3514 break;
3515 case 4:
3516 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3517 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3518 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3519 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3520 break;
3521 case 12:
3522 if (!Subtarget->hasLDSLoadB96_B128())
3523 return false;
3524
3525 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3526 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3527 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3528 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3529 break;
3530 case 16:
3531 if (!Subtarget->hasLDSLoadB96_B128())
3532 return false;
3533
3534 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3535 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3536 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3537 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3538 break;
3539 }
3540
3541 MachineBasicBlock *MBB = MI.getParent();
3542 const DebugLoc &DL = MI.getDebugLoc();
3543 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3544 .add(MI.getOperand(2));
3545
3546 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3547
3548 if (HasVIndex && HasVOffset) {
3549 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3550 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3551 .addReg(VIndex)
3552 .addImm(AMDGPU::sub0)
3553 .addReg(VOffset)
3554 .addImm(AMDGPU::sub1);
3555
3556 MIB.addReg(IdxReg);
3557 } else if (HasVIndex) {
3558 MIB.addReg(VIndex);
3559 } else if (HasVOffset) {
3560 MIB.addReg(VOffset);
3561 }
3562
3563 MIB.add(MI.getOperand(1)); // rsrc
3564 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3565 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3566 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
3567 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3568 MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3569 : AMDGPU::CPol::ALL_pregfx12)); // cpol
3570 MIB.addImm(
3571 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3572 ? 1
3573 : 0); // swz
3574 MIB.addImm(isAsyncLDSDMA(IntrinsicID));
3575
3576 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3577 // Don't set the offset value here because the pointer points to the base of
3578 // the buffer.
3579 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3580
3581 MachinePointerInfo StorePtrI = LoadPtrI;
3582 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3586
3587 auto F = LoadMMO->getFlags() &
3589 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3590 Size, LoadMMO->getBaseAlign());
3591
3592 MachineMemOperand *StoreMMO =
3593 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3594 sizeof(int32_t), LoadMMO->getBaseAlign());
3595
3596 MIB.setMemRefs({LoadMMO, StoreMMO});
3597
3598 MI.eraseFromParent();
3599 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3600 return true;
3601}
3602
3603/// Match a zero extend from a 32-bit value to 64-bits.
3604Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const {
3605 Register ZExtSrc;
3606 if (mi_match(Reg, *MRI, m_GZExt(m_Reg(ZExtSrc))))
3607 return MRI->getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3608
3609 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3610 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3611 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3612 return Register();
3613
3614 assert(Def->getNumOperands() == 3 &&
3615 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3616 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_ZeroInt())) {
3617 return Def->getOperand(1).getReg();
3618 }
3619
3620 return Register();
3621}
3622
3623/// Match a sign extend from a 32-bit value to 64-bits.
3624Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const {
3625 Register SExtSrc;
3626 if (mi_match(Reg, *MRI, m_GSExt(m_Reg(SExtSrc))))
3627 return MRI->getType(SExtSrc) == LLT::scalar(32) ? SExtSrc : Register();
3628
3629 // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31))
3630 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3631 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3632 return Register();
3633
3634 assert(Def->getNumOperands() == 3 &&
3635 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3636 if (mi_match(Def->getOperand(2).getReg(), *MRI,
3637 m_GAShr(m_SpecificReg(Def->getOperand(1).getReg()),
3638 m_SpecificICst(31))))
3639 return Def->getOperand(1).getReg();
3640
3641 if (VT->signBitIsZero(Reg))
3642 return matchZeroExtendFromS32(Reg);
3643
3644 return Register();
3645}
3646
3647/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
3648/// is 32-bit.
3650AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const {
3651 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3652 : matchZeroExtendFromS32(Reg);
3653}
3654
3655/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
3656/// is 32-bit.
3658AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const {
3659 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3660 : matchSignExtendFromS32(Reg);
3661}
3662
3664AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg,
3665 bool IsSigned) const {
3666 if (IsSigned)
3667 return matchSignExtendFromS32OrS32(Reg);
3668
3669 return matchZeroExtendFromS32OrS32(Reg);
3670}
3671
3672Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {
3673 Register AnyExtSrc;
3674 if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc))))
3675 return MRI->getType(AnyExtSrc) == LLT::scalar(32) ? AnyExtSrc : Register();
3676
3677 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 G_IMPLICIT_DEF)
3678 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3679 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3680 return Register();
3681
3682 assert(Def->getNumOperands() == 3 &&
3683 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3684
3685 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_GImplicitDef()))
3686 return Def->getOperand(1).getReg();
3687
3688 return Register();
3689}
3690
3691bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3692 if (!Subtarget->hasVMemToLDSLoad())
3693 return false;
3694
3695 unsigned Opc;
3696 unsigned Size = MI.getOperand(3).getImm();
3697 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
3698
3699 switch (Size) {
3700 default:
3701 return false;
3702 case 1:
3703 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3704 break;
3705 case 2:
3706 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3707 break;
3708 case 4:
3709 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3710 break;
3711 case 12:
3712 if (!Subtarget->hasLDSLoadB96_B128())
3713 return false;
3714 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3715 break;
3716 case 16:
3717 if (!Subtarget->hasLDSLoadB96_B128())
3718 return false;
3719 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3720 break;
3721 }
3722
3723 MachineBasicBlock *MBB = MI.getParent();
3724 const DebugLoc &DL = MI.getDebugLoc();
3725 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3726 .add(MI.getOperand(2));
3727
3728 Register Addr = MI.getOperand(1).getReg();
3729 Register VOffset;
3730 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3731 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3732 if (!isSGPR(Addr)) {
3733 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3734 if (isSGPR(AddrDef->Reg)) {
3735 Addr = AddrDef->Reg;
3736 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3737 Register SAddr =
3738 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3739 if (isSGPR(SAddr)) {
3740 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3741 if (Register Off = matchZeroExtendFromS32(PtrBaseOffset)) {
3742 Addr = SAddr;
3743 VOffset = Off;
3744 }
3745 }
3746 }
3747 }
3748
3749 if (isSGPR(Addr)) {
3751 if (!VOffset) {
3752 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3753 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3754 .addImm(0);
3755 }
3756 }
3757
3758 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3759 .addReg(Addr);
3760
3761 if (isSGPR(Addr))
3762 MIB.addReg(VOffset);
3763
3764 MIB.add(MI.getOperand(4)); // offset
3765
3766 unsigned Aux = MI.getOperand(5).getImm();
3767 MIB.addImm(Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol
3768 MIB.addImm(isAsyncLDSDMA(IntrinsicID));
3769
3770 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3771 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3772 LoadPtrI.Offset = MI.getOperand(4).getImm();
3773 MachinePointerInfo StorePtrI = LoadPtrI;
3774 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3778 auto F = LoadMMO->getFlags() &
3780 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3781 Size, LoadMMO->getBaseAlign());
3782 MachineMemOperand *StoreMMO =
3783 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3784 sizeof(int32_t), Align(4));
3785
3786 MIB.setMemRefs({LoadMMO, StoreMMO});
3787
3788 MI.eraseFromParent();
3789 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3790 return true;
3791}
3792
3793bool AMDGPUInstructionSelector::selectTensorLoadStore(MachineInstr &MI,
3794 Intrinsic::ID IID) const {
3795 bool IsLoad = IID == Intrinsic::amdgcn_tensor_load_to_lds;
3796 unsigned Opc =
3797 IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d4 : AMDGPU::TENSOR_STORE_FROM_LDS_d4;
3798 int NumGroups = 4;
3799
3800 // A lamda function to check whether an operand is a vector of all 0s.
3801 const auto isAllZeros = [&](MachineOperand &Opnd) {
3802 const MachineInstr *DefMI = MRI->getVRegDef(Opnd.getReg());
3803 if (!DefMI)
3804 return false;
3805 return llvm::isBuildVectorAllZeros(*DefMI, *MRI, true);
3806 };
3807
3808 // Use _D2 version if both group 2 and 3 are zero-initialized.
3809 if (isAllZeros(MI.getOperand(3)) && isAllZeros(MI.getOperand(4))) {
3810 NumGroups = 2;
3811 Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d2
3812 : AMDGPU::TENSOR_STORE_FROM_LDS_d2;
3813 }
3814
3815 // TODO: Handle the fifth group: MI.getOpetand(5), which is silently ignored
3816 // for now because all existing targets only support up to 4 groups.
3817 MachineBasicBlock *MBB = MI.getParent();
3818 auto MIB = BuildMI(*MBB, &MI, MI.getDebugLoc(), TII.get(Opc))
3819 .add(MI.getOperand(1)) // D# group 0
3820 .add(MI.getOperand(2)); // D# group 1
3821
3822 if (NumGroups >= 4) { // Has at least 4 groups
3823 MIB.add(MI.getOperand(3)) // D# group 2
3824 .add(MI.getOperand(4)); // D# group 3
3825 }
3826
3827 MIB.addImm(0) // r128
3828 .add(MI.getOperand(6)); // cpol
3829
3830 MI.eraseFromParent();
3831 return true;
3832}
3833
3834bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
3835 MachineInstr &MI) const {
3836 unsigned OpcodeOpIdx =
3837 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3;
3838 MI.setDesc(TII.get(MI.getOperand(OpcodeOpIdx).getImm()));
3839 MI.removeOperand(OpcodeOpIdx);
3840 MI.addImplicitDefUseOperands(*MI.getMF());
3841 constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3842 return true;
3843}
3844
3845// FIXME: This should be removed and let the patterns select. We just need the
3846// AGPR/VGPR combination versions.
3847bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3848 unsigned Opc;
3849 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3850 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3851 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3852 break;
3853 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3854 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3855 break;
3856 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3857 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3858 break;
3859 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3860 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3861 break;
3862 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3863 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3864 break;
3865 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3866 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3867 break;
3868 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3869 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3870 break;
3871 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3872 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3873 break;
3874 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3875 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3876 break;
3877 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3878 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3879 break;
3880 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3881 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3882 break;
3883 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3884 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3885 break;
3886 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3887 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3888 break;
3889 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3890 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3891 break;
3892 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3893 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3894 break;
3895 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3896 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3897 break;
3898 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
3899 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
3900 break;
3901 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
3902 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
3903 break;
3904 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
3905 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
3906 break;
3907 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
3908 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
3909 break;
3910 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
3911 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
3912 break;
3913 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3914 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3915 break;
3916 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
3917 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
3918 break;
3919 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
3920 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
3921 break;
3922 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
3923 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
3924 break;
3925 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
3926 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
3927 break;
3928 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
3929 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
3930 break;
3931 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
3932 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
3933 break;
3934 default:
3935 llvm_unreachable("unhandled smfmac intrinsic");
3936 }
3937
3938 auto VDst_In = MI.getOperand(4);
3939
3940 MI.setDesc(TII.get(Opc));
3941 MI.removeOperand(4); // VDst_In
3942 MI.removeOperand(1); // Intrinsic ID
3943 MI.addOperand(VDst_In); // Readd VDst_In to the end
3944 MI.addImplicitDefUseOperands(*MI.getMF());
3945 const MCInstrDesc &MCID = MI.getDesc();
3946 if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
3947 MI.getOperand(0).setIsEarlyClobber(true);
3948 }
3949 return true;
3950}
3951
3952bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
3953 MachineInstr &MI, Intrinsic::ID IntrID) const {
3954 if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
3955 !Subtarget->hasPermlane16Swap())
3956 return false;
3957 if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3958 !Subtarget->hasPermlane32Swap())
3959 return false;
3960
3961 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3962 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3963 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3964
3965 MI.removeOperand(2);
3966 MI.setDesc(TII.get(Opcode));
3967 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3968
3969 MachineOperand &FI = MI.getOperand(4);
3971
3972 constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3973 return true;
3974}
3975
3976bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3977 Register DstReg = MI.getOperand(0).getReg();
3978 Register SrcReg = MI.getOperand(1).getReg();
3979 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3980 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3981 MachineBasicBlock *MBB = MI.getParent();
3982 const DebugLoc &DL = MI.getDebugLoc();
3983
3984 if (IsVALU) {
3985 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3986 .addImm(Subtarget->getWavefrontSizeLog2())
3987 .addReg(SrcReg);
3988 } else {
3989 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3990 .addReg(SrcReg)
3991 .addImm(Subtarget->getWavefrontSizeLog2())
3992 .setOperandDead(3); // Dead scc
3993 }
3994
3995 const TargetRegisterClass &RC =
3996 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3997 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3998 return false;
3999
4000 MI.eraseFromParent();
4001 return true;
4002}
4003
4004bool AMDGPUInstructionSelector::selectWaveShuffleIntrin(
4005 MachineInstr &MI) const {
4006 assert(MI.getNumOperands() == 4);
4007 MachineBasicBlock *MBB = MI.getParent();
4008 const DebugLoc &DL = MI.getDebugLoc();
4009
4010 Register DstReg = MI.getOperand(0).getReg();
4011 Register ValReg = MI.getOperand(2).getReg();
4012 Register IdxReg = MI.getOperand(3).getReg();
4013
4014 const LLT DstTy = MRI->getType(DstReg);
4015 unsigned DstSize = DstTy.getSizeInBits();
4016 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4017 const TargetRegisterClass *DstRC =
4018 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
4019
4020 if (DstTy != LLT::scalar(32))
4021 return false;
4022
4023 if (!Subtarget->supportsBPermute())
4024 return false;
4025
4026 // If we can bpermute across the whole wave, then just do that
4027 if (Subtarget->supportsWaveWideBPermute()) {
4028 Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
4029 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
4030 .addImm(2)
4031 .addReg(IdxReg);
4032
4033 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), DstReg)
4034 .addReg(ShiftIdxReg)
4035 .addReg(ValReg)
4036 .addImm(0);
4037 } else {
4038 // Otherwise, we need to make use of whole wave mode
4039 assert(Subtarget->isWave64());
4040
4041 // Set inactive lanes to poison
4042 Register UndefValReg =
4043 MRI->createVirtualRegister(TRI.getRegClass(AMDGPU::SReg_32RegClassID));
4044 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefValReg);
4045
4046 Register UndefExecReg = MRI->createVirtualRegister(
4047 TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
4048 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefExecReg);
4049
4050 Register PoisonValReg = MRI->createVirtualRegister(DstRC);
4051 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonValReg)
4052 .addImm(0)
4053 .addReg(ValReg)
4054 .addImm(0)
4055 .addReg(UndefValReg)
4056 .addReg(UndefExecReg);
4057
4058 // ds_bpermute requires index to be multiplied by 4
4059 Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
4060 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
4061 .addImm(2)
4062 .addReg(IdxReg);
4063
4064 Register PoisonIdxReg = MRI->createVirtualRegister(DstRC);
4065 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonIdxReg)
4066 .addImm(0)
4067 .addReg(ShiftIdxReg)
4068 .addImm(0)
4069 .addReg(UndefValReg)
4070 .addReg(UndefExecReg);
4071
4072 // Get permutation of each half, then we'll select which one to use
4073 Register SameSidePermReg = MRI->createVirtualRegister(DstRC);
4074 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), SameSidePermReg)
4075 .addReg(PoisonIdxReg)
4076 .addReg(PoisonValReg)
4077 .addImm(0);
4078
4079 Register SwappedValReg = MRI->createVirtualRegister(DstRC);
4080 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_PERMLANE64_B32), SwappedValReg)
4081 .addReg(PoisonValReg);
4082
4083 Register OppSidePermReg = MRI->createVirtualRegister(DstRC);
4084 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), OppSidePermReg)
4085 .addReg(PoisonIdxReg)
4086 .addReg(SwappedValReg)
4087 .addImm(0);
4088
4089 Register WWMSwapPermReg = MRI->createVirtualRegister(DstRC);
4090 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::STRICT_WWM), WWMSwapPermReg)
4091 .addReg(OppSidePermReg);
4092
4093 // Select which side to take the permute from
4094 // We can get away with only using mbcnt_lo here since we're only
4095 // trying to detect which side of 32 each lane is on, and mbcnt_lo
4096 // returns 32 for lanes 32-63.
4097 Register ThreadIDReg = MRI->createVirtualRegister(DstRC);
4098 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MBCNT_LO_U32_B32_e64), ThreadIDReg)
4099 .addImm(-1)
4100 .addImm(0);
4101
4102 Register XORReg = MRI->createVirtualRegister(DstRC);
4103 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_XOR_B32_e64), XORReg)
4104 .addReg(ThreadIDReg)
4105 .addReg(PoisonIdxReg);
4106
4107 Register ANDReg = MRI->createVirtualRegister(DstRC);
4108 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_AND_B32_e64), ANDReg)
4109 .addReg(XORReg)
4110 .addImm(32);
4111
4112 Register CompareReg = MRI->createVirtualRegister(
4113 TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
4114 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), CompareReg)
4115 .addReg(ANDReg)
4116 .addImm(0);
4117
4118 // Finally do the selection
4119 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
4120 .addImm(0)
4121 .addReg(WWMSwapPermReg)
4122 .addImm(0)
4123 .addReg(SameSidePermReg)
4124 .addReg(CompareReg);
4125 }
4126
4127 MI.eraseFromParent();
4128 return true;
4129}
4130
4131// Match BITOP3 operation and return a number of matched instructions plus
4132// truth table.
4133static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
4135 const MachineRegisterInfo &MRI) {
4136 unsigned NumOpcodes = 0;
4137 uint8_t LHSBits, RHSBits;
4138
4139 auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
4140 // Define truth table given Src0, Src1, Src2 bits permutations:
4141 // 0 0 0
4142 // 0 0 1
4143 // 0 1 0
4144 // 0 1 1
4145 // 1 0 0
4146 // 1 0 1
4147 // 1 1 0
4148 // 1 1 1
4149 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
4150
4151 if (mi_match(Op, MRI, m_AllOnesInt())) {
4152 Bits = 0xff;
4153 return true;
4154 }
4155 if (mi_match(Op, MRI, m_ZeroInt())) {
4156 Bits = 0;
4157 return true;
4158 }
4159
4160 for (unsigned I = 0; I < Src.size(); ++I) {
4161 // Try to find existing reused operand
4162 if (Src[I] == Op) {
4163 Bits = SrcBits[I];
4164 return true;
4165 }
4166 // Try to replace parent operator
4167 if (Src[I] == R) {
4168 Bits = SrcBits[I];
4169 Src[I] = Op;
4170 return true;
4171 }
4172 }
4173
4174 if (Src.size() == 3) {
4175 // No room left for operands. Try one last time, there can be a 'not' of
4176 // one of our source operands. In this case we can compute the bits
4177 // without growing Src vector.
4178 Register LHS;
4179 if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {
4181 for (unsigned I = 0; I < Src.size(); ++I) {
4182 if (Src[I] == LHS) {
4183 Bits = ~SrcBits[I];
4184 return true;
4185 }
4186 }
4187 }
4188
4189 return false;
4190 }
4191
4192 Bits = SrcBits[Src.size()];
4193 Src.push_back(Op);
4194 return true;
4195 };
4196
4197 MachineInstr *MI = MRI.getVRegDef(R);
4198 switch (MI->getOpcode()) {
4199 case TargetOpcode::G_AND:
4200 case TargetOpcode::G_OR:
4201 case TargetOpcode::G_XOR: {
4202 Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);
4203 Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
4204
4205 SmallVector<Register, 3> Backup(Src.begin(), Src.end());
4206 if (!getOperandBits(LHS, LHSBits) ||
4207 !getOperandBits(RHS, RHSBits)) {
4208 Src = std::move(Backup);
4209 return std::make_pair(0, 0);
4210 }
4211
4212 // Recursion is naturally limited by the size of the operand vector.
4213 auto Op = BitOp3_Op(LHS, Src, MRI);
4214 if (Op.first) {
4215 NumOpcodes += Op.first;
4216 LHSBits = Op.second;
4217 }
4218
4219 Op = BitOp3_Op(RHS, Src, MRI);
4220 if (Op.first) {
4221 NumOpcodes += Op.first;
4222 RHSBits = Op.second;
4223 }
4224 break;
4225 }
4226 default:
4227 return std::make_pair(0, 0);
4228 }
4229
4230 uint8_t TTbl;
4231 switch (MI->getOpcode()) {
4232 case TargetOpcode::G_AND:
4233 TTbl = LHSBits & RHSBits;
4234 break;
4235 case TargetOpcode::G_OR:
4236 TTbl = LHSBits | RHSBits;
4237 break;
4238 case TargetOpcode::G_XOR:
4239 TTbl = LHSBits ^ RHSBits;
4240 break;
4241 default:
4242 break;
4243 }
4244
4245 return std::make_pair(NumOpcodes + 1, TTbl);
4246}
4247
4248bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
4249 if (!Subtarget->hasBitOp3Insts())
4250 return false;
4251
4252 Register DstReg = MI.getOperand(0).getReg();
4253 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4254 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
4255 if (!IsVALU)
4256 return false;
4257
4259 uint8_t TTbl;
4260 unsigned NumOpcodes;
4261
4262 std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);
4263
4264 // Src.empty() case can happen if all operands are all zero or all ones.
4265 // Normally it shall be optimized out before reaching this.
4266 if (NumOpcodes < 2 || Src.empty())
4267 return false;
4268
4269 const bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
4270 if (NumOpcodes == 2 && IsB32) {
4271 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4272 // asm more readable. This cannot be modeled with AddedComplexity because
4273 // selector does not know how many operations did we match.
4274 if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||
4275 mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
4276 mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
4277 return false;
4278 } else if (NumOpcodes < 4) {
4279 // For a uniform case threshold should be higher to account for moves
4280 // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be
4281 // in SGPRs and a readtfirstlane after.
4282 return false;
4283 }
4284
4285 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
4286 if (!IsB32 && STI.hasTrue16BitInsts())
4287 Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64
4288 : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64;
4289 unsigned CBL = STI.getConstantBusLimit(Opc);
4290 MachineBasicBlock *MBB = MI.getParent();
4291 const DebugLoc &DL = MI.getDebugLoc();
4292
4293 for (unsigned I = 0; I < Src.size(); ++I) {
4294 const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
4295 if (RB->getID() != AMDGPU::SGPRRegBankID)
4296 continue;
4297 if (CBL > 0) {
4298 --CBL;
4299 continue;
4300 }
4301 Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4302 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
4303 .addReg(Src[I]);
4304 Src[I] = NewReg;
4305 }
4306
4307 // Last operand can be ignored, turning a ternary operation into a binary.
4308 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4309 // 'c' with 'a' here without changing the answer. In some pathological
4310 // cases it should be possible to get an operation with a single operand
4311 // too if optimizer would not catch it.
4312 while (Src.size() < 3)
4313 Src.push_back(Src[0]);
4314
4315 auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);
4316 if (!IsB32)
4317 MIB.addImm(0); // src_mod0
4318 MIB.addReg(Src[0]);
4319 if (!IsB32)
4320 MIB.addImm(0); // src_mod1
4321 MIB.addReg(Src[1]);
4322 if (!IsB32)
4323 MIB.addImm(0); // src_mod2
4324 MIB.addReg(Src[2])
4325 .addImm(TTbl);
4326 if (!IsB32)
4327 MIB.addImm(0); // op_sel
4328
4329 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
4330 MI.eraseFromParent();
4331
4332 return true;
4333}
4334
4335bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
4336 Register SrcReg = MI.getOperand(0).getReg();
4337 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
4338 return false;
4339
4340 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
4341 Register SP =
4342 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
4343 Register WaveAddr = getWaveAddress(DefMI);
4344 MachineBasicBlock *MBB = MI.getParent();
4345 const DebugLoc &DL = MI.getDebugLoc();
4346
4347 if (!WaveAddr) {
4348 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4349 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
4350 .addReg(SrcReg)
4351 .addImm(Subtarget->getWavefrontSizeLog2())
4352 .setOperandDead(3); // Dead scc
4353 }
4354
4355 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
4356 .addReg(WaveAddr);
4357
4358 MI.eraseFromParent();
4359 return true;
4360}
4361
4363
4364 if (!I.isPreISelOpcode()) {
4365 if (I.isCopy())
4366 return selectCOPY(I);
4367 return true;
4368 }
4369
4370 switch (I.getOpcode()) {
4371 case TargetOpcode::G_AND:
4372 case TargetOpcode::G_OR:
4373 case TargetOpcode::G_XOR:
4374 if (selectBITOP3(I))
4375 return true;
4376 if (selectImpl(I, *CoverageInfo))
4377 return true;
4378 return selectG_AND_OR_XOR(I);
4379 case TargetOpcode::G_ADD:
4380 case TargetOpcode::G_SUB:
4381 case TargetOpcode::G_PTR_ADD:
4382 if (selectImpl(I, *CoverageInfo))
4383 return true;
4384 return selectG_ADD_SUB(I);
4385 case TargetOpcode::G_UADDO:
4386 case TargetOpcode::G_USUBO:
4387 case TargetOpcode::G_UADDE:
4388 case TargetOpcode::G_USUBE:
4389 return selectG_UADDO_USUBO_UADDE_USUBE(I);
4390 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4391 case AMDGPU::G_AMDGPU_MAD_I64_I32:
4392 return selectG_AMDGPU_MAD_64_32(I);
4393 case TargetOpcode::G_INTTOPTR:
4394 case TargetOpcode::G_BITCAST:
4395 case TargetOpcode::G_PTRTOINT:
4396 case TargetOpcode::G_FREEZE:
4397 return selectCOPY(I);
4398 case TargetOpcode::G_FNEG:
4399 if (selectImpl(I, *CoverageInfo))
4400 return true;
4401 return selectG_FNEG(I);
4402 case TargetOpcode::G_FABS:
4403 if (selectImpl(I, *CoverageInfo))
4404 return true;
4405 return selectG_FABS(I);
4406 case TargetOpcode::G_EXTRACT:
4407 return selectG_EXTRACT(I);
4408 case TargetOpcode::G_MERGE_VALUES:
4409 case TargetOpcode::G_CONCAT_VECTORS:
4410 return selectG_MERGE_VALUES(I);
4411 case TargetOpcode::G_UNMERGE_VALUES:
4412 return selectG_UNMERGE_VALUES(I);
4413 case TargetOpcode::G_BUILD_VECTOR:
4414 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
4415 return selectG_BUILD_VECTOR(I);
4416 case TargetOpcode::G_IMPLICIT_DEF:
4417 return selectG_IMPLICIT_DEF(I);
4418 case TargetOpcode::G_INSERT:
4419 return selectG_INSERT(I);
4420 case TargetOpcode::G_INTRINSIC:
4421 case TargetOpcode::G_INTRINSIC_CONVERGENT:
4422 return selectG_INTRINSIC(I);
4423 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
4424 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
4425 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
4426 case TargetOpcode::G_ICMP:
4427 case TargetOpcode::G_FCMP:
4428 if (selectG_ICMP_or_FCMP(I))
4429 return true;
4430 return selectImpl(I, *CoverageInfo);
4431 case TargetOpcode::G_LOAD:
4432 case TargetOpcode::G_ZEXTLOAD:
4433 case TargetOpcode::G_SEXTLOAD:
4434 case TargetOpcode::G_STORE:
4435 case TargetOpcode::G_ATOMIC_CMPXCHG:
4436 case TargetOpcode::G_ATOMICRMW_XCHG:
4437 case TargetOpcode::G_ATOMICRMW_ADD:
4438 case TargetOpcode::G_ATOMICRMW_SUB:
4439 case TargetOpcode::G_ATOMICRMW_AND:
4440 case TargetOpcode::G_ATOMICRMW_OR:
4441 case TargetOpcode::G_ATOMICRMW_XOR:
4442 case TargetOpcode::G_ATOMICRMW_MIN:
4443 case TargetOpcode::G_ATOMICRMW_MAX:
4444 case TargetOpcode::G_ATOMICRMW_UMIN:
4445 case TargetOpcode::G_ATOMICRMW_UMAX:
4446 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
4447 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4448 case TargetOpcode::G_ATOMICRMW_USUB_COND:
4449 case TargetOpcode::G_ATOMICRMW_USUB_SAT:
4450 case TargetOpcode::G_ATOMICRMW_FADD:
4451 case TargetOpcode::G_ATOMICRMW_FMIN:
4452 case TargetOpcode::G_ATOMICRMW_FMAX:
4453 return selectG_LOAD_STORE_ATOMICRMW(I);
4454 case TargetOpcode::G_SELECT:
4455 return selectG_SELECT(I);
4456 case TargetOpcode::G_TRUNC:
4457 return selectG_TRUNC(I);
4458 case TargetOpcode::G_SEXT:
4459 case TargetOpcode::G_ZEXT:
4460 case TargetOpcode::G_ANYEXT:
4461 case TargetOpcode::G_SEXT_INREG:
4462 // This is a workaround. For extension from type i1, `selectImpl()` uses
4463 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
4464 // i1 can only be hold in a SGPR class.
4465 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
4466 selectImpl(I, *CoverageInfo))
4467 return true;
4468 return selectG_SZA_EXT(I);
4469 case TargetOpcode::G_FPEXT:
4470 if (selectG_FPEXT(I))
4471 return true;
4472 return selectImpl(I, *CoverageInfo);
4473 case TargetOpcode::G_BRCOND:
4474 return selectG_BRCOND(I);
4475 case TargetOpcode::G_GLOBAL_VALUE:
4476 return selectG_GLOBAL_VALUE(I);
4477 case TargetOpcode::G_PTRMASK:
4478 return selectG_PTRMASK(I);
4479 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4480 return selectG_EXTRACT_VECTOR_ELT(I);
4481 case TargetOpcode::G_INSERT_VECTOR_ELT:
4482 return selectG_INSERT_VECTOR_ELT(I);
4483 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4484 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4485 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4486 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4487 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4488 const AMDGPU::ImageDimIntrinsicInfo *Intr =
4490 assert(Intr && "not an image intrinsic with image pseudo");
4491 return selectImageIntrinsic(I, Intr);
4492 }
4493 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY:
4494 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
4495 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
4496 return selectBVHIntersectRayIntrinsic(I);
4497 case AMDGPU::G_SBFX:
4498 case AMDGPU::G_UBFX:
4499 return selectG_SBFX_UBFX(I);
4500 case AMDGPU::G_SI_CALL:
4501 I.setDesc(TII.get(AMDGPU::SI_CALL));
4502 return true;
4503 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4504 return selectWaveAddress(I);
4505 case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: {
4506 I.setDesc(TII.get(AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN));
4507 return true;
4508 }
4509 case AMDGPU::G_STACKRESTORE:
4510 return selectStackRestore(I);
4511 case AMDGPU::G_PHI:
4512 return selectPHI(I);
4513 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4514 return selectCOPY_SCC_VCC(I);
4515 case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4516 return selectCOPY_VCC_SCC(I);
4517 case AMDGPU::G_AMDGPU_READANYLANE:
4518 return selectReadAnyLane(I);
4519 case TargetOpcode::G_CONSTANT:
4520 case TargetOpcode::G_FCONSTANT:
4521 default:
4522 return selectImpl(I, *CoverageInfo);
4523 }
4524 return false;
4525}
4526
4528AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
4529 return {{
4530 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4531 }};
4532
4533}
4534
4535std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4536 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
4537 unsigned Mods = 0;
4538 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
4539
4540 if (MI->getOpcode() == AMDGPU::G_FNEG) {
4541 Src = MI->getOperand(1).getReg();
4542 Mods |= SISrcMods::NEG;
4543 MI = getDefIgnoringCopies(Src, *MRI);
4544 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4545 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
4546 // denormal mode, but we're implicitly canonicalizing in a source operand.
4547 const ConstantFP *LHS =
4548 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
4549 if (LHS && LHS->isZero()) {
4550 Mods |= SISrcMods::NEG;
4551 Src = MI->getOperand(2).getReg();
4552 }
4553 }
4554
4555 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
4556 Src = MI->getOperand(1).getReg();
4557 Mods |= SISrcMods::ABS;
4558 }
4559
4560 if (OpSel)
4561 Mods |= SISrcMods::OP_SEL_0;
4562
4563 return std::pair(Src, Mods);
4564}
4565
4566Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4567 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
4568 bool ForceVGPR) const {
4569 if ((Mods != 0 || ForceVGPR) &&
4570 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4571
4572 // If we looked through copies to find source modifiers on an SGPR operand,
4573 // we now have an SGPR register source. To avoid potentially violating the
4574 // constant bus restriction, we need to insert a copy to a VGPR.
4575 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
4576 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
4577 TII.get(AMDGPU::COPY), VGPRSrc)
4578 .addReg(Src);
4579 Src = VGPRSrc;
4580 }
4581
4582 return Src;
4583}
4584
4585///
4586/// This will select either an SGPR or VGPR operand and will save us from
4587/// having to write an extra tablegen pattern.
4589AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
4590 return {{
4591 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4592 }};
4593}
4594
4596AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
4597 Register Src;
4598 unsigned Mods;
4599 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4600
4601 return {{
4602 [=](MachineInstrBuilder &MIB) {
4603 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4604 },
4605 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4606 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4607 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4608 }};
4609}
4610
4612AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
4613 Register Src;
4614 unsigned Mods;
4615 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4616 /*IsCanonicalizing=*/true,
4617 /*AllowAbs=*/false);
4618
4619 return {{
4620 [=](MachineInstrBuilder &MIB) {
4621 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4622 },
4623 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4624 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4625 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4626 }};
4627}
4628
4630AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
4631 return {{
4632 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
4633 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4634 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4635 }};
4636}
4637
4639AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
4640 Register Src;
4641 unsigned Mods;
4642 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4643
4644 return {{
4645 [=](MachineInstrBuilder &MIB) {
4646 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4647 },
4648 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4649 }};
4650}
4651
4653AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4654 MachineOperand &Root) const {
4655 Register Src;
4656 unsigned Mods;
4657 std::tie(Src, Mods) =
4658 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
4659
4660 return {{
4661 [=](MachineInstrBuilder &MIB) {
4662 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4663 },
4664 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4665 }};
4666}
4667
4669AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
4670 Register Src;
4671 unsigned Mods;
4672 std::tie(Src, Mods) =
4673 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
4674 /*AllowAbs=*/false);
4675
4676 return {{
4677 [=](MachineInstrBuilder &MIB) {
4678 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4679 },
4680 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4681 }};
4682}
4683
4685AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
4686 Register Reg = Root.getReg();
4687 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
4688 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
4689 return {};
4690 return {{
4691 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4692 }};
4693}
4694
4695enum class SrcStatus {
4700 // This means current op = [op_upper, op_lower] and src = -op_lower.
4703 // This means current op = [op_upper, op_lower] and src = [op_upper,
4704 // -op_lower].
4712};
4713/// Test if the MI is truncating to half, such as `%reg0:n = G_TRUNC %reg1:2n`
4714static bool isTruncHalf(const MachineInstr *MI,
4715 const MachineRegisterInfo &MRI) {
4716 if (MI->getOpcode() != AMDGPU::G_TRUNC)
4717 return false;
4718
4719 unsigned DstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();
4720 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4721 return DstSize * 2 == SrcSize;
4722}
4723
4724/// Test if the MI is logic shift right with half bits,
4725/// such as `%reg0:2n =G_LSHR %reg1:2n, CONST(n)`
4726static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4727 if (MI->getOpcode() != AMDGPU::G_LSHR)
4728 return false;
4729
4730 Register ShiftSrc;
4731 std::optional<ValueAndVReg> ShiftAmt;
4732 if (mi_match(MI->getOperand(0).getReg(), MRI,
4733 m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4734 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4735 unsigned Shift = ShiftAmt->Value.getZExtValue();
4736 return Shift * 2 == SrcSize;
4737 }
4738 return false;
4739}
4740
4741/// Test if the MI is shift left with half bits,
4742/// such as `%reg0:2n =G_SHL %reg1:2n, CONST(n)`
4743static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4744 if (MI->getOpcode() != AMDGPU::G_SHL)
4745 return false;
4746
4747 Register ShiftSrc;
4748 std::optional<ValueAndVReg> ShiftAmt;
4749 if (mi_match(MI->getOperand(0).getReg(), MRI,
4750 m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4751 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4752 unsigned Shift = ShiftAmt->Value.getZExtValue();
4753 return Shift * 2 == SrcSize;
4754 }
4755 return false;
4756}
4757
4758/// Test function, if the MI is `%reg0:n, %reg1:n = G_UNMERGE_VALUES %reg2:2n`
4759static bool isUnmergeHalf(const MachineInstr *MI,
4760 const MachineRegisterInfo &MRI) {
4761 if (MI->getOpcode() != AMDGPU::G_UNMERGE_VALUES)
4762 return false;
4763 return MI->getNumOperands() == 3 && MI->getOperand(0).isDef() &&
4764 MI->getOperand(1).isDef() && !MI->getOperand(2).isDef();
4765}
4766
4768
4770 const MachineRegisterInfo &MRI) {
4771 LLT OpTy = MRI.getType(Reg);
4772 if (OpTy.isScalar())
4773 return TypeClass::SCALAR;
4774 if (OpTy.isVector() && OpTy.getNumElements() == 2)
4777}
4778
4780 const MachineRegisterInfo &MRI) {
4781 TypeClass NegType = isVectorOfTwoOrScalar(Reg, MRI);
4782 if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR)
4783 return SrcStatus::INVALID;
4784
4785 switch (S) {
4786 case SrcStatus::IS_SAME:
4787 if (NegType == TypeClass::VECTOR_OF_TWO) {
4788 // Vector of 2:
4789 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4790 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4791 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4792 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4794 }
4795 if (NegType == TypeClass::SCALAR) {
4796 // Scalar:
4797 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4798 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4799 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4800 // [SrcHi, SrcLo] = [-OpHi, OpLo]
4801 return SrcStatus::IS_HI_NEG;
4802 }
4803 break;
4805 if (NegType == TypeClass::VECTOR_OF_TWO) {
4806 // Vector of 2:
4807 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4808 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4809 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4810 // [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo]
4811 return SrcStatus::IS_LO_NEG;
4812 }
4813 if (NegType == TypeClass::SCALAR) {
4814 // Scalar:
4815 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4816 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4817 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4818 // [SrcHi, SrcLo] = [-(-OpHi), OpLo] = [OpHi, OpLo]
4819 return SrcStatus::IS_SAME;
4820 }
4821 break;
4823 if (NegType == TypeClass::VECTOR_OF_TWO) {
4824 // Vector of 2:
4825 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4826 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4827 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4828 // [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo]
4829 return SrcStatus::IS_HI_NEG;
4830 }
4831 if (NegType == TypeClass::SCALAR) {
4832 // Scalar:
4833 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4834 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4835 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4836 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4838 }
4839 break;
4841 if (NegType == TypeClass::VECTOR_OF_TWO) {
4842 // Vector of 2:
4843 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4844 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4845 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4846 // [SrcHi, SrcLo] = [OpHi, OpLo]
4847 return SrcStatus::IS_SAME;
4848 }
4849 if (NegType == TypeClass::SCALAR) {
4850 // Scalar:
4851 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4852 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4853 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4854 // [SrcHi, SrcLo] = [OpHi, -OpLo]
4855 return SrcStatus::IS_LO_NEG;
4856 }
4857 break;
4859 // Vector of 2:
4860 // Src = CurrUpper
4861 // Curr = [CurrUpper, CurrLower]
4862 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4863 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4864 // Src = -OpUpper
4865 //
4866 // Scalar:
4867 // Src = CurrUpper
4868 // Curr = [CurrUpper, CurrLower]
4869 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4870 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4871 // Src = -OpUpper
4874 if (NegType == TypeClass::VECTOR_OF_TWO) {
4875 // Vector of 2:
4876 // Src = CurrLower
4877 // Curr = [CurrUpper, CurrLower]
4878 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4879 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4880 // Src = -OpLower
4882 }
4883 if (NegType == TypeClass::SCALAR) {
4884 // Scalar:
4885 // Src = CurrLower
4886 // Curr = [CurrUpper, CurrLower]
4887 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4888 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4889 // Src = OpLower
4891 }
4892 break;
4894 // Vector of 2:
4895 // Src = -CurrUpper
4896 // Curr = [CurrUpper, CurrLower]
4897 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4898 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4899 // Src = -(-OpUpper) = OpUpper
4900 //
4901 // Scalar:
4902 // Src = -CurrUpper
4903 // Curr = [CurrUpper, CurrLower]
4904 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4905 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4906 // Src = -(-OpUpper) = OpUpper
4909 if (NegType == TypeClass::VECTOR_OF_TWO) {
4910 // Vector of 2:
4911 // Src = -CurrLower
4912 // Curr = [CurrUpper, CurrLower]
4913 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4914 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4915 // Src = -(-OpLower) = OpLower
4917 }
4918 if (NegType == TypeClass::SCALAR) {
4919 // Scalar:
4920 // Src = -CurrLower
4921 // Curr = [CurrUpper, CurrLower]
4922 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4923 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4924 // Src = -OpLower
4926 }
4927 break;
4928 default:
4929 break;
4930 }
4931 llvm_unreachable("unexpected SrcStatus & NegType combination");
4932}
4933
4934static std::optional<std::pair<Register, SrcStatus>>
4935calcNextStatus(std::pair<Register, SrcStatus> Curr,
4936 const MachineRegisterInfo &MRI) {
4937 const MachineInstr *MI = MRI.getVRegDef(Curr.first);
4938
4939 unsigned Opc = MI->getOpcode();
4940
4941 // Handle general Opc cases.
4942 switch (Opc) {
4943 case AMDGPU::G_BITCAST:
4944 return std::optional<std::pair<Register, SrcStatus>>(
4945 {MI->getOperand(1).getReg(), Curr.second});
4946 case AMDGPU::COPY:
4947 if (MI->getOperand(1).getReg().isPhysical())
4948 return std::nullopt;
4949 return std::optional<std::pair<Register, SrcStatus>>(
4950 {MI->getOperand(1).getReg(), Curr.second});
4951 case AMDGPU::G_FNEG: {
4952 SrcStatus Stat = getNegStatus(Curr.first, Curr.second, MRI);
4953 if (Stat == SrcStatus::INVALID)
4954 return std::nullopt;
4955 return std::optional<std::pair<Register, SrcStatus>>(
4956 {MI->getOperand(1).getReg(), Stat});
4957 }
4958 default:
4959 break;
4960 }
4961
4962 // Calc next Stat from current Stat.
4963 switch (Curr.second) {
4964 case SrcStatus::IS_SAME:
4965 if (isTruncHalf(MI, MRI))
4966 return std::optional<std::pair<Register, SrcStatus>>(
4967 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
4968 else if (isUnmergeHalf(MI, MRI)) {
4969 if (Curr.first == MI->getOperand(0).getReg())
4970 return std::optional<std::pair<Register, SrcStatus>>(
4971 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF});
4972 return std::optional<std::pair<Register, SrcStatus>>(
4973 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF});
4974 }
4975 break;
4977 if (isTruncHalf(MI, MRI)) {
4978 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4979 // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower
4980 // = [OpLowerHi, OpLowerLo]
4981 // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4982 // = [-OpLowerHi, OpLowerLo]
4983 // = -OpLower
4984 return std::optional<std::pair<Register, SrcStatus>>(
4985 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4986 }
4987 if (isUnmergeHalf(MI, MRI)) {
4988 if (Curr.first == MI->getOperand(0).getReg())
4989 return std::optional<std::pair<Register, SrcStatus>>(
4990 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4991 return std::optional<std::pair<Register, SrcStatus>>(
4992 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
4993 }
4994 break;
4996 if (isShlHalf(MI, MRI))
4997 return std::optional<std::pair<Register, SrcStatus>>(
4998 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
4999 break;
5001 if (isLshrHalf(MI, MRI))
5002 return std::optional<std::pair<Register, SrcStatus>>(
5003 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF});
5004 break;
5006 if (isShlHalf(MI, MRI))
5007 return std::optional<std::pair<Register, SrcStatus>>(
5008 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
5009 break;
5011 if (isLshrHalf(MI, MRI))
5012 return std::optional<std::pair<Register, SrcStatus>>(
5013 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
5014 break;
5015 default:
5016 break;
5017 }
5018 return std::nullopt;
5019}
5020
5021/// This is used to control valid status that current MI supports. For example,
5022/// non floating point intrinsic such as @llvm.amdgcn.sdot2 does not support NEG
5023/// bit on VOP3P.
5024/// The class can be further extended to recognize support on SEL, NEG, ABS bit
5025/// for different MI on different arch
5027private:
5028 bool HasNeg = false;
5029 // Assume all complex pattern of VOP3P have opsel.
5030 bool HasOpsel = true;
5031
5032public:
5034 const MachineInstr *MI = MRI.getVRegDef(Reg);
5035 unsigned Opc = MI->getOpcode();
5036
5037 if (Opc == TargetOpcode::G_INTRINSIC) {
5038 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID();
5039 // Only float point intrinsic has neg & neg_hi bits.
5040 if (IntrinsicID == Intrinsic::amdgcn_fdot2)
5041 HasNeg = true;
5043 // Keep same for generic op.
5044 HasNeg = true;
5045 }
5046 }
5047 bool checkOptions(SrcStatus Stat) const {
5048 if (!HasNeg &&
5049 (Stat >= SrcStatus::NEG_START && Stat <= SrcStatus::NEG_END)) {
5050 return false;
5051 }
5052 if (!HasOpsel &&
5053 (Stat >= SrcStatus::HALF_START && Stat <= SrcStatus::HALF_END)) {
5054 return false;
5055 }
5056 return true;
5057 }
5058};
5059
5062 int MaxDepth = 3) {
5063 int Depth = 0;
5064 auto Curr = calcNextStatus({Reg, SrcStatus::IS_SAME}, MRI);
5066
5067 while (Depth <= MaxDepth && Curr.has_value()) {
5068 Depth++;
5069 if (SO.checkOptions(Curr.value().second))
5070 Statlist.push_back(Curr.value());
5071 Curr = calcNextStatus(Curr.value(), MRI);
5072 }
5073
5074 return Statlist;
5075}
5076
5077static std::pair<Register, SrcStatus>
5079 int MaxDepth = 3) {
5080 int Depth = 0;
5081 std::pair<Register, SrcStatus> LastSameOrNeg = {Reg, SrcStatus::IS_SAME};
5082 auto Curr = calcNextStatus(LastSameOrNeg, MRI);
5083
5084 while (Depth <= MaxDepth && Curr.has_value()) {
5085 Depth++;
5086 SrcStatus Stat = Curr.value().second;
5087 if (SO.checkOptions(Stat)) {
5088 if (Stat == SrcStatus::IS_SAME || Stat == SrcStatus::IS_HI_NEG ||
5090 LastSameOrNeg = Curr.value();
5091 }
5092 Curr = calcNextStatus(Curr.value(), MRI);
5093 }
5094
5095 return LastSameOrNeg;
5096}
5097
5098static bool isSameBitWidth(Register Reg1, Register Reg2,
5099 const MachineRegisterInfo &MRI) {
5100 unsigned Width1 = MRI.getType(Reg1).getSizeInBits();
5101 unsigned Width2 = MRI.getType(Reg2).getSizeInBits();
5102 return Width1 == Width2;
5103}
5104
5105static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {
5106 // SrcStatus::IS_LOWER_HALF remain 0.
5107 if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) {
5108 Mods ^= SISrcMods::NEG_HI;
5109 Mods |= SISrcMods::OP_SEL_1;
5110 } else if (HiStat == SrcStatus::IS_UPPER_HALF)
5111 Mods |= SISrcMods::OP_SEL_1;
5112 else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG)
5113 Mods ^= SISrcMods::NEG_HI;
5114 else if (HiStat == SrcStatus::IS_HI_NEG)
5115 Mods ^= SISrcMods::NEG_HI;
5116
5117 if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {
5118 Mods ^= SISrcMods::NEG;
5119 Mods |= SISrcMods::OP_SEL_0;
5120 } else if (LoStat == SrcStatus::IS_UPPER_HALF)
5121 Mods |= SISrcMods::OP_SEL_0;
5122 else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG)
5123 Mods |= SISrcMods::NEG;
5124 else if (LoStat == SrcStatus::IS_HI_NEG)
5125 Mods ^= SISrcMods::NEG;
5126
5127 return Mods;
5128}
5129
5130static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg,
5131 Register RootReg, const SIInstrInfo &TII,
5132 const MachineRegisterInfo &MRI) {
5133 auto IsHalfState = [](SrcStatus S) {
5136 };
5137 return isSameBitWidth(NewReg, RootReg, MRI) && IsHalfState(LoStat) &&
5138 IsHalfState(HiStat);
5139}
5140
5141std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PModsImpl(
5142 Register RootReg, const MachineRegisterInfo &MRI, bool IsDOT) const {
5143 unsigned Mods = 0;
5144 // No modification if Root type is not form of <2 x Type>.
5145 if (isVectorOfTwoOrScalar(RootReg, MRI) != TypeClass::VECTOR_OF_TWO) {
5146 Mods |= SISrcMods::OP_SEL_1;
5147 return {RootReg, Mods};
5148 }
5149
5150 SearchOptions SO(RootReg, MRI);
5151
5152 std::pair<Register, SrcStatus> Stat = getLastSameOrNeg(RootReg, MRI, SO);
5153
5154 if (Stat.second == SrcStatus::IS_BOTH_NEG)
5156 else if (Stat.second == SrcStatus::IS_HI_NEG)
5157 Mods ^= SISrcMods::NEG_HI;
5158 else if (Stat.second == SrcStatus::IS_LO_NEG)
5159 Mods ^= SISrcMods::NEG;
5160
5161 MachineInstr *MI = MRI.getVRegDef(Stat.first);
5162
5163 if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 ||
5164 (IsDOT && Subtarget->hasDOTOpSelHazard())) {
5165 Mods |= SISrcMods::OP_SEL_1;
5166 return {Stat.first, Mods};
5167 }
5168
5170 getSrcStats(MI->getOperand(2).getReg(), MRI, SO);
5171
5172 if (StatlistHi.empty()) {
5173 Mods |= SISrcMods::OP_SEL_1;
5174 return {Stat.first, Mods};
5175 }
5176
5178 getSrcStats(MI->getOperand(1).getReg(), MRI, SO);
5179
5180 if (StatlistLo.empty()) {
5181 Mods |= SISrcMods::OP_SEL_1;
5182 return {Stat.first, Mods};
5183 }
5184
5185 for (int I = StatlistHi.size() - 1; I >= 0; I--) {
5186 for (int J = StatlistLo.size() - 1; J >= 0; J--) {
5187 if (StatlistHi[I].first == StatlistLo[J].first &&
5188 isValidToPack(StatlistHi[I].second, StatlistLo[J].second,
5189 StatlistHi[I].first, RootReg, TII, MRI))
5190 return {StatlistHi[I].first,
5191 updateMods(StatlistHi[I].second, StatlistLo[J].second, Mods)};
5192 }
5193 }
5194 // Packed instructions do not have abs modifiers.
5195 Mods |= SISrcMods::OP_SEL_1;
5196
5197 return {Stat.first, Mods};
5198}
5199
5200// Removed unused function `getAllKindImm` to eliminate dead code.
5201
5202static bool checkRB(Register Reg, unsigned int RBNo,
5203 const AMDGPURegisterBankInfo &RBI,
5204 const MachineRegisterInfo &MRI,
5205 const TargetRegisterInfo &TRI) {
5206 const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI);
5207 return RB->getID() == RBNo;
5208}
5209
5210// This function is used to get the correct register bank for returned reg.
5211// Assume:
5212// 1. VOP3P is always legal for VGPR.
5213// 2. RootOp's regbank is legal.
5214// Thus
5215// 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR.
5216// 2. If RootOp is VGPR, then NewOp must be VGPR.
5218 const AMDGPURegisterBankInfo &RBI,
5220 const TargetRegisterInfo &TRI,
5221 const SIInstrInfo &TII) {
5222 // RootOp can only be VGPR or SGPR (some hand written cases such as.
5223 // inst-select-ashr.v2s16.mir::ashr_v2s16_vs).
5224 if (checkRB(RootReg, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
5225 checkRB(NewReg, AMDGPU::VGPRRegBankID, RBI, MRI, TRI))
5226 return NewReg;
5227
5228 MachineInstr *MI = MRI.getVRegDef(RootReg);
5229 if (MI->getOpcode() == AMDGPU::COPY && NewReg == MI->getOperand(1).getReg()) {
5230 // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp.
5231 return RootReg;
5232 }
5233
5234 MachineBasicBlock *BB = MI->getParent();
5235 Register DstReg = MRI.cloneVirtualRegister(RootReg);
5236
5238 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
5239 .addReg(NewReg);
5240
5241 // Only accept VGPR.
5242 return MIB->getOperand(0).getReg();
5243}
5244
5246AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root,
5247 bool IsDOT) const {
5248 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5249 Register Reg;
5250 unsigned Mods;
5251 std::tie(Reg, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, IsDOT);
5252
5253 Reg = getLegalRegBank(Reg, Root.getReg(), RBI, MRI, TRI, TII);
5254 return {{
5255 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5256 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5257 }};
5258}
5259
5261AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
5262
5263 return selectVOP3PRetHelper(Root);
5264}
5265
5267AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
5268
5269 return selectVOP3PRetHelper(Root, true);
5270}
5271
5273AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
5274 MachineOperand &Root) const {
5275 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
5276 "expected i1 value");
5277 unsigned Mods = SISrcMods::OP_SEL_1;
5278 if (Root.getImm() != 0)
5279 Mods |= SISrcMods::OP_SEL_0;
5280
5281 return {{
5282 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5283 }};
5284}
5285
5287 MachineInstr *InsertPt,
5288 MachineRegisterInfo &MRI) {
5289 const TargetRegisterClass *DstRegClass;
5290 switch (Elts.size()) {
5291 case 8:
5292 DstRegClass = &AMDGPU::VReg_256RegClass;
5293 break;
5294 case 4:
5295 DstRegClass = &AMDGPU::VReg_128RegClass;
5296 break;
5297 case 2:
5298 DstRegClass = &AMDGPU::VReg_64RegClass;
5299 break;
5300 default:
5301 llvm_unreachable("unhandled Reg sequence size");
5302 }
5303
5304 MachineIRBuilder B(*InsertPt);
5305 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
5306 .addDef(MRI.createVirtualRegister(DstRegClass));
5307 for (unsigned i = 0; i < Elts.size(); ++i) {
5308 MIB.addReg(Elts[i]);
5310 }
5311 return MIB->getOperand(0).getReg();
5312}
5313
5314static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
5316 MachineInstr *InsertPt,
5317 MachineRegisterInfo &MRI) {
5318 if (ModOpcode == TargetOpcode::G_FNEG) {
5319 Mods |= SISrcMods::NEG;
5320 // Check if all elements also have abs modifier
5321 SmallVector<Register, 8> NegAbsElts;
5322 for (auto El : Elts) {
5323 Register FabsSrc;
5324 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
5325 break;
5326 NegAbsElts.push_back(FabsSrc);
5327 }
5328 if (Elts.size() != NegAbsElts.size()) {
5329 // Neg
5330 Src = buildRegSequence(Elts, InsertPt, MRI);
5331 } else {
5332 // Neg and Abs
5333 Mods |= SISrcMods::NEG_HI;
5334 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
5335 }
5336 } else {
5337 assert(ModOpcode == TargetOpcode::G_FABS);
5338 // Abs
5339 Mods |= SISrcMods::NEG_HI;
5340 Src = buildRegSequence(Elts, InsertPt, MRI);
5341 }
5342}
5343
5345AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
5346 Register Src = Root.getReg();
5347 unsigned Mods = SISrcMods::OP_SEL_1;
5349
5350 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
5351 assert(BV->getNumSources() > 0);
5352 // Based on first element decide which mod we match, neg or abs
5353 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
5354 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
5355 ? AMDGPU::G_FNEG
5356 : AMDGPU::G_FABS;
5357 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
5358 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
5359 if (ElF32->getOpcode() != ModOpcode)
5360 break;
5361 EltsF32.push_back(ElF32->getOperand(1).getReg());
5362 }
5363
5364 // All elements had ModOpcode modifier
5365 if (BV->getNumSources() == EltsF32.size()) {
5366 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
5367 *MRI);
5368 }
5369 }
5370
5371 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5372 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5373}
5374
5376AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
5377 Register Src = Root.getReg();
5378 unsigned Mods = SISrcMods::OP_SEL_1;
5379 SmallVector<Register, 8> EltsV2F16;
5380
5381 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5382 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5383 Register FNegSrc;
5384 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
5385 break;
5386 EltsV2F16.push_back(FNegSrc);
5387 }
5388
5389 // All elements had ModOpcode modifier
5390 if (CV->getNumSources() == EltsV2F16.size()) {
5391 Mods |= SISrcMods::NEG;
5392 Mods |= SISrcMods::NEG_HI;
5393 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
5394 }
5395 }
5396
5397 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5398 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5399}
5400
5402AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
5403 Register Src = Root.getReg();
5404 unsigned Mods = SISrcMods::OP_SEL_1;
5405 SmallVector<Register, 8> EltsV2F16;
5406
5407 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5408 assert(CV->getNumSources() > 0);
5409 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
5410 // Based on first element decide which mod we match, neg or abs
5411 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
5412 ? AMDGPU::G_FNEG
5413 : AMDGPU::G_FABS;
5414
5415 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5416 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
5417 if (ElV2F16->getOpcode() != ModOpcode)
5418 break;
5419 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
5420 }
5421
5422 // All elements had ModOpcode modifier
5423 if (CV->getNumSources() == EltsV2F16.size()) {
5424 MachineIRBuilder B(*Root.getParent());
5425 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
5426 *MRI);
5427 }
5428 }
5429
5430 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5431 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5432}
5433
5435AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
5436 std::optional<FPValueAndVReg> FPValReg;
5437 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
5438 if (TII.isInlineConstant(FPValReg->Value)) {
5439 return {{[=](MachineInstrBuilder &MIB) {
5440 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
5441 }}};
5442 }
5443 // Non-inlineable splat floats should not fall-through for integer immediate
5444 // checks.
5445 return {};
5446 }
5447
5448 APInt ICst;
5449 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
5450 if (TII.isInlineConstant(ICst)) {
5451 return {
5452 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
5453 }
5454 }
5455
5456 return {};
5457}
5458
5460AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
5461 Register Src =
5462 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5463 unsigned Key = 0;
5464
5465 Register ShiftSrc;
5466 std::optional<ValueAndVReg> ShiftAmt;
5467 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5468 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5469 ShiftAmt->Value.getZExtValue() % 8 == 0) {
5470 Key = ShiftAmt->Value.getZExtValue() / 8;
5471 Src = ShiftSrc;
5472 }
5473
5474 return {{
5475 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5476 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5477 }};
5478}
5479
5481AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
5482
5483 Register Src =
5484 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5485 unsigned Key = 0;
5486
5487 Register ShiftSrc;
5488 std::optional<ValueAndVReg> ShiftAmt;
5489 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5490 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5491 ShiftAmt->Value.getZExtValue() == 16) {
5492 Src = ShiftSrc;
5493 Key = 1;
5494 }
5495
5496 return {{
5497 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5498 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5499 }};
5500}
5501
5503AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {
5504 Register Src =
5505 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5506 unsigned Key = 0;
5507
5508 Register S32 = matchZeroExtendFromS32(Src);
5509 if (!S32)
5510 S32 = matchAnyExtendFromS32(Src);
5511
5512 if (S32) {
5513 const MachineInstr *Def = getDefIgnoringCopies(S32, *MRI);
5514 if (Def->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
5515 assert(Def->getNumOperands() == 3);
5516 Register DstReg1 = Def->getOperand(1).getReg();
5517 if (mi_match(S32, *MRI,
5518 m_any_of(m_SpecificReg(DstReg1), m_Copy(m_Reg(DstReg1))))) {
5519 Src = Def->getOperand(2).getReg();
5520 Key = 1;
5521 }
5522 }
5523 }
5524
5525 return {{
5526 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5527 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5528 }};
5529}
5530
5532AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
5533 Register Src;
5534 unsigned Mods;
5535 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
5536
5537 // FIXME: Handle op_sel
5538 return {{
5539 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5540 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5541 }};
5542}
5543
5544// FIXME-TRUE16 remove when fake16 is removed
5546AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
5547 Register Src;
5548 unsigned Mods;
5549 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5550 /*IsCanonicalizing=*/true,
5551 /*AllowAbs=*/false,
5552 /*OpSel=*/false);
5553
5554 return {{
5555 [=](MachineInstrBuilder &MIB) {
5556 MIB.addReg(
5557 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5558 },
5559 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5560 }};
5561}
5562
5564AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
5565 Register Src;
5566 unsigned Mods;
5567 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5568 /*IsCanonicalizing=*/true,
5569 /*AllowAbs=*/false,
5570 /*OpSel=*/true);
5571
5572 return {{
5573 [=](MachineInstrBuilder &MIB) {
5574 MIB.addReg(
5575 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5576 },
5577 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5578 }};
5579}
5580
5581// Given \p Offset and load specified by the \p Root operand check if \p Offset
5582// is a multiple of the load byte size. If it is update \p Offset to a
5583// pre-scaled value and return true.
5584bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root,
5586 bool IsSigned) const {
5587 if (!Subtarget->hasScaleOffset())
5588 return false;
5589
5590 const MachineInstr &MI = *Root.getParent();
5591 MachineMemOperand *MMO = *MI.memoperands_begin();
5592
5593 if (!MMO->getSize().hasValue())
5594 return false;
5595
5596 uint64_t Size = MMO->getSize().getValue();
5597
5598 Register OffsetReg = matchExtendFromS32OrS32(Offset, IsSigned);
5599 if (!OffsetReg)
5600 OffsetReg = Offset;
5601
5602 if (auto Def = getDefSrcRegIgnoringCopies(OffsetReg, *MRI))
5603 OffsetReg = Def->Reg;
5604
5605 Register Op0;
5606 MachineInstr *Mul;
5607 bool ScaleOffset =
5608 (isPowerOf2_64(Size) &&
5609 mi_match(OffsetReg, *MRI,
5610 m_GShl(m_Reg(Op0),
5613 mi_match(OffsetReg, *MRI,
5615 m_Copy(m_SpecificICst(Size))))) ||
5616 mi_match(
5617 OffsetReg, *MRI,
5618 m_BinOp(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64,
5619 m_Reg(Op0), m_SpecificICst(Size))) ||
5620 // Match G_AMDGPU_MAD_U64_U32 offset, c, 0
5621 (mi_match(OffsetReg, *MRI, m_MInstr(Mul)) &&
5622 (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32
5623 : AMDGPU::G_AMDGPU_MAD_U64_U32) ||
5624 (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 &&
5625 VT->signBitIsZero(Mul->getOperand(2).getReg()))) &&
5626 mi_match(Mul->getOperand(4).getReg(), *MRI, m_ZeroInt()) &&
5627 mi_match(Mul->getOperand(3).getReg(), *MRI,
5629 m_Copy(m_SpecificICst(Size))))) &&
5630 mi_match(Mul->getOperand(2).getReg(), *MRI, m_Reg(Op0)));
5631
5632 if (ScaleOffset)
5633 Offset = Op0;
5634
5635 return ScaleOffset;
5636}
5637
5638bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
5639 Register &Base,
5640 Register *SOffset,
5641 int64_t *Offset,
5642 bool *ScaleOffset) const {
5643 MachineInstr *MI = Root.getParent();
5644 MachineBasicBlock *MBB = MI->getParent();
5645
5646 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
5647 // then we can select all ptr + 32-bit offsets.
5648 SmallVector<GEPInfo, 4> AddrInfo;
5649 getAddrModeInfo(*MI, *MRI, AddrInfo);
5650
5651 if (AddrInfo.empty())
5652 return false;
5653
5654 const GEPInfo &GEPI = AddrInfo[0];
5655 std::optional<int64_t> EncodedImm;
5656
5657 if (ScaleOffset)
5658 *ScaleOffset = false;
5659
5660 if (SOffset && Offset) {
5661 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5662 /*HasSOffset=*/true);
5663 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
5664 AddrInfo.size() > 1) {
5665 const GEPInfo &GEPI2 = AddrInfo[1];
5666 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
5667 Register OffsetReg = GEPI2.SgprParts[1];
5668 if (ScaleOffset)
5669 *ScaleOffset =
5670 selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5671 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5672 if (OffsetReg) {
5673 Base = GEPI2.SgprParts[0];
5674 *SOffset = OffsetReg;
5675 *Offset = *EncodedImm;
5676 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
5677 return true;
5678
5679 // For unbuffered smem loads, it is illegal for the Immediate Offset
5680 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
5681 // is negative. Handle the case where the Immediate Offset + SOffset
5682 // is negative.
5683 auto SKnown = VT->getKnownBits(*SOffset);
5684 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
5685 return false;
5686
5687 return true;
5688 }
5689 }
5690 }
5691 return false;
5692 }
5693
5694 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5695 /*HasSOffset=*/false);
5696 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
5697 Base = GEPI.SgprParts[0];
5698 *Offset = *EncodedImm;
5699 return true;
5700 }
5701
5702 // SGPR offset is unsigned.
5703 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
5704 GEPI.Imm != 0) {
5705 // If we make it this far we have a load with an 32-bit immediate offset.
5706 // It is OK to select this using a sgpr offset, because we have already
5707 // failed trying to select this load into one of the _IMM variants since
5708 // the _IMM Patterns are considered before the _SGPR patterns.
5709 Base = GEPI.SgprParts[0];
5710 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5711 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
5712 .addImm(GEPI.Imm);
5713 return true;
5714 }
5715
5716 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
5717 Register OffsetReg = GEPI.SgprParts[1];
5718 if (ScaleOffset)
5719 *ScaleOffset = selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5720 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5721 if (OffsetReg) {
5722 Base = GEPI.SgprParts[0];
5723 *SOffset = OffsetReg;
5724 return true;
5725 }
5726 }
5727
5728 return false;
5729}
5730
5732AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
5733 Register Base;
5734 int64_t Offset;
5735 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset,
5736 /* ScaleOffset */ nullptr))
5737 return std::nullopt;
5738
5739 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5740 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
5741}
5742
5744AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
5745 SmallVector<GEPInfo, 4> AddrInfo;
5746 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
5747
5748 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
5749 return std::nullopt;
5750
5751 const GEPInfo &GEPInfo = AddrInfo[0];
5752 Register PtrReg = GEPInfo.SgprParts[0];
5753 std::optional<int64_t> EncodedImm =
5754 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
5755 if (!EncodedImm)
5756 return std::nullopt;
5757
5758 return {{
5759 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
5760 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
5761 }};
5762}
5763
5765AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
5766 Register Base, SOffset;
5767 bool ScaleOffset;
5768 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr,
5769 &ScaleOffset))
5770 return std::nullopt;
5771
5772 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5773 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5774 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5775 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5776}
5777
5779AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
5780 Register Base, SOffset;
5781 int64_t Offset;
5782 bool ScaleOffset;
5783 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset, &ScaleOffset))
5784 return std::nullopt;
5785
5786 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5787 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5788 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5789 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
5790 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5791}
5792
5793std::pair<Register, int>
5794AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
5795 uint64_t FlatVariant) const {
5796 MachineInstr *MI = Root.getParent();
5797
5798 auto Default = std::pair(Root.getReg(), 0);
5799
5800 if (!STI.hasFlatInstOffsets())
5801 return Default;
5802
5803 Register PtrBase;
5804 int64_t ConstOffset;
5805 bool IsInBounds;
5806 std::tie(PtrBase, ConstOffset, IsInBounds) =
5807 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5808
5809 // Adding the offset to the base address with an immediate in a FLAT
5810 // instruction must not change the memory aperture in which the address falls.
5811 // Therefore we can only fold offsets from inbounds GEPs into FLAT
5812 // instructions.
5813 if (ConstOffset == 0 ||
5814 (FlatVariant == SIInstrFlags::FlatScratch &&
5815 !isFlatScratchBaseLegal(Root.getReg())) ||
5816 (FlatVariant == SIInstrFlags::FLAT && !IsInBounds))
5817 return Default;
5818
5819 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
5820 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
5821 return Default;
5822
5823 return std::pair(PtrBase, ConstOffset);
5824}
5825
5827AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
5828 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
5829
5830 return {{
5831 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5832 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5833 }};
5834}
5835
5837AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
5838 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
5839
5840 return {{
5841 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5842 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5843 }};
5844}
5845
5847AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
5848 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
5849
5850 return {{
5851 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5852 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5853 }};
5854}
5855
5856// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
5858AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
5859 unsigned CPolBits,
5860 bool NeedIOffset) const {
5861 Register Addr = Root.getReg();
5862 Register PtrBase;
5863 int64_t ConstOffset;
5864 int64_t ImmOffset = 0;
5865
5866 // Match the immediate offset first, which canonically is moved as low as
5867 // possible.
5868 std::tie(PtrBase, ConstOffset, std::ignore) =
5869 getPtrBaseWithConstantOffset(Addr, *MRI);
5870
5871 if (ConstOffset != 0) {
5872 if (NeedIOffset &&
5873 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5875 Addr = PtrBase;
5876 ImmOffset = ConstOffset;
5877 } else {
5878 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
5879 if (isSGPR(PtrBaseDef->Reg)) {
5880 if (ConstOffset > 0) {
5881 // Offset is too large.
5882 //
5883 // saddr + large_offset -> saddr +
5884 // (voffset = large_offset & ~MaxOffset) +
5885 // (large_offset & MaxOffset);
5886 int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset;
5887 if (NeedIOffset) {
5888 std::tie(SplitImmOffset, RemainderOffset) =
5889 TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5891 }
5892
5893 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
5894 : isUInt<32>(RemainderOffset)) {
5895 MachineInstr *MI = Root.getParent();
5896 MachineBasicBlock *MBB = MI->getParent();
5897 Register HighBits =
5898 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5899
5900 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
5901 HighBits)
5902 .addImm(RemainderOffset);
5903
5904 if (NeedIOffset)
5905 return {{
5906 [=](MachineInstrBuilder &MIB) {
5907 MIB.addReg(PtrBase);
5908 }, // saddr
5909 [=](MachineInstrBuilder &MIB) {
5910 MIB.addReg(HighBits);
5911 }, // voffset
5912 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
5913 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
5914 }};
5915 return {{
5916 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
5917 [=](MachineInstrBuilder &MIB) {
5918 MIB.addReg(HighBits);
5919 }, // voffset
5920 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
5921 }};
5922 }
5923 }
5924
5925 // We are adding a 64 bit SGPR and a constant. If constant bus limit
5926 // is 1 we would need to perform 1 or 2 extra moves for each half of
5927 // the constant and it is better to do a scalar add and then issue a
5928 // single VALU instruction to materialize zero. Otherwise it is less
5929 // instructions to perform VALU adds with immediates or inline literals.
5930 unsigned NumLiterals =
5931 !TII.isInlineConstant(APInt(32, Lo_32(ConstOffset))) +
5932 !TII.isInlineConstant(APInt(32, Hi_32(ConstOffset)));
5933 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
5934 return std::nullopt;
5935 }
5936 }
5937 }
5938
5939 // Match the variable offset.
5940 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5941 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
5942 // Look through the SGPR->VGPR copy.
5943 Register SAddr =
5944 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
5945
5946 if (isSGPR(SAddr)) {
5947 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
5948
5949 // It's possible voffset is an SGPR here, but the copy to VGPR will be
5950 // inserted later.
5951 bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset,
5952 Subtarget->hasSignedGVSOffset());
5953 if (Register VOffset = matchExtendFromS32OrS32(
5954 PtrBaseOffset, Subtarget->hasSignedGVSOffset())) {
5955 if (NeedIOffset)
5956 return {{[=](MachineInstrBuilder &MIB) { // saddr
5957 MIB.addReg(SAddr);
5958 },
5959 [=](MachineInstrBuilder &MIB) { // voffset
5960 MIB.addReg(VOffset);
5961 },
5962 [=](MachineInstrBuilder &MIB) { // offset
5963 MIB.addImm(ImmOffset);
5964 },
5965 [=](MachineInstrBuilder &MIB) { // cpol
5966 MIB.addImm(CPolBits |
5967 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
5968 }}};
5969 return {{[=](MachineInstrBuilder &MIB) { // saddr
5970 MIB.addReg(SAddr);
5971 },
5972 [=](MachineInstrBuilder &MIB) { // voffset
5973 MIB.addReg(VOffset);
5974 },
5975 [=](MachineInstrBuilder &MIB) { // cpol
5976 MIB.addImm(CPolBits |
5977 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
5978 }}};
5979 }
5980 }
5981 }
5982
5983 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
5984 // drop this.
5985 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
5986 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
5987 return std::nullopt;
5988
5989 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
5990 // moves required to copy a 64-bit SGPR to VGPR.
5991 MachineInstr *MI = Root.getParent();
5992 MachineBasicBlock *MBB = MI->getParent();
5993 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5994
5995 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
5996 .addImm(0);
5997
5998 if (NeedIOffset)
5999 return {{
6000 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
6001 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
6002 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6003 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
6004 }};
6005 return {{
6006 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
6007 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
6008 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
6009 }};
6010}
6011
6013AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
6014 return selectGlobalSAddr(Root, 0);
6015}
6016
6018AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
6019 const MachineInstr &I = *Root.getParent();
6020
6021 // We are assuming CPol is always the last operand of the intrinsic.
6022 auto PassedCPol =
6023 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
6024 return selectGlobalSAddr(Root, PassedCPol);
6025}
6026
6028AMDGPUInstructionSelector::selectGlobalSAddrCPolM0(MachineOperand &Root) const {
6029 const MachineInstr &I = *Root.getParent();
6030
6031 // We are assuming CPol is second from last operand of the intrinsic.
6032 auto PassedCPol =
6033 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
6034 return selectGlobalSAddr(Root, PassedCPol);
6035}
6036
6038AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
6039 return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
6040}
6041
6043AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
6044 MachineOperand &Root) const {
6045 const MachineInstr &I = *Root.getParent();
6046
6047 // We are assuming CPol is always the last operand of the intrinsic.
6048 auto PassedCPol =
6049 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
6050 return selectGlobalSAddr(Root, PassedCPol, false);
6051}
6052
6054AMDGPUInstructionSelector::selectGlobalSAddrNoIOffsetM0(
6055 MachineOperand &Root) const {
6056 const MachineInstr &I = *Root.getParent();
6057
6058 // We are assuming CPol is second from last operand of the intrinsic.
6059 auto PassedCPol =
6060 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
6061 return selectGlobalSAddr(Root, PassedCPol, false);
6062}
6063
6065AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
6066 Register Addr = Root.getReg();
6067 Register PtrBase;
6068 int64_t ConstOffset;
6069 int64_t ImmOffset = 0;
6070
6071 // Match the immediate offset first, which canonically is moved as low as
6072 // possible.
6073 std::tie(PtrBase, ConstOffset, std::ignore) =
6074 getPtrBaseWithConstantOffset(Addr, *MRI);
6075
6076 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
6077 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
6079 Addr = PtrBase;
6080 ImmOffset = ConstOffset;
6081 }
6082
6083 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
6084 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6085 int FI = AddrDef->MI->getOperand(1).getIndex();
6086 return {{
6087 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
6088 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
6089 }};
6090 }
6091
6092 Register SAddr = AddrDef->Reg;
6093
6094 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
6095 Register LHS = AddrDef->MI->getOperand(1).getReg();
6096 Register RHS = AddrDef->MI->getOperand(2).getReg();
6097 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
6098 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
6099
6100 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
6101 isSGPR(RHSDef->Reg)) {
6102 int FI = LHSDef->MI->getOperand(1).getIndex();
6103 MachineInstr &I = *Root.getParent();
6104 MachineBasicBlock *BB = I.getParent();
6105 const DebugLoc &DL = I.getDebugLoc();
6106 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6107
6108 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
6109 .addFrameIndex(FI)
6110 .addReg(RHSDef->Reg)
6111 .setOperandDead(3); // Dead scc
6112 }
6113 }
6114
6115 if (!isSGPR(SAddr))
6116 return std::nullopt;
6117
6118 return {{
6119 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
6120 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
6121 }};
6122}
6123
6124// Check whether the flat scratch SVS swizzle bug affects this access.
6125bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
6126 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
6127 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
6128 return false;
6129
6130 // The bug affects the swizzling of SVS accesses if there is any carry out
6131 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
6132 // voffset to (soffset + inst_offset).
6133 auto VKnown = VT->getKnownBits(VAddr);
6134 auto SKnown = KnownBits::add(VT->getKnownBits(SAddr),
6135 KnownBits::makeConstant(APInt(32, ImmOffset)));
6136 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
6137 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
6138 return (VMax & 3) + (SMax & 3) >= 4;
6139}
6140
6142AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
6143 Register Addr = Root.getReg();
6144 Register PtrBase;
6145 int64_t ConstOffset;
6146 int64_t ImmOffset = 0;
6147
6148 // Match the immediate offset first, which canonically is moved as low as
6149 // possible.
6150 std::tie(PtrBase, ConstOffset, std::ignore) =
6151 getPtrBaseWithConstantOffset(Addr, *MRI);
6152
6153 Register OrigAddr = Addr;
6154 if (ConstOffset != 0 &&
6155 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
6157 Addr = PtrBase;
6158 ImmOffset = ConstOffset;
6159 }
6160
6161 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
6162 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
6163 return std::nullopt;
6164
6165 Register RHS = AddrDef->MI->getOperand(2).getReg();
6166 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
6167 return std::nullopt;
6168
6169 Register LHS = AddrDef->MI->getOperand(1).getReg();
6170 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
6171
6172 if (OrigAddr != Addr) {
6173 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
6174 return std::nullopt;
6175 } else {
6176 if (!isFlatScratchBaseLegalSV(OrigAddr))
6177 return std::nullopt;
6178 }
6179
6180 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
6181 return std::nullopt;
6182
6183 unsigned CPol = selectScaleOffset(Root, RHS, true /* IsSigned */)
6185 : 0;
6186
6187 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6188 int FI = LHSDef->MI->getOperand(1).getIndex();
6189 return {{
6190 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
6191 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
6192 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6193 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
6194 }};
6195 }
6196
6197 if (!isSGPR(LHS))
6198 if (auto Def = getDefSrcRegIgnoringCopies(LHS, *MRI))
6199 LHS = Def->Reg;
6200
6201 if (!isSGPR(LHS))
6202 return std::nullopt;
6203
6204 return {{
6205 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
6206 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
6207 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6208 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
6209 }};
6210}
6211
6213AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
6214 MachineInstr *MI = Root.getParent();
6215 MachineBasicBlock *MBB = MI->getParent();
6216 MachineFunction *MF = MBB->getParent();
6217 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6218
6219 int64_t Offset = 0;
6220 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
6221 Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
6222 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6223
6224 // TODO: Should this be inside the render function? The iterator seems to
6225 // move.
6226 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
6227 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
6228 HighBits)
6229 .addImm(Offset & ~MaxOffset);
6230
6231 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6232 MIB.addReg(Info->getScratchRSrcReg());
6233 },
6234 [=](MachineInstrBuilder &MIB) { // vaddr
6235 MIB.addReg(HighBits);
6236 },
6237 [=](MachineInstrBuilder &MIB) { // soffset
6238 // Use constant zero for soffset and rely on eliminateFrameIndex
6239 // to choose the appropriate frame register if need be.
6240 MIB.addImm(0);
6241 },
6242 [=](MachineInstrBuilder &MIB) { // offset
6243 MIB.addImm(Offset & MaxOffset);
6244 }}};
6245 }
6246
6247 assert(Offset == 0 || Offset == -1);
6248
6249 // Try to fold a frame index directly into the MUBUF vaddr field, and any
6250 // offsets.
6251 std::optional<int> FI;
6252 Register VAddr = Root.getReg();
6253
6254 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6255 Register PtrBase;
6256 int64_t ConstOffset;
6257 std::tie(PtrBase, ConstOffset, std::ignore) =
6258 getPtrBaseWithConstantOffset(VAddr, *MRI);
6259 if (ConstOffset != 0) {
6260 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
6261 (!STI.privateMemoryResourceIsRangeChecked() ||
6262 VT->signBitIsZero(PtrBase))) {
6263 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
6264 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
6265 FI = PtrBaseDef->getOperand(1).getIndex();
6266 else
6267 VAddr = PtrBase;
6268 Offset = ConstOffset;
6269 }
6270 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6271 FI = RootDef->getOperand(1).getIndex();
6272 }
6273
6274 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6275 MIB.addReg(Info->getScratchRSrcReg());
6276 },
6277 [=](MachineInstrBuilder &MIB) { // vaddr
6278 if (FI)
6279 MIB.addFrameIndex(*FI);
6280 else
6281 MIB.addReg(VAddr);
6282 },
6283 [=](MachineInstrBuilder &MIB) { // soffset
6284 // Use constant zero for soffset and rely on eliminateFrameIndex
6285 // to choose the appropriate frame register if need be.
6286 MIB.addImm(0);
6287 },
6288 [=](MachineInstrBuilder &MIB) { // offset
6289 MIB.addImm(Offset);
6290 }}};
6291}
6292
6293bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
6294 int64_t Offset) const {
6295 if (!isUInt<16>(Offset))
6296 return false;
6297
6298 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6299 return true;
6300
6301 // On Southern Islands instruction with a negative base value and an offset
6302 // don't seem to work.
6303 return VT->signBitIsZero(Base);
6304}
6305
6306bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
6307 int64_t Offset1,
6308 unsigned Size) const {
6309 if (Offset0 % Size != 0 || Offset1 % Size != 0)
6310 return false;
6311 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
6312 return false;
6313
6314 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6315 return true;
6316
6317 // On Southern Islands instruction with a negative base value and an offset
6318 // don't seem to work.
6319 return VT->signBitIsZero(Base);
6320}
6321
6322// Return whether the operation has NoUnsignedWrap property.
6323static bool isNoUnsignedWrap(MachineInstr *Addr) {
6324 return Addr->getOpcode() == TargetOpcode::G_OR ||
6325 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
6327}
6328
6329// Check that the base address of flat scratch load/store in the form of `base +
6330// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
6331// requirement). We always treat the first operand as the base address here.
6332bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
6333 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6334
6335 if (isNoUnsignedWrap(AddrMI))
6336 return true;
6337
6338 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6339 // values.
6340 if (STI.hasSignedScratchOffsets())
6341 return true;
6342
6343 Register LHS = AddrMI->getOperand(1).getReg();
6344 Register RHS = AddrMI->getOperand(2).getReg();
6345
6346 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
6347 std::optional<ValueAndVReg> RhsValReg =
6349 // If the immediate offset is negative and within certain range, the base
6350 // address cannot also be negative. If the base is also negative, the sum
6351 // would be either negative or much larger than the valid range of scratch
6352 // memory a thread can access.
6353 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
6354 RhsValReg->Value.getSExtValue() > -0x40000000)
6355 return true;
6356 }
6357
6358 return VT->signBitIsZero(LHS);
6359}
6360
6361// Check address value in SGPR/VGPR are legal for flat scratch in the form
6362// of: SGPR + VGPR.
6363bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
6364 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6365
6366 if (isNoUnsignedWrap(AddrMI))
6367 return true;
6368
6369 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6370 // values.
6371 if (STI.hasSignedScratchOffsets())
6372 return true;
6373
6374 Register LHS = AddrMI->getOperand(1).getReg();
6375 Register RHS = AddrMI->getOperand(2).getReg();
6376 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6377}
6378
6379// Check address value in SGPR/VGPR are legal for flat scratch in the form
6380// of: SGPR + VGPR + Imm.
6381bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
6382 Register Addr) const {
6383 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6384 // values.
6385 if (STI.hasSignedScratchOffsets())
6386 return true;
6387
6388 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6389 Register Base = AddrMI->getOperand(1).getReg();
6390 std::optional<DefinitionAndSourceRegister> BaseDef =
6392 std::optional<ValueAndVReg> RHSOffset =
6394 assert(RHSOffset);
6395
6396 // If the immediate offset is negative and within certain range, the base
6397 // address cannot also be negative. If the base is also negative, the sum
6398 // would be either negative or much larger than the valid range of scratch
6399 // memory a thread can access.
6400 if (isNoUnsignedWrap(BaseDef->MI) &&
6401 (isNoUnsignedWrap(AddrMI) ||
6402 (RHSOffset->Value.getSExtValue() < 0 &&
6403 RHSOffset->Value.getSExtValue() > -0x40000000)))
6404 return true;
6405
6406 Register LHS = BaseDef->MI->getOperand(1).getReg();
6407 Register RHS = BaseDef->MI->getOperand(2).getReg();
6408 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6409}
6410
6411bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
6412 unsigned ShAmtBits) const {
6413 assert(MI.getOpcode() == TargetOpcode::G_AND);
6414
6415 std::optional<APInt> RHS =
6416 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
6417 if (!RHS)
6418 return false;
6419
6420 if (RHS->countr_one() >= ShAmtBits)
6421 return true;
6422
6423 const APInt &LHSKnownZeros = VT->getKnownZeroes(MI.getOperand(1).getReg());
6424 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
6425}
6426
6428AMDGPUInstructionSelector::selectMUBUFScratchOffset(
6429 MachineOperand &Root) const {
6430 Register Reg = Root.getReg();
6431 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6432
6433 std::optional<DefinitionAndSourceRegister> Def =
6435 assert(Def && "this shouldn't be an optional result");
6436 Reg = Def->Reg;
6437
6438 if (Register WaveBase = getWaveAddress(Def->MI)) {
6439 return {{
6440 [=](MachineInstrBuilder &MIB) { // rsrc
6441 MIB.addReg(Info->getScratchRSrcReg());
6442 },
6443 [=](MachineInstrBuilder &MIB) { // soffset
6444 MIB.addReg(WaveBase);
6445 },
6446 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
6447 }};
6448 }
6449
6450 int64_t Offset = 0;
6451
6452 // FIXME: Copy check is a hack
6454 if (mi_match(Reg, *MRI,
6455 m_GPtrAdd(m_Reg(BasePtr),
6457 if (!TII.isLegalMUBUFImmOffset(Offset))
6458 return {};
6459 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
6460 Register WaveBase = getWaveAddress(BasePtrDef);
6461 if (!WaveBase)
6462 return {};
6463
6464 return {{
6465 [=](MachineInstrBuilder &MIB) { // rsrc
6466 MIB.addReg(Info->getScratchRSrcReg());
6467 },
6468 [=](MachineInstrBuilder &MIB) { // soffset
6469 MIB.addReg(WaveBase);
6470 },
6471 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6472 }};
6473 }
6474
6475 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
6476 !TII.isLegalMUBUFImmOffset(Offset))
6477 return {};
6478
6479 return {{
6480 [=](MachineInstrBuilder &MIB) { // rsrc
6481 MIB.addReg(Info->getScratchRSrcReg());
6482 },
6483 [=](MachineInstrBuilder &MIB) { // soffset
6484 MIB.addImm(0);
6485 },
6486 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6487 }};
6488}
6489
6490std::pair<Register, unsigned>
6491AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
6492 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6493 int64_t ConstAddr = 0;
6494
6495 Register PtrBase;
6496 int64_t Offset;
6497 std::tie(PtrBase, Offset, std::ignore) =
6498 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6499
6500 if (Offset) {
6501 if (isDSOffsetLegal(PtrBase, Offset)) {
6502 // (add n0, c0)
6503 return std::pair(PtrBase, Offset);
6504 }
6505 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6506 // TODO
6507
6508
6509 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6510 // TODO
6511
6512 }
6513
6514 return std::pair(Root.getReg(), 0);
6515}
6516
6518AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
6519 Register Reg;
6520 unsigned Offset;
6521 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
6522 return {{
6523 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6524 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
6525 }};
6526}
6527
6529AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
6530 return selectDSReadWrite2(Root, 4);
6531}
6532
6534AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
6535 return selectDSReadWrite2(Root, 8);
6536}
6537
6539AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
6540 unsigned Size) const {
6541 Register Reg;
6542 unsigned Offset;
6543 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
6544 return {{
6545 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6546 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
6547 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
6548 }};
6549}
6550
6551std::pair<Register, unsigned>
6552AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
6553 unsigned Size) const {
6554 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6555 int64_t ConstAddr = 0;
6556
6557 Register PtrBase;
6558 int64_t Offset;
6559 std::tie(PtrBase, Offset, std::ignore) =
6560 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6561
6562 if (Offset) {
6563 int64_t OffsetValue0 = Offset;
6564 int64_t OffsetValue1 = Offset + Size;
6565 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
6566 // (add n0, c0)
6567 return std::pair(PtrBase, OffsetValue0 / Size);
6568 }
6569 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6570 // TODO
6571
6572 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6573 // TODO
6574
6575 }
6576
6577 return std::pair(Root.getReg(), 0);
6578}
6579
6580/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
6581/// the base value with the constant offset, and if the offset computation is
6582/// known to be inbounds. There may be intervening copies between \p Root and
6583/// the identified constant. Returns \p Root, 0, false if this does not match
6584/// the pattern.
6585std::tuple<Register, int64_t, bool>
6586AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
6587 Register Root, const MachineRegisterInfo &MRI) const {
6588 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
6589 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
6590 return {Root, 0, false};
6591
6592 MachineOperand &RHS = RootI->getOperand(2);
6593 std::optional<ValueAndVReg> MaybeOffset =
6595 if (!MaybeOffset)
6596 return {Root, 0, false};
6597 bool IsInBounds = RootI->getFlag(MachineInstr::MIFlag::InBounds);
6598 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue(),
6599 IsInBounds};
6600}
6601
6603 MIB.addImm(0);
6604}
6605
6606/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
6607/// BasePtr is not valid, a null base pointer will be used.
6609 uint32_t FormatLo, uint32_t FormatHi,
6610 Register BasePtr) {
6611 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6612 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6613 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6614 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6615
6616 B.buildInstr(AMDGPU::S_MOV_B32)
6617 .addDef(RSrc2)
6618 .addImm(FormatLo);
6619 B.buildInstr(AMDGPU::S_MOV_B32)
6620 .addDef(RSrc3)
6621 .addImm(FormatHi);
6622
6623 // Build the half of the subregister with the constants before building the
6624 // full 128-bit register. If we are building multiple resource descriptors,
6625 // this will allow CSEing of the 2-component register.
6626 B.buildInstr(AMDGPU::REG_SEQUENCE)
6627 .addDef(RSrcHi)
6628 .addReg(RSrc2)
6629 .addImm(AMDGPU::sub0)
6630 .addReg(RSrc3)
6631 .addImm(AMDGPU::sub1);
6632
6633 Register RSrcLo = BasePtr;
6634 if (!BasePtr) {
6635 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6636 B.buildInstr(AMDGPU::S_MOV_B64)
6637 .addDef(RSrcLo)
6638 .addImm(0);
6639 }
6640
6641 B.buildInstr(AMDGPU::REG_SEQUENCE)
6642 .addDef(RSrc)
6643 .addReg(RSrcLo)
6644 .addImm(AMDGPU::sub0_sub1)
6645 .addReg(RSrcHi)
6646 .addImm(AMDGPU::sub2_sub3);
6647
6648 return RSrc;
6649}
6650
6652 const SIInstrInfo &TII, Register BasePtr) {
6653 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6654
6655 // FIXME: Why are half the "default" bits ignored based on the addressing
6656 // mode?
6657 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
6658}
6659
6661 const SIInstrInfo &TII, Register BasePtr) {
6662 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6663
6664 // FIXME: Why are half the "default" bits ignored based on the addressing
6665 // mode?
6666 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
6667}
6668
6669AMDGPUInstructionSelector::MUBUFAddressData
6670AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
6671 MUBUFAddressData Data;
6672 Data.N0 = Src;
6673
6674 Register PtrBase;
6675 int64_t Offset;
6676
6677 std::tie(PtrBase, Offset, std::ignore) =
6678 getPtrBaseWithConstantOffset(Src, *MRI);
6679 if (isUInt<32>(Offset)) {
6680 Data.N0 = PtrBase;
6681 Data.Offset = Offset;
6682 }
6683
6684 if (MachineInstr *InputAdd
6685 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
6686 Data.N2 = InputAdd->getOperand(1).getReg();
6687 Data.N3 = InputAdd->getOperand(2).getReg();
6688
6689 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
6690 // FIXME: Don't know this was defined by operand 0
6691 //
6692 // TODO: Remove this when we have copy folding optimizations after
6693 // RegBankSelect.
6694 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
6695 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
6696 }
6697
6698 return Data;
6699}
6700
6701/// Return if the addr64 mubuf mode should be used for the given address.
6702bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
6703 // (ptr_add N2, N3) -> addr64, or
6704 // (ptr_add (ptr_add N2, N3), C1) -> addr64
6705 if (Addr.N2)
6706 return true;
6707
6708 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
6709 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
6710}
6711
6712/// Split an immediate offset \p ImmOffset depending on whether it fits in the
6713/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
6714/// component.
6715void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
6716 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
6717 if (TII.isLegalMUBUFImmOffset(ImmOffset))
6718 return;
6719
6720 // Illegal offset, store it in soffset.
6721 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6722 B.buildInstr(AMDGPU::S_MOV_B32)
6723 .addDef(SOffset)
6724 .addImm(ImmOffset);
6725 ImmOffset = 0;
6726}
6727
6728bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
6729 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
6730 Register &SOffset, int64_t &Offset) const {
6731 // FIXME: Predicates should stop this from reaching here.
6732 // addr64 bit was removed for volcanic islands.
6733 if (!STI.hasAddr64() || STI.useFlatForGlobal())
6734 return false;
6735
6736 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6737 if (!shouldUseAddr64(AddrData))
6738 return false;
6739
6740 Register N0 = AddrData.N0;
6741 Register N2 = AddrData.N2;
6742 Register N3 = AddrData.N3;
6743 Offset = AddrData.Offset;
6744
6745 // Base pointer for the SRD.
6746 Register SRDPtr;
6747
6748 if (N2) {
6749 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6750 assert(N3);
6751 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6752 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
6753 // addr64, and construct the default resource from a 0 address.
6754 VAddr = N0;
6755 } else {
6756 SRDPtr = N3;
6757 VAddr = N2;
6758 }
6759 } else {
6760 // N2 is not divergent.
6761 SRDPtr = N2;
6762 VAddr = N3;
6763 }
6764 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6765 // Use the default null pointer in the resource
6766 VAddr = N0;
6767 } else {
6768 // N0 -> offset, or
6769 // (N0 + C1) -> offset
6770 SRDPtr = N0;
6771 }
6772
6773 MachineIRBuilder B(*Root.getParent());
6774 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
6775 splitIllegalMUBUFOffset(B, SOffset, Offset);
6776 return true;
6777}
6778
6779bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
6780 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
6781 int64_t &Offset) const {
6782
6783 // FIXME: Pattern should not reach here.
6784 if (STI.useFlatForGlobal())
6785 return false;
6786
6787 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6788 if (shouldUseAddr64(AddrData))
6789 return false;
6790
6791 // N0 -> offset, or
6792 // (N0 + C1) -> offset
6793 Register SRDPtr = AddrData.N0;
6794 Offset = AddrData.Offset;
6795
6796 // TODO: Look through extensions for 32-bit soffset.
6797 MachineIRBuilder B(*Root.getParent());
6798
6799 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
6800 splitIllegalMUBUFOffset(B, SOffset, Offset);
6801 return true;
6802}
6803
6805AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
6806 Register VAddr;
6807 Register RSrcReg;
6808 Register SOffset;
6809 int64_t Offset = 0;
6810
6811 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
6812 return {};
6813
6814 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
6815 // pattern.
6816 return {{
6817 [=](MachineInstrBuilder &MIB) { // rsrc
6818 MIB.addReg(RSrcReg);
6819 },
6820 [=](MachineInstrBuilder &MIB) { // vaddr
6821 MIB.addReg(VAddr);
6822 },
6823 [=](MachineInstrBuilder &MIB) { // soffset
6824 if (SOffset)
6825 MIB.addReg(SOffset);
6826 else if (STI.hasRestrictedSOffset())
6827 MIB.addReg(AMDGPU::SGPR_NULL);
6828 else
6829 MIB.addImm(0);
6830 },
6831 [=](MachineInstrBuilder &MIB) { // offset
6832 MIB.addImm(Offset);
6833 },
6834 addZeroImm, // cpol
6835 addZeroImm, // tfe
6836 addZeroImm // swz
6837 }};
6838}
6839
6841AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
6842 Register RSrcReg;
6843 Register SOffset;
6844 int64_t Offset = 0;
6845
6846 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
6847 return {};
6848
6849 return {{
6850 [=](MachineInstrBuilder &MIB) { // rsrc
6851 MIB.addReg(RSrcReg);
6852 },
6853 [=](MachineInstrBuilder &MIB) { // soffset
6854 if (SOffset)
6855 MIB.addReg(SOffset);
6856 else if (STI.hasRestrictedSOffset())
6857 MIB.addReg(AMDGPU::SGPR_NULL);
6858 else
6859 MIB.addImm(0);
6860 },
6861 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
6862 addZeroImm, // cpol
6863 addZeroImm, // tfe
6864 addZeroImm, // swz
6865 }};
6866}
6867
6869AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
6870
6871 Register SOffset = Root.getReg();
6872
6873 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
6874 SOffset = AMDGPU::SGPR_NULL;
6875
6876 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
6877}
6878
6879/// Get an immediate that must be 32-bits, and treated as zero extended.
6880static std::optional<uint64_t>
6882 // getIConstantVRegVal sexts any values, so see if that matters.
6883 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
6884 if (!OffsetVal || !isInt<32>(*OffsetVal))
6885 return std::nullopt;
6886 return Lo_32(*OffsetVal);
6887}
6888
6890AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
6891 std::optional<uint64_t> OffsetVal =
6892 Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI);
6893 if (!OffsetVal)
6894 return {};
6895
6896 std::optional<int64_t> EncodedImm =
6897 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
6898 if (!EncodedImm)
6899 return {};
6900
6901 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
6902}
6903
6905AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
6906 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
6907
6908 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
6909 if (!OffsetVal)
6910 return {};
6911
6912 std::optional<int64_t> EncodedImm =
6914 if (!EncodedImm)
6915 return {};
6916
6917 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
6918}
6919
6921AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
6922 // Match the (soffset + offset) pair as a 32-bit register base and
6923 // an immediate offset.
6924 Register SOffset;
6925 unsigned Offset;
6926 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
6927 *MRI, Root.getReg(), VT, /*CheckNUW*/ true);
6928 if (!SOffset)
6929 return std::nullopt;
6930
6931 std::optional<int64_t> EncodedOffset =
6932 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
6933 if (!EncodedOffset)
6934 return std::nullopt;
6935
6936 assert(MRI->getType(SOffset) == LLT::scalar(32));
6937 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
6938 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
6939}
6940
6941std::pair<Register, unsigned>
6942AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
6943 bool &Matched) const {
6944 Matched = false;
6945
6946 Register Src;
6947 unsigned Mods;
6948 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
6949
6950 if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
6951 assert(MRI->getType(Src) == LLT::scalar(16));
6952
6953 // Only change Src if src modifier could be gained. In such cases new Src
6954 // could be sgpr but this does not violate constant bus restriction for
6955 // instruction that is being selected.
6956 Src = stripBitCast(Src, *MRI);
6957
6958 const auto CheckAbsNeg = [&]() {
6959 // Be careful about folding modifiers if we already have an abs. fneg is
6960 // applied last, so we don't want to apply an earlier fneg.
6961 if ((Mods & SISrcMods::ABS) == 0) {
6962 unsigned ModsTmp;
6963 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
6964
6965 if ((ModsTmp & SISrcMods::NEG) != 0)
6966 Mods ^= SISrcMods::NEG;
6967
6968 if ((ModsTmp & SISrcMods::ABS) != 0)
6969 Mods |= SISrcMods::ABS;
6970 }
6971 };
6972
6973 CheckAbsNeg();
6974
6975 // op_sel/op_sel_hi decide the source type and source.
6976 // If the source's op_sel_hi is set, it indicates to do a conversion from
6977 // fp16. If the sources's op_sel is set, it picks the high half of the
6978 // source register.
6979
6980 Mods |= SISrcMods::OP_SEL_1;
6981
6982 if (isExtractHiElt(*MRI, Src, Src)) {
6983 Mods |= SISrcMods::OP_SEL_0;
6984 CheckAbsNeg();
6985 }
6986
6987 Matched = true;
6988 }
6989
6990 return {Src, Mods};
6991}
6992
6994AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
6995 MachineOperand &Root) const {
6996 Register Src;
6997 unsigned Mods;
6998 bool Matched;
6999 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
7000 if (!Matched)
7001 return {};
7002
7003 return {{
7004 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
7005 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
7006 }};
7007}
7008
7010AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
7011 Register Src;
7012 unsigned Mods;
7013 bool Matched;
7014 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
7015
7016 return {{
7017 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
7018 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
7019 }};
7020}
7021
7022bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
7023 MachineInstr &I, Intrinsic::ID IntrID) const {
7024 MachineBasicBlock *MBB = I.getParent();
7025 const DebugLoc &DL = I.getDebugLoc();
7026 Register CCReg = I.getOperand(0).getReg();
7027
7028 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
7029 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_CMP_EQ_U32)).addImm(0).addImm(0);
7030
7031 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
7032 .addImm(I.getOperand(2).getImm());
7033
7034 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
7035
7036 I.eraseFromParent();
7037 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
7038 *MRI);
7039}
7040
7041bool AMDGPUInstructionSelector::selectSGetBarrierState(
7042 MachineInstr &I, Intrinsic::ID IntrID) const {
7043 MachineBasicBlock *MBB = I.getParent();
7044 const DebugLoc &DL = I.getDebugLoc();
7045 const MachineOperand &BarOp = I.getOperand(2);
7046 std::optional<int64_t> BarValImm =
7047 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
7048
7049 if (!BarValImm) {
7050 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
7051 .addReg(BarOp.getReg());
7052 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7053 }
7054 MachineInstrBuilder MIB;
7055 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
7056 : AMDGPU::S_GET_BARRIER_STATE_M0;
7057 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7058
7059 auto DstReg = I.getOperand(0).getReg();
7060 const TargetRegisterClass *DstRC =
7061 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
7062 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
7063 return false;
7064 MIB.addDef(DstReg);
7065 if (BarValImm) {
7066 MIB.addImm(*BarValImm);
7067 }
7068 I.eraseFromParent();
7069 return true;
7070}
7071
7072unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
7073 if (HasInlineConst) {
7074 switch (IntrID) {
7075 default:
7076 llvm_unreachable("not a named barrier op");
7077 case Intrinsic::amdgcn_s_barrier_join:
7078 return AMDGPU::S_BARRIER_JOIN_IMM;
7079 case Intrinsic::amdgcn_s_wakeup_barrier:
7080 return AMDGPU::S_WAKEUP_BARRIER_IMM;
7081 case Intrinsic::amdgcn_s_get_named_barrier_state:
7082 return AMDGPU::S_GET_BARRIER_STATE_IMM;
7083 };
7084 } else {
7085 switch (IntrID) {
7086 default:
7087 llvm_unreachable("not a named barrier op");
7088 case Intrinsic::amdgcn_s_barrier_join:
7089 return AMDGPU::S_BARRIER_JOIN_M0;
7090 case Intrinsic::amdgcn_s_wakeup_barrier:
7091 return AMDGPU::S_WAKEUP_BARRIER_M0;
7092 case Intrinsic::amdgcn_s_get_named_barrier_state:
7093 return AMDGPU::S_GET_BARRIER_STATE_M0;
7094 };
7095 }
7096}
7097
7098bool AMDGPUInstructionSelector::selectNamedBarrierInit(
7099 MachineInstr &I, Intrinsic::ID IntrID) const {
7100 MachineBasicBlock *MBB = I.getParent();
7101 const DebugLoc &DL = I.getDebugLoc();
7102 const MachineOperand &BarOp = I.getOperand(1);
7103 const MachineOperand &CntOp = I.getOperand(2);
7104
7105 // BarID = (BarOp >> 4) & 0x3F
7106 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7107 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
7108 .add(BarOp)
7109 .addImm(4u)
7110 .setOperandDead(3); // Dead scc
7111
7112 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7113 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
7114 .addReg(TmpReg0)
7115 .addImm(0x3F)
7116 .setOperandDead(3); // Dead scc
7117
7118 // MO = ((CntOp & 0x3F) << shAmt) | BarID
7119 Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7120 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2)
7121 .add(CntOp)
7122 .addImm(0x3F)
7123 .setOperandDead(3); // Dead scc
7124
7125 Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7126 constexpr unsigned ShAmt = 16;
7127 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3)
7128 .addReg(TmpReg2)
7129 .addImm(ShAmt)
7130 .setOperandDead(3); // Dead scc
7131
7132 Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7133 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4)
7134 .addReg(TmpReg1)
7135 .addReg(TmpReg3)
7136 .setOperandDead(3); // Dead scc;
7137
7138 auto CopyMIB =
7139 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
7140 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7141
7142 unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
7143 ? AMDGPU::S_BARRIER_INIT_M0
7144 : AMDGPU::S_BARRIER_SIGNAL_M0;
7145 MachineInstrBuilder MIB;
7146 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7147
7148 I.eraseFromParent();
7149 return true;
7150}
7151
7152bool AMDGPUInstructionSelector::selectNamedBarrierInst(
7153 MachineInstr &I, Intrinsic::ID IntrID) const {
7154 MachineBasicBlock *MBB = I.getParent();
7155 const DebugLoc &DL = I.getDebugLoc();
7156 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
7157 ? I.getOperand(2)
7158 : I.getOperand(1);
7159 std::optional<int64_t> BarValImm =
7160 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
7161
7162 if (!BarValImm) {
7163 // BarID = (BarOp >> 4) & 0x3F
7164 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7165 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
7166 .addReg(BarOp.getReg())
7167 .addImm(4u)
7168 .setOperandDead(3); // Dead scc;
7169
7170 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7171 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
7172 .addReg(TmpReg0)
7173 .addImm(0x3F)
7174 .setOperandDead(3); // Dead scc;
7175
7176 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
7177 .addReg(TmpReg1);
7178 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7179 }
7180
7181 MachineInstrBuilder MIB;
7182 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
7183 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7184
7185 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
7186 auto DstReg = I.getOperand(0).getReg();
7187 const TargetRegisterClass *DstRC =
7188 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
7189 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
7190 return false;
7191 MIB.addDef(DstReg);
7192 }
7193
7194 if (BarValImm) {
7195 auto BarId = ((*BarValImm) >> 4) & 0x3F;
7196 MIB.addImm(BarId);
7197 }
7198
7199 I.eraseFromParent();
7200 return true;
7201}
7202
7203void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
7204 const MachineInstr &MI,
7205 int OpIdx) const {
7206 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7207 "Expected G_CONSTANT");
7208 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
7209}
7210
7211void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
7212 const MachineInstr &MI,
7213 int OpIdx) const {
7214 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7215 "Expected G_CONSTANT");
7216 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
7217}
7218
7219void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
7220 const MachineInstr &MI,
7221 int OpIdx) const {
7222 const MachineOperand &Op = MI.getOperand(1);
7223 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
7224 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
7225}
7226
7227void AMDGPUInstructionSelector::renderCountTrailingOnesImm(
7228 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7229 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7230 "Expected G_CONSTANT");
7231 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countTrailingOnes());
7232}
7233
7234/// This only really exists to satisfy DAG type checking machinery, so is a
7235/// no-op here.
7236void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
7237 const MachineInstr &MI,
7238 int OpIdx) const {
7239 const MachineOperand &Op = MI.getOperand(OpIdx);
7240 int64_t Imm;
7241 if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
7242 MIB.addImm(Imm);
7243 else
7244 MIB.addImm(Op.getImm());
7245}
7246
7247void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
7248 const MachineInstr &MI,
7249 int OpIdx) const {
7250 MIB.addImm(MI.getOperand(OpIdx).getImm() != 0);
7251}
7252
7253void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
7254 const MachineInstr &MI,
7255 int OpIdx) const {
7256 assert(OpIdx >= 0 && "expected to match an immediate operand");
7257 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7258}
7259
7260void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
7261 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7262 assert(OpIdx >= 0 && "expected to match an immediate operand");
7263 MIB.addImm(
7264 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7265}
7266
7267void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
7268 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7269 assert(OpIdx >= 0 && "expected to match an immediate operand");
7270 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
7272 : (int64_t)SISrcMods::DST_OP_SEL);
7273}
7274
7275void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
7276 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7277 assert(OpIdx >= 0 && "expected to match an immediate operand");
7278 MIB.addImm(
7279 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7280}
7281
7282void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
7283 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7284 assert(OpIdx >= 0 && "expected to match an immediate operand");
7285 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7286 ? (int64_t)(SISrcMods::OP_SEL_0)
7287 : 0);
7288}
7289
7290void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
7291 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7292 assert(OpIdx >= 0 && "expected to match an immediate operand");
7293 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
7294 : 0);
7295}
7296
7297void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
7298 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7299 assert(OpIdx >= 0 && "expected to match an immediate operand");
7300 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
7301 : 0);
7302}
7303
7304void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
7305 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7306 assert(OpIdx >= 0 && "expected to match an immediate operand");
7307 MIB.addImm(
7308 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7309}
7310
7311void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
7312 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7313 assert(OpIdx >= 0 && "expected to match an immediate operand");
7314 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7315 ? (int64_t)SISrcMods::DST_OP_SEL
7316 : 0);
7317}
7318
7319void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
7320 const MachineInstr &MI,
7321 int OpIdx) const {
7322 assert(OpIdx >= 0 && "expected to match an immediate operand");
7323 MIB.addImm(MI.getOperand(OpIdx).getImm() &
7326}
7327
7328void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
7329 const MachineInstr &MI,
7330 int OpIdx) const {
7331 assert(OpIdx >= 0 && "expected to match an immediate operand");
7332 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
7335 MIB.addImm(Swizzle);
7336}
7337
7338void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
7339 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7340 assert(OpIdx >= 0 && "expected to match an immediate operand");
7341 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
7344 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
7345}
7346
7347void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
7348 const MachineInstr &MI,
7349 int OpIdx) const {
7350 MIB.addFrameIndex(MI.getOperand(1).getIndex());
7351}
7352
7353void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
7354 const MachineInstr &MI,
7355 int OpIdx) const {
7356 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
7357 int ExpVal = APF.getExactLog2Abs();
7358 assert(ExpVal != INT_MIN);
7359 MIB.addImm(ExpVal);
7360}
7361
7362void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
7363 const MachineInstr &MI,
7364 int OpIdx) const {
7365 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
7366 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
7367 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
7368 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
7369 MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
7370}
7371
7372void AMDGPUInstructionSelector::renderVOP3PModsNeg(MachineInstrBuilder &MIB,
7373 const MachineInstr &MI,
7374 int OpIdx) const {
7375 unsigned Mods = SISrcMods::OP_SEL_1;
7376 if (MI.getOperand(OpIdx).getImm())
7377 Mods ^= SISrcMods::NEG;
7378 MIB.addImm((int64_t)Mods);
7379}
7380
7381void AMDGPUInstructionSelector::renderVOP3PModsNegs(MachineInstrBuilder &MIB,
7382 const MachineInstr &MI,
7383 int OpIdx) const {
7384 unsigned Mods = SISrcMods::OP_SEL_1;
7385 if (MI.getOperand(OpIdx).getImm())
7387 MIB.addImm((int64_t)Mods);
7388}
7389
7390void AMDGPUInstructionSelector::renderVOP3PModsNegAbs(MachineInstrBuilder &MIB,
7391 const MachineInstr &MI,
7392 int OpIdx) const {
7393 unsigned Val = MI.getOperand(OpIdx).getImm();
7394 unsigned Mods = SISrcMods::OP_SEL_1; // default: none
7395 if (Val == 1) // neg
7396 Mods ^= SISrcMods::NEG;
7397 if (Val == 2) // abs
7398 Mods ^= SISrcMods::ABS;
7399 if (Val == 3) // neg and abs
7400 Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
7401 MIB.addImm((int64_t)Mods);
7402}
7403
7404void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB,
7405 const MachineInstr &MI,
7406 int OpIdx) const {
7407 uint32_t V = MI.getOperand(2).getImm();
7410 if (!Subtarget->hasSafeCUPrefetch())
7411 V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
7412 MIB.addImm(V);
7413}
7414
7415/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
7416void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
7417 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7418 unsigned Val = MI.getOperand(OpIdx).getImm();
7419 unsigned New = 0;
7420 if (Val & 0x1)
7422 if (Val & 0x2)
7424 MIB.addImm(New);
7425}
7426
7427bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
7428 return TII.isInlineConstant(Imm);
7429}
7430
7431bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
7432 return TII.isInlineConstant(Imm);
7433}
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static Register getLegalRegBank(Register NewReg, Register RootReg, const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const SIInstrInfo &TII)
static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is shift left with half bits, such as reg0:2n =G_SHL reg1:2n, CONST(n)
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
static bool checkRB(Register Reg, unsigned int RBNo, const AMDGPURegisterBankInfo &RBI, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI)
static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods)
static bool isTruncHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is truncating to half, such as reg0:n = G_TRUNC reg1:2n
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static TypeClass isVectorOfTwoOrScalar(Register Reg, const MachineRegisterInfo &MRI)
static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, MachineBasicBlock *MBB)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static bool isSameBitWidth(Register Reg1, Register Reg2, const MachineRegisterInfo &MRI)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static bool isAsyncLDSDMA(Intrinsic::ID Intr)
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelValueTracking &ValueTracking)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static std::pair< Register, SrcStatus > getLastSameOrNeg(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI)
static std::optional< std::pair< Register, SrcStatus > > calcNextStatus(std::pair< Register, SrcStatus > Curr, const MachineRegisterInfo &MRI)
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg, Register RootReg, const SIInstrInfo &TII, const MachineRegisterInfo &MRI)
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static SmallVector< std::pair< Register, SrcStatus > > getSrcStats(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static bool isUnmergeHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test function, if the MI is reg0:n, reg1:n = G_UNMERGE_VALUES reg2:2n
static SrcStatus getNegStatus(Register Reg, SrcStatus S, const MachineRegisterInfo &MRI)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is logic shift right with half bits, such as reg0:2n =G_LSHR reg1:2n,...
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
constexpr LLT S1
constexpr LLT S32
AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool isAllZeros(StringRef Arr)
Return true if the array is empty or all zeros.
dxil translate DXIL Translate Metadata
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
MachineInstr unsigned OpIdx
#define P(N)
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
#define LLVM_DEBUG(...)
Definition Debug.h:114
Value * RHS
Value * LHS
This is used to control valid status that current MI supports.
bool checkOptions(SrcStatus Stat) const
SearchOptions(Register Reg, const MachineRegisterInfo &MRI)
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelValueTracking *VT, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1564
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1577
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:693
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:678
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
bool isFPPredicate() const
Definition InstrTypes.h:782
bool isIntPredicate() const
Definition InstrTypes.h:783
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
LLVM_ABI DILocation * get() const
Get the underlying DILocation.
Definition DebugLoc.cpp:48
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelValueTracking *vt, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool hasValue() const
TypeSize getValue() const
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void setReturnAddressIsTaken(bool s)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Analysis providing profile information.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
static bool isGenericOpcode(unsigned Opc)
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
LLVM_READONLY int32_t getGlobalSaddrOp(uint32_t Opcode)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
IndexMode
ARM Index Modes.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
operand_type_match m_Reg()
SpecificConstantMatch m_SpecificICst(const APInt &RequestedValue)
Matches a constant equal to RequestedValue.
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_XOR, true > m_GXor(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_SEXT > m_GSExt(const SrcTy &Src)
UnaryOp_match< SrcTy, TargetOpcode::G_FPEXT > m_GFPExt(const SrcTy &Src)
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
ConstantMatch< APInt > m_ICst(APInt &Cst)
SpecificConstantMatch m_AllOnesInt()
BinaryOp_match< LHS, RHS, TargetOpcode::G_OR, true > m_GOr(const LHS &L, const RHS &R)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
ImplicitDefMatch m_GImplicitDef()
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
BinaryOp_match< LHS, RHS, TargetOpcode::G_ASHR, false > m_GAShr(const LHS &L, const RHS &R)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
SpecificRegisterMatch m_SpecificReg(Register RequestedReg)
Matches a register only if it is equal to RequestedReg.
BinaryOp_match< LHS, RHS, TargetOpcode::G_SHL, false > m_GShl(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
BinaryOp_match< LHS, RHS, TargetOpcode::G_AND, true > m_GAnd(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_BITCAST > m_GBitcast(const SrcTy &Src)
bind_ty< MachineInstr * > m_MInstr(MachineInstr *&MI)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_ANYEXT > m_GAnyExt(const SrcTy &Src)
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, TargetOpcode::G_MUL, true > m_GMul(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:917
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI bool isBuildVectorAllZeros(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndef=false)
Return true if the specified instruction is a G_BUILD_VECTOR or G_BUILD_VECTOR_TRUNC where all of the...
Definition Utils.cpp:1483
LLVM_ABI Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition Utils.cpp:56
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:653
PointerUnion< const TargetRegisterClass *, const RegisterBank * > RegClassOrRegBank
Convenient type to represent either a register class or a register bank.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:461
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
LLVM_ABI std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition Utils.cpp:294
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:155
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition Utils.cpp:494
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:314
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition Utils.cpp:439
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
LLVM_ABI std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition Utils.cpp:469
LLVM_ABI Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition Utils.cpp:501
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr RegState getUndefRegState(bool B)
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:317
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:363
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.