LLVM 22.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45 const AMDGPUTargetMachine &TM)
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
49#include "AMDGPUGenGlobalISel.inc"
52#include "AMDGPUGenGlobalISel.inc"
54{
55}
56
57const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
58
69
70// Return the wave level SGPR base address if this is a wave address.
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(1).getReg()
74 : Register();
75}
76
77bool AMDGPUInstructionSelector::isVCC(Register Reg,
78 const MachineRegisterInfo &MRI) const {
79 // The verifier is oblivious to s1 being a valid value for wavesize registers.
80 if (Reg.isPhysical())
81 return false;
82
83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84 const TargetRegisterClass *RC =
86 if (RC) {
87 const LLT Ty = MRI.getType(Reg);
88 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
89 return false;
90 // G_TRUNC s1 result is never vcc.
91 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
92 RC->hasSuperClassEq(TRI.getBoolRC());
93 }
94
95 const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
96 return RB->getID() == AMDGPU::VCCRegBankID;
97}
98
99bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
100 unsigned NewOpc) const {
101 MI.setDesc(TII.get(NewOpc));
102 MI.removeOperand(1); // Remove intrinsic ID.
103 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
104
105 MachineOperand &Dst = MI.getOperand(0);
106 MachineOperand &Src = MI.getOperand(1);
107
108 // TODO: This should be legalized to s32 if needed
109 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
110 return false;
111
112 const TargetRegisterClass *DstRC
113 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
114 const TargetRegisterClass *SrcRC
115 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
116 if (!DstRC || DstRC != SrcRC)
117 return false;
118
119 if (!RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) ||
120 !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
121 return false;
122 const MCInstrDesc &MCID = MI.getDesc();
123 if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
124 MI.getOperand(0).setIsEarlyClobber(true);
125 }
126 return true;
127}
128
129bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
130 const DebugLoc &DL = I.getDebugLoc();
131 MachineBasicBlock *BB = I.getParent();
132 I.setDesc(TII.get(TargetOpcode::COPY));
133
134 const MachineOperand &Src = I.getOperand(1);
135 MachineOperand &Dst = I.getOperand(0);
136 Register DstReg = Dst.getReg();
137 Register SrcReg = Src.getReg();
138
139 if (isVCC(DstReg, *MRI)) {
140 if (SrcReg == AMDGPU::SCC) {
141 const TargetRegisterClass *RC
142 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
143 if (!RC)
144 return true;
145 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
146 }
147
148 if (!isVCC(SrcReg, *MRI)) {
149 // TODO: Should probably leave the copy and let copyPhysReg expand it.
150 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
151 return false;
152
153 const TargetRegisterClass *SrcRC
154 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
155
156 std::optional<ValueAndVReg> ConstVal =
157 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
158 if (ConstVal) {
159 unsigned MovOpc =
160 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
161 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
162 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
163 } else {
164 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
165
166 // We can't trust the high bits at this point, so clear them.
167
168 // TODO: Skip masking high bits if def is known boolean.
169
170 if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
171 assert(Subtarget->useRealTrue16Insts());
172 const int64_t NoMods = 0;
173 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
174 .addImm(NoMods)
175 .addImm(1)
176 .addImm(NoMods)
177 .addReg(SrcReg)
178 .addImm(NoMods);
179 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
180 .addImm(NoMods)
181 .addImm(0)
182 .addImm(NoMods)
183 .addReg(MaskedReg)
184 .addImm(NoMods);
185 } else {
186 bool IsSGPR = TRI.isSGPRClass(SrcRC);
187 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
188 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
189 .addImm(1)
190 .addReg(SrcReg);
191 if (IsSGPR)
192 And.setOperandDead(3); // Dead scc
193
194 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
195 .addImm(0)
196 .addReg(MaskedReg);
197 }
198 }
199
200 if (!MRI->getRegClassOrNull(SrcReg))
201 MRI->setRegClass(SrcReg, SrcRC);
202 I.eraseFromParent();
203 return true;
204 }
205
206 const TargetRegisterClass *RC =
207 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
208 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
209 return false;
210
211 return true;
212 }
213
214 for (const MachineOperand &MO : I.operands()) {
215 if (MO.getReg().isPhysical())
216 continue;
217
218 const TargetRegisterClass *RC =
219 TRI.getConstrainedRegClassForOperand(MO, *MRI);
220 if (!RC)
221 continue;
222 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
223 }
224 return true;
225}
226
227bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
228 const DebugLoc &DL = I.getDebugLoc();
229 MachineBasicBlock *BB = I.getParent();
230 Register VCCReg = I.getOperand(1).getReg();
231 MachineInstr *Cmp;
232
233 // Set SCC as a side effect with S_CMP or S_OR.
234 if (STI.hasScalarCompareEq64()) {
235 unsigned CmpOpc =
236 STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
237 Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0);
238 } else {
239 Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
240 Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst)
241 .addReg(VCCReg)
242 .addReg(VCCReg);
243 }
244
245 if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))
246 return false;
247
248 Register DstReg = I.getOperand(0).getReg();
249 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
250
251 I.eraseFromParent();
252 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
253}
254
255bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
256 const DebugLoc &DL = I.getDebugLoc();
257 MachineBasicBlock *BB = I.getParent();
258
259 Register DstReg = I.getOperand(0).getReg();
260 Register SrcReg = I.getOperand(1).getReg();
261 std::optional<ValueAndVReg> Arg =
262 getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);
263
264 if (Arg) {
265 const int64_t Value = Arg->Value.getZExtValue();
266 if (Value == 0) {
267 unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
268 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
269 } else {
270 assert(Value == 1);
271 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());
272 }
273 I.eraseFromParent();
274 return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
275 }
276
277 // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
278 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);
279
280 unsigned SelectOpcode =
281 STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
282 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
283 .addReg(TRI.getExec())
284 .addImm(0);
285
286 I.eraseFromParent();
287 return constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
288}
289
290bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
291 Register DstReg = I.getOperand(0).getReg();
292 Register SrcReg = I.getOperand(1).getReg();
293
294 const DebugLoc &DL = I.getDebugLoc();
295 MachineBasicBlock *BB = I.getParent();
296
297 auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
298 .addReg(SrcReg);
299
300 I.eraseFromParent();
301 return constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
302}
303
304bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
305 const Register DefReg = I.getOperand(0).getReg();
306 const LLT DefTy = MRI->getType(DefReg);
307
308 // S1 G_PHIs should not be selected in instruction-select, instead:
309 // - divergent S1 G_PHI should go through lane mask merging algorithm
310 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
311 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
312 if (DefTy == LLT::scalar(1))
313 return false;
314
315 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
316
317 const RegClassOrRegBank &RegClassOrBank =
318 MRI->getRegClassOrRegBank(DefReg);
319
320 const TargetRegisterClass *DefRC =
322 if (!DefRC) {
323 if (!DefTy.isValid()) {
324 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
325 return false;
326 }
327
328 const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
329 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
330 if (!DefRC) {
331 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
332 return false;
333 }
334 }
335
336 // If inputs have register bank, assign corresponding reg class.
337 // Note: registers don't need to have the same reg bank.
338 for (unsigned i = 1; i != I.getNumOperands(); i += 2) {
339 const Register SrcReg = I.getOperand(i).getReg();
340
341 const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
342 if (RB) {
343 const LLT SrcTy = MRI->getType(SrcReg);
344 const TargetRegisterClass *SrcRC =
345 TRI.getRegClassForTypeOnBank(SrcTy, *RB);
346 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
347 return false;
348 }
349 }
350
351 I.setDesc(TII.get(TargetOpcode::PHI));
352 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
353}
354
356AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
357 const TargetRegisterClass &SubRC,
358 unsigned SubIdx) const {
359
360 MachineInstr *MI = MO.getParent();
361 MachineBasicBlock *BB = MO.getParent()->getParent();
362 Register DstReg = MRI->createVirtualRegister(&SubRC);
363
364 if (MO.isReg()) {
365 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
366 Register Reg = MO.getReg();
367 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
368 .addReg(Reg, 0, ComposedSubIdx);
369
370 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
371 MO.isKill(), MO.isDead(), MO.isUndef(),
372 MO.isEarlyClobber(), 0, MO.isDebug(),
373 MO.isInternalRead());
374 }
375
376 assert(MO.isImm());
377
378 APInt Imm(64, MO.getImm());
379
380 switch (SubIdx) {
381 default:
382 llvm_unreachable("do not know to split immediate with this sub index.");
383 case AMDGPU::sub0:
384 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
385 case AMDGPU::sub1:
386 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
387 }
388}
389
390static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
391 switch (Opc) {
392 case AMDGPU::G_AND:
393 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
394 case AMDGPU::G_OR:
395 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
396 case AMDGPU::G_XOR:
397 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
398 default:
399 llvm_unreachable("not a bit op");
400 }
401}
402
403bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
404 Register DstReg = I.getOperand(0).getReg();
405 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
406
407 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
408 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
409 DstRB->getID() != AMDGPU::VCCRegBankID)
410 return false;
411
412 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
413 STI.isWave64());
414 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
415
416 // Dead implicit-def of scc
417 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
418 true, // isImp
419 false, // isKill
420 true)); // isDead
421 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
422}
423
424bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
425 MachineBasicBlock *BB = I.getParent();
426 MachineFunction *MF = BB->getParent();
427 Register DstReg = I.getOperand(0).getReg();
428 const DebugLoc &DL = I.getDebugLoc();
429 LLT Ty = MRI->getType(DstReg);
430 if (Ty.isVector())
431 return false;
432
433 unsigned Size = Ty.getSizeInBits();
434 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
435 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
436 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
437
438 if (Size == 32) {
439 if (IsSALU) {
440 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
441 MachineInstr *Add =
442 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
443 .add(I.getOperand(1))
444 .add(I.getOperand(2))
445 .setOperandDead(3); // Dead scc
446 I.eraseFromParent();
447 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
448 }
449
450 if (STI.hasAddNoCarry()) {
451 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
452 I.setDesc(TII.get(Opc));
453 I.addOperand(*MF, MachineOperand::CreateImm(0));
454 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
455 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
456 }
457
458 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
459
460 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
461 MachineInstr *Add
462 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
463 .addDef(UnusedCarry, RegState::Dead)
464 .add(I.getOperand(1))
465 .add(I.getOperand(2))
466 .addImm(0);
467 I.eraseFromParent();
468 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
469 }
470
471 assert(!Sub && "illegal sub should not reach here");
472
473 const TargetRegisterClass &RC
474 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
475 const TargetRegisterClass &HalfRC
476 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
477
478 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
479 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
480 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
481 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
482
483 Register DstLo = MRI->createVirtualRegister(&HalfRC);
484 Register DstHi = MRI->createVirtualRegister(&HalfRC);
485
486 if (IsSALU) {
487 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
488 .add(Lo1)
489 .add(Lo2);
490 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
491 .add(Hi1)
492 .add(Hi2)
493 .setOperandDead(3); // Dead scc
494 } else {
495 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
496 Register CarryReg = MRI->createVirtualRegister(CarryRC);
497 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
498 .addDef(CarryReg)
499 .add(Lo1)
500 .add(Lo2)
501 .addImm(0);
502 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
503 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
504 .add(Hi1)
505 .add(Hi2)
506 .addReg(CarryReg, RegState::Kill)
507 .addImm(0);
508
509 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
510 return false;
511 }
512
513 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
514 .addReg(DstLo)
515 .addImm(AMDGPU::sub0)
516 .addReg(DstHi)
517 .addImm(AMDGPU::sub1);
518
519
520 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
521 return false;
522
523 I.eraseFromParent();
524 return true;
525}
526
527bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
528 MachineInstr &I) const {
529 MachineBasicBlock *BB = I.getParent();
530 MachineFunction *MF = BB->getParent();
531 const DebugLoc &DL = I.getDebugLoc();
532 Register Dst0Reg = I.getOperand(0).getReg();
533 Register Dst1Reg = I.getOperand(1).getReg();
534 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
535 I.getOpcode() == AMDGPU::G_UADDE;
536 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
537 I.getOpcode() == AMDGPU::G_USUBE;
538
539 if (isVCC(Dst1Reg, *MRI)) {
540 unsigned NoCarryOpc =
541 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
542 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
543 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
544 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
545 I.addOperand(*MF, MachineOperand::CreateImm(0));
546 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
547 }
548
549 Register Src0Reg = I.getOperand(2).getReg();
550 Register Src1Reg = I.getOperand(3).getReg();
551
552 if (HasCarryIn) {
553 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
554 .addReg(I.getOperand(4).getReg());
555 }
556
557 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
558 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
559
560 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
561 .add(I.getOperand(2))
562 .add(I.getOperand(3));
563
564 if (MRI->use_nodbg_empty(Dst1Reg)) {
565 CarryInst.setOperandDead(3); // Dead scc
566 } else {
567 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
568 .addReg(AMDGPU::SCC);
569 if (!MRI->getRegClassOrNull(Dst1Reg))
570 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
571 }
572
573 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
574 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
575 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
576 return false;
577
578 if (HasCarryIn &&
579 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
580 AMDGPU::SReg_32RegClass, *MRI))
581 return false;
582
583 I.eraseFromParent();
584 return true;
585}
586
587bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
588 MachineInstr &I) const {
589 MachineBasicBlock *BB = I.getParent();
590 MachineFunction *MF = BB->getParent();
591 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
592 bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() &&
593 MRI->use_nodbg_empty(I.getOperand(1).getReg());
594
595 unsigned Opc;
596 if (Subtarget->hasMADIntraFwdBug())
597 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
598 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
599 else if (UseNoCarry)
600 Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64
601 : AMDGPU::V_MAD_NC_I64_I32_e64;
602 else
603 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
604
605 if (UseNoCarry)
606 I.removeOperand(1);
607
608 I.setDesc(TII.get(Opc));
609 I.addOperand(*MF, MachineOperand::CreateImm(0));
610 I.addImplicitDefUseOperands(*MF);
611 I.getOperand(0).setIsEarlyClobber(true);
612 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
613}
614
615// TODO: We should probably legalize these to only using 32-bit results.
616bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
617 MachineBasicBlock *BB = I.getParent();
618 Register DstReg = I.getOperand(0).getReg();
619 Register SrcReg = I.getOperand(1).getReg();
620 LLT DstTy = MRI->getType(DstReg);
621 LLT SrcTy = MRI->getType(SrcReg);
622 const unsigned SrcSize = SrcTy.getSizeInBits();
623 unsigned DstSize = DstTy.getSizeInBits();
624
625 // TODO: Should handle any multiple of 32 offset.
626 unsigned Offset = I.getOperand(2).getImm();
627 if (Offset % 32 != 0 || DstSize > 128)
628 return false;
629
630 // 16-bit operations really use 32-bit registers.
631 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
632 if (DstSize == 16)
633 DstSize = 32;
634
635 const TargetRegisterClass *DstRC =
636 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
637 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
638 return false;
639
640 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
641 const TargetRegisterClass *SrcRC =
642 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
643 if (!SrcRC)
644 return false;
646 DstSize / 32);
647 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
648 if (!SrcRC)
649 return false;
650
651 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
652 *SrcRC, I.getOperand(1));
653 const DebugLoc &DL = I.getDebugLoc();
654 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
655 .addReg(SrcReg, 0, SubReg);
656
657 I.eraseFromParent();
658 return true;
659}
660
661bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
662 MachineBasicBlock *BB = MI.getParent();
663 Register DstReg = MI.getOperand(0).getReg();
664 LLT DstTy = MRI->getType(DstReg);
665 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
666
667 const unsigned SrcSize = SrcTy.getSizeInBits();
668 if (SrcSize < 32)
669 return selectImpl(MI, *CoverageInfo);
670
671 const DebugLoc &DL = MI.getDebugLoc();
672 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
673 const unsigned DstSize = DstTy.getSizeInBits();
674 const TargetRegisterClass *DstRC =
675 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
676 if (!DstRC)
677 return false;
678
679 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
680 MachineInstrBuilder MIB =
681 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
682 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
683 MachineOperand &Src = MI.getOperand(I + 1);
684 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
685 MIB.addImm(SubRegs[I]);
686
687 const TargetRegisterClass *SrcRC
688 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
689 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
690 return false;
691 }
692
693 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
694 return false;
695
696 MI.eraseFromParent();
697 return true;
698}
699
700bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
701 MachineBasicBlock *BB = MI.getParent();
702 const int NumDst = MI.getNumOperands() - 1;
703
704 MachineOperand &Src = MI.getOperand(NumDst);
705
706 Register SrcReg = Src.getReg();
707 Register DstReg0 = MI.getOperand(0).getReg();
708 LLT DstTy = MRI->getType(DstReg0);
709 LLT SrcTy = MRI->getType(SrcReg);
710
711 const unsigned DstSize = DstTy.getSizeInBits();
712 const unsigned SrcSize = SrcTy.getSizeInBits();
713 const DebugLoc &DL = MI.getDebugLoc();
714 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
715
716 const TargetRegisterClass *SrcRC =
717 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
718 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
719 return false;
720
721 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
722 // source, and this relies on the fact that the same subregister indices are
723 // used for both.
724 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
725 for (int I = 0, E = NumDst; I != E; ++I) {
726 MachineOperand &Dst = MI.getOperand(I);
727 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
728 .addReg(SrcReg, 0, SubRegs[I]);
729
730 // Make sure the subregister index is valid for the source register.
731 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
732 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
733 return false;
734
735 const TargetRegisterClass *DstRC =
736 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
737 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
738 return false;
739 }
740
741 MI.eraseFromParent();
742 return true;
743}
744
745bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
746 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
747 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
748
749 Register Src0 = MI.getOperand(1).getReg();
750 Register Src1 = MI.getOperand(2).getReg();
751 LLT SrcTy = MRI->getType(Src0);
752 const unsigned SrcSize = SrcTy.getSizeInBits();
753
754 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
755 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
756 return selectG_MERGE_VALUES(MI);
757 }
758
759 // Selection logic below is for V2S16 only.
760 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
761 Register Dst = MI.getOperand(0).getReg();
762 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
763 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
764 SrcTy != LLT::scalar(32)))
765 return selectImpl(MI, *CoverageInfo);
766
767 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
768 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
769 return false;
770
771 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
772 DstBank->getID() == AMDGPU::VGPRRegBankID);
773 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
774
775 const DebugLoc &DL = MI.getDebugLoc();
776 MachineBasicBlock *BB = MI.getParent();
777
778 // First, before trying TableGen patterns, check if both sources are
779 // constants. In those cases, we can trivially compute the final constant
780 // and emit a simple move.
781 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
782 if (ConstSrc1) {
783 auto ConstSrc0 =
784 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
785 if (ConstSrc0) {
786 const int64_t K0 = ConstSrc0->Value.getSExtValue();
787 const int64_t K1 = ConstSrc1->Value.getSExtValue();
788 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
789 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
790 uint32_t Imm = Lo16 | (Hi16 << 16);
791
792 // VALU
793 if (IsVector) {
794 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
795 MI.eraseFromParent();
796 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
797 }
798
799 // SALU
800 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
801 MI.eraseFromParent();
802 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
803 }
804 }
805
806 // Now try TableGen patterns.
807 if (selectImpl(MI, *CoverageInfo))
808 return true;
809
810 // TODO: This should probably be a combine somewhere
811 // (build_vector $src0, undef) -> copy $src0
812 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
813 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
814 MI.setDesc(TII.get(AMDGPU::COPY));
815 MI.removeOperand(2);
816 const auto &RC =
817 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
818 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
819 RBI.constrainGenericRegister(Src0, RC, *MRI);
820 }
821
822 // TODO: Can be improved?
823 if (IsVector) {
824 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
825 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
826 .addImm(0xFFFF)
827 .addReg(Src0);
828 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
829 return false;
830
831 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
832 .addReg(Src1)
833 .addImm(16)
834 .addReg(TmpReg);
835 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
836 return false;
837
838 MI.eraseFromParent();
839 return true;
840 }
841
842 Register ShiftSrc0;
843 Register ShiftSrc1;
844
845 // With multiple uses of the shift, this will duplicate the shift and
846 // increase register pressure.
847 //
848 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
849 // => (S_PACK_HH_B32_B16 $src0, $src1)
850 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
851 // => (S_PACK_HL_B32_B16 $src0, $src1)
852 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
853 // => (S_PACK_LH_B32_B16 $src0, $src1)
854 // (build_vector $src0, $src1)
855 // => (S_PACK_LL_B32_B16 $src0, $src1)
856
857 bool Shift0 = mi_match(
858 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
859
860 bool Shift1 = mi_match(
861 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
862
863 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
864 if (Shift0 && Shift1) {
865 Opc = AMDGPU::S_PACK_HH_B32_B16;
866 MI.getOperand(1).setReg(ShiftSrc0);
867 MI.getOperand(2).setReg(ShiftSrc1);
868 } else if (Shift1) {
869 Opc = AMDGPU::S_PACK_LH_B32_B16;
870 MI.getOperand(2).setReg(ShiftSrc1);
871 } else if (Shift0) {
872 auto ConstSrc1 =
873 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
874 if (ConstSrc1 && ConstSrc1->Value == 0) {
875 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
876 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
877 .addReg(ShiftSrc0)
878 .addImm(16)
879 .setOperandDead(3); // Dead scc
880
881 MI.eraseFromParent();
882 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
883 }
884 if (STI.hasSPackHL()) {
885 Opc = AMDGPU::S_PACK_HL_B32_B16;
886 MI.getOperand(1).setReg(ShiftSrc0);
887 }
888 }
889
890 MI.setDesc(TII.get(Opc));
891 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
892}
893
894bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
895 const MachineOperand &MO = I.getOperand(0);
896
897 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
898 // regbank check here is to know why getConstrainedRegClassForOperand failed.
899 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
900 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
901 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
902 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
903 return true;
904 }
905
906 return false;
907}
908
909bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
910 MachineBasicBlock *BB = I.getParent();
911
912 Register DstReg = I.getOperand(0).getReg();
913 Register Src0Reg = I.getOperand(1).getReg();
914 Register Src1Reg = I.getOperand(2).getReg();
915 LLT Src1Ty = MRI->getType(Src1Reg);
916
917 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
918 unsigned InsSize = Src1Ty.getSizeInBits();
919
920 int64_t Offset = I.getOperand(3).getImm();
921
922 // FIXME: These cases should have been illegal and unnecessary to check here.
923 if (Offset % 32 != 0 || InsSize % 32 != 0)
924 return false;
925
926 // Currently not handled by getSubRegFromChannel.
927 if (InsSize > 128)
928 return false;
929
930 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
931 if (SubReg == AMDGPU::NoSubRegister)
932 return false;
933
934 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
935 const TargetRegisterClass *DstRC =
936 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
937 if (!DstRC)
938 return false;
939
940 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
941 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
942 const TargetRegisterClass *Src0RC =
943 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
944 const TargetRegisterClass *Src1RC =
945 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
946
947 // Deal with weird cases where the class only partially supports the subreg
948 // index.
949 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
950 if (!Src0RC || !Src1RC)
951 return false;
952
953 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
954 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
955 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
956 return false;
957
958 const DebugLoc &DL = I.getDebugLoc();
959 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
960 .addReg(Src0Reg)
961 .addReg(Src1Reg)
962 .addImm(SubReg);
963
964 I.eraseFromParent();
965 return true;
966}
967
968bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
969 Register DstReg = MI.getOperand(0).getReg();
970 Register SrcReg = MI.getOperand(1).getReg();
971 Register OffsetReg = MI.getOperand(2).getReg();
972 Register WidthReg = MI.getOperand(3).getReg();
973
974 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
975 "scalar BFX instructions are expanded in regbankselect");
976 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
977 "64-bit vector BFX instructions are expanded in regbankselect");
978
979 const DebugLoc &DL = MI.getDebugLoc();
980 MachineBasicBlock *MBB = MI.getParent();
981
982 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
983 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
984 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
985 .addReg(SrcReg)
986 .addReg(OffsetReg)
987 .addReg(WidthReg);
988 MI.eraseFromParent();
989 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
990}
991
992bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
993 if (STI.getLDSBankCount() != 16)
994 return selectImpl(MI, *CoverageInfo);
995
996 Register Dst = MI.getOperand(0).getReg();
997 Register Src0 = MI.getOperand(2).getReg();
998 Register M0Val = MI.getOperand(6).getReg();
999 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
1000 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
1001 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
1002 return false;
1003
1004 // This requires 2 instructions. It is possible to write a pattern to support
1005 // this, but the generated isel emitter doesn't correctly deal with multiple
1006 // output instructions using the same physical register input. The copy to m0
1007 // is incorrectly placed before the second instruction.
1008 //
1009 // TODO: Match source modifiers.
1010
1011 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1012 const DebugLoc &DL = MI.getDebugLoc();
1013 MachineBasicBlock *MBB = MI.getParent();
1014
1015 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1016 .addReg(M0Val);
1017 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
1018 .addImm(2)
1019 .addImm(MI.getOperand(4).getImm()) // $attr
1020 .addImm(MI.getOperand(3).getImm()); // $attrchan
1021
1022 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
1023 .addImm(0) // $src0_modifiers
1024 .addReg(Src0) // $src0
1025 .addImm(MI.getOperand(4).getImm()) // $attr
1026 .addImm(MI.getOperand(3).getImm()) // $attrchan
1027 .addImm(0) // $src2_modifiers
1028 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
1029 .addImm(MI.getOperand(5).getImm()) // $high
1030 .addImm(0) // $clamp
1031 .addImm(0); // $omod
1032
1033 MI.eraseFromParent();
1034 return true;
1035}
1036
1037// Writelane is special in that it can use SGPR and M0 (which would normally
1038// count as using the constant bus twice - but in this case it is allowed since
1039// the lane selector doesn't count as a use of the constant bus). However, it is
1040// still required to abide by the 1 SGPR rule. Fix this up if we might have
1041// multiple SGPRs.
1042bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
1043 // With a constant bus limit of at least 2, there's no issue.
1044 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
1045 return selectImpl(MI, *CoverageInfo);
1046
1047 MachineBasicBlock *MBB = MI.getParent();
1048 const DebugLoc &DL = MI.getDebugLoc();
1049 Register VDst = MI.getOperand(0).getReg();
1050 Register Val = MI.getOperand(2).getReg();
1051 Register LaneSelect = MI.getOperand(3).getReg();
1052 Register VDstIn = MI.getOperand(4).getReg();
1053
1054 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
1055
1056 std::optional<ValueAndVReg> ConstSelect =
1057 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
1058 if (ConstSelect) {
1059 // The selector has to be an inline immediate, so we can use whatever for
1060 // the other operands.
1061 MIB.addReg(Val);
1062 MIB.addImm(ConstSelect->Value.getSExtValue() &
1063 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
1064 } else {
1065 std::optional<ValueAndVReg> ConstVal =
1067
1068 // If the value written is an inline immediate, we can get away without a
1069 // copy to m0.
1070 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
1071 STI.hasInv2PiInlineImm())) {
1072 MIB.addImm(ConstVal->Value.getSExtValue());
1073 MIB.addReg(LaneSelect);
1074 } else {
1075 MIB.addReg(Val);
1076
1077 // If the lane selector was originally in a VGPR and copied with
1078 // readfirstlane, there's a hazard to read the same SGPR from the
1079 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
1080 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
1081
1082 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1083 .addReg(LaneSelect);
1084 MIB.addReg(AMDGPU::M0);
1085 }
1086 }
1087
1088 MIB.addReg(VDstIn);
1089
1090 MI.eraseFromParent();
1091 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1092}
1093
1094// We need to handle this here because tablegen doesn't support matching
1095// instructions with multiple outputs.
1096bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
1097 Register Dst0 = MI.getOperand(0).getReg();
1098 Register Dst1 = MI.getOperand(1).getReg();
1099
1100 LLT Ty = MRI->getType(Dst0);
1101 unsigned Opc;
1102 if (Ty == LLT::scalar(32))
1103 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1104 else if (Ty == LLT::scalar(64))
1105 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1106 else
1107 return false;
1108
1109 // TODO: Match source modifiers.
1110
1111 const DebugLoc &DL = MI.getDebugLoc();
1112 MachineBasicBlock *MBB = MI.getParent();
1113
1114 Register Numer = MI.getOperand(3).getReg();
1115 Register Denom = MI.getOperand(4).getReg();
1116 unsigned ChooseDenom = MI.getOperand(5).getImm();
1117
1118 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1119
1120 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1121 .addDef(Dst1)
1122 .addImm(0) // $src0_modifiers
1123 .addUse(Src0) // $src0
1124 .addImm(0) // $src1_modifiers
1125 .addUse(Denom) // $src1
1126 .addImm(0) // $src2_modifiers
1127 .addUse(Numer) // $src2
1128 .addImm(0) // $clamp
1129 .addImm(0); // $omod
1130
1131 MI.eraseFromParent();
1132 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1133}
1134
1135bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1136 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1137 switch (IntrinsicID) {
1138 case Intrinsic::amdgcn_if_break: {
1139 MachineBasicBlock *BB = I.getParent();
1140
1141 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1142 // SelectionDAG uses for wave32 vs wave64.
1143 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1144 .add(I.getOperand(0))
1145 .add(I.getOperand(2))
1146 .add(I.getOperand(3));
1147
1148 Register DstReg = I.getOperand(0).getReg();
1149 Register Src0Reg = I.getOperand(2).getReg();
1150 Register Src1Reg = I.getOperand(3).getReg();
1151
1152 I.eraseFromParent();
1153
1154 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1155 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1156
1157 return true;
1158 }
1159 case Intrinsic::amdgcn_interp_p1_f16:
1160 return selectInterpP1F16(I);
1161 case Intrinsic::amdgcn_wqm:
1162 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1163 case Intrinsic::amdgcn_softwqm:
1164 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1165 case Intrinsic::amdgcn_strict_wwm:
1166 case Intrinsic::amdgcn_wwm:
1167 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1168 case Intrinsic::amdgcn_strict_wqm:
1169 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1170 case Intrinsic::amdgcn_writelane:
1171 return selectWritelane(I);
1172 case Intrinsic::amdgcn_div_scale:
1173 return selectDivScale(I);
1174 case Intrinsic::amdgcn_icmp:
1175 case Intrinsic::amdgcn_fcmp:
1176 if (selectImpl(I, *CoverageInfo))
1177 return true;
1178 return selectIntrinsicCmp(I);
1179 case Intrinsic::amdgcn_ballot:
1180 return selectBallot(I);
1181 case Intrinsic::amdgcn_reloc_constant:
1182 return selectRelocConstant(I);
1183 case Intrinsic::amdgcn_groupstaticsize:
1184 return selectGroupStaticSize(I);
1185 case Intrinsic::returnaddress:
1186 return selectReturnAddress(I);
1187 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1188 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1189 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1190 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1191 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1192 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1193 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1194 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1195 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1196 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1197 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1198 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1199 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1200 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1201 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1202 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1203 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1204 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1205 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1206 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1207 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1208 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1209 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1210 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1211 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1212 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1213 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1214 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1215 return selectSMFMACIntrin(I);
1216 case Intrinsic::amdgcn_permlane16_swap:
1217 case Intrinsic::amdgcn_permlane32_swap:
1218 return selectPermlaneSwapIntrin(I, IntrinsicID);
1219 default:
1220 return selectImpl(I, *CoverageInfo);
1221 }
1222}
1223
1225 const GCNSubtarget &ST) {
1226 if (Size != 16 && Size != 32 && Size != 64)
1227 return -1;
1228
1229 if (Size == 16 && !ST.has16BitInsts())
1230 return -1;
1231
1232 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
1233 unsigned FakeS16Opc, unsigned S32Opc,
1234 unsigned S64Opc) {
1235 if (Size == 16)
1236 return ST.hasTrue16BitInsts()
1237 ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc
1238 : S16Opc;
1239 if (Size == 32)
1240 return S32Opc;
1241 return S64Opc;
1242 };
1243
1244 switch (P) {
1245 default:
1246 llvm_unreachable("Unknown condition code!");
1247 case CmpInst::ICMP_NE:
1248 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1249 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1250 AMDGPU::V_CMP_NE_U64_e64);
1251 case CmpInst::ICMP_EQ:
1252 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1253 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1254 AMDGPU::V_CMP_EQ_U64_e64);
1255 case CmpInst::ICMP_SGT:
1256 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1257 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1258 AMDGPU::V_CMP_GT_I64_e64);
1259 case CmpInst::ICMP_SGE:
1260 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1261 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1262 AMDGPU::V_CMP_GE_I64_e64);
1263 case CmpInst::ICMP_SLT:
1264 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1265 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1266 AMDGPU::V_CMP_LT_I64_e64);
1267 case CmpInst::ICMP_SLE:
1268 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1269 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1270 AMDGPU::V_CMP_LE_I64_e64);
1271 case CmpInst::ICMP_UGT:
1272 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1273 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1274 AMDGPU::V_CMP_GT_U64_e64);
1275 case CmpInst::ICMP_UGE:
1276 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1277 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1278 AMDGPU::V_CMP_GE_U64_e64);
1279 case CmpInst::ICMP_ULT:
1280 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1281 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1282 AMDGPU::V_CMP_LT_U64_e64);
1283 case CmpInst::ICMP_ULE:
1284 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1285 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1286 AMDGPU::V_CMP_LE_U64_e64);
1287
1288 case CmpInst::FCMP_OEQ:
1289 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1290 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1291 AMDGPU::V_CMP_EQ_F64_e64);
1292 case CmpInst::FCMP_OGT:
1293 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1294 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1295 AMDGPU::V_CMP_GT_F64_e64);
1296 case CmpInst::FCMP_OGE:
1297 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1298 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1299 AMDGPU::V_CMP_GE_F64_e64);
1300 case CmpInst::FCMP_OLT:
1301 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1302 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1303 AMDGPU::V_CMP_LT_F64_e64);
1304 case CmpInst::FCMP_OLE:
1305 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1306 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1307 AMDGPU::V_CMP_LE_F64_e64);
1308 case CmpInst::FCMP_ONE:
1309 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1310 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1311 AMDGPU::V_CMP_NEQ_F64_e64);
1312 case CmpInst::FCMP_ORD:
1313 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1314 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1315 AMDGPU::V_CMP_O_F64_e64);
1316 case CmpInst::FCMP_UNO:
1317 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1318 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1319 AMDGPU::V_CMP_U_F64_e64);
1320 case CmpInst::FCMP_UEQ:
1321 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1322 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1323 AMDGPU::V_CMP_NLG_F64_e64);
1324 case CmpInst::FCMP_UGT:
1325 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1326 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1327 AMDGPU::V_CMP_NLE_F64_e64);
1328 case CmpInst::FCMP_UGE:
1329 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1330 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1331 AMDGPU::V_CMP_NLT_F64_e64);
1332 case CmpInst::FCMP_ULT:
1333 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1334 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1335 AMDGPU::V_CMP_NGE_F64_e64);
1336 case CmpInst::FCMP_ULE:
1337 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1338 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1339 AMDGPU::V_CMP_NGT_F64_e64);
1340 case CmpInst::FCMP_UNE:
1341 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1342 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1343 AMDGPU::V_CMP_NEQ_F64_e64);
1344 case CmpInst::FCMP_TRUE:
1345 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1346 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1347 AMDGPU::V_CMP_TRU_F64_e64);
1349 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1350 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1351 AMDGPU::V_CMP_F_F64_e64);
1352 }
1353}
1354
1355int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1356 unsigned Size) const {
1357 if (Size == 64) {
1358 if (!STI.hasScalarCompareEq64())
1359 return -1;
1360
1361 switch (P) {
1362 case CmpInst::ICMP_NE:
1363 return AMDGPU::S_CMP_LG_U64;
1364 case CmpInst::ICMP_EQ:
1365 return AMDGPU::S_CMP_EQ_U64;
1366 default:
1367 return -1;
1368 }
1369 }
1370
1371 if (Size == 32) {
1372 switch (P) {
1373 case CmpInst::ICMP_NE:
1374 return AMDGPU::S_CMP_LG_U32;
1375 case CmpInst::ICMP_EQ:
1376 return AMDGPU::S_CMP_EQ_U32;
1377 case CmpInst::ICMP_SGT:
1378 return AMDGPU::S_CMP_GT_I32;
1379 case CmpInst::ICMP_SGE:
1380 return AMDGPU::S_CMP_GE_I32;
1381 case CmpInst::ICMP_SLT:
1382 return AMDGPU::S_CMP_LT_I32;
1383 case CmpInst::ICMP_SLE:
1384 return AMDGPU::S_CMP_LE_I32;
1385 case CmpInst::ICMP_UGT:
1386 return AMDGPU::S_CMP_GT_U32;
1387 case CmpInst::ICMP_UGE:
1388 return AMDGPU::S_CMP_GE_U32;
1389 case CmpInst::ICMP_ULT:
1390 return AMDGPU::S_CMP_LT_U32;
1391 case CmpInst::ICMP_ULE:
1392 return AMDGPU::S_CMP_LE_U32;
1393 case CmpInst::FCMP_OEQ:
1394 return AMDGPU::S_CMP_EQ_F32;
1395 case CmpInst::FCMP_OGT:
1396 return AMDGPU::S_CMP_GT_F32;
1397 case CmpInst::FCMP_OGE:
1398 return AMDGPU::S_CMP_GE_F32;
1399 case CmpInst::FCMP_OLT:
1400 return AMDGPU::S_CMP_LT_F32;
1401 case CmpInst::FCMP_OLE:
1402 return AMDGPU::S_CMP_LE_F32;
1403 case CmpInst::FCMP_ONE:
1404 return AMDGPU::S_CMP_LG_F32;
1405 case CmpInst::FCMP_ORD:
1406 return AMDGPU::S_CMP_O_F32;
1407 case CmpInst::FCMP_UNO:
1408 return AMDGPU::S_CMP_U_F32;
1409 case CmpInst::FCMP_UEQ:
1410 return AMDGPU::S_CMP_NLG_F32;
1411 case CmpInst::FCMP_UGT:
1412 return AMDGPU::S_CMP_NLE_F32;
1413 case CmpInst::FCMP_UGE:
1414 return AMDGPU::S_CMP_NLT_F32;
1415 case CmpInst::FCMP_ULT:
1416 return AMDGPU::S_CMP_NGE_F32;
1417 case CmpInst::FCMP_ULE:
1418 return AMDGPU::S_CMP_NGT_F32;
1419 case CmpInst::FCMP_UNE:
1420 return AMDGPU::S_CMP_NEQ_F32;
1421 default:
1422 llvm_unreachable("Unknown condition code!");
1423 }
1424 }
1425
1426 if (Size == 16) {
1427 if (!STI.hasSALUFloatInsts())
1428 return -1;
1429
1430 switch (P) {
1431 case CmpInst::FCMP_OEQ:
1432 return AMDGPU::S_CMP_EQ_F16;
1433 case CmpInst::FCMP_OGT:
1434 return AMDGPU::S_CMP_GT_F16;
1435 case CmpInst::FCMP_OGE:
1436 return AMDGPU::S_CMP_GE_F16;
1437 case CmpInst::FCMP_OLT:
1438 return AMDGPU::S_CMP_LT_F16;
1439 case CmpInst::FCMP_OLE:
1440 return AMDGPU::S_CMP_LE_F16;
1441 case CmpInst::FCMP_ONE:
1442 return AMDGPU::S_CMP_LG_F16;
1443 case CmpInst::FCMP_ORD:
1444 return AMDGPU::S_CMP_O_F16;
1445 case CmpInst::FCMP_UNO:
1446 return AMDGPU::S_CMP_U_F16;
1447 case CmpInst::FCMP_UEQ:
1448 return AMDGPU::S_CMP_NLG_F16;
1449 case CmpInst::FCMP_UGT:
1450 return AMDGPU::S_CMP_NLE_F16;
1451 case CmpInst::FCMP_UGE:
1452 return AMDGPU::S_CMP_NLT_F16;
1453 case CmpInst::FCMP_ULT:
1454 return AMDGPU::S_CMP_NGE_F16;
1455 case CmpInst::FCMP_ULE:
1456 return AMDGPU::S_CMP_NGT_F16;
1457 case CmpInst::FCMP_UNE:
1458 return AMDGPU::S_CMP_NEQ_F16;
1459 default:
1460 llvm_unreachable("Unknown condition code!");
1461 }
1462 }
1463
1464 return -1;
1465}
1466
1467bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1468
1469 MachineBasicBlock *BB = I.getParent();
1470 const DebugLoc &DL = I.getDebugLoc();
1471
1472 Register SrcReg = I.getOperand(2).getReg();
1473 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1474
1475 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1476
1477 Register CCReg = I.getOperand(0).getReg();
1478 if (!isVCC(CCReg, *MRI)) {
1479 int Opcode = getS_CMPOpcode(Pred, Size);
1480 if (Opcode == -1)
1481 return false;
1482 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1483 .add(I.getOperand(2))
1484 .add(I.getOperand(3));
1485 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1486 .addReg(AMDGPU::SCC);
1487 bool Ret =
1488 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1489 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1490 I.eraseFromParent();
1491 return Ret;
1492 }
1493
1494 if (I.getOpcode() == AMDGPU::G_FCMP)
1495 return false;
1496
1497 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1498 if (Opcode == -1)
1499 return false;
1500
1501 MachineInstrBuilder ICmp;
1502 // t16 instructions
1503 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers)) {
1504 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1505 .addImm(0)
1506 .add(I.getOperand(2))
1507 .addImm(0)
1508 .add(I.getOperand(3))
1509 .addImm(0); // op_sel
1510 } else {
1511 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1512 .add(I.getOperand(2))
1513 .add(I.getOperand(3));
1514 }
1515
1516 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1517 *TRI.getBoolRC(), *MRI);
1518 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1519 I.eraseFromParent();
1520 return Ret;
1521}
1522
1523bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1524 Register Dst = I.getOperand(0).getReg();
1525 if (isVCC(Dst, *MRI))
1526 return false;
1527
1528 LLT DstTy = MRI->getType(Dst);
1529 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1530 return false;
1531
1532 MachineBasicBlock *BB = I.getParent();
1533 const DebugLoc &DL = I.getDebugLoc();
1534 Register SrcReg = I.getOperand(2).getReg();
1535 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1536
1537 // i1 inputs are not supported in GlobalISel.
1538 if (Size == 1)
1539 return false;
1540
1541 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1542 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1543 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1544 I.eraseFromParent();
1545 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1546 }
1547
1548 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1549 if (Opcode == -1)
1550 return false;
1551
1552 MachineInstrBuilder SelectedMI;
1553 MachineOperand &LHS = I.getOperand(2);
1554 MachineOperand &RHS = I.getOperand(3);
1555 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1556 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
1557 Register Src0Reg =
1558 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1559 Register Src1Reg =
1560 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1561 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1562 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1563 SelectedMI.addImm(Src0Mods);
1564 SelectedMI.addReg(Src0Reg);
1565 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1566 SelectedMI.addImm(Src1Mods);
1567 SelectedMI.addReg(Src1Reg);
1568 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1569 SelectedMI.addImm(0); // clamp
1570 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1571 SelectedMI.addImm(0); // op_sel
1572
1573 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1574 if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1575 return false;
1576
1577 I.eraseFromParent();
1578 return true;
1579}
1580
1581// Ballot has to zero bits in input lane-mask that are zero in current exec,
1582// Done as AND with exec. For inputs that are results of instruction that
1583// implicitly use same exec, for example compares in same basic block or SCC to
1584// VCC copy, use copy.
1587 MachineInstr *MI = MRI.getVRegDef(Reg);
1588 if (MI->getParent() != MBB)
1589 return false;
1590
1591 // Lane mask generated by SCC to VCC copy.
1592 if (MI->getOpcode() == AMDGPU::COPY) {
1593 auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg());
1594 auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg());
1595 if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1596 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1597 return true;
1598 }
1599
1600 // Lane mask generated using compare with same exec.
1601 if (isa<GAnyCmp>(MI))
1602 return true;
1603
1604 Register LHS, RHS;
1605 // Look through AND.
1606 if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
1607 return isLaneMaskFromSameBlock(LHS, MRI, MBB) ||
1609
1610 return false;
1611}
1612
1613bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1614 MachineBasicBlock *BB = I.getParent();
1615 const DebugLoc &DL = I.getDebugLoc();
1616 Register DstReg = I.getOperand(0).getReg();
1617 Register SrcReg = I.getOperand(2).getReg();
1618 const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
1619 const unsigned WaveSize = STI.getWavefrontSize();
1620
1621 // In the common case, the return type matches the wave size.
1622 // However we also support emitting i64 ballots in wave32 mode.
1623 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1624 return false;
1625
1626 std::optional<ValueAndVReg> Arg =
1628
1629 Register Dst = DstReg;
1630 // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1631 if (BallotSize != WaveSize) {
1632 Dst = MRI->createVirtualRegister(TRI.getBoolRC());
1633 }
1634
1635 if (Arg) {
1636 const int64_t Value = Arg->Value.getZExtValue();
1637 if (Value == 0) {
1638 // Dst = S_MOV 0
1639 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1640 BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
1641 } else {
1642 // Dst = COPY EXEC
1643 assert(Value == 1);
1644 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
1645 }
1646 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1647 return false;
1648 } else {
1649 if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {
1650 // Dst = COPY SrcReg
1651 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
1652 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1653 return false;
1654 } else {
1655 // Dst = S_AND SrcReg, EXEC
1656 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1657 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
1658 .addReg(SrcReg)
1659 .addReg(TRI.getExec())
1660 .setOperandDead(3); // Dead scc
1661 if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI))
1662 return false;
1663 }
1664 }
1665
1666 // i64 ballot on Wave32: zero-extend i32 ballot to i64.
1667 if (BallotSize != WaveSize) {
1668 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1669 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1670 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1671 .addReg(Dst)
1672 .addImm(AMDGPU::sub0)
1673 .addReg(HiReg)
1674 .addImm(AMDGPU::sub1);
1675 }
1676
1677 I.eraseFromParent();
1678 return true;
1679}
1680
1681bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1682 Register DstReg = I.getOperand(0).getReg();
1683 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1684 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1685 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1686 return false;
1687
1688 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1689
1690 Module *M = MF->getFunction().getParent();
1691 const MDNode *Metadata = I.getOperand(2).getMetadata();
1692 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1693 auto *RelocSymbol = cast<GlobalVariable>(
1694 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1695
1696 MachineBasicBlock *BB = I.getParent();
1697 BuildMI(*BB, &I, I.getDebugLoc(),
1698 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1700
1701 I.eraseFromParent();
1702 return true;
1703}
1704
1705bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1706 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1707
1708 Register DstReg = I.getOperand(0).getReg();
1709 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1710 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1711 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1712
1713 MachineBasicBlock *MBB = I.getParent();
1714 const DebugLoc &DL = I.getDebugLoc();
1715
1716 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1717
1718 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1719 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1720 MIB.addImm(MFI->getLDSSize());
1721 } else {
1722 Module *M = MF->getFunction().getParent();
1723 const GlobalValue *GV =
1724 Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1726 }
1727
1728 I.eraseFromParent();
1729 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1730}
1731
1732bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1733 MachineBasicBlock *MBB = I.getParent();
1734 MachineFunction &MF = *MBB->getParent();
1735 const DebugLoc &DL = I.getDebugLoc();
1736
1737 MachineOperand &Dst = I.getOperand(0);
1738 Register DstReg = Dst.getReg();
1739 unsigned Depth = I.getOperand(2).getImm();
1740
1741 const TargetRegisterClass *RC
1742 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1743 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1744 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1745 return false;
1746
1747 // Check for kernel and shader functions
1748 if (Depth != 0 ||
1749 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1750 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1751 .addImm(0);
1752 I.eraseFromParent();
1753 return true;
1754 }
1755
1756 MachineFrameInfo &MFI = MF.getFrameInfo();
1757 // There is a call to @llvm.returnaddress in this function
1758 MFI.setReturnAddressIsTaken(true);
1759
1760 // Get the return address reg and mark it as an implicit live-in
1761 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1762 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1763 AMDGPU::SReg_64RegClass, DL);
1764 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1765 .addReg(LiveIn);
1766 I.eraseFromParent();
1767 return true;
1768}
1769
1770bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1771 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1772 // SelectionDAG uses for wave32 vs wave64.
1773 MachineBasicBlock *BB = MI.getParent();
1774 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1775 .add(MI.getOperand(1));
1776
1777 Register Reg = MI.getOperand(1).getReg();
1778 MI.eraseFromParent();
1779
1780 if (!MRI->getRegClassOrNull(Reg))
1781 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1782 return true;
1783}
1784
1785bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1786 MachineInstr &MI, Intrinsic::ID IntrID) const {
1787 MachineBasicBlock *MBB = MI.getParent();
1788 MachineFunction *MF = MBB->getParent();
1789 const DebugLoc &DL = MI.getDebugLoc();
1790
1791 unsigned IndexOperand = MI.getOperand(7).getImm();
1792 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1793 bool WaveDone = MI.getOperand(9).getImm() != 0;
1794
1795 if (WaveDone && !WaveRelease) {
1796 // TODO: Move this to IR verifier
1797 const Function &Fn = MF->getFunction();
1798 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1799 Fn, "ds_ordered_count: wave_done requires wave_release", DL));
1800 }
1801
1802 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1803 IndexOperand &= ~0x3f;
1804 unsigned CountDw = 0;
1805
1806 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1807 CountDw = (IndexOperand >> 24) & 0xf;
1808 IndexOperand &= ~(0xf << 24);
1809
1810 if (CountDw < 1 || CountDw > 4) {
1811 const Function &Fn = MF->getFunction();
1812 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1813 Fn, "ds_ordered_count: dword count must be between 1 and 4", DL));
1814 CountDw = 1;
1815 }
1816 }
1817
1818 if (IndexOperand) {
1819 const Function &Fn = MF->getFunction();
1820 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1821 Fn, "ds_ordered_count: bad index operand", DL));
1822 }
1823
1824 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1825 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1826
1827 unsigned Offset0 = OrderedCountIndex << 2;
1828 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1829
1830 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1831 Offset1 |= (CountDw - 1) << 6;
1832
1833 if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
1834 Offset1 |= ShaderType << 2;
1835
1836 unsigned Offset = Offset0 | (Offset1 << 8);
1837
1838 Register M0Val = MI.getOperand(2).getReg();
1839 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1840 .addReg(M0Val);
1841
1842 Register DstReg = MI.getOperand(0).getReg();
1843 Register ValReg = MI.getOperand(3).getReg();
1844 MachineInstrBuilder DS =
1845 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1846 .addReg(ValReg)
1847 .addImm(Offset)
1848 .cloneMemRefs(MI);
1849
1850 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1851 return false;
1852
1853 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1854 MI.eraseFromParent();
1855 return Ret;
1856}
1857
1858static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1859 switch (IntrID) {
1860 case Intrinsic::amdgcn_ds_gws_init:
1861 return AMDGPU::DS_GWS_INIT;
1862 case Intrinsic::amdgcn_ds_gws_barrier:
1863 return AMDGPU::DS_GWS_BARRIER;
1864 case Intrinsic::amdgcn_ds_gws_sema_v:
1865 return AMDGPU::DS_GWS_SEMA_V;
1866 case Intrinsic::amdgcn_ds_gws_sema_br:
1867 return AMDGPU::DS_GWS_SEMA_BR;
1868 case Intrinsic::amdgcn_ds_gws_sema_p:
1869 return AMDGPU::DS_GWS_SEMA_P;
1870 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1871 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1872 default:
1873 llvm_unreachable("not a gws intrinsic");
1874 }
1875}
1876
1877bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1878 Intrinsic::ID IID) const {
1879 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1880 !STI.hasGWSSemaReleaseAll()))
1881 return false;
1882
1883 // intrinsic ID, vsrc, offset
1884 const bool HasVSrc = MI.getNumOperands() == 3;
1885 assert(HasVSrc || MI.getNumOperands() == 2);
1886
1887 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1888 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1889 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1890 return false;
1891
1892 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1893 unsigned ImmOffset;
1894
1895 MachineBasicBlock *MBB = MI.getParent();
1896 const DebugLoc &DL = MI.getDebugLoc();
1897
1898 MachineInstr *Readfirstlane = nullptr;
1899
1900 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1901 // incoming offset, in case there's an add of a constant. We'll have to put it
1902 // back later.
1903 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1904 Readfirstlane = OffsetDef;
1905 BaseOffset = OffsetDef->getOperand(1).getReg();
1906 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1907 }
1908
1909 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1910 // If we have a constant offset, try to use the 0 in m0 as the base.
1911 // TODO: Look into changing the default m0 initialization value. If the
1912 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1913 // the immediate offset.
1914
1915 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1916 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1917 .addImm(0);
1918 } else {
1919 std::tie(BaseOffset, ImmOffset) =
1920 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, VT);
1921
1922 if (Readfirstlane) {
1923 // We have the constant offset now, so put the readfirstlane back on the
1924 // variable component.
1925 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1926 return false;
1927
1928 Readfirstlane->getOperand(1).setReg(BaseOffset);
1929 BaseOffset = Readfirstlane->getOperand(0).getReg();
1930 } else {
1931 if (!RBI.constrainGenericRegister(BaseOffset,
1932 AMDGPU::SReg_32RegClass, *MRI))
1933 return false;
1934 }
1935
1936 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1937 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1938 .addReg(BaseOffset)
1939 .addImm(16)
1940 .setOperandDead(3); // Dead scc
1941
1942 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1943 .addReg(M0Base);
1944 }
1945
1946 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1947 // offset field) % 64. Some versions of the programming guide omit the m0
1948 // part, or claim it's from offset 0.
1949
1950 unsigned Opc = gwsIntrinToOpcode(IID);
1951 const MCInstrDesc &InstrDesc = TII.get(Opc);
1952
1953 if (HasVSrc) {
1954 Register VSrc = MI.getOperand(1).getReg();
1955
1956 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
1957 const TargetRegisterClass *DataRC = TII.getRegClass(InstrDesc, Data0Idx);
1958 const TargetRegisterClass *SubRC =
1959 TRI.getSubRegisterClass(DataRC, AMDGPU::sub0);
1960
1961 if (!SubRC) {
1962 // 32-bit normal case.
1963 if (!RBI.constrainGenericRegister(VSrc, *DataRC, *MRI))
1964 return false;
1965
1966 BuildMI(*MBB, &MI, DL, InstrDesc)
1967 .addReg(VSrc)
1968 .addImm(ImmOffset)
1969 .cloneMemRefs(MI);
1970 } else {
1971 // Requires even register alignment, so create 64-bit value and pad the
1972 // top half with undef.
1973 Register DataReg = MRI->createVirtualRegister(DataRC);
1974 if (!RBI.constrainGenericRegister(VSrc, *SubRC, *MRI))
1975 return false;
1976
1977 Register UndefReg = MRI->createVirtualRegister(SubRC);
1978 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
1979 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), DataReg)
1980 .addReg(VSrc)
1981 .addImm(AMDGPU::sub0)
1982 .addReg(UndefReg)
1983 .addImm(AMDGPU::sub1);
1984
1985 BuildMI(*MBB, &MI, DL, InstrDesc)
1986 .addReg(DataReg)
1987 .addImm(ImmOffset)
1988 .cloneMemRefs(MI);
1989 }
1990 } else {
1991 BuildMI(*MBB, &MI, DL, InstrDesc)
1992 .addImm(ImmOffset)
1993 .cloneMemRefs(MI);
1994 }
1995
1996 MI.eraseFromParent();
1997 return true;
1998}
1999
2000bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
2001 bool IsAppend) const {
2002 Register PtrBase = MI.getOperand(2).getReg();
2003 LLT PtrTy = MRI->getType(PtrBase);
2004 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2005
2006 unsigned Offset;
2007 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
2008
2009 // TODO: Should this try to look through readfirstlane like GWS?
2010 if (!isDSOffsetLegal(PtrBase, Offset)) {
2011 PtrBase = MI.getOperand(2).getReg();
2012 Offset = 0;
2013 }
2014
2015 MachineBasicBlock *MBB = MI.getParent();
2016 const DebugLoc &DL = MI.getDebugLoc();
2017 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2018
2019 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2020 .addReg(PtrBase);
2021 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
2022 return false;
2023
2024 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
2025 .addImm(Offset)
2026 .addImm(IsGDS ? -1 : 0)
2027 .cloneMemRefs(MI);
2028 MI.eraseFromParent();
2029 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2030}
2031
2032bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
2033 MachineFunction *MF = MI.getMF();
2034 SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
2035
2036 MFInfo->setInitWholeWave();
2037 return selectImpl(MI, *CoverageInfo);
2038}
2039
2040static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
2041 bool &IsTexFail) {
2042 if (TexFailCtrl)
2043 IsTexFail = true;
2044
2045 TFE = TexFailCtrl & 0x1;
2046 TexFailCtrl &= ~(uint64_t)0x1;
2047 LWE = TexFailCtrl & 0x2;
2048 TexFailCtrl &= ~(uint64_t)0x2;
2049
2050 return TexFailCtrl == 0;
2051}
2052
2053bool AMDGPUInstructionSelector::selectImageIntrinsic(
2054 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
2055 MachineBasicBlock *MBB = MI.getParent();
2056 const DebugLoc &DL = MI.getDebugLoc();
2057 unsigned IntrOpcode = Intr->BaseOpcode;
2058
2059 // For image atomic: use no-return opcode if result is unused.
2060 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) {
2061 Register ResultDef = MI.getOperand(0).getReg();
2062 if (MRI->use_nodbg_empty(ResultDef))
2063 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
2064 }
2065
2066 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2068
2069 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
2070 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
2071 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
2072 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
2073
2074 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
2075
2076 Register VDataIn = AMDGPU::NoRegister;
2077 Register VDataOut = AMDGPU::NoRegister;
2078 LLT VDataTy;
2079 int NumVDataDwords = -1;
2080 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
2081 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
2082
2083 bool Unorm;
2084 if (!BaseOpcode->Sampler)
2085 Unorm = true;
2086 else
2087 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
2088
2089 bool TFE;
2090 bool LWE;
2091 bool IsTexFail = false;
2092 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
2093 TFE, LWE, IsTexFail))
2094 return false;
2095
2096 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
2097 const bool IsA16 = (Flags & 1) != 0;
2098 const bool IsG16 = (Flags & 2) != 0;
2099
2100 // A16 implies 16 bit gradients if subtarget doesn't support G16
2101 if (IsA16 && !STI.hasG16() && !IsG16)
2102 return false;
2103
2104 unsigned DMask = 0;
2105 unsigned DMaskLanes = 0;
2106
2107 if (BaseOpcode->Atomic) {
2108 if (!BaseOpcode->NoReturn)
2109 VDataOut = MI.getOperand(0).getReg();
2110 VDataIn = MI.getOperand(2).getReg();
2111 LLT Ty = MRI->getType(VDataIn);
2112
2113 // Be careful to allow atomic swap on 16-bit element vectors.
2114 const bool Is64Bit = BaseOpcode->AtomicX2 ?
2115 Ty.getSizeInBits() == 128 :
2116 Ty.getSizeInBits() == 64;
2117
2118 if (BaseOpcode->AtomicX2) {
2119 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
2120
2121 DMask = Is64Bit ? 0xf : 0x3;
2122 NumVDataDwords = Is64Bit ? 4 : 2;
2123 } else {
2124 DMask = Is64Bit ? 0x3 : 0x1;
2125 NumVDataDwords = Is64Bit ? 2 : 1;
2126 }
2127 } else {
2128 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
2129 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
2130
2131 if (BaseOpcode->Store) {
2132 VDataIn = MI.getOperand(1).getReg();
2133 VDataTy = MRI->getType(VDataIn);
2134 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
2135 } else if (BaseOpcode->NoReturn) {
2136 NumVDataDwords = 0;
2137 } else {
2138 VDataOut = MI.getOperand(0).getReg();
2139 VDataTy = MRI->getType(VDataOut);
2140 NumVDataDwords = DMaskLanes;
2141
2142 if (IsD16 && !STI.hasUnpackedD16VMem())
2143 NumVDataDwords = (DMaskLanes + 1) / 2;
2144 }
2145 }
2146
2147 // Set G16 opcode
2148 if (Subtarget->hasG16() && IsG16) {
2149 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2151 assert(G16MappingInfo);
2152 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
2153 }
2154
2155 // TODO: Check this in verifier.
2156 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
2157
2158 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
2159 // Keep GLC only when the atomic's result is actually used.
2160 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
2162 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
2164 return false;
2165
2166 int NumVAddrRegs = 0;
2167 int NumVAddrDwords = 0;
2168 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
2169 // Skip the $noregs and 0s inserted during legalization.
2170 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
2171 if (!AddrOp.isReg())
2172 continue; // XXX - Break?
2173
2174 Register Addr = AddrOp.getReg();
2175 if (!Addr)
2176 break;
2177
2178 ++NumVAddrRegs;
2179 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
2180 }
2181
2182 // The legalizer preprocessed the intrinsic arguments. If we aren't using
2183 // NSA, these should have been packed into a single value in the first
2184 // address register
2185 const bool UseNSA =
2186 NumVAddrRegs != 1 &&
2187 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2188 : NumVAddrDwords == NumVAddrRegs);
2189 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
2190 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
2191 return false;
2192 }
2193
2194 if (IsTexFail)
2195 ++NumVDataDwords;
2196
2197 int Opcode = -1;
2198 if (IsGFX12Plus) {
2199 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
2200 NumVDataDwords, NumVAddrDwords);
2201 } else if (IsGFX11Plus) {
2202 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2203 UseNSA ? AMDGPU::MIMGEncGfx11NSA
2204 : AMDGPU::MIMGEncGfx11Default,
2205 NumVDataDwords, NumVAddrDwords);
2206 } else if (IsGFX10Plus) {
2207 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2208 UseNSA ? AMDGPU::MIMGEncGfx10NSA
2209 : AMDGPU::MIMGEncGfx10Default,
2210 NumVDataDwords, NumVAddrDwords);
2211 } else {
2212 if (Subtarget->hasGFX90AInsts()) {
2213 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
2214 NumVDataDwords, NumVAddrDwords);
2215 if (Opcode == -1) {
2216 LLVM_DEBUG(
2217 dbgs()
2218 << "requested image instruction is not supported on this GPU\n");
2219 return false;
2220 }
2221 }
2222 if (Opcode == -1 &&
2223 STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
2224 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
2225 NumVDataDwords, NumVAddrDwords);
2226 if (Opcode == -1)
2227 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
2228 NumVDataDwords, NumVAddrDwords);
2229 }
2230 if (Opcode == -1)
2231 return false;
2232
2233 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
2234 .cloneMemRefs(MI);
2235
2236 if (VDataOut) {
2237 if (BaseOpcode->AtomicX2) {
2238 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
2239
2240 Register TmpReg = MRI->createVirtualRegister(
2241 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2242 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2243
2244 MIB.addDef(TmpReg);
2245 if (!MRI->use_empty(VDataOut)) {
2246 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
2247 .addReg(TmpReg, RegState::Kill, SubReg);
2248 }
2249
2250 } else {
2251 MIB.addDef(VDataOut); // vdata output
2252 }
2253 }
2254
2255 if (VDataIn)
2256 MIB.addReg(VDataIn); // vdata input
2257
2258 for (int I = 0; I != NumVAddrRegs; ++I) {
2259 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2260 if (SrcOp.isReg()) {
2261 assert(SrcOp.getReg() != 0);
2262 MIB.addReg(SrcOp.getReg());
2263 }
2264 }
2265
2266 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2267 if (BaseOpcode->Sampler)
2268 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2269
2270 MIB.addImm(DMask); // dmask
2271
2272 if (IsGFX10Plus)
2273 MIB.addImm(DimInfo->Encoding);
2274 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2275 MIB.addImm(Unorm);
2276
2277 MIB.addImm(CPol);
2278 MIB.addImm(IsA16 && // a16 or r128
2279 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2280 if (IsGFX10Plus)
2281 MIB.addImm(IsA16 ? -1 : 0);
2282
2283 if (!Subtarget->hasGFX90AInsts()) {
2284 MIB.addImm(TFE); // tfe
2285 } else if (TFE) {
2286 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2287 return false;
2288 }
2289
2290 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2291 MIB.addImm(LWE); // lwe
2292 if (!IsGFX10Plus)
2293 MIB.addImm(DimInfo->DA ? -1 : 0);
2294 if (BaseOpcode->HasD16)
2295 MIB.addImm(IsD16 ? -1 : 0);
2296
2297 MI.eraseFromParent();
2298 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2299 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2300 return true;
2301}
2302
2303// We need to handle this here because tablegen doesn't support matching
2304// instructions with multiple outputs.
2305bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2306 MachineInstr &MI) const {
2307 Register Dst0 = MI.getOperand(0).getReg();
2308 Register Dst1 = MI.getOperand(1).getReg();
2309
2310 const DebugLoc &DL = MI.getDebugLoc();
2311 MachineBasicBlock *MBB = MI.getParent();
2312
2313 Register Addr = MI.getOperand(3).getReg();
2314 Register Data0 = MI.getOperand(4).getReg();
2315 Register Data1 = MI.getOperand(5).getReg();
2316 unsigned Offset = MI.getOperand(6).getImm();
2317
2318 unsigned Opc;
2319 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
2320 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2321 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2322 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2323 break;
2324 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2325 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2326 break;
2327 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2328 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2329 break;
2330 }
2331
2332 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
2333 .addDef(Dst1)
2334 .addUse(Addr)
2335 .addUse(Data0)
2336 .addUse(Data1)
2337 .addImm(Offset)
2338 .cloneMemRefs(MI);
2339
2340 MI.eraseFromParent();
2341 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2342}
2343
2344bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2345 MachineInstr &I) const {
2346 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2347 switch (IntrinsicID) {
2348 case Intrinsic::amdgcn_end_cf:
2349 return selectEndCfIntrinsic(I);
2350 case Intrinsic::amdgcn_ds_ordered_add:
2351 case Intrinsic::amdgcn_ds_ordered_swap:
2352 return selectDSOrderedIntrinsic(I, IntrinsicID);
2353 case Intrinsic::amdgcn_ds_gws_init:
2354 case Intrinsic::amdgcn_ds_gws_barrier:
2355 case Intrinsic::amdgcn_ds_gws_sema_v:
2356 case Intrinsic::amdgcn_ds_gws_sema_br:
2357 case Intrinsic::amdgcn_ds_gws_sema_p:
2358 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2359 return selectDSGWSIntrinsic(I, IntrinsicID);
2360 case Intrinsic::amdgcn_ds_append:
2361 return selectDSAppendConsume(I, true);
2362 case Intrinsic::amdgcn_ds_consume:
2363 return selectDSAppendConsume(I, false);
2364 case Intrinsic::amdgcn_init_whole_wave:
2365 return selectInitWholeWave(I);
2366 case Intrinsic::amdgcn_raw_buffer_load_lds:
2367 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2368 case Intrinsic::amdgcn_struct_buffer_load_lds:
2369 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2370 return selectBufferLoadLds(I);
2371 // Until we can store both the address space of the global and the LDS
2372 // arguments by having tto MachineMemOperands on an intrinsic, we just trust
2373 // that the argument is a global pointer (buffer pointers have been handled by
2374 // a LLVM IR-level lowering).
2375 case Intrinsic::amdgcn_load_to_lds:
2376 case Intrinsic::amdgcn_global_load_lds:
2377 return selectGlobalLoadLds(I);
2378 case Intrinsic::amdgcn_exp_compr:
2379 if (!STI.hasCompressedExport()) {
2380 Function &F = I.getMF()->getFunction();
2381 F.getContext().diagnose(
2382 DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget",
2383 I.getDebugLoc(), DS_Error));
2384 return false;
2385 }
2386 break;
2387 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2388 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2389 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2390 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2391 return selectDSBvhStackIntrinsic(I);
2392 case Intrinsic::amdgcn_s_barrier_init:
2393 case Intrinsic::amdgcn_s_barrier_signal_var:
2394 return selectNamedBarrierInit(I, IntrinsicID);
2395 case Intrinsic::amdgcn_s_barrier_join:
2396 case Intrinsic::amdgcn_s_get_named_barrier_state:
2397 return selectNamedBarrierInst(I, IntrinsicID);
2398 case Intrinsic::amdgcn_s_get_barrier_state:
2399 return selectSGetBarrierState(I, IntrinsicID);
2400 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2401 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2402 }
2403 return selectImpl(I, *CoverageInfo);
2404}
2405
2406bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2407 if (selectImpl(I, *CoverageInfo))
2408 return true;
2409
2410 MachineBasicBlock *BB = I.getParent();
2411 const DebugLoc &DL = I.getDebugLoc();
2412
2413 Register DstReg = I.getOperand(0).getReg();
2414 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2415 assert(Size <= 32 || Size == 64);
2416 const MachineOperand &CCOp = I.getOperand(1);
2417 Register CCReg = CCOp.getReg();
2418 if (!isVCC(CCReg, *MRI)) {
2419 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2420 AMDGPU::S_CSELECT_B32;
2421 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2422 .addReg(CCReg);
2423
2424 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2425 // bank, because it does not cover the register class that we used to represent
2426 // for it. So we need to manually set the register class here.
2427 if (!MRI->getRegClassOrNull(CCReg))
2428 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2429 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2430 .add(I.getOperand(2))
2431 .add(I.getOperand(3));
2432
2433 bool Ret = false;
2434 Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2435 Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2436 I.eraseFromParent();
2437 return Ret;
2438 }
2439
2440 // Wide VGPR select should have been split in RegBankSelect.
2441 if (Size > 32)
2442 return false;
2443
2444 MachineInstr *Select =
2445 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2446 .addImm(0)
2447 .add(I.getOperand(3))
2448 .addImm(0)
2449 .add(I.getOperand(2))
2450 .add(I.getOperand(1));
2451
2452 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2453 I.eraseFromParent();
2454 return Ret;
2455}
2456
2457bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2458 Register DstReg = I.getOperand(0).getReg();
2459 Register SrcReg = I.getOperand(1).getReg();
2460 const LLT DstTy = MRI->getType(DstReg);
2461 const LLT SrcTy = MRI->getType(SrcReg);
2462 const LLT S1 = LLT::scalar(1);
2463
2464 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2465 const RegisterBank *DstRB;
2466 if (DstTy == S1) {
2467 // This is a special case. We don't treat s1 for legalization artifacts as
2468 // vcc booleans.
2469 DstRB = SrcRB;
2470 } else {
2471 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2472 if (SrcRB != DstRB)
2473 return false;
2474 }
2475
2476 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2477
2478 unsigned DstSize = DstTy.getSizeInBits();
2479 unsigned SrcSize = SrcTy.getSizeInBits();
2480
2481 const TargetRegisterClass *SrcRC =
2482 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2483 const TargetRegisterClass *DstRC =
2484 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2485 if (!SrcRC || !DstRC)
2486 return false;
2487
2488 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2489 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2490 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2491 return false;
2492 }
2493
2494 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2495 assert(STI.useRealTrue16Insts());
2496 const DebugLoc &DL = I.getDebugLoc();
2497 MachineBasicBlock *MBB = I.getParent();
2498 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
2499 .addReg(SrcReg, 0, AMDGPU::lo16);
2500 I.eraseFromParent();
2501 return true;
2502 }
2503
2504 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2505 MachineBasicBlock *MBB = I.getParent();
2506 const DebugLoc &DL = I.getDebugLoc();
2507
2508 Register LoReg = MRI->createVirtualRegister(DstRC);
2509 Register HiReg = MRI->createVirtualRegister(DstRC);
2510 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2511 .addReg(SrcReg, 0, AMDGPU::sub0);
2512 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2513 .addReg(SrcReg, 0, AMDGPU::sub1);
2514
2515 if (IsVALU && STI.hasSDWA()) {
2516 // Write the low 16-bits of the high element into the high 16-bits of the
2517 // low element.
2518 MachineInstr *MovSDWA =
2519 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2520 .addImm(0) // $src0_modifiers
2521 .addReg(HiReg) // $src0
2522 .addImm(0) // $clamp
2523 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2524 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2525 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2526 .addReg(LoReg, RegState::Implicit);
2527 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2528 } else {
2529 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2530 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2531 Register ImmReg = MRI->createVirtualRegister(DstRC);
2532 if (IsVALU) {
2533 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2534 .addImm(16)
2535 .addReg(HiReg);
2536 } else {
2537 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2538 .addReg(HiReg)
2539 .addImm(16)
2540 .setOperandDead(3); // Dead scc
2541 }
2542
2543 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2544 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2545 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2546
2547 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2548 .addImm(0xffff);
2549 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2550 .addReg(LoReg)
2551 .addReg(ImmReg);
2552 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2553 .addReg(TmpReg0)
2554 .addReg(TmpReg1);
2555
2556 if (!IsVALU) {
2557 And.setOperandDead(3); // Dead scc
2558 Or.setOperandDead(3); // Dead scc
2559 }
2560 }
2561
2562 I.eraseFromParent();
2563 return true;
2564 }
2565
2566 if (!DstTy.isScalar())
2567 return false;
2568
2569 if (SrcSize > 32) {
2570 unsigned SubRegIdx = DstSize < 32
2571 ? static_cast<unsigned>(AMDGPU::sub0)
2572 : TRI.getSubRegFromChannel(0, DstSize / 32);
2573 if (SubRegIdx == AMDGPU::NoSubRegister)
2574 return false;
2575
2576 // Deal with weird cases where the class only partially supports the subreg
2577 // index.
2578 const TargetRegisterClass *SrcWithSubRC
2579 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2580 if (!SrcWithSubRC)
2581 return false;
2582
2583 if (SrcWithSubRC != SrcRC) {
2584 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2585 return false;
2586 }
2587
2588 I.getOperand(1).setSubReg(SubRegIdx);
2589 }
2590
2591 I.setDesc(TII.get(TargetOpcode::COPY));
2592 return true;
2593}
2594
2595/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2596static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2598 int SignedMask = static_cast<int>(Mask);
2599 return SignedMask >= -16 && SignedMask <= 64;
2600}
2601
2602// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2603const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2605 const TargetRegisterInfo &TRI) const {
2606 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2607 if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))
2608 return RB;
2609
2610 // Ignore the type, since we don't use vcc in artifacts.
2611 if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
2612 return &RBI.getRegBankFromRegClass(*RC, LLT());
2613 return nullptr;
2614}
2615
2616bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2617 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2618 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2619 const DebugLoc &DL = I.getDebugLoc();
2620 MachineBasicBlock &MBB = *I.getParent();
2621 const Register DstReg = I.getOperand(0).getReg();
2622 const Register SrcReg = I.getOperand(1).getReg();
2623
2624 const LLT DstTy = MRI->getType(DstReg);
2625 const LLT SrcTy = MRI->getType(SrcReg);
2626 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2627 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2628 const unsigned DstSize = DstTy.getSizeInBits();
2629 if (!DstTy.isScalar())
2630 return false;
2631
2632 // Artifact casts should never use vcc.
2633 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2634
2635 // FIXME: This should probably be illegal and split earlier.
2636 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2637 if (DstSize <= 32)
2638 return selectCOPY(I);
2639
2640 const TargetRegisterClass *SrcRC =
2641 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2642 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2643 const TargetRegisterClass *DstRC =
2644 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2645
2646 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2647 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2648 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2649 .addReg(SrcReg)
2650 .addImm(AMDGPU::sub0)
2651 .addReg(UndefReg)
2652 .addImm(AMDGPU::sub1);
2653 I.eraseFromParent();
2654
2655 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2656 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2657 }
2658
2659 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2660 // 64-bit should have been split up in RegBankSelect
2661
2662 // Try to use an and with a mask if it will save code size.
2663 unsigned Mask;
2664 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2665 MachineInstr *ExtI =
2666 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2667 .addImm(Mask)
2668 .addReg(SrcReg);
2669 I.eraseFromParent();
2670 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2671 }
2672
2673 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2674 MachineInstr *ExtI =
2675 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2676 .addReg(SrcReg)
2677 .addImm(0) // Offset
2678 .addImm(SrcSize); // Width
2679 I.eraseFromParent();
2680 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2681 }
2682
2683 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2684 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2685 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2686 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2687 return false;
2688
2689 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2690 const unsigned SextOpc = SrcSize == 8 ?
2691 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2692 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2693 .addReg(SrcReg);
2694 I.eraseFromParent();
2695 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2696 }
2697
2698 // Using a single 32-bit SALU to calculate the high half is smaller than
2699 // S_BFE with a literal constant operand.
2700 if (DstSize > 32 && SrcSize == 32) {
2701 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2702 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2703 if (Signed) {
2704 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2705 .addReg(SrcReg, 0, SubReg)
2706 .addImm(31)
2707 .setOperandDead(3); // Dead scc
2708 } else {
2709 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2710 .addImm(0);
2711 }
2712 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2713 .addReg(SrcReg, 0, SubReg)
2714 .addImm(AMDGPU::sub0)
2715 .addReg(HiReg)
2716 .addImm(AMDGPU::sub1);
2717 I.eraseFromParent();
2718 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2719 *MRI);
2720 }
2721
2722 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2723 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2724
2725 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2726 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2727 // We need a 64-bit register source, but the high bits don't matter.
2728 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2729 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2730 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2731
2732 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2733 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2734 .addReg(SrcReg, 0, SubReg)
2735 .addImm(AMDGPU::sub0)
2736 .addReg(UndefReg)
2737 .addImm(AMDGPU::sub1);
2738
2739 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2740 .addReg(ExtReg)
2741 .addImm(SrcSize << 16);
2742
2743 I.eraseFromParent();
2744 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2745 }
2746
2747 unsigned Mask;
2748 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2749 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2750 .addReg(SrcReg)
2751 .addImm(Mask)
2752 .setOperandDead(3); // Dead scc
2753 } else {
2754 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2755 .addReg(SrcReg)
2756 .addImm(SrcSize << 16);
2757 }
2758
2759 I.eraseFromParent();
2760 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2761 }
2762
2763 return false;
2764}
2765
2769
2771 Register BitcastSrc;
2772 if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
2773 Reg = BitcastSrc;
2774 return Reg;
2775}
2776
2778 Register &Out) {
2779 Register Trunc;
2780 if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
2781 return false;
2782
2783 Register LShlSrc;
2784 Register Cst;
2785 if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
2786 Cst = stripCopy(Cst, MRI);
2787 if (mi_match(Cst, MRI, m_SpecificICst(16))) {
2788 Out = stripBitCast(LShlSrc, MRI);
2789 return true;
2790 }
2791 }
2792
2793 MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
2794 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2795 return false;
2796
2797 assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2798 LLT::fixed_vector(2, 16));
2799
2800 ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2801 assert(Mask.size() == 2);
2802
2803 if (Mask[0] == 1 && Mask[1] <= 1) {
2804 Out = Shuffle->getOperand(0).getReg();
2805 return true;
2806 }
2807
2808 return false;
2809}
2810
2811bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2812 if (!Subtarget->hasSALUFloatInsts())
2813 return false;
2814
2815 Register Dst = I.getOperand(0).getReg();
2816 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2817 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2818 return false;
2819
2820 Register Src = I.getOperand(1).getReg();
2821
2822 if (MRI->getType(Dst) == LLT::scalar(32) &&
2823 MRI->getType(Src) == LLT::scalar(16)) {
2824 if (isExtractHiElt(*MRI, Src, Src)) {
2825 MachineBasicBlock *BB = I.getParent();
2826 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2827 .addUse(Src);
2828 I.eraseFromParent();
2829 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2830 }
2831 }
2832
2833 return false;
2834}
2835
2836bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2837 // Only manually handle the f64 SGPR case.
2838 //
2839 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2840 // the bit ops theoretically have a second result due to the implicit def of
2841 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2842 // that is easy by disabling the check. The result works, but uses a
2843 // nonsensical sreg32orlds_and_sreg_1 regclass.
2844 //
2845 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2846 // the variadic REG_SEQUENCE operands.
2847
2848 Register Dst = MI.getOperand(0).getReg();
2849 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2850 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2851 MRI->getType(Dst) != LLT::scalar(64))
2852 return false;
2853
2854 Register Src = MI.getOperand(1).getReg();
2855 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2856 if (Fabs)
2857 Src = Fabs->getOperand(1).getReg();
2858
2859 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2860 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2861 return false;
2862
2863 MachineBasicBlock *BB = MI.getParent();
2864 const DebugLoc &DL = MI.getDebugLoc();
2865 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2866 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2867 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2868 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2869
2870 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2871 .addReg(Src, 0, AMDGPU::sub0);
2872 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2873 .addReg(Src, 0, AMDGPU::sub1);
2874 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2875 .addImm(0x80000000);
2876
2877 // Set or toggle sign bit.
2878 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2879 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2880 .addReg(HiReg)
2881 .addReg(ConstReg)
2882 .setOperandDead(3); // Dead scc
2883 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2884 .addReg(LoReg)
2885 .addImm(AMDGPU::sub0)
2886 .addReg(OpReg)
2887 .addImm(AMDGPU::sub1);
2888 MI.eraseFromParent();
2889 return true;
2890}
2891
2892// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2893bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2894 Register Dst = MI.getOperand(0).getReg();
2895 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2896 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2897 MRI->getType(Dst) != LLT::scalar(64))
2898 return false;
2899
2900 Register Src = MI.getOperand(1).getReg();
2901 MachineBasicBlock *BB = MI.getParent();
2902 const DebugLoc &DL = MI.getDebugLoc();
2903 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2904 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2905 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2906 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2907
2908 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2909 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2910 return false;
2911
2912 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2913 .addReg(Src, 0, AMDGPU::sub0);
2914 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2915 .addReg(Src, 0, AMDGPU::sub1);
2916 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2917 .addImm(0x7fffffff);
2918
2919 // Clear sign bit.
2920 // TODO: Should this used S_BITSET0_*?
2921 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2922 .addReg(HiReg)
2923 .addReg(ConstReg)
2924 .setOperandDead(3); // Dead scc
2925 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2926 .addReg(LoReg)
2927 .addImm(AMDGPU::sub0)
2928 .addReg(OpReg)
2929 .addImm(AMDGPU::sub1);
2930
2931 MI.eraseFromParent();
2932 return true;
2933}
2934
2935static bool isConstant(const MachineInstr &MI) {
2936 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2937}
2938
2939void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2940 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2941
2942 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2943 const MachineInstr *PtrMI =
2944 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2945
2946 assert(PtrMI);
2947
2948 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2949 return;
2950
2951 GEPInfo GEPInfo;
2952
2953 for (unsigned i = 1; i != 3; ++i) {
2954 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2955 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2956 assert(OpDef);
2957 if (i == 2 && isConstant(*OpDef)) {
2958 // TODO: Could handle constant base + variable offset, but a combine
2959 // probably should have commuted it.
2960 assert(GEPInfo.Imm == 0);
2961 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2962 continue;
2963 }
2964 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2965 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2966 GEPInfo.SgprParts.push_back(GEPOp.getReg());
2967 else
2968 GEPInfo.VgprParts.push_back(GEPOp.getReg());
2969 }
2970
2971 AddrInfo.push_back(GEPInfo);
2972 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2973}
2974
2975bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2976 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2977}
2978
2979bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2980 if (!MI.hasOneMemOperand())
2981 return false;
2982
2983 const MachineMemOperand *MMO = *MI.memoperands_begin();
2984 const Value *Ptr = MMO->getValue();
2985
2986 // UndefValue means this is a load of a kernel input. These are uniform.
2987 // Sometimes LDS instructions have constant pointers.
2988 // If Ptr is null, then that means this mem operand contains a
2989 // PseudoSourceValue like GOT.
2991 return true;
2992
2994 return true;
2995
2996 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
2997 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
2998 AMDGPU::SGPRRegBankID;
2999
3000 const Instruction *I = dyn_cast<Instruction>(Ptr);
3001 return I && I->getMetadata("amdgpu.uniform");
3002}
3003
3004bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
3005 for (const GEPInfo &GEPInfo : AddrInfo) {
3006 if (!GEPInfo.VgprParts.empty())
3007 return true;
3008 }
3009 return false;
3010}
3011
3012void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
3013 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
3014 unsigned AS = PtrTy.getAddressSpace();
3016 STI.ldsRequiresM0Init()) {
3017 MachineBasicBlock *BB = I.getParent();
3018
3019 // If DS instructions require M0 initialization, insert it before selecting.
3020 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3021 .addImm(-1);
3022 }
3023}
3024
3025bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
3026 MachineInstr &I) const {
3027 initM0(I);
3028 return selectImpl(I, *CoverageInfo);
3029}
3030
3032 if (Reg.isPhysical())
3033 return false;
3034
3035 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
3036 const unsigned Opcode = MI.getOpcode();
3037
3038 if (Opcode == AMDGPU::COPY)
3039 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
3040
3041 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
3042 Opcode == AMDGPU::G_XOR)
3043 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
3044 isVCmpResult(MI.getOperand(2).getReg(), MRI);
3045
3046 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
3047 return GI->is(Intrinsic::amdgcn_class);
3048
3049 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
3050}
3051
3052bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
3053 MachineBasicBlock *BB = I.getParent();
3054 MachineOperand &CondOp = I.getOperand(0);
3055 Register CondReg = CondOp.getReg();
3056 const DebugLoc &DL = I.getDebugLoc();
3057
3058 unsigned BrOpcode;
3059 Register CondPhysReg;
3060 const TargetRegisterClass *ConstrainRC;
3061
3062 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
3063 // whether the branch is uniform when selecting the instruction. In
3064 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
3065 // RegBankSelect knows what it's doing if the branch condition is scc, even
3066 // though it currently does not.
3067 if (!isVCC(CondReg, *MRI)) {
3068 if (MRI->getType(CondReg) != LLT::scalar(32))
3069 return false;
3070
3071 CondPhysReg = AMDGPU::SCC;
3072 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
3073 ConstrainRC = &AMDGPU::SReg_32RegClass;
3074 } else {
3075 // FIXME: Should scc->vcc copies and with exec?
3076
3077 // Unless the value of CondReg is a result of a V_CMP* instruction then we
3078 // need to insert an and with exec.
3079 if (!isVCmpResult(CondReg, *MRI)) {
3080 const bool Is64 = STI.isWave64();
3081 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
3082 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
3083
3084 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
3085 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
3086 .addReg(CondReg)
3087 .addReg(Exec)
3088 .setOperandDead(3); // Dead scc
3089 CondReg = TmpReg;
3090 }
3091
3092 CondPhysReg = TRI.getVCC();
3093 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
3094 ConstrainRC = TRI.getBoolRC();
3095 }
3096
3097 if (!MRI->getRegClassOrNull(CondReg))
3098 MRI->setRegClass(CondReg, ConstrainRC);
3099
3100 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
3101 .addReg(CondReg);
3102 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
3103 .addMBB(I.getOperand(1).getMBB());
3104
3105 I.eraseFromParent();
3106 return true;
3107}
3108
3109bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
3110 MachineInstr &I) const {
3111 Register DstReg = I.getOperand(0).getReg();
3112 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3113 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3114 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
3115 if (IsVGPR)
3116 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3117
3118 return RBI.constrainGenericRegister(
3119 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
3120}
3121
3122bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
3123 Register DstReg = I.getOperand(0).getReg();
3124 Register SrcReg = I.getOperand(1).getReg();
3125 Register MaskReg = I.getOperand(2).getReg();
3126 LLT Ty = MRI->getType(DstReg);
3127 LLT MaskTy = MRI->getType(MaskReg);
3128 MachineBasicBlock *BB = I.getParent();
3129 const DebugLoc &DL = I.getDebugLoc();
3130
3131 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3132 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3133 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
3134 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3135 if (DstRB != SrcRB) // Should only happen for hand written MIR.
3136 return false;
3137
3138 // Try to avoid emitting a bit operation when we only need to touch half of
3139 // the 64-bit pointer.
3140 APInt MaskOnes = VT->getKnownOnes(MaskReg).zext(64);
3141 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
3142 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
3143
3144 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
3145 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
3146
3147 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
3148 !CanCopyLow32 && !CanCopyHi32) {
3149 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
3150 .addReg(SrcReg)
3151 .addReg(MaskReg)
3152 .setOperandDead(3); // Dead scc
3153 I.eraseFromParent();
3154 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3155 }
3156
3157 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3158 const TargetRegisterClass &RegRC
3159 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3160
3161 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
3162 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
3163 const TargetRegisterClass *MaskRC =
3164 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
3165
3166 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3167 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3168 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
3169 return false;
3170
3171 if (Ty.getSizeInBits() == 32) {
3172 assert(MaskTy.getSizeInBits() == 32 &&
3173 "ptrmask should have been narrowed during legalize");
3174
3175 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
3176 .addReg(SrcReg)
3177 .addReg(MaskReg);
3178
3179 if (!IsVGPR)
3180 NewOp.setOperandDead(3); // Dead scc
3181 I.eraseFromParent();
3182 return true;
3183 }
3184
3185 Register HiReg = MRI->createVirtualRegister(&RegRC);
3186 Register LoReg = MRI->createVirtualRegister(&RegRC);
3187
3188 // Extract the subregisters from the source pointer.
3189 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
3190 .addReg(SrcReg, 0, AMDGPU::sub0);
3191 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
3192 .addReg(SrcReg, 0, AMDGPU::sub1);
3193
3194 Register MaskedLo, MaskedHi;
3195
3196 if (CanCopyLow32) {
3197 // If all the bits in the low half are 1, we only need a copy for it.
3198 MaskedLo = LoReg;
3199 } else {
3200 // Extract the mask subregister and apply the and.
3201 Register MaskLo = MRI->createVirtualRegister(&RegRC);
3202 MaskedLo = MRI->createVirtualRegister(&RegRC);
3203
3204 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
3205 .addReg(MaskReg, 0, AMDGPU::sub0);
3206 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
3207 .addReg(LoReg)
3208 .addReg(MaskLo);
3209 }
3210
3211 if (CanCopyHi32) {
3212 // If all the bits in the high half are 1, we only need a copy for it.
3213 MaskedHi = HiReg;
3214 } else {
3215 Register MaskHi = MRI->createVirtualRegister(&RegRC);
3216 MaskedHi = MRI->createVirtualRegister(&RegRC);
3217
3218 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3219 .addReg(MaskReg, 0, AMDGPU::sub1);
3220 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3221 .addReg(HiReg)
3222 .addReg(MaskHi);
3223 }
3224
3225 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3226 .addReg(MaskedLo)
3227 .addImm(AMDGPU::sub0)
3228 .addReg(MaskedHi)
3229 .addImm(AMDGPU::sub1);
3230 I.eraseFromParent();
3231 return true;
3232}
3233
3234/// Return the register to use for the index value, and the subregister to use
3235/// for the indirectly accessed register.
3236static std::pair<Register, unsigned>
3238 const TargetRegisterClass *SuperRC, Register IdxReg,
3239 unsigned EltSize, GISelValueTracking &ValueTracking) {
3240 Register IdxBaseReg;
3241 int Offset;
3242
3243 std::tie(IdxBaseReg, Offset) =
3244 AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &ValueTracking);
3245 if (IdxBaseReg == AMDGPU::NoRegister) {
3246 // This will happen if the index is a known constant. This should ordinarily
3247 // be legalized out, but handle it as a register just in case.
3248 assert(Offset == 0);
3249 IdxBaseReg = IdxReg;
3250 }
3251
3252 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3253
3254 // Skip out of bounds offsets, or else we would end up using an undefined
3255 // register.
3256 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3257 return std::pair(IdxReg, SubRegs[0]);
3258 return std::pair(IdxBaseReg, SubRegs[Offset]);
3259}
3260
3261bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3262 MachineInstr &MI) const {
3263 Register DstReg = MI.getOperand(0).getReg();
3264 Register SrcReg = MI.getOperand(1).getReg();
3265 Register IdxReg = MI.getOperand(2).getReg();
3266
3267 LLT DstTy = MRI->getType(DstReg);
3268 LLT SrcTy = MRI->getType(SrcReg);
3269
3270 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3271 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3272 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3273
3274 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3275 // into a waterfall loop.
3276 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3277 return false;
3278
3279 const TargetRegisterClass *SrcRC =
3280 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3281 const TargetRegisterClass *DstRC =
3282 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3283 if (!SrcRC || !DstRC)
3284 return false;
3285 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3286 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3287 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3288 return false;
3289
3290 MachineBasicBlock *BB = MI.getParent();
3291 const DebugLoc &DL = MI.getDebugLoc();
3292 const bool Is64 = DstTy.getSizeInBits() == 64;
3293
3294 unsigned SubReg;
3295 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3296 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *VT);
3297
3298 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3299 if (DstTy.getSizeInBits() != 32 && !Is64)
3300 return false;
3301
3302 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3303 .addReg(IdxReg);
3304
3305 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3306 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3307 .addReg(SrcReg, 0, SubReg)
3308 .addReg(SrcReg, RegState::Implicit);
3309 MI.eraseFromParent();
3310 return true;
3311 }
3312
3313 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3314 return false;
3315
3316 if (!STI.useVGPRIndexMode()) {
3317 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3318 .addReg(IdxReg);
3319 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3320 .addReg(SrcReg, 0, SubReg)
3321 .addReg(SrcReg, RegState::Implicit);
3322 MI.eraseFromParent();
3323 return true;
3324 }
3325
3326 const MCInstrDesc &GPRIDXDesc =
3327 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3328 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3329 .addReg(SrcReg)
3330 .addReg(IdxReg)
3331 .addImm(SubReg);
3332
3333 MI.eraseFromParent();
3334 return true;
3335}
3336
3337// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3338bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3339 MachineInstr &MI) const {
3340 Register DstReg = MI.getOperand(0).getReg();
3341 Register VecReg = MI.getOperand(1).getReg();
3342 Register ValReg = MI.getOperand(2).getReg();
3343 Register IdxReg = MI.getOperand(3).getReg();
3344
3345 LLT VecTy = MRI->getType(DstReg);
3346 LLT ValTy = MRI->getType(ValReg);
3347 unsigned VecSize = VecTy.getSizeInBits();
3348 unsigned ValSize = ValTy.getSizeInBits();
3349
3350 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3351 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3352 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3353
3354 assert(VecTy.getElementType() == ValTy);
3355
3356 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3357 // into a waterfall loop.
3358 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3359 return false;
3360
3361 const TargetRegisterClass *VecRC =
3362 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3363 const TargetRegisterClass *ValRC =
3364 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3365
3366 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3367 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3368 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3369 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3370 return false;
3371
3372 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3373 return false;
3374
3375 unsigned SubReg;
3376 std::tie(IdxReg, SubReg) =
3377 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *VT);
3378
3379 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3380 STI.useVGPRIndexMode();
3381
3382 MachineBasicBlock *BB = MI.getParent();
3383 const DebugLoc &DL = MI.getDebugLoc();
3384
3385 if (!IndexMode) {
3386 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3387 .addReg(IdxReg);
3388
3389 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3390 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3391 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3392 .addReg(VecReg)
3393 .addReg(ValReg)
3394 .addImm(SubReg);
3395 MI.eraseFromParent();
3396 return true;
3397 }
3398
3399 const MCInstrDesc &GPRIDXDesc =
3400 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3401 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3402 .addReg(VecReg)
3403 .addReg(ValReg)
3404 .addReg(IdxReg)
3405 .addImm(SubReg);
3406
3407 MI.eraseFromParent();
3408 return true;
3409}
3410
3411bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3412 if (!Subtarget->hasVMemToLDSLoad())
3413 return false;
3414 unsigned Opc;
3415 unsigned Size = MI.getOperand(3).getImm();
3416
3417 // The struct intrinsic variants add one additional operand over raw.
3418 const bool HasVIndex = MI.getNumOperands() == 9;
3419 Register VIndex;
3420 int OpOffset = 0;
3421 if (HasVIndex) {
3422 VIndex = MI.getOperand(4).getReg();
3423 OpOffset = 1;
3424 }
3425
3426 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3427 std::optional<ValueAndVReg> MaybeVOffset =
3429 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3430
3431 switch (Size) {
3432 default:
3433 return false;
3434 case 1:
3435 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3436 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3437 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3438 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3439 break;
3440 case 2:
3441 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3442 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3443 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3444 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3445 break;
3446 case 4:
3447 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3448 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3449 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3450 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3451 break;
3452 case 12:
3453 if (!Subtarget->hasLDSLoadB96_B128())
3454 return false;
3455
3456 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3457 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3458 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3459 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3460 break;
3461 case 16:
3462 if (!Subtarget->hasLDSLoadB96_B128())
3463 return false;
3464
3465 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3466 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3467 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3468 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3469 break;
3470 }
3471
3472 MachineBasicBlock *MBB = MI.getParent();
3473 const DebugLoc &DL = MI.getDebugLoc();
3474 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3475 .add(MI.getOperand(2));
3476
3477 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3478
3479 if (HasVIndex && HasVOffset) {
3480 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3481 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3482 .addReg(VIndex)
3483 .addImm(AMDGPU::sub0)
3484 .addReg(VOffset)
3485 .addImm(AMDGPU::sub1);
3486
3487 MIB.addReg(IdxReg);
3488 } else if (HasVIndex) {
3489 MIB.addReg(VIndex);
3490 } else if (HasVOffset) {
3491 MIB.addReg(VOffset);
3492 }
3493
3494 MIB.add(MI.getOperand(1)); // rsrc
3495 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3496 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3497 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
3498 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3499 MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3500 : AMDGPU::CPol::ALL_pregfx12)); // cpol
3501 MIB.addImm(
3502 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3503 ? 1
3504 : 0); // swz
3505
3506 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3507 // Don't set the offset value here because the pointer points to the base of
3508 // the buffer.
3509 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3510
3511 MachinePointerInfo StorePtrI = LoadPtrI;
3512 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3516
3517 auto F = LoadMMO->getFlags() &
3519 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3520 Size, LoadMMO->getBaseAlign());
3521
3522 MachineMemOperand *StoreMMO =
3523 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3524 sizeof(int32_t), LoadMMO->getBaseAlign());
3525
3526 MIB.setMemRefs({LoadMMO, StoreMMO});
3527
3528 MI.eraseFromParent();
3529 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3530}
3531
3532/// Match a zero extend from a 32-bit value to 64-bits.
3533Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const {
3534 Register ZExtSrc;
3535 if (mi_match(Reg, *MRI, m_GZExt(m_Reg(ZExtSrc))))
3536 return MRI->getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3537
3538 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3539 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3540 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3541 return Register();
3542
3543 assert(Def->getNumOperands() == 3 &&
3544 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3545 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_ZeroInt())) {
3546 return Def->getOperand(1).getReg();
3547 }
3548
3549 return Register();
3550}
3551
3552/// Match a sign extend from a 32-bit value to 64-bits.
3553Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const {
3554 Register SExtSrc;
3555 if (mi_match(Reg, *MRI, m_GSExt(m_Reg(SExtSrc))))
3556 return MRI->getType(SExtSrc) == LLT::scalar(32) ? SExtSrc : Register();
3557
3558 // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31))
3559 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3560 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3561 return Register();
3562
3563 assert(Def->getNumOperands() == 3 &&
3564 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3565 if (mi_match(Def->getOperand(2).getReg(), *MRI,
3566 m_GAShr(m_SpecificReg(Def->getOperand(1).getReg()),
3567 m_SpecificICst(31))))
3568 return Def->getOperand(1).getReg();
3569
3570 if (VT->signBitIsZero(Reg))
3571 return matchZeroExtendFromS32(Reg);
3572
3573 return Register();
3574}
3575
3576/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
3577/// is 32-bit.
3579AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const {
3580 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3581 : matchZeroExtendFromS32(Reg);
3582}
3583
3584/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
3585/// is 32-bit.
3587AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const {
3588 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3589 : matchSignExtendFromS32(Reg);
3590}
3591
3593AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg,
3594 bool IsSigned) const {
3595 if (IsSigned)
3596 return matchSignExtendFromS32OrS32(Reg);
3597
3598 return matchZeroExtendFromS32OrS32(Reg);
3599}
3600
3601Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {
3602 Register AnyExtSrc;
3603 if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc))))
3604 return MRI->getType(AnyExtSrc) == LLT::scalar(32) ? AnyExtSrc : Register();
3605
3606 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 G_IMPLICIT_DEF)
3607 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3608 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3609 return Register();
3610
3611 assert(Def->getNumOperands() == 3 &&
3612 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3613
3614 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_GImplicitDef()))
3615 return Def->getOperand(1).getReg();
3616
3617 return Register();
3618}
3619
3620bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3621 if (!Subtarget->hasVMemToLDSLoad())
3622 return false;
3623
3624 unsigned Opc;
3625 unsigned Size = MI.getOperand(3).getImm();
3626
3627 switch (Size) {
3628 default:
3629 return false;
3630 case 1:
3631 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3632 break;
3633 case 2:
3634 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3635 break;
3636 case 4:
3637 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3638 break;
3639 case 12:
3640 if (!Subtarget->hasLDSLoadB96_B128())
3641 return false;
3642 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3643 break;
3644 case 16:
3645 if (!Subtarget->hasLDSLoadB96_B128())
3646 return false;
3647 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3648 break;
3649 }
3650
3651 MachineBasicBlock *MBB = MI.getParent();
3652 const DebugLoc &DL = MI.getDebugLoc();
3653 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3654 .add(MI.getOperand(2));
3655
3656 Register Addr = MI.getOperand(1).getReg();
3657 Register VOffset;
3658 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3659 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3660 if (!isSGPR(Addr)) {
3661 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3662 if (isSGPR(AddrDef->Reg)) {
3663 Addr = AddrDef->Reg;
3664 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3665 Register SAddr =
3666 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3667 if (isSGPR(SAddr)) {
3668 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3669 if (Register Off = matchZeroExtendFromS32(PtrBaseOffset)) {
3670 Addr = SAddr;
3671 VOffset = Off;
3672 }
3673 }
3674 }
3675 }
3676
3677 if (isSGPR(Addr)) {
3679 if (!VOffset) {
3680 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3681 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3682 .addImm(0);
3683 }
3684 }
3685
3686 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3687 .addReg(Addr);
3688
3689 if (isSGPR(Addr))
3690 MIB.addReg(VOffset);
3691
3692 MIB.add(MI.getOperand(4)); // offset
3693
3694 unsigned Aux = MI.getOperand(5).getImm();
3695 MIB.addImm(Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol
3696
3697 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3698 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3699 LoadPtrI.Offset = MI.getOperand(4).getImm();
3700 MachinePointerInfo StorePtrI = LoadPtrI;
3701 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3705 auto F = LoadMMO->getFlags() &
3707 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3708 Size, LoadMMO->getBaseAlign());
3709 MachineMemOperand *StoreMMO =
3710 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3711 sizeof(int32_t), Align(4));
3712
3713 MIB.setMemRefs({LoadMMO, StoreMMO});
3714
3715 MI.eraseFromParent();
3716 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3717}
3718
3719bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
3720 MachineInstr &MI) const {
3721 unsigned OpcodeOpIdx =
3722 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3;
3723 MI.setDesc(TII.get(MI.getOperand(OpcodeOpIdx).getImm()));
3724 MI.removeOperand(OpcodeOpIdx);
3725 MI.addImplicitDefUseOperands(*MI.getMF());
3726 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3727}
3728
3729// FIXME: This should be removed and let the patterns select. We just need the
3730// AGPR/VGPR combination versions.
3731bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3732 unsigned Opc;
3733 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3734 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3735 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3736 break;
3737 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3738 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3739 break;
3740 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3741 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3742 break;
3743 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3744 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3745 break;
3746 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3747 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3748 break;
3749 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3750 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3751 break;
3752 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3753 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3754 break;
3755 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3756 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3757 break;
3758 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3759 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3760 break;
3761 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3762 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3763 break;
3764 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3765 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3766 break;
3767 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3768 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3769 break;
3770 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3771 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3772 break;
3773 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3774 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3775 break;
3776 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3777 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3778 break;
3779 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3780 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3781 break;
3782 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
3783 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
3784 break;
3785 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
3786 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
3787 break;
3788 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
3789 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
3790 break;
3791 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
3792 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
3793 break;
3794 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
3795 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
3796 break;
3797 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3798 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3799 break;
3800 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
3801 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
3802 break;
3803 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
3804 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
3805 break;
3806 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
3807 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
3808 break;
3809 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
3810 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
3811 break;
3812 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
3813 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
3814 break;
3815 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
3816 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
3817 break;
3818 default:
3819 llvm_unreachable("unhandled smfmac intrinsic");
3820 }
3821
3822 auto VDst_In = MI.getOperand(4);
3823
3824 MI.setDesc(TII.get(Opc));
3825 MI.removeOperand(4); // VDst_In
3826 MI.removeOperand(1); // Intrinsic ID
3827 MI.addOperand(VDst_In); // Readd VDst_In to the end
3828 MI.addImplicitDefUseOperands(*MI.getMF());
3829 const MCInstrDesc &MCID = MI.getDesc();
3830 if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
3831 MI.getOperand(0).setIsEarlyClobber(true);
3832 }
3833 return true;
3834}
3835
3836bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
3837 MachineInstr &MI, Intrinsic::ID IntrID) const {
3838 if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
3839 !Subtarget->hasPermlane16Swap())
3840 return false;
3841 if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3842 !Subtarget->hasPermlane32Swap())
3843 return false;
3844
3845 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3846 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3847 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3848
3849 MI.removeOperand(2);
3850 MI.setDesc(TII.get(Opcode));
3851 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3852
3853 MachineOperand &FI = MI.getOperand(4);
3855
3856 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3857}
3858
3859bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3860 Register DstReg = MI.getOperand(0).getReg();
3861 Register SrcReg = MI.getOperand(1).getReg();
3862 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3863 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3864 MachineBasicBlock *MBB = MI.getParent();
3865 const DebugLoc &DL = MI.getDebugLoc();
3866
3867 if (IsVALU) {
3868 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3869 .addImm(Subtarget->getWavefrontSizeLog2())
3870 .addReg(SrcReg);
3871 } else {
3872 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3873 .addReg(SrcReg)
3874 .addImm(Subtarget->getWavefrontSizeLog2())
3875 .setOperandDead(3); // Dead scc
3876 }
3877
3878 const TargetRegisterClass &RC =
3879 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3880 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3881 return false;
3882
3883 MI.eraseFromParent();
3884 return true;
3885}
3886
3887// Match BITOP3 operation and return a number of matched instructions plus
3888// truth table.
3889static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
3891 const MachineRegisterInfo &MRI) {
3892 unsigned NumOpcodes = 0;
3893 uint8_t LHSBits, RHSBits;
3894
3895 auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
3896 // Define truth table given Src0, Src1, Src2 bits permutations:
3897 // 0 0 0
3898 // 0 0 1
3899 // 0 1 0
3900 // 0 1 1
3901 // 1 0 0
3902 // 1 0 1
3903 // 1 1 0
3904 // 1 1 1
3905 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
3906
3907 if (mi_match(Op, MRI, m_AllOnesInt())) {
3908 Bits = 0xff;
3909 return true;
3910 }
3911 if (mi_match(Op, MRI, m_ZeroInt())) {
3912 Bits = 0;
3913 return true;
3914 }
3915
3916 for (unsigned I = 0; I < Src.size(); ++I) {
3917 // Try to find existing reused operand
3918 if (Src[I] == Op) {
3919 Bits = SrcBits[I];
3920 return true;
3921 }
3922 // Try to replace parent operator
3923 if (Src[I] == R) {
3924 Bits = SrcBits[I];
3925 Src[I] = Op;
3926 return true;
3927 }
3928 }
3929
3930 if (Src.size() == 3) {
3931 // No room left for operands. Try one last time, there can be a 'not' of
3932 // one of our source operands. In this case we can compute the bits
3933 // without growing Src vector.
3934 Register LHS;
3935 if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {
3937 for (unsigned I = 0; I < Src.size(); ++I) {
3938 if (Src[I] == LHS) {
3939 Bits = ~SrcBits[I];
3940 return true;
3941 }
3942 }
3943 }
3944
3945 return false;
3946 }
3947
3948 Bits = SrcBits[Src.size()];
3949 Src.push_back(Op);
3950 return true;
3951 };
3952
3953 MachineInstr *MI = MRI.getVRegDef(R);
3954 switch (MI->getOpcode()) {
3955 case TargetOpcode::G_AND:
3956 case TargetOpcode::G_OR:
3957 case TargetOpcode::G_XOR: {
3958 Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);
3959 Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
3960
3961 SmallVector<Register, 3> Backup(Src.begin(), Src.end());
3962 if (!getOperandBits(LHS, LHSBits) ||
3963 !getOperandBits(RHS, RHSBits)) {
3964 Src = Backup;
3965 return std::make_pair(0, 0);
3966 }
3967
3968 // Recursion is naturally limited by the size of the operand vector.
3969 auto Op = BitOp3_Op(LHS, Src, MRI);
3970 if (Op.first) {
3971 NumOpcodes += Op.first;
3972 LHSBits = Op.second;
3973 }
3974
3975 Op = BitOp3_Op(RHS, Src, MRI);
3976 if (Op.first) {
3977 NumOpcodes += Op.first;
3978 RHSBits = Op.second;
3979 }
3980 break;
3981 }
3982 default:
3983 return std::make_pair(0, 0);
3984 }
3985
3986 uint8_t TTbl;
3987 switch (MI->getOpcode()) {
3988 case TargetOpcode::G_AND:
3989 TTbl = LHSBits & RHSBits;
3990 break;
3991 case TargetOpcode::G_OR:
3992 TTbl = LHSBits | RHSBits;
3993 break;
3994 case TargetOpcode::G_XOR:
3995 TTbl = LHSBits ^ RHSBits;
3996 break;
3997 default:
3998 break;
3999 }
4000
4001 return std::make_pair(NumOpcodes + 1, TTbl);
4002}
4003
4004bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
4005 if (!Subtarget->hasBitOp3Insts())
4006 return false;
4007
4008 Register DstReg = MI.getOperand(0).getReg();
4009 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4010 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
4011 if (!IsVALU)
4012 return false;
4013
4015 uint8_t TTbl;
4016 unsigned NumOpcodes;
4017
4018 std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);
4019
4020 // Src.empty() case can happen if all operands are all zero or all ones.
4021 // Normally it shall be optimized out before reaching this.
4022 if (NumOpcodes < 2 || Src.empty())
4023 return false;
4024
4025 const bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
4026 if (NumOpcodes == 2 && IsB32) {
4027 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4028 // asm more readable. This cannot be modeled with AddedComplexity because
4029 // selector does not know how many operations did we match.
4030 if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||
4031 mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
4032 mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
4033 return false;
4034 } else if (NumOpcodes < 4) {
4035 // For a uniform case threshold should be higher to account for moves
4036 // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be
4037 // in SGPRs and a readtfirstlane after.
4038 return false;
4039 }
4040
4041 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
4042 if (!IsB32 && STI.hasTrue16BitInsts())
4043 Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64
4044 : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64;
4045 unsigned CBL = STI.getConstantBusLimit(Opc);
4046 MachineBasicBlock *MBB = MI.getParent();
4047 const DebugLoc &DL = MI.getDebugLoc();
4048
4049 for (unsigned I = 0; I < Src.size(); ++I) {
4050 const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
4051 if (RB->getID() != AMDGPU::SGPRRegBankID)
4052 continue;
4053 if (CBL > 0) {
4054 --CBL;
4055 continue;
4056 }
4057 Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4058 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
4059 .addReg(Src[I]);
4060 Src[I] = NewReg;
4061 }
4062
4063 // Last operand can be ignored, turning a ternary operation into a binary.
4064 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4065 // 'c' with 'a' here without changing the answer. In some pathological
4066 // cases it should be possible to get an operation with a single operand
4067 // too if optimizer would not catch it.
4068 while (Src.size() < 3)
4069 Src.push_back(Src[0]);
4070
4071 auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);
4072 if (!IsB32)
4073 MIB.addImm(0); // src_mod0
4074 MIB.addReg(Src[0]);
4075 if (!IsB32)
4076 MIB.addImm(0); // src_mod1
4077 MIB.addReg(Src[1]);
4078 if (!IsB32)
4079 MIB.addImm(0); // src_mod2
4080 MIB.addReg(Src[2])
4081 .addImm(TTbl);
4082 if (!IsB32)
4083 MIB.addImm(0); // op_sel
4084
4085 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
4086 MI.eraseFromParent();
4087
4088 return true;
4089}
4090
4091bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
4092 Register SrcReg = MI.getOperand(0).getReg();
4093 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
4094 return false;
4095
4096 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
4097 Register SP =
4098 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
4099 Register WaveAddr = getWaveAddress(DefMI);
4100 MachineBasicBlock *MBB = MI.getParent();
4101 const DebugLoc &DL = MI.getDebugLoc();
4102
4103 if (!WaveAddr) {
4104 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4105 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
4106 .addReg(SrcReg)
4107 .addImm(Subtarget->getWavefrontSizeLog2())
4108 .setOperandDead(3); // Dead scc
4109 }
4110
4111 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
4112 .addReg(WaveAddr);
4113
4114 MI.eraseFromParent();
4115 return true;
4116}
4117
4119
4120 if (!I.isPreISelOpcode()) {
4121 if (I.isCopy())
4122 return selectCOPY(I);
4123 return true;
4124 }
4125
4126 switch (I.getOpcode()) {
4127 case TargetOpcode::G_AND:
4128 case TargetOpcode::G_OR:
4129 case TargetOpcode::G_XOR:
4130 if (selectBITOP3(I))
4131 return true;
4132 if (selectImpl(I, *CoverageInfo))
4133 return true;
4134 return selectG_AND_OR_XOR(I);
4135 case TargetOpcode::G_ADD:
4136 case TargetOpcode::G_SUB:
4137 case TargetOpcode::G_PTR_ADD:
4138 if (selectImpl(I, *CoverageInfo))
4139 return true;
4140 return selectG_ADD_SUB(I);
4141 case TargetOpcode::G_UADDO:
4142 case TargetOpcode::G_USUBO:
4143 case TargetOpcode::G_UADDE:
4144 case TargetOpcode::G_USUBE:
4145 return selectG_UADDO_USUBO_UADDE_USUBE(I);
4146 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4147 case AMDGPU::G_AMDGPU_MAD_I64_I32:
4148 return selectG_AMDGPU_MAD_64_32(I);
4149 case TargetOpcode::G_INTTOPTR:
4150 case TargetOpcode::G_BITCAST:
4151 case TargetOpcode::G_PTRTOINT:
4152 case TargetOpcode::G_FREEZE:
4153 return selectCOPY(I);
4154 case TargetOpcode::G_FNEG:
4155 if (selectImpl(I, *CoverageInfo))
4156 return true;
4157 return selectG_FNEG(I);
4158 case TargetOpcode::G_FABS:
4159 if (selectImpl(I, *CoverageInfo))
4160 return true;
4161 return selectG_FABS(I);
4162 case TargetOpcode::G_EXTRACT:
4163 return selectG_EXTRACT(I);
4164 case TargetOpcode::G_MERGE_VALUES:
4165 case TargetOpcode::G_CONCAT_VECTORS:
4166 return selectG_MERGE_VALUES(I);
4167 case TargetOpcode::G_UNMERGE_VALUES:
4168 return selectG_UNMERGE_VALUES(I);
4169 case TargetOpcode::G_BUILD_VECTOR:
4170 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
4171 return selectG_BUILD_VECTOR(I);
4172 case TargetOpcode::G_IMPLICIT_DEF:
4173 return selectG_IMPLICIT_DEF(I);
4174 case TargetOpcode::G_INSERT:
4175 return selectG_INSERT(I);
4176 case TargetOpcode::G_INTRINSIC:
4177 case TargetOpcode::G_INTRINSIC_CONVERGENT:
4178 return selectG_INTRINSIC(I);
4179 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
4180 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
4181 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
4182 case TargetOpcode::G_ICMP:
4183 case TargetOpcode::G_FCMP:
4184 if (selectG_ICMP_or_FCMP(I))
4185 return true;
4186 return selectImpl(I, *CoverageInfo);
4187 case TargetOpcode::G_LOAD:
4188 case TargetOpcode::G_ZEXTLOAD:
4189 case TargetOpcode::G_SEXTLOAD:
4190 case TargetOpcode::G_STORE:
4191 case TargetOpcode::G_ATOMIC_CMPXCHG:
4192 case TargetOpcode::G_ATOMICRMW_XCHG:
4193 case TargetOpcode::G_ATOMICRMW_ADD:
4194 case TargetOpcode::G_ATOMICRMW_SUB:
4195 case TargetOpcode::G_ATOMICRMW_AND:
4196 case TargetOpcode::G_ATOMICRMW_OR:
4197 case TargetOpcode::G_ATOMICRMW_XOR:
4198 case TargetOpcode::G_ATOMICRMW_MIN:
4199 case TargetOpcode::G_ATOMICRMW_MAX:
4200 case TargetOpcode::G_ATOMICRMW_UMIN:
4201 case TargetOpcode::G_ATOMICRMW_UMAX:
4202 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
4203 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4204 case TargetOpcode::G_ATOMICRMW_USUB_COND:
4205 case TargetOpcode::G_ATOMICRMW_USUB_SAT:
4206 case TargetOpcode::G_ATOMICRMW_FADD:
4207 case TargetOpcode::G_ATOMICRMW_FMIN:
4208 case TargetOpcode::G_ATOMICRMW_FMAX:
4209 return selectG_LOAD_STORE_ATOMICRMW(I);
4210 case TargetOpcode::G_SELECT:
4211 return selectG_SELECT(I);
4212 case TargetOpcode::G_TRUNC:
4213 return selectG_TRUNC(I);
4214 case TargetOpcode::G_SEXT:
4215 case TargetOpcode::G_ZEXT:
4216 case TargetOpcode::G_ANYEXT:
4217 case TargetOpcode::G_SEXT_INREG:
4218 // This is a workaround. For extension from type i1, `selectImpl()` uses
4219 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
4220 // i1 can only be hold in a SGPR class.
4221 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
4222 selectImpl(I, *CoverageInfo))
4223 return true;
4224 return selectG_SZA_EXT(I);
4225 case TargetOpcode::G_FPEXT:
4226 if (selectG_FPEXT(I))
4227 return true;
4228 return selectImpl(I, *CoverageInfo);
4229 case TargetOpcode::G_BRCOND:
4230 return selectG_BRCOND(I);
4231 case TargetOpcode::G_GLOBAL_VALUE:
4232 return selectG_GLOBAL_VALUE(I);
4233 case TargetOpcode::G_PTRMASK:
4234 return selectG_PTRMASK(I);
4235 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4236 return selectG_EXTRACT_VECTOR_ELT(I);
4237 case TargetOpcode::G_INSERT_VECTOR_ELT:
4238 return selectG_INSERT_VECTOR_ELT(I);
4239 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4240 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4241 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4242 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4243 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4244 const AMDGPU::ImageDimIntrinsicInfo *Intr =
4246 assert(Intr && "not an image intrinsic with image pseudo");
4247 return selectImageIntrinsic(I, Intr);
4248 }
4249 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY:
4250 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
4251 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
4252 return selectBVHIntersectRayIntrinsic(I);
4253 case AMDGPU::G_SBFX:
4254 case AMDGPU::G_UBFX:
4255 return selectG_SBFX_UBFX(I);
4256 case AMDGPU::G_SI_CALL:
4257 I.setDesc(TII.get(AMDGPU::SI_CALL));
4258 return true;
4259 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4260 return selectWaveAddress(I);
4261 case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: {
4262 I.setDesc(TII.get(AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN));
4263 return true;
4264 }
4265 case AMDGPU::G_STACKRESTORE:
4266 return selectStackRestore(I);
4267 case AMDGPU::G_PHI:
4268 return selectPHI(I);
4269 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4270 return selectCOPY_SCC_VCC(I);
4271 case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4272 return selectCOPY_VCC_SCC(I);
4273 case AMDGPU::G_AMDGPU_READANYLANE:
4274 return selectReadAnyLane(I);
4275 case TargetOpcode::G_CONSTANT:
4276 case TargetOpcode::G_FCONSTANT:
4277 default:
4278 return selectImpl(I, *CoverageInfo);
4279 }
4280 return false;
4281}
4282
4284AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
4285 return {{
4286 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4287 }};
4288
4289}
4290
4291std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4292 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
4293 unsigned Mods = 0;
4294 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
4295
4296 if (MI->getOpcode() == AMDGPU::G_FNEG) {
4297 Src = MI->getOperand(1).getReg();
4298 Mods |= SISrcMods::NEG;
4299 MI = getDefIgnoringCopies(Src, *MRI);
4300 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4301 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
4302 // denormal mode, but we're implicitly canonicalizing in a source operand.
4303 const ConstantFP *LHS =
4304 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
4305 if (LHS && LHS->isZero()) {
4306 Mods |= SISrcMods::NEG;
4307 Src = MI->getOperand(2).getReg();
4308 }
4309 }
4310
4311 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
4312 Src = MI->getOperand(1).getReg();
4313 Mods |= SISrcMods::ABS;
4314 }
4315
4316 if (OpSel)
4317 Mods |= SISrcMods::OP_SEL_0;
4318
4319 return std::pair(Src, Mods);
4320}
4321
4322Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4323 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
4324 bool ForceVGPR) const {
4325 if ((Mods != 0 || ForceVGPR) &&
4326 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4327
4328 // If we looked through copies to find source modifiers on an SGPR operand,
4329 // we now have an SGPR register source. To avoid potentially violating the
4330 // constant bus restriction, we need to insert a copy to a VGPR.
4331 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
4332 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
4333 TII.get(AMDGPU::COPY), VGPRSrc)
4334 .addReg(Src);
4335 Src = VGPRSrc;
4336 }
4337
4338 return Src;
4339}
4340
4341///
4342/// This will select either an SGPR or VGPR operand and will save us from
4343/// having to write an extra tablegen pattern.
4345AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
4346 return {{
4347 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4348 }};
4349}
4350
4352AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
4353 Register Src;
4354 unsigned Mods;
4355 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4356
4357 return {{
4358 [=](MachineInstrBuilder &MIB) {
4359 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4360 },
4361 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4362 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4363 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4364 }};
4365}
4366
4368AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
4369 Register Src;
4370 unsigned Mods;
4371 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4372 /*IsCanonicalizing=*/true,
4373 /*AllowAbs=*/false);
4374
4375 return {{
4376 [=](MachineInstrBuilder &MIB) {
4377 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4378 },
4379 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4380 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4381 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4382 }};
4383}
4384
4386AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
4387 return {{
4388 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
4389 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4390 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4391 }};
4392}
4393
4395AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
4396 Register Src;
4397 unsigned Mods;
4398 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4399
4400 return {{
4401 [=](MachineInstrBuilder &MIB) {
4402 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4403 },
4404 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4405 }};
4406}
4407
4409AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4410 MachineOperand &Root) const {
4411 Register Src;
4412 unsigned Mods;
4413 std::tie(Src, Mods) =
4414 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
4415
4416 return {{
4417 [=](MachineInstrBuilder &MIB) {
4418 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4419 },
4420 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4421 }};
4422}
4423
4425AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
4426 Register Src;
4427 unsigned Mods;
4428 std::tie(Src, Mods) =
4429 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
4430 /*AllowAbs=*/false);
4431
4432 return {{
4433 [=](MachineInstrBuilder &MIB) {
4434 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4435 },
4436 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4437 }};
4438}
4439
4441AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
4442 Register Reg = Root.getReg();
4443 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
4444 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
4445 return {};
4446 return {{
4447 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4448 }};
4449}
4450
4451enum class SrcStatus {
4456 // This means current op = [op_upper, op_lower] and src = -op_lower.
4459 // This means current op = [op_upper, op_lower] and src = [op_upper,
4460 // -op_lower].
4468};
4469/// Test if the MI is truncating to half, such as `%reg0:n = G_TRUNC %reg1:2n`
4470static bool isTruncHalf(const MachineInstr *MI,
4471 const MachineRegisterInfo &MRI) {
4472 if (MI->getOpcode() != AMDGPU::G_TRUNC)
4473 return false;
4474
4475 unsigned DstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();
4476 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4477 return DstSize * 2 == SrcSize;
4478}
4479
4480/// Test if the MI is logic shift right with half bits,
4481/// such as `%reg0:2n =G_LSHR %reg1:2n, CONST(n)`
4482static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4483 if (MI->getOpcode() != AMDGPU::G_LSHR)
4484 return false;
4485
4486 Register ShiftSrc;
4487 std::optional<ValueAndVReg> ShiftAmt;
4488 if (mi_match(MI->getOperand(0).getReg(), MRI,
4489 m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4490 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4491 unsigned Shift = ShiftAmt->Value.getZExtValue();
4492 return Shift * 2 == SrcSize;
4493 }
4494 return false;
4495}
4496
4497/// Test if the MI is shift left with half bits,
4498/// such as `%reg0:2n =G_SHL %reg1:2n, CONST(n)`
4499static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4500 if (MI->getOpcode() != AMDGPU::G_SHL)
4501 return false;
4502
4503 Register ShiftSrc;
4504 std::optional<ValueAndVReg> ShiftAmt;
4505 if (mi_match(MI->getOperand(0).getReg(), MRI,
4506 m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4507 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4508 unsigned Shift = ShiftAmt->Value.getZExtValue();
4509 return Shift * 2 == SrcSize;
4510 }
4511 return false;
4512}
4513
4514/// Test function, if the MI is `%reg0:n, %reg1:n = G_UNMERGE_VALUES %reg2:2n`
4515static bool isUnmergeHalf(const MachineInstr *MI,
4516 const MachineRegisterInfo &MRI) {
4517 if (MI->getOpcode() != AMDGPU::G_UNMERGE_VALUES)
4518 return false;
4519 return MI->getNumOperands() == 3 && MI->getOperand(0).isDef() &&
4520 MI->getOperand(1).isDef() && !MI->getOperand(2).isDef();
4521}
4522
4524
4526 const MachineRegisterInfo &MRI) {
4527 LLT OpTy = MRI.getType(Reg);
4528 if (OpTy.isScalar())
4529 return TypeClass::SCALAR;
4530 if (OpTy.isVector() && OpTy.getNumElements() == 2)
4533}
4534
4536 const MachineRegisterInfo &MRI) {
4538 if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR)
4539 return SrcStatus::INVALID;
4540
4541 switch (S) {
4542 case SrcStatus::IS_SAME:
4543 if (NegType == TypeClass::VECTOR_OF_TWO) {
4544 // Vector of 2:
4545 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4546 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4547 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4548 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4550 }
4551 if (NegType == TypeClass::SCALAR) {
4552 // Scalar:
4553 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4554 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4555 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4556 // [SrcHi, SrcLo] = [-OpHi, OpLo]
4557 return SrcStatus::IS_HI_NEG;
4558 }
4559 break;
4561 if (NegType == TypeClass::VECTOR_OF_TWO) {
4562 // Vector of 2:
4563 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4564 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4565 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4566 // [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo]
4567 return SrcStatus::IS_LO_NEG;
4568 }
4569 if (NegType == TypeClass::SCALAR) {
4570 // Scalar:
4571 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4572 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4573 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4574 // [SrcHi, SrcLo] = [-(-OpHi), OpLo] = [OpHi, OpLo]
4575 return SrcStatus::IS_SAME;
4576 }
4577 break;
4579 if (NegType == TypeClass::VECTOR_OF_TWO) {
4580 // Vector of 2:
4581 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4582 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4583 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4584 // [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo]
4585 return SrcStatus::IS_HI_NEG;
4586 }
4587 if (NegType == TypeClass::SCALAR) {
4588 // Scalar:
4589 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4590 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4591 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4592 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4594 }
4595 break;
4597 if (NegType == TypeClass::VECTOR_OF_TWO) {
4598 // Vector of 2:
4599 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4600 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4601 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4602 // [SrcHi, SrcLo] = [OpHi, OpLo]
4603 return SrcStatus::IS_SAME;
4604 }
4605 if (NegType == TypeClass::SCALAR) {
4606 // Scalar:
4607 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4608 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4609 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4610 // [SrcHi, SrcLo] = [OpHi, -OpLo]
4611 return SrcStatus::IS_LO_NEG;
4612 }
4613 break;
4615 // Vector of 2:
4616 // Src = CurrUpper
4617 // Curr = [CurrUpper, CurrLower]
4618 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4619 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4620 // Src = -OpUpper
4621 //
4622 // Scalar:
4623 // Src = CurrUpper
4624 // Curr = [CurrUpper, CurrLower]
4625 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4626 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4627 // Src = -OpUpper
4630 if (NegType == TypeClass::VECTOR_OF_TWO) {
4631 // Vector of 2:
4632 // Src = CurrLower
4633 // Curr = [CurrUpper, CurrLower]
4634 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4635 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4636 // Src = -OpLower
4638 }
4639 if (NegType == TypeClass::SCALAR) {
4640 // Scalar:
4641 // Src = CurrLower
4642 // Curr = [CurrUpper, CurrLower]
4643 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4644 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4645 // Src = OpLower
4647 }
4648 break;
4650 // Vector of 2:
4651 // Src = -CurrUpper
4652 // Curr = [CurrUpper, CurrLower]
4653 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4654 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4655 // Src = -(-OpUpper) = OpUpper
4656 //
4657 // Scalar:
4658 // Src = -CurrUpper
4659 // Curr = [CurrUpper, CurrLower]
4660 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4661 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4662 // Src = -(-OpUpper) = OpUpper
4665 if (NegType == TypeClass::VECTOR_OF_TWO) {
4666 // Vector of 2:
4667 // Src = -CurrLower
4668 // Curr = [CurrUpper, CurrLower]
4669 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4670 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4671 // Src = -(-OpLower) = OpLower
4673 }
4674 if (NegType == TypeClass::SCALAR) {
4675 // Scalar:
4676 // Src = -CurrLower
4677 // Curr = [CurrUpper, CurrLower]
4678 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4679 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4680 // Src = -OpLower
4682 }
4683 break;
4684 default:
4685 break;
4686 }
4687 llvm_unreachable("unexpected SrcStatus & NegType combination");
4688}
4689
4690static std::optional<std::pair<Register, SrcStatus>>
4691calcNextStatus(std::pair<Register, SrcStatus> Curr,
4692 const MachineRegisterInfo &MRI) {
4693 const MachineInstr *MI = MRI.getVRegDef(Curr.first);
4694
4695 unsigned Opc = MI->getOpcode();
4696
4697 // Handle general Opc cases.
4698 switch (Opc) {
4699 case AMDGPU::G_BITCAST:
4700 return std::optional<std::pair<Register, SrcStatus>>(
4701 {MI->getOperand(1).getReg(), Curr.second});
4702 case AMDGPU::COPY:
4703 if (MI->getOperand(1).getReg().isPhysical())
4704 return std::nullopt;
4705 return std::optional<std::pair<Register, SrcStatus>>(
4706 {MI->getOperand(1).getReg(), Curr.second});
4707 case AMDGPU::G_FNEG: {
4708 SrcStatus Stat = getNegStatus(Curr.first, Curr.second, MRI);
4709 if (Stat == SrcStatus::INVALID)
4710 return std::nullopt;
4711 return std::optional<std::pair<Register, SrcStatus>>(
4712 {MI->getOperand(1).getReg(), Stat});
4713 }
4714 default:
4715 break;
4716 }
4717
4718 // Calc next Stat from current Stat.
4719 switch (Curr.second) {
4720 case SrcStatus::IS_SAME:
4721 if (isTruncHalf(MI, MRI))
4722 return std::optional<std::pair<Register, SrcStatus>>(
4723 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
4724 else if (isUnmergeHalf(MI, MRI)) {
4725 if (Curr.first == MI->getOperand(0).getReg())
4726 return std::optional<std::pair<Register, SrcStatus>>(
4727 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF});
4728 return std::optional<std::pair<Register, SrcStatus>>(
4729 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF});
4730 }
4731 break;
4733 if (isTruncHalf(MI, MRI)) {
4734 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4735 // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower
4736 // = [OpLowerHi, OpLowerLo]
4737 // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4738 // = [-OpLowerHi, OpLowerLo]
4739 // = -OpLower
4740 return std::optional<std::pair<Register, SrcStatus>>(
4741 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4742 }
4743 if (isUnmergeHalf(MI, MRI)) {
4744 if (Curr.first == MI->getOperand(0).getReg())
4745 return std::optional<std::pair<Register, SrcStatus>>(
4746 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4747 return std::optional<std::pair<Register, SrcStatus>>(
4748 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
4749 }
4750 break;
4752 if (isShlHalf(MI, MRI))
4753 return std::optional<std::pair<Register, SrcStatus>>(
4754 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
4755 break;
4757 if (isLshrHalf(MI, MRI))
4758 return std::optional<std::pair<Register, SrcStatus>>(
4759 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF});
4760 break;
4762 if (isShlHalf(MI, MRI))
4763 return std::optional<std::pair<Register, SrcStatus>>(
4764 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
4765 break;
4767 if (isLshrHalf(MI, MRI))
4768 return std::optional<std::pair<Register, SrcStatus>>(
4769 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
4770 break;
4771 default:
4772 break;
4773 }
4774 return std::nullopt;
4775}
4776
4777/// This is used to control valid status that current MI supports. For example,
4778/// non floating point intrinsic such as @llvm.amdgcn.sdot2 does not support NEG
4779/// bit on VOP3P.
4780/// The class can be further extended to recognize support on SEL, NEG, ABS bit
4781/// for different MI on different arch
4783private:
4784 bool HasNeg = false;
4785 // Assume all complex pattern of VOP3P have opsel.
4786 bool HasOpsel = true;
4787
4788public:
4790 const MachineInstr *MI = MRI.getVRegDef(Reg);
4791 unsigned Opc = MI->getOpcode();
4792
4793 if (Opc < TargetOpcode::GENERIC_OP_END) {
4794 // Keep same for generic op.
4795 HasNeg = true;
4796 } else if (Opc == TargetOpcode::G_INTRINSIC) {
4797 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID();
4798 // Only float point intrinsic has neg & neg_hi bits.
4799 if (IntrinsicID == Intrinsic::amdgcn_fdot2)
4800 HasNeg = true;
4801 }
4802 }
4803 bool checkOptions(SrcStatus Stat) const {
4804 if (!HasNeg &&
4805 (Stat >= SrcStatus::NEG_START && Stat <= SrcStatus::NEG_END)) {
4806 return false;
4807 }
4808 if (!HasOpsel &&
4809 (Stat >= SrcStatus::HALF_START && Stat <= SrcStatus::HALF_END)) {
4810 return false;
4811 }
4812 return true;
4813 }
4814};
4815
4818 int MaxDepth = 3) {
4819 int Depth = 0;
4820 auto Curr = calcNextStatus({Reg, SrcStatus::IS_SAME}, MRI);
4822
4823 while (Depth <= MaxDepth && Curr.has_value()) {
4824 Depth++;
4825 if (SO.checkOptions(Curr.value().second))
4826 Statlist.push_back(Curr.value());
4827 Curr = calcNextStatus(Curr.value(), MRI);
4828 }
4829
4830 return Statlist;
4831}
4832
4833static std::pair<Register, SrcStatus>
4835 int MaxDepth = 3) {
4836 int Depth = 0;
4837 std::pair<Register, SrcStatus> LastSameOrNeg = {Reg, SrcStatus::IS_SAME};
4838 auto Curr = calcNextStatus(LastSameOrNeg, MRI);
4839
4840 while (Depth <= MaxDepth && Curr.has_value()) {
4841 Depth++;
4842 SrcStatus Stat = Curr.value().second;
4843 if (SO.checkOptions(Stat)) {
4844 if (Stat == SrcStatus::IS_SAME || Stat == SrcStatus::IS_HI_NEG ||
4846 LastSameOrNeg = Curr.value();
4847 }
4848 Curr = calcNextStatus(Curr.value(), MRI);
4849 }
4850
4851 return LastSameOrNeg;
4852}
4853
4854static bool isSameBitWidth(Register Reg1, Register Reg2,
4855 const MachineRegisterInfo &MRI) {
4856 unsigned Width1 = MRI.getType(Reg1).getSizeInBits();
4857 unsigned Width2 = MRI.getType(Reg2).getSizeInBits();
4858 return Width1 == Width2;
4859}
4860
4861static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {
4862 // SrcStatus::IS_LOWER_HALF remain 0.
4863 if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) {
4864 Mods ^= SISrcMods::NEG_HI;
4865 Mods |= SISrcMods::OP_SEL_1;
4866 } else if (HiStat == SrcStatus::IS_UPPER_HALF)
4867 Mods |= SISrcMods::OP_SEL_1;
4868 else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG)
4869 Mods ^= SISrcMods::NEG_HI;
4870 else if (HiStat == SrcStatus::IS_HI_NEG)
4871 Mods ^= SISrcMods::NEG_HI;
4872
4873 if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {
4874 Mods ^= SISrcMods::NEG;
4875 Mods |= SISrcMods::OP_SEL_0;
4876 } else if (LoStat == SrcStatus::IS_UPPER_HALF)
4877 Mods |= SISrcMods::OP_SEL_0;
4878 else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG)
4879 Mods |= SISrcMods::NEG;
4880 else if (LoStat == SrcStatus::IS_HI_NEG)
4881 Mods ^= SISrcMods::NEG;
4882
4883 return Mods;
4884}
4885
4886static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg,
4887 Register RootReg, const SIInstrInfo &TII,
4888 const MachineRegisterInfo &MRI) {
4889 auto IsHalfState = [](SrcStatus S) {
4892 };
4893 return isSameBitWidth(NewReg, RootReg, MRI) && IsHalfState(LoStat) &&
4894 IsHalfState(HiStat);
4895}
4896
4897std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PModsImpl(
4898 Register RootReg, const MachineRegisterInfo &MRI, bool IsDOT) const {
4899 unsigned Mods = 0;
4900 // No modification if Root type is not form of <2 x Type>.
4901 if (isVectorOfTwoOrScalar(RootReg, MRI) != TypeClass::VECTOR_OF_TWO) {
4902 Mods |= SISrcMods::OP_SEL_1;
4903 return {RootReg, Mods};
4904 }
4905
4906 SearchOptions SO(RootReg, MRI);
4907
4908 std::pair<Register, SrcStatus> Stat = getLastSameOrNeg(RootReg, MRI, SO);
4909
4910 if (Stat.second == SrcStatus::IS_BOTH_NEG)
4912 else if (Stat.second == SrcStatus::IS_HI_NEG)
4913 Mods ^= SISrcMods::NEG_HI;
4914 else if (Stat.second == SrcStatus::IS_LO_NEG)
4915 Mods ^= SISrcMods::NEG;
4916
4917 MachineInstr *MI = MRI.getVRegDef(Stat.first);
4918
4919 if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 ||
4920 (IsDOT && Subtarget->hasDOTOpSelHazard())) {
4921 Mods |= SISrcMods::OP_SEL_1;
4922 return {Stat.first, Mods};
4923 }
4924
4926 getSrcStats(MI->getOperand(2).getReg(), MRI, SO);
4927
4928 if (StatlistHi.empty()) {
4929 Mods |= SISrcMods::OP_SEL_1;
4930 return {Stat.first, Mods};
4931 }
4932
4934 getSrcStats(MI->getOperand(1).getReg(), MRI, SO);
4935
4936 if (StatlistLo.empty()) {
4937 Mods |= SISrcMods::OP_SEL_1;
4938 return {Stat.first, Mods};
4939 }
4940
4941 for (int I = StatlistHi.size() - 1; I >= 0; I--) {
4942 for (int J = StatlistLo.size() - 1; J >= 0; J--) {
4943 if (StatlistHi[I].first == StatlistLo[J].first &&
4944 isValidToPack(StatlistHi[I].second, StatlistLo[J].second,
4945 StatlistHi[I].first, RootReg, TII, MRI))
4946 return {StatlistHi[I].first,
4947 updateMods(StatlistHi[I].second, StatlistLo[J].second, Mods)};
4948 }
4949 }
4950 // Packed instructions do not have abs modifiers.
4951 Mods |= SISrcMods::OP_SEL_1;
4952
4953 return {Stat.first, Mods};
4954}
4955
4956// Removed unused function `getAllKindImm` to eliminate dead code.
4957
4958static bool checkRB(Register Reg, unsigned int RBNo,
4959 const AMDGPURegisterBankInfo &RBI,
4960 const MachineRegisterInfo &MRI,
4961 const TargetRegisterInfo &TRI) {
4962 const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI);
4963 return RB->getID() == RBNo;
4964}
4965
4966// This function is used to get the correct register bank for returned reg.
4967// Assume:
4968// 1. VOP3P is always legal for VGPR.
4969// 2. RootOp's regbank is legal.
4970// Thus
4971// 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR.
4972// 2. If RootOp is VGPR, then NewOp must be VGPR.
4974 const AMDGPURegisterBankInfo &RBI,
4976 const TargetRegisterInfo &TRI,
4977 const SIInstrInfo &TII) {
4978 // RootOp can only be VGPR or SGPR (some hand written cases such as.
4979 // inst-select-ashr.v2s16.mir::ashr_v2s16_vs).
4980 if (checkRB(RootReg, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
4981 checkRB(NewReg, AMDGPU::VGPRRegBankID, RBI, MRI, TRI))
4982 return NewReg;
4983
4984 MachineInstr *MI = MRI.getVRegDef(RootReg);
4985 if (MI->getOpcode() == AMDGPU::COPY && NewReg == MI->getOperand(1).getReg()) {
4986 // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp.
4987 return RootReg;
4988 }
4989
4990 MachineBasicBlock *BB = MI->getParent();
4991 Register DstReg = MRI.cloneVirtualRegister(RootReg);
4992
4994 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
4995 .addReg(NewReg);
4996
4997 // Only accept VGPR.
4998 return MIB->getOperand(0).getReg();
4999}
5000
5002AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root,
5003 bool IsDOT) const {
5004 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5005 Register Reg;
5006 unsigned Mods;
5007 std::tie(Reg, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, IsDOT);
5008
5009 Reg = getLegalRegBank(Reg, Root.getReg(), RBI, MRI, TRI, TII);
5010 return {{
5011 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5012 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5013 }};
5014}
5015
5017AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
5018
5019 return selectVOP3PRetHelper(Root);
5020}
5021
5023AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
5024
5025 return selectVOP3PRetHelper(Root, true);
5026}
5027
5029AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
5030 MachineOperand &Root) const {
5031 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
5032 "expected i1 value");
5033 unsigned Mods = SISrcMods::OP_SEL_1;
5034 if (Root.getImm() != 0)
5035 Mods |= SISrcMods::OP_SEL_0;
5036
5037 return {{
5038 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5039 }};
5040}
5041
5043 MachineInstr *InsertPt,
5045 const TargetRegisterClass *DstRegClass;
5046 switch (Elts.size()) {
5047 case 8:
5048 DstRegClass = &AMDGPU::VReg_256RegClass;
5049 break;
5050 case 4:
5051 DstRegClass = &AMDGPU::VReg_128RegClass;
5052 break;
5053 case 2:
5054 DstRegClass = &AMDGPU::VReg_64RegClass;
5055 break;
5056 default:
5057 llvm_unreachable("unhandled Reg sequence size");
5058 }
5059
5060 MachineIRBuilder B(*InsertPt);
5061 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
5062 .addDef(MRI.createVirtualRegister(DstRegClass));
5063 for (unsigned i = 0; i < Elts.size(); ++i) {
5064 MIB.addReg(Elts[i]);
5066 }
5067 return MIB->getOperand(0).getReg();
5068}
5069
5070static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
5072 MachineInstr *InsertPt,
5074 if (ModOpcode == TargetOpcode::G_FNEG) {
5075 Mods |= SISrcMods::NEG;
5076 // Check if all elements also have abs modifier
5077 SmallVector<Register, 8> NegAbsElts;
5078 for (auto El : Elts) {
5079 Register FabsSrc;
5080 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
5081 break;
5082 NegAbsElts.push_back(FabsSrc);
5083 }
5084 if (Elts.size() != NegAbsElts.size()) {
5085 // Neg
5086 Src = buildRegSequence(Elts, InsertPt, MRI);
5087 } else {
5088 // Neg and Abs
5089 Mods |= SISrcMods::NEG_HI;
5090 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
5091 }
5092 } else {
5093 assert(ModOpcode == TargetOpcode::G_FABS);
5094 // Abs
5095 Mods |= SISrcMods::NEG_HI;
5096 Src = buildRegSequence(Elts, InsertPt, MRI);
5097 }
5098}
5099
5101AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
5102 Register Src = Root.getReg();
5103 unsigned Mods = SISrcMods::OP_SEL_1;
5105
5106 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
5107 assert(BV->getNumSources() > 0);
5108 // Based on first element decide which mod we match, neg or abs
5109 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
5110 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
5111 ? AMDGPU::G_FNEG
5112 : AMDGPU::G_FABS;
5113 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
5114 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
5115 if (ElF32->getOpcode() != ModOpcode)
5116 break;
5117 EltsF32.push_back(ElF32->getOperand(1).getReg());
5118 }
5119
5120 // All elements had ModOpcode modifier
5121 if (BV->getNumSources() == EltsF32.size()) {
5122 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
5123 *MRI);
5124 }
5125 }
5126
5127 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5128 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5129}
5130
5132AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
5133 Register Src = Root.getReg();
5134 unsigned Mods = SISrcMods::OP_SEL_1;
5135 SmallVector<Register, 8> EltsV2F16;
5136
5137 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5138 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5139 Register FNegSrc;
5140 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
5141 break;
5142 EltsV2F16.push_back(FNegSrc);
5143 }
5144
5145 // All elements had ModOpcode modifier
5146 if (CV->getNumSources() == EltsV2F16.size()) {
5147 Mods |= SISrcMods::NEG;
5148 Mods |= SISrcMods::NEG_HI;
5149 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
5150 }
5151 }
5152
5153 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5154 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5155}
5156
5158AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
5159 Register Src = Root.getReg();
5160 unsigned Mods = SISrcMods::OP_SEL_1;
5161 SmallVector<Register, 8> EltsV2F16;
5162
5163 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5164 assert(CV->getNumSources() > 0);
5165 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
5166 // Based on first element decide which mod we match, neg or abs
5167 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
5168 ? AMDGPU::G_FNEG
5169 : AMDGPU::G_FABS;
5170
5171 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5172 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
5173 if (ElV2F16->getOpcode() != ModOpcode)
5174 break;
5175 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
5176 }
5177
5178 // All elements had ModOpcode modifier
5179 if (CV->getNumSources() == EltsV2F16.size()) {
5180 MachineIRBuilder B(*Root.getParent());
5181 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
5182 *MRI);
5183 }
5184 }
5185
5186 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5187 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5188}
5189
5191AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
5192 std::optional<FPValueAndVReg> FPValReg;
5193 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
5194 if (TII.isInlineConstant(FPValReg->Value)) {
5195 return {{[=](MachineInstrBuilder &MIB) {
5196 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
5197 }}};
5198 }
5199 // Non-inlineable splat floats should not fall-through for integer immediate
5200 // checks.
5201 return {};
5202 }
5203
5204 APInt ICst;
5205 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
5206 if (TII.isInlineConstant(ICst)) {
5207 return {
5208 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
5209 }
5210 }
5211
5212 return {};
5213}
5214
5216AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
5217 Register Src =
5218 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5219 unsigned Key = 0;
5220
5221 Register ShiftSrc;
5222 std::optional<ValueAndVReg> ShiftAmt;
5223 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5224 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5225 ShiftAmt->Value.getZExtValue() % 8 == 0) {
5226 Key = ShiftAmt->Value.getZExtValue() / 8;
5227 Src = ShiftSrc;
5228 }
5229
5230 return {{
5231 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5232 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5233 }};
5234}
5235
5237AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
5238
5239 Register Src =
5240 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5241 unsigned Key = 0;
5242
5243 Register ShiftSrc;
5244 std::optional<ValueAndVReg> ShiftAmt;
5245 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5246 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5247 ShiftAmt->Value.getZExtValue() == 16) {
5248 Src = ShiftSrc;
5249 Key = 1;
5250 }
5251
5252 return {{
5253 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5254 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5255 }};
5256}
5257
5259AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {
5260 Register Src =
5261 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5262 unsigned Key = 0;
5263
5264 Register S32 = matchZeroExtendFromS32(Src);
5265 if (!S32)
5266 S32 = matchAnyExtendFromS32(Src);
5267
5268 if (S32) {
5269 const MachineInstr *Def = getDefIgnoringCopies(S32, *MRI);
5270 if (Def->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
5271 assert(Def->getNumOperands() == 3);
5272 Register DstReg1 = Def->getOperand(1).getReg();
5273 if (mi_match(S32, *MRI,
5274 m_any_of(m_SpecificReg(DstReg1), m_Copy(m_Reg(DstReg1))))) {
5275 Src = Def->getOperand(2).getReg();
5276 Key = 1;
5277 }
5278 }
5279 }
5280
5281 return {{
5282 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5283 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5284 }};
5285}
5286
5288AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
5289 Register Src;
5290 unsigned Mods;
5291 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
5292
5293 // FIXME: Handle op_sel
5294 return {{
5295 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5296 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5297 }};
5298}
5299
5300// FIXME-TRUE16 remove when fake16 is removed
5302AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
5303 Register Src;
5304 unsigned Mods;
5305 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5306 /*IsCanonicalizing=*/true,
5307 /*AllowAbs=*/false,
5308 /*OpSel=*/false);
5309
5310 return {{
5311 [=](MachineInstrBuilder &MIB) {
5312 MIB.addReg(
5313 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5314 },
5315 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5316 }};
5317}
5318
5320AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
5321 Register Src;
5322 unsigned Mods;
5323 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5324 /*IsCanonicalizing=*/true,
5325 /*AllowAbs=*/false,
5326 /*OpSel=*/true);
5327
5328 return {{
5329 [=](MachineInstrBuilder &MIB) {
5330 MIB.addReg(
5331 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5332 },
5333 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5334 }};
5335}
5336
5337// Given \p Offset and load specified by the \p Root operand check if \p Offset
5338// is a multiple of the load byte size. If it is update \p Offset to a
5339// pre-scaled value and return true.
5340bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root,
5342 bool IsSigned) const {
5343 if (!Subtarget->hasScaleOffset())
5344 return false;
5345
5346 const MachineInstr &MI = *Root.getParent();
5347 MachineMemOperand *MMO = *MI.memoperands_begin();
5348
5349 if (!MMO->getSize().hasValue())
5350 return false;
5351
5352 uint64_t Size = MMO->getSize().getValue();
5353
5354 Register OffsetReg = matchExtendFromS32OrS32(Offset, IsSigned);
5355 if (!OffsetReg)
5356 OffsetReg = Offset;
5357
5358 if (auto Def = getDefSrcRegIgnoringCopies(OffsetReg, *MRI))
5359 OffsetReg = Def->Reg;
5360
5361 Register Op0;
5362 MachineInstr *Mul;
5363 bool ScaleOffset =
5364 (isPowerOf2_64(Size) &&
5365 mi_match(OffsetReg, *MRI,
5366 m_GShl(m_Reg(Op0),
5369 mi_match(OffsetReg, *MRI,
5371 m_Copy(m_SpecificICst(Size))))) ||
5372 mi_match(
5373 OffsetReg, *MRI,
5374 m_BinOp(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64,
5375 m_Reg(Op0), m_SpecificICst(Size))) ||
5376 // Match G_AMDGPU_MAD_U64_U32 offset, c, 0
5377 (mi_match(OffsetReg, *MRI, m_MInstr(Mul)) &&
5378 (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32
5379 : AMDGPU::G_AMDGPU_MAD_U64_U32) ||
5380 (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 &&
5381 VT->signBitIsZero(Mul->getOperand(2).getReg()))) &&
5382 mi_match(Mul->getOperand(4).getReg(), *MRI, m_ZeroInt()) &&
5383 mi_match(Mul->getOperand(3).getReg(), *MRI,
5385 m_Copy(m_SpecificICst(Size))))) &&
5386 mi_match(Mul->getOperand(2).getReg(), *MRI, m_Reg(Op0)));
5387
5388 if (ScaleOffset)
5389 Offset = Op0;
5390
5391 return ScaleOffset;
5392}
5393
5394bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
5395 Register &Base,
5396 Register *SOffset,
5397 int64_t *Offset,
5398 bool *ScaleOffset) const {
5399 MachineInstr *MI = Root.getParent();
5400 MachineBasicBlock *MBB = MI->getParent();
5401
5402 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
5403 // then we can select all ptr + 32-bit offsets.
5404 SmallVector<GEPInfo, 4> AddrInfo;
5405 getAddrModeInfo(*MI, *MRI, AddrInfo);
5406
5407 if (AddrInfo.empty())
5408 return false;
5409
5410 const GEPInfo &GEPI = AddrInfo[0];
5411 std::optional<int64_t> EncodedImm;
5412
5413 if (ScaleOffset)
5414 *ScaleOffset = false;
5415
5416 if (SOffset && Offset) {
5417 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5418 /*HasSOffset=*/true);
5419 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
5420 AddrInfo.size() > 1) {
5421 const GEPInfo &GEPI2 = AddrInfo[1];
5422 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
5423 Register OffsetReg = GEPI2.SgprParts[1];
5424 if (ScaleOffset)
5425 *ScaleOffset =
5426 selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5427 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5428 if (OffsetReg) {
5429 Base = GEPI2.SgprParts[0];
5430 *SOffset = OffsetReg;
5431 *Offset = *EncodedImm;
5432 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
5433 return true;
5434
5435 // For unbuffered smem loads, it is illegal for the Immediate Offset
5436 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
5437 // is negative. Handle the case where the Immediate Offset + SOffset
5438 // is negative.
5439 auto SKnown = VT->getKnownBits(*SOffset);
5440 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
5441 return false;
5442
5443 return true;
5444 }
5445 }
5446 }
5447 return false;
5448 }
5449
5450 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5451 /*HasSOffset=*/false);
5452 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
5453 Base = GEPI.SgprParts[0];
5454 *Offset = *EncodedImm;
5455 return true;
5456 }
5457
5458 // SGPR offset is unsigned.
5459 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
5460 GEPI.Imm != 0) {
5461 // If we make it this far we have a load with an 32-bit immediate offset.
5462 // It is OK to select this using a sgpr offset, because we have already
5463 // failed trying to select this load into one of the _IMM variants since
5464 // the _IMM Patterns are considered before the _SGPR patterns.
5465 Base = GEPI.SgprParts[0];
5466 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5467 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
5468 .addImm(GEPI.Imm);
5469 return true;
5470 }
5471
5472 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
5473 Register OffsetReg = GEPI.SgprParts[1];
5474 if (ScaleOffset)
5475 *ScaleOffset = selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5476 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5477 if (OffsetReg) {
5478 Base = GEPI.SgprParts[0];
5479 *SOffset = OffsetReg;
5480 return true;
5481 }
5482 }
5483
5484 return false;
5485}
5486
5488AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
5489 Register Base;
5490 int64_t Offset;
5491 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset,
5492 /* ScaleOffset */ nullptr))
5493 return std::nullopt;
5494
5495 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5496 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
5497}
5498
5500AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
5501 SmallVector<GEPInfo, 4> AddrInfo;
5502 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
5503
5504 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
5505 return std::nullopt;
5506
5507 const GEPInfo &GEPInfo = AddrInfo[0];
5508 Register PtrReg = GEPInfo.SgprParts[0];
5509 std::optional<int64_t> EncodedImm =
5510 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
5511 if (!EncodedImm)
5512 return std::nullopt;
5513
5514 return {{
5515 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
5516 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
5517 }};
5518}
5519
5521AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
5522 Register Base, SOffset;
5523 bool ScaleOffset;
5524 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr,
5525 &ScaleOffset))
5526 return std::nullopt;
5527
5528 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5529 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5530 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5531 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5532}
5533
5535AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
5536 Register Base, SOffset;
5537 int64_t Offset;
5538 bool ScaleOffset;
5539 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset, &ScaleOffset))
5540 return std::nullopt;
5541
5542 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5543 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5544 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5545 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
5546 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5547}
5548
5549std::pair<Register, int>
5550AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
5551 uint64_t FlatVariant) const {
5552 MachineInstr *MI = Root.getParent();
5553
5554 auto Default = std::pair(Root.getReg(), 0);
5555
5556 if (!STI.hasFlatInstOffsets())
5557 return Default;
5558
5559 Register PtrBase;
5560 int64_t ConstOffset;
5561 bool IsInBounds;
5562 std::tie(PtrBase, ConstOffset, IsInBounds) =
5563 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5564
5565 // Adding the offset to the base address with an immediate in a FLAT
5566 // instruction must not change the memory aperture in which the address falls.
5567 // Therefore we can only fold offsets from inbounds GEPs into FLAT
5568 // instructions.
5569 if (ConstOffset == 0 ||
5570 (FlatVariant == SIInstrFlags::FlatScratch &&
5571 !isFlatScratchBaseLegal(Root.getReg())) ||
5572 (FlatVariant == SIInstrFlags::FLAT && !IsInBounds))
5573 return Default;
5574
5575 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
5576 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
5577 return Default;
5578
5579 return std::pair(PtrBase, ConstOffset);
5580}
5581
5583AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
5584 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
5585
5586 return {{
5587 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5588 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5589 }};
5590}
5591
5593AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
5594 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
5595
5596 return {{
5597 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5598 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5599 }};
5600}
5601
5603AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
5604 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
5605
5606 return {{
5607 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5608 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5609 }};
5610}
5611
5612// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
5614AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
5615 unsigned CPolBits,
5616 bool NeedIOffset) const {
5617 Register Addr = Root.getReg();
5618 Register PtrBase;
5619 int64_t ConstOffset;
5620 int64_t ImmOffset = 0;
5621
5622 // Match the immediate offset first, which canonically is moved as low as
5623 // possible.
5624 std::tie(PtrBase, ConstOffset, std::ignore) =
5625 getPtrBaseWithConstantOffset(Addr, *MRI);
5626
5627 if (ConstOffset != 0) {
5628 if (NeedIOffset &&
5629 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5631 Addr = PtrBase;
5632 ImmOffset = ConstOffset;
5633 } else {
5634 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
5635 if (isSGPR(PtrBaseDef->Reg)) {
5636 if (ConstOffset > 0) {
5637 // Offset is too large.
5638 //
5639 // saddr + large_offset -> saddr +
5640 // (voffset = large_offset & ~MaxOffset) +
5641 // (large_offset & MaxOffset);
5642 int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset;
5643 if (NeedIOffset) {
5644 std::tie(SplitImmOffset, RemainderOffset) =
5645 TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5647 }
5648
5649 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
5650 : isUInt<32>(RemainderOffset)) {
5651 MachineInstr *MI = Root.getParent();
5652 MachineBasicBlock *MBB = MI->getParent();
5653 Register HighBits =
5654 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5655
5656 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
5657 HighBits)
5658 .addImm(RemainderOffset);
5659
5660 if (NeedIOffset)
5661 return {{
5662 [=](MachineInstrBuilder &MIB) {
5663 MIB.addReg(PtrBase);
5664 }, // saddr
5665 [=](MachineInstrBuilder &MIB) {
5666 MIB.addReg(HighBits);
5667 }, // voffset
5668 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
5669 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
5670 }};
5671 return {{
5672 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
5673 [=](MachineInstrBuilder &MIB) {
5674 MIB.addReg(HighBits);
5675 }, // voffset
5676 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
5677 }};
5678 }
5679 }
5680
5681 // We are adding a 64 bit SGPR and a constant. If constant bus limit
5682 // is 1 we would need to perform 1 or 2 extra moves for each half of
5683 // the constant and it is better to do a scalar add and then issue a
5684 // single VALU instruction to materialize zero. Otherwise it is less
5685 // instructions to perform VALU adds with immediates or inline literals.
5686 unsigned NumLiterals =
5687 !TII.isInlineConstant(APInt(32, Lo_32(ConstOffset))) +
5688 !TII.isInlineConstant(APInt(32, Hi_32(ConstOffset)));
5689 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
5690 return std::nullopt;
5691 }
5692 }
5693 }
5694
5695 // Match the variable offset.
5696 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5697 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
5698 // Look through the SGPR->VGPR copy.
5699 Register SAddr =
5700 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
5701
5702 if (isSGPR(SAddr)) {
5703 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
5704
5705 // It's possible voffset is an SGPR here, but the copy to VGPR will be
5706 // inserted later.
5707 bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset,
5708 Subtarget->hasSignedGVSOffset());
5709 if (Register VOffset = matchExtendFromS32OrS32(
5710 PtrBaseOffset, Subtarget->hasSignedGVSOffset())) {
5711 if (NeedIOffset)
5712 return {{[=](MachineInstrBuilder &MIB) { // saddr
5713 MIB.addReg(SAddr);
5714 },
5715 [=](MachineInstrBuilder &MIB) { // voffset
5716 MIB.addReg(VOffset);
5717 },
5718 [=](MachineInstrBuilder &MIB) { // offset
5719 MIB.addImm(ImmOffset);
5720 },
5721 [=](MachineInstrBuilder &MIB) { // cpol
5722 MIB.addImm(CPolBits |
5723 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
5724 }}};
5725 return {{[=](MachineInstrBuilder &MIB) { // saddr
5726 MIB.addReg(SAddr);
5727 },
5728 [=](MachineInstrBuilder &MIB) { // voffset
5729 MIB.addReg(VOffset);
5730 },
5731 [=](MachineInstrBuilder &MIB) { // cpol
5732 MIB.addImm(CPolBits |
5733 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
5734 }}};
5735 }
5736 }
5737 }
5738
5739 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
5740 // drop this.
5741 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
5742 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
5743 return std::nullopt;
5744
5745 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
5746 // moves required to copy a 64-bit SGPR to VGPR.
5747 MachineInstr *MI = Root.getParent();
5748 MachineBasicBlock *MBB = MI->getParent();
5749 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5750
5751 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
5752 .addImm(0);
5753
5754 if (NeedIOffset)
5755 return {{
5756 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
5757 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
5758 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
5759 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
5760 }};
5761 return {{
5762 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
5763 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
5764 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
5765 }};
5766}
5767
5769AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
5770 return selectGlobalSAddr(Root, 0);
5771}
5772
5774AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
5775 const MachineInstr &I = *Root.getParent();
5776
5777 // We are assuming CPol is always the last operand of the intrinsic.
5778 auto PassedCPol =
5779 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
5780 return selectGlobalSAddr(Root, PassedCPol);
5781}
5782
5784AMDGPUInstructionSelector::selectGlobalSAddrCPolM0(MachineOperand &Root) const {
5785 const MachineInstr &I = *Root.getParent();
5786
5787 // We are assuming CPol is second from last operand of the intrinsic.
5788 auto PassedCPol =
5789 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
5790 return selectGlobalSAddr(Root, PassedCPol);
5791}
5792
5794AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
5795 return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
5796}
5797
5799AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
5800 MachineOperand &Root) const {
5801 const MachineInstr &I = *Root.getParent();
5802
5803 // We are assuming CPol is always the last operand of the intrinsic.
5804 auto PassedCPol =
5805 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
5806 return selectGlobalSAddr(Root, PassedCPol, false);
5807}
5808
5810AMDGPUInstructionSelector::selectGlobalSAddrNoIOffsetM0(
5811 MachineOperand &Root) const {
5812 const MachineInstr &I = *Root.getParent();
5813
5814 // We are assuming CPol is second from last operand of the intrinsic.
5815 auto PassedCPol =
5816 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
5817 return selectGlobalSAddr(Root, PassedCPol, false);
5818}
5819
5821AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
5822 Register Addr = Root.getReg();
5823 Register PtrBase;
5824 int64_t ConstOffset;
5825 int64_t ImmOffset = 0;
5826
5827 // Match the immediate offset first, which canonically is moved as low as
5828 // possible.
5829 std::tie(PtrBase, ConstOffset, std::ignore) =
5830 getPtrBaseWithConstantOffset(Addr, *MRI);
5831
5832 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
5833 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
5835 Addr = PtrBase;
5836 ImmOffset = ConstOffset;
5837 }
5838
5839 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5840 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5841 int FI = AddrDef->MI->getOperand(1).getIndex();
5842 return {{
5843 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
5844 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
5845 }};
5846 }
5847
5848 Register SAddr = AddrDef->Reg;
5849
5850 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
5851 Register LHS = AddrDef->MI->getOperand(1).getReg();
5852 Register RHS = AddrDef->MI->getOperand(2).getReg();
5853 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
5854 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
5855
5856 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
5857 isSGPR(RHSDef->Reg)) {
5858 int FI = LHSDef->MI->getOperand(1).getIndex();
5859 MachineInstr &I = *Root.getParent();
5860 MachineBasicBlock *BB = I.getParent();
5861 const DebugLoc &DL = I.getDebugLoc();
5862 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5863
5864 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
5865 .addFrameIndex(FI)
5866 .addReg(RHSDef->Reg)
5867 .setOperandDead(3); // Dead scc
5868 }
5869 }
5870
5871 if (!isSGPR(SAddr))
5872 return std::nullopt;
5873
5874 return {{
5875 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
5876 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
5877 }};
5878}
5879
5880// Check whether the flat scratch SVS swizzle bug affects this access.
5881bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
5882 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
5883 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
5884 return false;
5885
5886 // The bug affects the swizzling of SVS accesses if there is any carry out
5887 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
5888 // voffset to (soffset + inst_offset).
5889 auto VKnown = VT->getKnownBits(VAddr);
5890 auto SKnown = KnownBits::add(VT->getKnownBits(SAddr),
5891 KnownBits::makeConstant(APInt(32, ImmOffset)));
5892 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
5893 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
5894 return (VMax & 3) + (SMax & 3) >= 4;
5895}
5896
5898AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
5899 Register Addr = Root.getReg();
5900 Register PtrBase;
5901 int64_t ConstOffset;
5902 int64_t ImmOffset = 0;
5903
5904 // Match the immediate offset first, which canonically is moved as low as
5905 // possible.
5906 std::tie(PtrBase, ConstOffset, std::ignore) =
5907 getPtrBaseWithConstantOffset(Addr, *MRI);
5908
5909 Register OrigAddr = Addr;
5910 if (ConstOffset != 0 &&
5911 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
5913 Addr = PtrBase;
5914 ImmOffset = ConstOffset;
5915 }
5916
5917 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
5918 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
5919 return std::nullopt;
5920
5921 Register RHS = AddrDef->MI->getOperand(2).getReg();
5922 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
5923 return std::nullopt;
5924
5925 Register LHS = AddrDef->MI->getOperand(1).getReg();
5926 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
5927
5928 if (OrigAddr != Addr) {
5929 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
5930 return std::nullopt;
5931 } else {
5932 if (!isFlatScratchBaseLegalSV(OrigAddr))
5933 return std::nullopt;
5934 }
5935
5936 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
5937 return std::nullopt;
5938
5939 unsigned CPol = selectScaleOffset(Root, RHS, true /* IsSigned */)
5941 : 0;
5942
5943 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5944 int FI = LHSDef->MI->getOperand(1).getIndex();
5945 return {{
5946 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
5947 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
5948 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
5949 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
5950 }};
5951 }
5952
5953 if (!isSGPR(LHS))
5954 if (auto Def = getDefSrcRegIgnoringCopies(LHS, *MRI))
5955 LHS = Def->Reg;
5956
5957 if (!isSGPR(LHS))
5958 return std::nullopt;
5959
5960 return {{
5961 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
5962 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
5963 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
5964 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
5965 }};
5966}
5967
5969AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
5970 MachineInstr *MI = Root.getParent();
5971 MachineBasicBlock *MBB = MI->getParent();
5972 MachineFunction *MF = MBB->getParent();
5973 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
5974
5975 int64_t Offset = 0;
5976 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
5977 Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
5978 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5979
5980 // TODO: Should this be inside the render function? The iterator seems to
5981 // move.
5982 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
5983 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
5984 HighBits)
5985 .addImm(Offset & ~MaxOffset);
5986
5987 return {{[=](MachineInstrBuilder &MIB) { // rsrc
5988 MIB.addReg(Info->getScratchRSrcReg());
5989 },
5990 [=](MachineInstrBuilder &MIB) { // vaddr
5991 MIB.addReg(HighBits);
5992 },
5993 [=](MachineInstrBuilder &MIB) { // soffset
5994 // Use constant zero for soffset and rely on eliminateFrameIndex
5995 // to choose the appropriate frame register if need be.
5996 MIB.addImm(0);
5997 },
5998 [=](MachineInstrBuilder &MIB) { // offset
5999 MIB.addImm(Offset & MaxOffset);
6000 }}};
6001 }
6002
6003 assert(Offset == 0 || Offset == -1);
6004
6005 // Try to fold a frame index directly into the MUBUF vaddr field, and any
6006 // offsets.
6007 std::optional<int> FI;
6008 Register VAddr = Root.getReg();
6009
6010 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6011 Register PtrBase;
6012 int64_t ConstOffset;
6013 std::tie(PtrBase, ConstOffset, std::ignore) =
6014 getPtrBaseWithConstantOffset(VAddr, *MRI);
6015 if (ConstOffset != 0) {
6016 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
6017 (!STI.privateMemoryResourceIsRangeChecked() ||
6018 VT->signBitIsZero(PtrBase))) {
6019 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
6020 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
6021 FI = PtrBaseDef->getOperand(1).getIndex();
6022 else
6023 VAddr = PtrBase;
6024 Offset = ConstOffset;
6025 }
6026 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6027 FI = RootDef->getOperand(1).getIndex();
6028 }
6029
6030 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6031 MIB.addReg(Info->getScratchRSrcReg());
6032 },
6033 [=](MachineInstrBuilder &MIB) { // vaddr
6034 if (FI)
6035 MIB.addFrameIndex(*FI);
6036 else
6037 MIB.addReg(VAddr);
6038 },
6039 [=](MachineInstrBuilder &MIB) { // soffset
6040 // Use constant zero for soffset and rely on eliminateFrameIndex
6041 // to choose the appropriate frame register if need be.
6042 MIB.addImm(0);
6043 },
6044 [=](MachineInstrBuilder &MIB) { // offset
6045 MIB.addImm(Offset);
6046 }}};
6047}
6048
6049bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
6050 int64_t Offset) const {
6051 if (!isUInt<16>(Offset))
6052 return false;
6053
6054 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6055 return true;
6056
6057 // On Southern Islands instruction with a negative base value and an offset
6058 // don't seem to work.
6059 return VT->signBitIsZero(Base);
6060}
6061
6062bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
6063 int64_t Offset1,
6064 unsigned Size) const {
6065 if (Offset0 % Size != 0 || Offset1 % Size != 0)
6066 return false;
6067 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
6068 return false;
6069
6070 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6071 return true;
6072
6073 // On Southern Islands instruction with a negative base value and an offset
6074 // don't seem to work.
6075 return VT->signBitIsZero(Base);
6076}
6077
6078// Return whether the operation has NoUnsignedWrap property.
6079static bool isNoUnsignedWrap(MachineInstr *Addr) {
6080 return Addr->getOpcode() == TargetOpcode::G_OR ||
6081 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
6083}
6084
6085// Check that the base address of flat scratch load/store in the form of `base +
6086// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
6087// requirement). We always treat the first operand as the base address here.
6088bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
6089 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6090
6091 if (isNoUnsignedWrap(AddrMI))
6092 return true;
6093
6094 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6095 // values.
6096 if (STI.hasSignedScratchOffsets())
6097 return true;
6098
6099 Register LHS = AddrMI->getOperand(1).getReg();
6100 Register RHS = AddrMI->getOperand(2).getReg();
6101
6102 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
6103 std::optional<ValueAndVReg> RhsValReg =
6105 // If the immediate offset is negative and within certain range, the base
6106 // address cannot also be negative. If the base is also negative, the sum
6107 // would be either negative or much larger than the valid range of scratch
6108 // memory a thread can access.
6109 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
6110 RhsValReg->Value.getSExtValue() > -0x40000000)
6111 return true;
6112 }
6113
6114 return VT->signBitIsZero(LHS);
6115}
6116
6117// Check address value in SGPR/VGPR are legal for flat scratch in the form
6118// of: SGPR + VGPR.
6119bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
6120 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6121
6122 if (isNoUnsignedWrap(AddrMI))
6123 return true;
6124
6125 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6126 // values.
6127 if (STI.hasSignedScratchOffsets())
6128 return true;
6129
6130 Register LHS = AddrMI->getOperand(1).getReg();
6131 Register RHS = AddrMI->getOperand(2).getReg();
6132 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6133}
6134
6135// Check address value in SGPR/VGPR are legal for flat scratch in the form
6136// of: SGPR + VGPR + Imm.
6137bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
6138 Register Addr) const {
6139 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6140 // values.
6141 if (STI.hasSignedScratchOffsets())
6142 return true;
6143
6144 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6145 Register Base = AddrMI->getOperand(1).getReg();
6146 std::optional<DefinitionAndSourceRegister> BaseDef =
6148 std::optional<ValueAndVReg> RHSOffset =
6150 assert(RHSOffset);
6151
6152 // If the immediate offset is negative and within certain range, the base
6153 // address cannot also be negative. If the base is also negative, the sum
6154 // would be either negative or much larger than the valid range of scratch
6155 // memory a thread can access.
6156 if (isNoUnsignedWrap(BaseDef->MI) &&
6157 (isNoUnsignedWrap(AddrMI) ||
6158 (RHSOffset->Value.getSExtValue() < 0 &&
6159 RHSOffset->Value.getSExtValue() > -0x40000000)))
6160 return true;
6161
6162 Register LHS = BaseDef->MI->getOperand(1).getReg();
6163 Register RHS = BaseDef->MI->getOperand(2).getReg();
6164 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6165}
6166
6167bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
6168 unsigned ShAmtBits) const {
6169 assert(MI.getOpcode() == TargetOpcode::G_AND);
6170
6171 std::optional<APInt> RHS =
6172 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
6173 if (!RHS)
6174 return false;
6175
6176 if (RHS->countr_one() >= ShAmtBits)
6177 return true;
6178
6179 const APInt &LHSKnownZeros = VT->getKnownZeroes(MI.getOperand(1).getReg());
6180 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
6181}
6182
6184AMDGPUInstructionSelector::selectMUBUFScratchOffset(
6185 MachineOperand &Root) const {
6186 Register Reg = Root.getReg();
6187 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6188
6189 std::optional<DefinitionAndSourceRegister> Def =
6191 assert(Def && "this shouldn't be an optional result");
6192 Reg = Def->Reg;
6193
6194 if (Register WaveBase = getWaveAddress(Def->MI)) {
6195 return {{
6196 [=](MachineInstrBuilder &MIB) { // rsrc
6197 MIB.addReg(Info->getScratchRSrcReg());
6198 },
6199 [=](MachineInstrBuilder &MIB) { // soffset
6200 MIB.addReg(WaveBase);
6201 },
6202 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
6203 }};
6204 }
6205
6206 int64_t Offset = 0;
6207
6208 // FIXME: Copy check is a hack
6210 if (mi_match(Reg, *MRI,
6211 m_GPtrAdd(m_Reg(BasePtr),
6213 if (!TII.isLegalMUBUFImmOffset(Offset))
6214 return {};
6215 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
6216 Register WaveBase = getWaveAddress(BasePtrDef);
6217 if (!WaveBase)
6218 return {};
6219
6220 return {{
6221 [=](MachineInstrBuilder &MIB) { // rsrc
6222 MIB.addReg(Info->getScratchRSrcReg());
6223 },
6224 [=](MachineInstrBuilder &MIB) { // soffset
6225 MIB.addReg(WaveBase);
6226 },
6227 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6228 }};
6229 }
6230
6231 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
6232 !TII.isLegalMUBUFImmOffset(Offset))
6233 return {};
6234
6235 return {{
6236 [=](MachineInstrBuilder &MIB) { // rsrc
6237 MIB.addReg(Info->getScratchRSrcReg());
6238 },
6239 [=](MachineInstrBuilder &MIB) { // soffset
6240 MIB.addImm(0);
6241 },
6242 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6243 }};
6244}
6245
6246std::pair<Register, unsigned>
6247AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
6248 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6249 int64_t ConstAddr = 0;
6250
6251 Register PtrBase;
6252 int64_t Offset;
6253 std::tie(PtrBase, Offset, std::ignore) =
6254 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6255
6256 if (Offset) {
6257 if (isDSOffsetLegal(PtrBase, Offset)) {
6258 // (add n0, c0)
6259 return std::pair(PtrBase, Offset);
6260 }
6261 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6262 // TODO
6263
6264
6265 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6266 // TODO
6267
6268 }
6269
6270 return std::pair(Root.getReg(), 0);
6271}
6272
6274AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
6275 Register Reg;
6276 unsigned Offset;
6277 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
6278 return {{
6279 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6280 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
6281 }};
6282}
6283
6285AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
6286 return selectDSReadWrite2(Root, 4);
6287}
6288
6290AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
6291 return selectDSReadWrite2(Root, 8);
6292}
6293
6295AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
6296 unsigned Size) const {
6297 Register Reg;
6298 unsigned Offset;
6299 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
6300 return {{
6301 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6302 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
6303 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
6304 }};
6305}
6306
6307std::pair<Register, unsigned>
6308AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
6309 unsigned Size) const {
6310 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6311 int64_t ConstAddr = 0;
6312
6313 Register PtrBase;
6314 int64_t Offset;
6315 std::tie(PtrBase, Offset, std::ignore) =
6316 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6317
6318 if (Offset) {
6319 int64_t OffsetValue0 = Offset;
6320 int64_t OffsetValue1 = Offset + Size;
6321 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
6322 // (add n0, c0)
6323 return std::pair(PtrBase, OffsetValue0 / Size);
6324 }
6325 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6326 // TODO
6327
6328 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6329 // TODO
6330
6331 }
6332
6333 return std::pair(Root.getReg(), 0);
6334}
6335
6336/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
6337/// the base value with the constant offset, and if the offset computation is
6338/// known to be inbounds. There may be intervening copies between \p Root and
6339/// the identified constant. Returns \p Root, 0, false if this does not match
6340/// the pattern.
6341std::tuple<Register, int64_t, bool>
6342AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
6343 Register Root, const MachineRegisterInfo &MRI) const {
6344 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
6345 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
6346 return {Root, 0, false};
6347
6348 MachineOperand &RHS = RootI->getOperand(2);
6349 std::optional<ValueAndVReg> MaybeOffset =
6351 if (!MaybeOffset)
6352 return {Root, 0, false};
6353 bool IsInBounds = RootI->getFlag(MachineInstr::MIFlag::InBounds);
6354 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue(),
6355 IsInBounds};
6356}
6357
6359 MIB.addImm(0);
6360}
6361
6362/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
6363/// BasePtr is not valid, a null base pointer will be used.
6365 uint32_t FormatLo, uint32_t FormatHi,
6366 Register BasePtr) {
6367 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6368 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6369 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6370 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6371
6372 B.buildInstr(AMDGPU::S_MOV_B32)
6373 .addDef(RSrc2)
6374 .addImm(FormatLo);
6375 B.buildInstr(AMDGPU::S_MOV_B32)
6376 .addDef(RSrc3)
6377 .addImm(FormatHi);
6378
6379 // Build the half of the subregister with the constants before building the
6380 // full 128-bit register. If we are building multiple resource descriptors,
6381 // this will allow CSEing of the 2-component register.
6382 B.buildInstr(AMDGPU::REG_SEQUENCE)
6383 .addDef(RSrcHi)
6384 .addReg(RSrc2)
6385 .addImm(AMDGPU::sub0)
6386 .addReg(RSrc3)
6387 .addImm(AMDGPU::sub1);
6388
6389 Register RSrcLo = BasePtr;
6390 if (!BasePtr) {
6391 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6392 B.buildInstr(AMDGPU::S_MOV_B64)
6393 .addDef(RSrcLo)
6394 .addImm(0);
6395 }
6396
6397 B.buildInstr(AMDGPU::REG_SEQUENCE)
6398 .addDef(RSrc)
6399 .addReg(RSrcLo)
6400 .addImm(AMDGPU::sub0_sub1)
6401 .addReg(RSrcHi)
6402 .addImm(AMDGPU::sub2_sub3);
6403
6404 return RSrc;
6405}
6406
6408 const SIInstrInfo &TII, Register BasePtr) {
6409 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6410
6411 // FIXME: Why are half the "default" bits ignored based on the addressing
6412 // mode?
6413 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
6414}
6415
6417 const SIInstrInfo &TII, Register BasePtr) {
6418 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6419
6420 // FIXME: Why are half the "default" bits ignored based on the addressing
6421 // mode?
6422 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
6423}
6424
6425AMDGPUInstructionSelector::MUBUFAddressData
6426AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
6427 MUBUFAddressData Data;
6428 Data.N0 = Src;
6429
6430 Register PtrBase;
6431 int64_t Offset;
6432
6433 std::tie(PtrBase, Offset, std::ignore) =
6434 getPtrBaseWithConstantOffset(Src, *MRI);
6435 if (isUInt<32>(Offset)) {
6436 Data.N0 = PtrBase;
6437 Data.Offset = Offset;
6438 }
6439
6440 if (MachineInstr *InputAdd
6441 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
6442 Data.N2 = InputAdd->getOperand(1).getReg();
6443 Data.N3 = InputAdd->getOperand(2).getReg();
6444
6445 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
6446 // FIXME: Don't know this was defined by operand 0
6447 //
6448 // TODO: Remove this when we have copy folding optimizations after
6449 // RegBankSelect.
6450 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
6451 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
6452 }
6453
6454 return Data;
6455}
6456
6457/// Return if the addr64 mubuf mode should be used for the given address.
6458bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
6459 // (ptr_add N2, N3) -> addr64, or
6460 // (ptr_add (ptr_add N2, N3), C1) -> addr64
6461 if (Addr.N2)
6462 return true;
6463
6464 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
6465 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
6466}
6467
6468/// Split an immediate offset \p ImmOffset depending on whether it fits in the
6469/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
6470/// component.
6471void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
6472 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
6473 if (TII.isLegalMUBUFImmOffset(ImmOffset))
6474 return;
6475
6476 // Illegal offset, store it in soffset.
6477 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6478 B.buildInstr(AMDGPU::S_MOV_B32)
6479 .addDef(SOffset)
6480 .addImm(ImmOffset);
6481 ImmOffset = 0;
6482}
6483
6484bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
6485 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
6486 Register &SOffset, int64_t &Offset) const {
6487 // FIXME: Predicates should stop this from reaching here.
6488 // addr64 bit was removed for volcanic islands.
6489 if (!STI.hasAddr64() || STI.useFlatForGlobal())
6490 return false;
6491
6492 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6493 if (!shouldUseAddr64(AddrData))
6494 return false;
6495
6496 Register N0 = AddrData.N0;
6497 Register N2 = AddrData.N2;
6498 Register N3 = AddrData.N3;
6499 Offset = AddrData.Offset;
6500
6501 // Base pointer for the SRD.
6502 Register SRDPtr;
6503
6504 if (N2) {
6505 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6506 assert(N3);
6507 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6508 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
6509 // addr64, and construct the default resource from a 0 address.
6510 VAddr = N0;
6511 } else {
6512 SRDPtr = N3;
6513 VAddr = N2;
6514 }
6515 } else {
6516 // N2 is not divergent.
6517 SRDPtr = N2;
6518 VAddr = N3;
6519 }
6520 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6521 // Use the default null pointer in the resource
6522 VAddr = N0;
6523 } else {
6524 // N0 -> offset, or
6525 // (N0 + C1) -> offset
6526 SRDPtr = N0;
6527 }
6528
6529 MachineIRBuilder B(*Root.getParent());
6530 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
6531 splitIllegalMUBUFOffset(B, SOffset, Offset);
6532 return true;
6533}
6534
6535bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
6536 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
6537 int64_t &Offset) const {
6538
6539 // FIXME: Pattern should not reach here.
6540 if (STI.useFlatForGlobal())
6541 return false;
6542
6543 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6544 if (shouldUseAddr64(AddrData))
6545 return false;
6546
6547 // N0 -> offset, or
6548 // (N0 + C1) -> offset
6549 Register SRDPtr = AddrData.N0;
6550 Offset = AddrData.Offset;
6551
6552 // TODO: Look through extensions for 32-bit soffset.
6553 MachineIRBuilder B(*Root.getParent());
6554
6555 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
6556 splitIllegalMUBUFOffset(B, SOffset, Offset);
6557 return true;
6558}
6559
6561AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
6562 Register VAddr;
6563 Register RSrcReg;
6564 Register SOffset;
6565 int64_t Offset = 0;
6566
6567 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
6568 return {};
6569
6570 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
6571 // pattern.
6572 return {{
6573 [=](MachineInstrBuilder &MIB) { // rsrc
6574 MIB.addReg(RSrcReg);
6575 },
6576 [=](MachineInstrBuilder &MIB) { // vaddr
6577 MIB.addReg(VAddr);
6578 },
6579 [=](MachineInstrBuilder &MIB) { // soffset
6580 if (SOffset)
6581 MIB.addReg(SOffset);
6582 else if (STI.hasRestrictedSOffset())
6583 MIB.addReg(AMDGPU::SGPR_NULL);
6584 else
6585 MIB.addImm(0);
6586 },
6587 [=](MachineInstrBuilder &MIB) { // offset
6588 MIB.addImm(Offset);
6589 },
6590 addZeroImm, // cpol
6591 addZeroImm, // tfe
6592 addZeroImm // swz
6593 }};
6594}
6595
6597AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
6598 Register RSrcReg;
6599 Register SOffset;
6600 int64_t Offset = 0;
6601
6602 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
6603 return {};
6604
6605 return {{
6606 [=](MachineInstrBuilder &MIB) { // rsrc
6607 MIB.addReg(RSrcReg);
6608 },
6609 [=](MachineInstrBuilder &MIB) { // soffset
6610 if (SOffset)
6611 MIB.addReg(SOffset);
6612 else if (STI.hasRestrictedSOffset())
6613 MIB.addReg(AMDGPU::SGPR_NULL);
6614 else
6615 MIB.addImm(0);
6616 },
6617 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
6618 addZeroImm, // cpol
6619 addZeroImm, // tfe
6620 addZeroImm, // swz
6621 }};
6622}
6623
6625AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
6626
6627 Register SOffset = Root.getReg();
6628
6629 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
6630 SOffset = AMDGPU::SGPR_NULL;
6631
6632 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
6633}
6634
6635/// Get an immediate that must be 32-bits, and treated as zero extended.
6636static std::optional<uint64_t>
6638 // getIConstantVRegVal sexts any values, so see if that matters.
6639 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
6640 if (!OffsetVal || !isInt<32>(*OffsetVal))
6641 return std::nullopt;
6642 return Lo_32(*OffsetVal);
6643}
6644
6646AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
6647 std::optional<uint64_t> OffsetVal =
6648 Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI);
6649 if (!OffsetVal)
6650 return {};
6651
6652 std::optional<int64_t> EncodedImm =
6653 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
6654 if (!EncodedImm)
6655 return {};
6656
6657 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
6658}
6659
6661AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
6662 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
6663
6664 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
6665 if (!OffsetVal)
6666 return {};
6667
6668 std::optional<int64_t> EncodedImm =
6670 if (!EncodedImm)
6671 return {};
6672
6673 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
6674}
6675
6677AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
6678 // Match the (soffset + offset) pair as a 32-bit register base and
6679 // an immediate offset.
6680 Register SOffset;
6681 unsigned Offset;
6682 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
6683 *MRI, Root.getReg(), VT, /*CheckNUW*/ true);
6684 if (!SOffset)
6685 return std::nullopt;
6686
6687 std::optional<int64_t> EncodedOffset =
6688 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
6689 if (!EncodedOffset)
6690 return std::nullopt;
6691
6692 assert(MRI->getType(SOffset) == LLT::scalar(32));
6693 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
6694 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
6695}
6696
6697std::pair<Register, unsigned>
6698AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
6699 bool &Matched) const {
6700 Matched = false;
6701
6702 Register Src;
6703 unsigned Mods;
6704 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
6705
6706 if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
6707 assert(MRI->getType(Src) == LLT::scalar(16));
6708
6709 // Only change Src if src modifier could be gained. In such cases new Src
6710 // could be sgpr but this does not violate constant bus restriction for
6711 // instruction that is being selected.
6712 Src = stripBitCast(Src, *MRI);
6713
6714 const auto CheckAbsNeg = [&]() {
6715 // Be careful about folding modifiers if we already have an abs. fneg is
6716 // applied last, so we don't want to apply an earlier fneg.
6717 if ((Mods & SISrcMods::ABS) == 0) {
6718 unsigned ModsTmp;
6719 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
6720
6721 if ((ModsTmp & SISrcMods::NEG) != 0)
6722 Mods ^= SISrcMods::NEG;
6723
6724 if ((ModsTmp & SISrcMods::ABS) != 0)
6725 Mods |= SISrcMods::ABS;
6726 }
6727 };
6728
6729 CheckAbsNeg();
6730
6731 // op_sel/op_sel_hi decide the source type and source.
6732 // If the source's op_sel_hi is set, it indicates to do a conversion from
6733 // fp16. If the sources's op_sel is set, it picks the high half of the
6734 // source register.
6735
6736 Mods |= SISrcMods::OP_SEL_1;
6737
6738 if (isExtractHiElt(*MRI, Src, Src)) {
6739 Mods |= SISrcMods::OP_SEL_0;
6740 CheckAbsNeg();
6741 }
6742
6743 Matched = true;
6744 }
6745
6746 return {Src, Mods};
6747}
6748
6750AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
6751 MachineOperand &Root) const {
6752 Register Src;
6753 unsigned Mods;
6754 bool Matched;
6755 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
6756 if (!Matched)
6757 return {};
6758
6759 return {{
6760 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
6761 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
6762 }};
6763}
6764
6766AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
6767 Register Src;
6768 unsigned Mods;
6769 bool Matched;
6770 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
6771
6772 return {{
6773 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
6774 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
6775 }};
6776}
6777
6778bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
6779 MachineInstr &I, Intrinsic::ID IntrID) const {
6780 MachineBasicBlock *MBB = I.getParent();
6781 const DebugLoc &DL = I.getDebugLoc();
6782 Register CCReg = I.getOperand(0).getReg();
6783
6784 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6785 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_CMP_EQ_U32)).addImm(0).addImm(0);
6786
6787 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
6788 .addImm(I.getOperand(2).getImm());
6789
6790 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
6791
6792 I.eraseFromParent();
6793 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
6794 *MRI);
6795}
6796
6797bool AMDGPUInstructionSelector::selectSGetBarrierState(
6798 MachineInstr &I, Intrinsic::ID IntrID) const {
6799 MachineBasicBlock *MBB = I.getParent();
6800 const DebugLoc &DL = I.getDebugLoc();
6801 const MachineOperand &BarOp = I.getOperand(2);
6802 std::optional<int64_t> BarValImm =
6803 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
6804
6805 if (!BarValImm) {
6806 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
6807 .addReg(BarOp.getReg());
6808 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6809 }
6810 MachineInstrBuilder MIB;
6811 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
6812 : AMDGPU::S_GET_BARRIER_STATE_M0;
6813 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
6814
6815 auto DstReg = I.getOperand(0).getReg();
6816 const TargetRegisterClass *DstRC =
6817 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
6818 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
6819 return false;
6820 MIB.addDef(DstReg);
6821 if (BarValImm) {
6822 MIB.addImm(*BarValImm);
6823 }
6824 I.eraseFromParent();
6825 return true;
6826}
6827
6828unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
6829 if (HasInlineConst) {
6830 switch (IntrID) {
6831 default:
6832 llvm_unreachable("not a named barrier op");
6833 case Intrinsic::amdgcn_s_barrier_join:
6834 return AMDGPU::S_BARRIER_JOIN_IMM;
6835 case Intrinsic::amdgcn_s_get_named_barrier_state:
6836 return AMDGPU::S_GET_BARRIER_STATE_IMM;
6837 };
6838 } else {
6839 switch (IntrID) {
6840 default:
6841 llvm_unreachable("not a named barrier op");
6842 case Intrinsic::amdgcn_s_barrier_join:
6843 return AMDGPU::S_BARRIER_JOIN_M0;
6844 case Intrinsic::amdgcn_s_get_named_barrier_state:
6845 return AMDGPU::S_GET_BARRIER_STATE_M0;
6846 };
6847 }
6848}
6849
6850bool AMDGPUInstructionSelector::selectNamedBarrierInit(
6851 MachineInstr &I, Intrinsic::ID IntrID) const {
6852 MachineBasicBlock *MBB = I.getParent();
6853 const DebugLoc &DL = I.getDebugLoc();
6854 const MachineOperand &BarOp = I.getOperand(1);
6855 const MachineOperand &CntOp = I.getOperand(2);
6856
6857 // BarID = (BarOp >> 4) & 0x3F
6858 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6859 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
6860 .add(BarOp)
6861 .addImm(4u)
6862 .setOperandDead(3); // Dead scc
6863
6864 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6865 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
6866 .addReg(TmpReg0)
6867 .addImm(0x3F)
6868 .setOperandDead(3); // Dead scc
6869
6870 // MO = ((CntOp & 0x3F) << shAmt) | BarID
6871 Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6872 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2)
6873 .add(CntOp)
6874 .addImm(0x3F)
6875 .setOperandDead(3); // Dead scc
6876
6877 Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6878 constexpr unsigned ShAmt = 16;
6879 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3)
6880 .addReg(TmpReg2)
6881 .addImm(ShAmt)
6882 .setOperandDead(3); // Dead scc
6883
6884 Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6885 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4)
6886 .addReg(TmpReg1)
6887 .addReg(TmpReg3)
6888 .setOperandDead(3); // Dead scc;
6889
6890 auto CopyMIB =
6891 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
6892 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6893
6894 unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
6895 ? AMDGPU::S_BARRIER_INIT_M0
6896 : AMDGPU::S_BARRIER_SIGNAL_M0;
6897 MachineInstrBuilder MIB;
6898 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
6899
6900 I.eraseFromParent();
6901 return true;
6902}
6903
6904bool AMDGPUInstructionSelector::selectNamedBarrierInst(
6905 MachineInstr &I, Intrinsic::ID IntrID) const {
6906 MachineBasicBlock *MBB = I.getParent();
6907 const DebugLoc &DL = I.getDebugLoc();
6908 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
6909 ? I.getOperand(2)
6910 : I.getOperand(1);
6911 std::optional<int64_t> BarValImm =
6912 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
6913
6914 if (!BarValImm) {
6915 // BarID = (BarOp >> 4) & 0x3F
6916 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6917 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
6918 .addReg(BarOp.getReg())
6919 .addImm(4u)
6920 .setOperandDead(3); // Dead scc;
6921
6922 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6923 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
6924 .addReg(TmpReg0)
6925 .addImm(0x3F)
6926 .setOperandDead(3); // Dead scc;
6927
6928 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
6929 .addReg(TmpReg1);
6930 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6931 }
6932
6933 MachineInstrBuilder MIB;
6934 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
6935 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
6936
6937 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
6938 auto DstReg = I.getOperand(0).getReg();
6939 const TargetRegisterClass *DstRC =
6940 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
6941 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
6942 return false;
6943 MIB.addDef(DstReg);
6944 }
6945
6946 if (BarValImm) {
6947 auto BarId = ((*BarValImm) >> 4) & 0x3F;
6948 MIB.addImm(BarId);
6949 }
6950
6951 I.eraseFromParent();
6952 return true;
6953}
6954
6955void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
6956 const MachineInstr &MI,
6957 int OpIdx) const {
6958 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6959 "Expected G_CONSTANT");
6960 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
6961}
6962
6963void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
6964 const MachineInstr &MI,
6965 int OpIdx) const {
6966 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6967 "Expected G_CONSTANT");
6968 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
6969}
6970
6971void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
6972 const MachineInstr &MI,
6973 int OpIdx) const {
6974 const MachineOperand &Op = MI.getOperand(1);
6975 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
6976 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
6977}
6978
6979void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
6980 const MachineInstr &MI,
6981 int OpIdx) const {
6982 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6983 "Expected G_CONSTANT");
6984 MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount());
6985}
6986
6987/// This only really exists to satisfy DAG type checking machinery, so is a
6988/// no-op here.
6989void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
6990 const MachineInstr &MI,
6991 int OpIdx) const {
6992 const MachineOperand &Op = MI.getOperand(OpIdx);
6993 int64_t Imm;
6994 if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
6995 MIB.addImm(Imm);
6996 else
6997 MIB.addImm(Op.getImm());
6998}
6999
7000void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
7001 const MachineInstr &MI,
7002 int OpIdx) const {
7003 MIB.addImm(MI.getOperand(OpIdx).getImm() != 0);
7004}
7005
7006void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
7007 const MachineInstr &MI,
7008 int OpIdx) const {
7009 assert(OpIdx >= 0 && "expected to match an immediate operand");
7010 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7011}
7012
7013void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
7014 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7015 assert(OpIdx >= 0 && "expected to match an immediate operand");
7016 MIB.addImm(
7017 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7018}
7019
7020void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
7021 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7022 assert(OpIdx >= 0 && "expected to match an immediate operand");
7023 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
7025 : (int64_t)SISrcMods::DST_OP_SEL);
7026}
7027
7028void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
7029 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7030 assert(OpIdx >= 0 && "expected to match an immediate operand");
7031 MIB.addImm(
7032 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7033}
7034
7035void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
7036 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7037 assert(OpIdx >= 0 && "expected to match an immediate operand");
7038 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7039 ? (int64_t)(SISrcMods::OP_SEL_0)
7040 : 0);
7041}
7042
7043void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
7044 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7045 assert(OpIdx >= 0 && "expected to match an immediate operand");
7046 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
7047 : 0);
7048}
7049
7050void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
7051 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7052 assert(OpIdx >= 0 && "expected to match an immediate operand");
7053 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
7054 : 0);
7055}
7056
7057void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
7058 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7059 assert(OpIdx >= 0 && "expected to match an immediate operand");
7060 MIB.addImm(
7061 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7062}
7063
7064void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
7065 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7066 assert(OpIdx >= 0 && "expected to match an immediate operand");
7067 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7068 ? (int64_t)SISrcMods::DST_OP_SEL
7069 : 0);
7070}
7071
7072void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
7073 const MachineInstr &MI,
7074 int OpIdx) const {
7075 assert(OpIdx >= 0 && "expected to match an immediate operand");
7076 MIB.addImm(MI.getOperand(OpIdx).getImm() &
7079}
7080
7081void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
7082 const MachineInstr &MI,
7083 int OpIdx) const {
7084 assert(OpIdx >= 0 && "expected to match an immediate operand");
7085 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
7088 MIB.addImm(Swizzle);
7089}
7090
7091void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
7092 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7093 assert(OpIdx >= 0 && "expected to match an immediate operand");
7094 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
7097 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
7098}
7099
7100void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
7101 const MachineInstr &MI,
7102 int OpIdx) const {
7103 MIB.addFrameIndex(MI.getOperand(1).getIndex());
7104}
7105
7106void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
7107 const MachineInstr &MI,
7108 int OpIdx) const {
7109 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
7110 int ExpVal = APF.getExactLog2Abs();
7111 assert(ExpVal != INT_MIN);
7112 MIB.addImm(ExpVal);
7113}
7114
7115void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
7116 const MachineInstr &MI,
7117 int OpIdx) const {
7118 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
7119 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
7120 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
7121 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
7122 MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
7123}
7124
7125void AMDGPUInstructionSelector::renderVOP3PModsNeg(MachineInstrBuilder &MIB,
7126 const MachineInstr &MI,
7127 int OpIdx) const {
7128 unsigned Mods = SISrcMods::OP_SEL_1;
7129 if (MI.getOperand(OpIdx).getImm())
7130 Mods ^= SISrcMods::NEG;
7131 MIB.addImm((int64_t)Mods);
7132}
7133
7134void AMDGPUInstructionSelector::renderVOP3PModsNegs(MachineInstrBuilder &MIB,
7135 const MachineInstr &MI,
7136 int OpIdx) const {
7137 unsigned Mods = SISrcMods::OP_SEL_1;
7138 if (MI.getOperand(OpIdx).getImm())
7140 MIB.addImm((int64_t)Mods);
7141}
7142
7143void AMDGPUInstructionSelector::renderVOP3PModsNegAbs(MachineInstrBuilder &MIB,
7144 const MachineInstr &MI,
7145 int OpIdx) const {
7146 unsigned Val = MI.getOperand(OpIdx).getImm();
7147 unsigned Mods = SISrcMods::OP_SEL_1; // default: none
7148 if (Val == 1) // neg
7149 Mods ^= SISrcMods::NEG;
7150 if (Val == 2) // abs
7151 Mods ^= SISrcMods::ABS;
7152 if (Val == 3) // neg and abs
7153 Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
7154 MIB.addImm((int64_t)Mods);
7155}
7156
7157void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB,
7158 const MachineInstr &MI,
7159 int OpIdx) const {
7160 uint32_t V = MI.getOperand(2).getImm();
7163 if (!Subtarget->hasSafeCUPrefetch())
7164 V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
7165 MIB.addImm(V);
7166}
7167
7168/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
7169void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
7170 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7171 unsigned Val = MI.getOperand(OpIdx).getImm();
7172 unsigned New = 0;
7173 if (Val & 0x1)
7175 if (Val & 0x2)
7177 MIB.addImm(New);
7178}
7179
7180bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
7181 return TII.isInlineConstant(Imm);
7182}
7183
7184bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
7185 return TII.isInlineConstant(Imm);
7186}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static Register getLegalRegBank(Register NewReg, Register RootReg, const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const SIInstrInfo &TII)
static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is shift left with half bits, such as reg0:2n =G_SHL reg1:2n, CONST(n)
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
static bool checkRB(Register Reg, unsigned int RBNo, const AMDGPURegisterBankInfo &RBI, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI)
static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods)
static bool isTruncHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is truncating to half, such as reg0:n = G_TRUNC reg1:2n
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static TypeClass isVectorOfTwoOrScalar(Register Reg, const MachineRegisterInfo &MRI)
static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, MachineBasicBlock *MBB)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static bool isSameBitWidth(Register Reg1, Register Reg2, const MachineRegisterInfo &MRI)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelValueTracking &ValueTracking)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static std::pair< Register, SrcStatus > getLastSameOrNeg(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI)
static std::optional< std::pair< Register, SrcStatus > > calcNextStatus(std::pair< Register, SrcStatus > Curr, const MachineRegisterInfo &MRI)
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg, Register RootReg, const SIInstrInfo &TII, const MachineRegisterInfo &MRI)
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static SmallVector< std::pair< Register, SrcStatus > > getSrcStats(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static bool isUnmergeHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test function, if the MI is reg0:n, reg1:n = G_UNMERGE_VALUES reg2:2n
static SrcStatus getNegStatus(Register Reg, SrcStatus S, const MachineRegisterInfo &MRI)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is logic shift right with half bits, such as reg0:2n =G_LSHR reg1:2n,...
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
constexpr LLT S1
constexpr LLT S32
AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
dxil translate DXIL Translate Metadata
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
MachineInstr unsigned OpIdx
#define P(N)
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
#define LLVM_DEBUG(...)
Definition Debug.h:114
Value * RHS
Value * LHS
This is used to control valid status that current MI supports.
bool checkOptions(SrcStatus Stat) const
SearchOptions(Register Reg, const MachineRegisterInfo &MRI)
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelValueTracking *VT, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1479
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1563
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:693
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:678
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
bool isFPPredicate() const
Definition InstrTypes.h:782
bool isIntPredicate() const
Definition InstrTypes.h:783
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
LLVM_ABI DILocation * get() const
Get the underlying DILocation.
Definition DebugLoc.cpp:48
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelValueTracking *vt, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool hasValue() const
TypeSize getValue() const
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void setReturnAddressIsTaken(bool s)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Analysis providing profile information.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
IndexMode
ARM Index Modes.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
operand_type_match m_Reg()
SpecificConstantMatch m_SpecificICst(const APInt &RequestedValue)
Matches a constant equal to RequestedValue.
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_XOR, true > m_GXor(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_SEXT > m_GSExt(const SrcTy &Src)
UnaryOp_match< SrcTy, TargetOpcode::G_FPEXT > m_GFPExt(const SrcTy &Src)
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
ConstantMatch< APInt > m_ICst(APInt &Cst)
SpecificConstantMatch m_AllOnesInt()
BinaryOp_match< LHS, RHS, TargetOpcode::G_OR, true > m_GOr(const LHS &L, const RHS &R)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
ImplicitDefMatch m_GImplicitDef()
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
BinaryOp_match< LHS, RHS, TargetOpcode::G_ASHR, false > m_GAShr(const LHS &L, const RHS &R)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
SpecificRegisterMatch m_SpecificReg(Register RequestedReg)
Matches a register only if it is equal to RequestedReg.
BinaryOp_match< LHS, RHS, TargetOpcode::G_SHL, false > m_GShl(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
BinaryOp_match< LHS, RHS, TargetOpcode::G_AND, true > m_GAnd(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_BITCAST > m_GBitcast(const SrcTy &Src)
bind_ty< MachineInstr * > m_MInstr(MachineInstr *&MI)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_ANYEXT > m_GAnyExt(const SrcTy &Src)
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, TargetOpcode::G_MUL, true > m_GMul(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:920
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition Utils.cpp:56
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:652
PointerUnion< const TargetRegisterClass *, const RegisterBank * > RegClassOrRegBank
Convenient type to represent either a register class or a register bank.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:460
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
LLVM_ABI std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition Utils.cpp:295
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:155
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition Utils.cpp:493
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:315
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition Utils.cpp:440
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:434
LLVM_ABI std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition Utils.cpp:468
LLVM_ABI Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition Utils.cpp:500
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.