LLVM 23.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
45 const AMDGPUTargetMachine &TM)
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
49#include "AMDGPUGenGlobalISel.inc"
52#include "AMDGPUGenGlobalISel.inc"
54{
55}
56
57const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
58
69
70// Return the wave level SGPR base address if this is a wave address.
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(1).getReg()
74 : Register();
75}
76
78 const Function &F = I.getMF()->getFunction();
79 F.getContext().diagnose(DiagnosticInfoUnsupported(
80 F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error));
81}
82
83bool AMDGPUInstructionSelector::isVCC(Register Reg,
84 const MachineRegisterInfo &MRI) const {
85 // The verifier is oblivious to s1 being a valid value for wavesize registers.
86 if (Reg.isPhysical())
87 return false;
88
89 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
90 const TargetRegisterClass *RC =
92 if (RC) {
93 const LLT Ty = MRI.getType(Reg);
94 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
95 return false;
96 // G_TRUNC s1 result is never vcc.
97 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
98 RC->hasSuperClassEq(TRI.getBoolRC());
99 }
100
101 const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
102 return RB->getID() == AMDGPU::VCCRegBankID;
103}
104
105bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
106 unsigned NewOpc) const {
107 MI.setDesc(TII.get(NewOpc));
108 MI.removeOperand(1); // Remove intrinsic ID.
109 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
110
111 MachineOperand &Dst = MI.getOperand(0);
112 MachineOperand &Src = MI.getOperand(1);
113
114 // TODO: This should be legalized to s32 if needed
115 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
116 return false;
117
118 const TargetRegisterClass *DstRC
119 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
120 const TargetRegisterClass *SrcRC
121 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
122 if (!DstRC || DstRC != SrcRC)
123 return false;
124
125 if (!RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) ||
126 !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
127 return false;
128 const MCInstrDesc &MCID = MI.getDesc();
129 if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
130 MI.getOperand(0).setIsEarlyClobber(true);
131 }
132 return true;
133}
134
135bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
136 const DebugLoc &DL = I.getDebugLoc();
137 MachineBasicBlock *BB = I.getParent();
138 I.setDesc(TII.get(TargetOpcode::COPY));
139
140 const MachineOperand &Src = I.getOperand(1);
141 MachineOperand &Dst = I.getOperand(0);
142 Register DstReg = Dst.getReg();
143 Register SrcReg = Src.getReg();
144
145 if (isVCC(DstReg, *MRI)) {
146 if (SrcReg == AMDGPU::SCC) {
147 const TargetRegisterClass *RC
148 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
149 if (!RC)
150 return true;
151 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
152 }
153
154 if (!isVCC(SrcReg, *MRI)) {
155 // TODO: Should probably leave the copy and let copyPhysReg expand it.
156 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
157 return false;
158
159 const TargetRegisterClass *SrcRC
160 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
161
162 std::optional<ValueAndVReg> ConstVal =
163 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
164 if (ConstVal) {
165 unsigned MovOpc =
166 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
167 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
168 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
169 } else {
170 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
171
172 // We can't trust the high bits at this point, so clear them.
173
174 // TODO: Skip masking high bits if def is known boolean.
175
176 if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
177 assert(Subtarget->useRealTrue16Insts());
178 const int64_t NoMods = 0;
179 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
180 .addImm(NoMods)
181 .addImm(1)
182 .addImm(NoMods)
183 .addReg(SrcReg)
184 .addImm(NoMods);
185 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
186 .addImm(NoMods)
187 .addImm(0)
188 .addImm(NoMods)
189 .addReg(MaskedReg)
190 .addImm(NoMods);
191 } else {
192 bool IsSGPR = TRI.isSGPRClass(SrcRC);
193 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
194 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
195 .addImm(1)
196 .addReg(SrcReg);
197 if (IsSGPR)
198 And.setOperandDead(3); // Dead scc
199
200 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
201 .addImm(0)
202 .addReg(MaskedReg);
203 }
204 }
205
206 if (!MRI->getRegClassOrNull(SrcReg))
207 MRI->setRegClass(SrcReg, SrcRC);
208 I.eraseFromParent();
209 return true;
210 }
211
212 const TargetRegisterClass *RC =
213 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
214 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
215 return false;
216
217 return true;
218 }
219
220 for (const MachineOperand &MO : I.operands()) {
221 if (MO.getReg().isPhysical())
222 continue;
223
224 const TargetRegisterClass *RC =
225 TRI.getConstrainedRegClassForOperand(MO, *MRI);
226 if (!RC)
227 continue;
228 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
229 }
230 return true;
231}
232
233bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
234 const DebugLoc &DL = I.getDebugLoc();
235 MachineBasicBlock *BB = I.getParent();
236 Register VCCReg = I.getOperand(1).getReg();
237 MachineInstr *Cmp;
238
239 // Set SCC as a side effect with S_CMP or S_OR.
240 if (STI.hasScalarCompareEq64()) {
241 unsigned CmpOpc =
242 STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
243 Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)).addReg(VCCReg).addImm(0);
244 } else {
245 Register DeadDst = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
246 Cmp = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_OR_B64), DeadDst)
247 .addReg(VCCReg)
248 .addReg(VCCReg);
249 }
250
251 constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
252
253 Register DstReg = I.getOperand(0).getReg();
254 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
255
256 I.eraseFromParent();
257 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
258}
259
260bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
261 const DebugLoc &DL = I.getDebugLoc();
262 MachineBasicBlock *BB = I.getParent();
263
264 Register DstReg = I.getOperand(0).getReg();
265 Register SrcReg = I.getOperand(1).getReg();
266 std::optional<ValueAndVReg> Arg =
267 getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);
268
269 if (Arg) {
270 const int64_t Value = Arg->Value.getZExtValue();
271 if (Value == 0) {
272 unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
273 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
274 } else {
275 assert(Value == 1);
276 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());
277 }
278 I.eraseFromParent();
279 return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
280 }
281
282 // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
283 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);
284
285 unsigned SelectOpcode =
286 STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
287 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
288 .addReg(TRI.getExec())
289 .addImm(0);
290
291 I.eraseFromParent();
293 return true;
294}
295
296bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
297 Register DstReg = I.getOperand(0).getReg();
298 Register SrcReg = I.getOperand(1).getReg();
299
300 const DebugLoc &DL = I.getDebugLoc();
301 MachineBasicBlock *BB = I.getParent();
302
303 auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
304 .addReg(SrcReg);
305
306 I.eraseFromParent();
307 constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
308 return true;
309}
310
311bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
312 const Register DefReg = I.getOperand(0).getReg();
313 const LLT DefTy = MRI->getType(DefReg);
314
315 // S1 G_PHIs should not be selected in instruction-select, instead:
316 // - divergent S1 G_PHI should go through lane mask merging algorithm
317 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
318 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
319 if (DefTy == LLT::scalar(1))
320 return false;
321
322 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
323
324 const RegClassOrRegBank &RegClassOrBank =
325 MRI->getRegClassOrRegBank(DefReg);
326
327 const TargetRegisterClass *DefRC =
329 if (!DefRC) {
330 if (!DefTy.isValid()) {
331 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
332 return false;
333 }
334
335 const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
336 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
337 if (!DefRC) {
338 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
339 return false;
340 }
341 }
342
343 // If inputs have register bank, assign corresponding reg class.
344 // Note: registers don't need to have the same reg bank.
345 for (unsigned i = 1; i != I.getNumOperands(); i += 2) {
346 const Register SrcReg = I.getOperand(i).getReg();
347
348 const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
349 if (RB) {
350 const LLT SrcTy = MRI->getType(SrcReg);
351 const TargetRegisterClass *SrcRC =
352 TRI.getRegClassForTypeOnBank(SrcTy, *RB);
353 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
354 return false;
355 }
356 }
357
358 I.setDesc(TII.get(TargetOpcode::PHI));
359 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
360}
361
363AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
364 const TargetRegisterClass &SubRC,
365 unsigned SubIdx) const {
366
367 MachineInstr *MI = MO.getParent();
368 MachineBasicBlock *BB = MO.getParent()->getParent();
369 Register DstReg = MRI->createVirtualRegister(&SubRC);
370
371 if (MO.isReg()) {
372 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
373 Register Reg = MO.getReg();
374 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
375 .addReg(Reg, {}, ComposedSubIdx);
376
377 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
378 MO.isKill(), MO.isDead(), MO.isUndef(),
379 MO.isEarlyClobber(), 0, MO.isDebug(),
380 MO.isInternalRead());
381 }
382
383 assert(MO.isImm());
384
385 APInt Imm(64, MO.getImm());
386
387 switch (SubIdx) {
388 default:
389 llvm_unreachable("do not know to split immediate with this sub index.");
390 case AMDGPU::sub0:
391 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
392 case AMDGPU::sub1:
393 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
394 }
395}
396
397static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
398 switch (Opc) {
399 case AMDGPU::G_AND:
400 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
401 case AMDGPU::G_OR:
402 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
403 case AMDGPU::G_XOR:
404 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
405 default:
406 llvm_unreachable("not a bit op");
407 }
408}
409
410bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
411 Register DstReg = I.getOperand(0).getReg();
412 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
413
414 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
415 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
416 DstRB->getID() != AMDGPU::VCCRegBankID)
417 return false;
418
419 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
420 STI.isWave64());
421 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
422
423 // Dead implicit-def of scc
424 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
425 true, // isImp
426 false, // isKill
427 true)); // isDead
428 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
429 return true;
430}
431
432bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
433 MachineBasicBlock *BB = I.getParent();
434 MachineFunction *MF = BB->getParent();
435 Register DstReg = I.getOperand(0).getReg();
436 const DebugLoc &DL = I.getDebugLoc();
437 LLT Ty = MRI->getType(DstReg);
438 if (Ty.isVector())
439 return false;
440
441 unsigned Size = Ty.getSizeInBits();
442 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
443 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
444 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
445
446 if (Size == 32) {
447 if (IsSALU) {
448 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
449 MachineInstr *Add =
450 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
451 .add(I.getOperand(1))
452 .add(I.getOperand(2))
453 .setOperandDead(3); // Dead scc
454 I.eraseFromParent();
455 constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
456 return true;
457 }
458
459 if (STI.hasAddNoCarryInsts()) {
460 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
461 I.setDesc(TII.get(Opc));
462 I.addOperand(*MF, MachineOperand::CreateImm(0));
463 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
464 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
465 return true;
466 }
467
468 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
469
470 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
471 MachineInstr *Add
472 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
473 .addDef(UnusedCarry, RegState::Dead)
474 .add(I.getOperand(1))
475 .add(I.getOperand(2))
476 .addImm(0);
477 I.eraseFromParent();
478 constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
479 return true;
480 }
481
482 assert(!Sub && "illegal sub should not reach here");
483
484 const TargetRegisterClass &RC
485 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
486 const TargetRegisterClass &HalfRC
487 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
488
489 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
490 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
491 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
492 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
493
494 Register DstLo = MRI->createVirtualRegister(&HalfRC);
495 Register DstHi = MRI->createVirtualRegister(&HalfRC);
496
497 if (IsSALU) {
498 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
499 .add(Lo1)
500 .add(Lo2);
501 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
502 .add(Hi1)
503 .add(Hi2)
504 .setOperandDead(3); // Dead scc
505 } else {
506 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
507 Register CarryReg = MRI->createVirtualRegister(CarryRC);
508 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
509 .addDef(CarryReg)
510 .add(Lo1)
511 .add(Lo2)
512 .addImm(0);
513 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
514 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
515 .add(Hi1)
516 .add(Hi2)
517 .addReg(CarryReg, RegState::Kill)
518 .addImm(0);
519
520 constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI);
521 }
522
523 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
524 .addReg(DstLo)
525 .addImm(AMDGPU::sub0)
526 .addReg(DstHi)
527 .addImm(AMDGPU::sub1);
528
529
530 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
531 return false;
532
533 I.eraseFromParent();
534 return true;
535}
536
537bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
538 MachineInstr &I) const {
539 MachineBasicBlock *BB = I.getParent();
540 MachineFunction *MF = BB->getParent();
541 const DebugLoc &DL = I.getDebugLoc();
542 Register Dst0Reg = I.getOperand(0).getReg();
543 Register Dst1Reg = I.getOperand(1).getReg();
544 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
545 I.getOpcode() == AMDGPU::G_UADDE;
546 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
547 I.getOpcode() == AMDGPU::G_USUBE;
548
549 if (isVCC(Dst1Reg, *MRI)) {
550 unsigned NoCarryOpc =
551 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
552 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
553 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
554 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
555 I.addOperand(*MF, MachineOperand::CreateImm(0));
556 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
557 return true;
558 }
559
560 Register Src0Reg = I.getOperand(2).getReg();
561 Register Src1Reg = I.getOperand(3).getReg();
562
563 if (HasCarryIn) {
564 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
565 .addReg(I.getOperand(4).getReg());
566 }
567
568 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
569 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
570
571 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
572 .add(I.getOperand(2))
573 .add(I.getOperand(3));
574
575 if (MRI->use_nodbg_empty(Dst1Reg)) {
576 CarryInst.setOperandDead(3); // Dead scc
577 } else {
578 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
579 .addReg(AMDGPU::SCC);
580 if (!MRI->getRegClassOrNull(Dst1Reg))
581 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
582 }
583
584 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
585 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
586 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
587 return false;
588
589 if (HasCarryIn &&
590 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
591 AMDGPU::SReg_32RegClass, *MRI))
592 return false;
593
594 I.eraseFromParent();
595 return true;
596}
597
598bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
599 MachineInstr &I) const {
600 MachineBasicBlock *BB = I.getParent();
601 MachineFunction *MF = BB->getParent();
602 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
603 bool UseNoCarry = Subtarget->hasMadNC64_32Insts() &&
604 MRI->use_nodbg_empty(I.getOperand(1).getReg());
605
606 unsigned Opc;
607 if (Subtarget->hasMADIntraFwdBug())
608 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
609 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
610 else if (UseNoCarry)
611 Opc = IsUnsigned ? AMDGPU::V_MAD_NC_U64_U32_e64
612 : AMDGPU::V_MAD_NC_I64_I32_e64;
613 else
614 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
615
616 if (UseNoCarry)
617 I.removeOperand(1);
618
619 I.setDesc(TII.get(Opc));
620 I.addOperand(*MF, MachineOperand::CreateImm(0));
621 I.addImplicitDefUseOperands(*MF);
622 I.getOperand(0).setIsEarlyClobber(true);
623 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
624 return true;
625}
626
627// TODO: We should probably legalize these to only using 32-bit results.
628bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
629 MachineBasicBlock *BB = I.getParent();
630 Register DstReg = I.getOperand(0).getReg();
631 Register SrcReg = I.getOperand(1).getReg();
632 LLT DstTy = MRI->getType(DstReg);
633 LLT SrcTy = MRI->getType(SrcReg);
634 const unsigned SrcSize = SrcTy.getSizeInBits();
635 unsigned DstSize = DstTy.getSizeInBits();
636
637 // TODO: Should handle any multiple of 32 offset.
638 unsigned Offset = I.getOperand(2).getImm();
639 if (Offset % 32 != 0 || DstSize > 128)
640 return false;
641
642 // 16-bit operations really use 32-bit registers.
643 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
644 if (DstSize == 16)
645 DstSize = 32;
646
647 const TargetRegisterClass *DstRC =
648 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
649 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
650 return false;
651
652 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
653 const TargetRegisterClass *SrcRC =
654 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
655 if (!SrcRC)
656 return false;
657 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
658 DstSize / 32);
659 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
660 if (!SrcRC)
661 return false;
662
663 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
664 *SrcRC, I.getOperand(1));
665 const DebugLoc &DL = I.getDebugLoc();
666 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
667 .addReg(SrcReg, {}, SubReg);
668
669 I.eraseFromParent();
670 return true;
671}
672
673bool AMDGPUInstructionSelector::selectS16MergeToS32(MachineInstr &MI) const {
674 Register Dst = MI.getOperand(0).getReg();
675 Register Src0 = MI.getOperand(1).getReg();
676 Register Src1 = MI.getOperand(2).getReg();
677
678 LLT Src0Ty = MRI->getType(Src0);
679 LLT Src1Ty = MRI->getType(Src1);
680
681 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
682 const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, TRI);
683 const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, TRI);
684 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
685
686 Register ShiftSrc0;
687 Register ShiftSrc1;
688
689 const DebugLoc &DL = MI.getDebugLoc();
690 MachineBasicBlock *BB = MI.getParent();
691
692 // VGPR case
693 if (IsVector) {
694 // If source are both VGPR16, use REG_SEQUENCE with lo16/hi16 subregisters
695 if (Src0Bank->getID() == AMDGPU::VGPRRegBankID &&
696 Src1Bank->getID() == AMDGPU::VGPRRegBankID &&
697 Src0Ty == LLT::scalar(16) && Src1Ty == LLT::scalar(16)) {
698 BuildMI(*BB, MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), Dst)
699 .addReg(Src0)
700 .addImm(AMDGPU::lo16)
701 .addReg(Src1)
702 .addImm(AMDGPU::hi16);
703
704 if (!RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI))
705 return false;
706
707 MI.eraseFromParent();
708 return true;
709 }
710
711 // Otherwise, use V_LSHL_OR_B32_e64
712 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
713 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
714 .addImm(0xFFFF)
715 .addReg(Src0);
716 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
717
718 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
719 .addReg(Src1)
720 .addImm(16)
721 .addReg(TmpReg);
722 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
723
724 MI.eraseFromParent();
725 return true;
726 }
727
728 // SGPR case -> S_PACK_*_B32_B16
729 // With multiple uses of the shift, this will duplicate the shift and
730 // increase register pressure.
731 //
732 // (merge (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
733 // => (S_PACK_HH_B32_B16 $src0, $src1)
734 // (merge (lshr_oneuse SReg_32:$src0, 16), $src1)
735 // => (S_PACK_HL_B32_B16 $src0, $src1)
736 // (merge $src0, (lshr_oneuse SReg_32:$src1, 16))
737 // => (S_PACK_LH_B32_B16 $src0, $src1)
738 // (merge $src0, $src1)
739 // => (S_PACK_LL_B32_B16 $src0, $src1)
740
741 bool Shift0 = mi_match(
742 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
743
744 bool Shift1 = mi_match(
745 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
746
747 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
748 if (Shift0 && Shift1) {
749 Opc = AMDGPU::S_PACK_HH_B32_B16;
750 MI.getOperand(1).setReg(ShiftSrc0);
751 MI.getOperand(2).setReg(ShiftSrc1);
752 } else if (Shift1) {
753 Opc = AMDGPU::S_PACK_LH_B32_B16;
754 MI.getOperand(2).setReg(ShiftSrc1);
755 } else if (Shift0) {
756 auto ConstSrc1 =
757 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
758 if (ConstSrc1 && ConstSrc1->Value == 0) {
759 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
760 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
761 .addReg(ShiftSrc0)
762 .addImm(16)
763 .setOperandDead(3); // Dead scc
764
765 MI.eraseFromParent();
766 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
767 return true;
768 }
769 if (STI.hasSPackHL()) {
770 Opc = AMDGPU::S_PACK_HL_B32_B16;
771 MI.getOperand(1).setReg(ShiftSrc0);
772 }
773 }
774
775 MI.setDesc(TII.get(Opc));
776 constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
777 return true;
778}
779
780bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
781 MachineBasicBlock *BB = MI.getParent();
782 Register DstReg = MI.getOperand(0).getReg();
783 LLT DstTy = MRI->getType(DstReg);
784 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
785
786 const unsigned SrcSize = SrcTy.getSizeInBits();
787 if (SrcSize < 32) {
788 // Handle s32 <- G_MERGE_VALUES s16, s16
789 if (SrcSize == 16 && DstTy.getSizeInBits() == 32 &&
790 MI.getNumOperands() == 3) {
791 return selectS16MergeToS32(MI);
792 }
793 return selectImpl(MI, *CoverageInfo);
794 }
795
796 const DebugLoc &DL = MI.getDebugLoc();
797 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
798 const unsigned DstSize = DstTy.getSizeInBits();
799 const TargetRegisterClass *DstRC =
800 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
801 if (!DstRC)
802 return false;
803
804 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
805 MachineInstrBuilder MIB =
806 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
807 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
808 MachineOperand &Src = MI.getOperand(I + 1);
809 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
810 MIB.addImm(SubRegs[I]);
811
812 const TargetRegisterClass *SrcRC
813 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
814 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
815 return false;
816 }
817
818 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
819 return false;
820
821 MI.eraseFromParent();
822 return true;
823}
824
825bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
826 MachineBasicBlock *BB = MI.getParent();
827 const int NumDst = MI.getNumOperands() - 1;
828
829 MachineOperand &Src = MI.getOperand(NumDst);
830
831 Register SrcReg = Src.getReg();
832 Register DstReg0 = MI.getOperand(0).getReg();
833 LLT DstTy = MRI->getType(DstReg0);
834 LLT SrcTy = MRI->getType(SrcReg);
835
836 const unsigned DstSize = DstTy.getSizeInBits();
837 const unsigned SrcSize = SrcTy.getSizeInBits();
838 const DebugLoc &DL = MI.getDebugLoc();
839 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
840
841 const TargetRegisterClass *SrcRC =
842 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
843 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
844 return false;
845
846 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
847 // source, and this relies on the fact that the same subregister indices are
848 // used for both.
849 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
850 for (int I = 0, E = NumDst; I != E; ++I) {
851 MachineOperand &Dst = MI.getOperand(I);
852 // hi16:sreg_32 is not allowed so explicitly shift upper 16-bits.
853 if (SrcBank->getID() == AMDGPU::SGPRRegBankID &&
854 SubRegs[I] == AMDGPU::hi16) {
855 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst.getReg())
856 .addReg(SrcReg)
857 .addImm(16);
858 } else {
859 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
860 .addReg(SrcReg, {}, SubRegs[I]);
861 }
862
863 // Make sure the subregister index is valid for the source register.
864 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
865 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
866 return false;
867
868 const TargetRegisterClass *DstRC =
869 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
870 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
871 return false;
872 }
873
874 MI.eraseFromParent();
875 return true;
876}
877
878bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
879 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
880 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
881
882 Register Src0 = MI.getOperand(1).getReg();
883 Register Src1 = MI.getOperand(2).getReg();
884 LLT SrcTy = MRI->getType(Src0);
885 const unsigned SrcSize = SrcTy.getSizeInBits();
886
887 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
888 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
889 return selectG_MERGE_VALUES(MI);
890 }
891
892 // Selection logic below is for V2S16 only.
893 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
894 Register Dst = MI.getOperand(0).getReg();
895 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
896 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
897 SrcTy != LLT::scalar(32)))
898 return selectImpl(MI, *CoverageInfo);
899
900 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
901 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
902 return false;
903
904 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
905 DstBank->getID() == AMDGPU::VGPRRegBankID);
906 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
907
908 const DebugLoc &DL = MI.getDebugLoc();
909 MachineBasicBlock *BB = MI.getParent();
910
911 // First, before trying TableGen patterns, check if both sources are
912 // constants. In those cases, we can trivially compute the final constant
913 // and emit a simple move.
914 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
915 if (ConstSrc1) {
916 auto ConstSrc0 =
917 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
918 if (ConstSrc0) {
919 const int64_t K0 = ConstSrc0->Value.getSExtValue();
920 const int64_t K1 = ConstSrc1->Value.getSExtValue();
921 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
922 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
923 uint32_t Imm = Lo16 | (Hi16 << 16);
924
925 // VALU
926 if (IsVector) {
927 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
928 MI.eraseFromParent();
929 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
930 }
931
932 // SALU
933 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
934 MI.eraseFromParent();
935 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
936 }
937 }
938
939 // Now try TableGen patterns.
940 if (selectImpl(MI, *CoverageInfo))
941 return true;
942
943 // TODO: This should probably be a combine somewhere
944 // (build_vector $src0, undef) -> copy $src0
945 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
946 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
947 MI.setDesc(TII.get(AMDGPU::COPY));
948 MI.removeOperand(2);
949 const auto &RC =
950 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
951 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
952 RBI.constrainGenericRegister(Src0, RC, *MRI);
953 }
954
955 return selectS16MergeToS32(MI);
956}
957
958bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
959 const MachineOperand &MO = I.getOperand(0);
960
961 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
962 // regbank check here is to know why getConstrainedRegClassForOperand failed.
963 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
964 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
965 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
966 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
967 return true;
968 }
969
970 return false;
971}
972
973bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
974 MachineBasicBlock *BB = I.getParent();
975
976 Register DstReg = I.getOperand(0).getReg();
977 Register Src0Reg = I.getOperand(1).getReg();
978 Register Src1Reg = I.getOperand(2).getReg();
979 LLT Src1Ty = MRI->getType(Src1Reg);
980
981 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
982 unsigned InsSize = Src1Ty.getSizeInBits();
983
984 int64_t Offset = I.getOperand(3).getImm();
985
986 // FIXME: These cases should have been illegal and unnecessary to check here.
987 if (Offset % 32 != 0 || InsSize % 32 != 0)
988 return false;
989
990 // Currently not handled by getSubRegFromChannel.
991 if (InsSize > 128)
992 return false;
993
994 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
995 if (SubReg == AMDGPU::NoSubRegister)
996 return false;
997
998 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
999 const TargetRegisterClass *DstRC =
1000 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
1001 if (!DstRC)
1002 return false;
1003
1004 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
1005 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
1006 const TargetRegisterClass *Src0RC =
1007 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
1008 const TargetRegisterClass *Src1RC =
1009 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
1010
1011 // Deal with weird cases where the class only partially supports the subreg
1012 // index.
1013 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
1014 if (!Src0RC || !Src1RC)
1015 return false;
1016
1017 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
1018 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
1019 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
1020 return false;
1021
1022 const DebugLoc &DL = I.getDebugLoc();
1023 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
1024 .addReg(Src0Reg)
1025 .addReg(Src1Reg)
1026 .addImm(SubReg);
1027
1028 I.eraseFromParent();
1029 return true;
1030}
1031
1032bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
1033 Register DstReg = MI.getOperand(0).getReg();
1034 Register SrcReg = MI.getOperand(1).getReg();
1035 Register OffsetReg = MI.getOperand(2).getReg();
1036 Register WidthReg = MI.getOperand(3).getReg();
1037
1038 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
1039 "scalar BFX instructions are expanded in regbankselect");
1040 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
1041 "64-bit vector BFX instructions are expanded in regbankselect");
1042
1043 const DebugLoc &DL = MI.getDebugLoc();
1044 MachineBasicBlock *MBB = MI.getParent();
1045
1046 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
1047 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
1048 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
1049 .addReg(SrcReg)
1050 .addReg(OffsetReg)
1051 .addReg(WidthReg);
1052 MI.eraseFromParent();
1053 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1054 return true;
1055}
1056
1057bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
1058 if (STI.getLDSBankCount() != 16)
1059 return selectImpl(MI, *CoverageInfo);
1060
1061 Register Dst = MI.getOperand(0).getReg();
1062 Register Src0 = MI.getOperand(2).getReg();
1063 Register M0Val = MI.getOperand(6).getReg();
1064 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
1065 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
1066 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
1067 return false;
1068
1069 // This requires 2 instructions. It is possible to write a pattern to support
1070 // this, but the generated isel emitter doesn't correctly deal with multiple
1071 // output instructions using the same physical register input. The copy to m0
1072 // is incorrectly placed before the second instruction.
1073 //
1074 // TODO: Match source modifiers.
1075
1076 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1077 const DebugLoc &DL = MI.getDebugLoc();
1078 MachineBasicBlock *MBB = MI.getParent();
1079
1080 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1081 .addReg(M0Val);
1082 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
1083 .addImm(2)
1084 .addImm(MI.getOperand(4).getImm()) // $attr
1085 .addImm(MI.getOperand(3).getImm()); // $attrchan
1086
1087 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
1088 .addImm(0) // $src0_modifiers
1089 .addReg(Src0) // $src0
1090 .addImm(MI.getOperand(4).getImm()) // $attr
1091 .addImm(MI.getOperand(3).getImm()) // $attrchan
1092 .addImm(0) // $src2_modifiers
1093 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
1094 .addImm(MI.getOperand(5).getImm()) // $high
1095 .addImm(0) // $clamp
1096 .addImm(0); // $omod
1097
1098 MI.eraseFromParent();
1099 return true;
1100}
1101
1102// Writelane is special in that it can use SGPR and M0 (which would normally
1103// count as using the constant bus twice - but in this case it is allowed since
1104// the lane selector doesn't count as a use of the constant bus). However, it is
1105// still required to abide by the 1 SGPR rule. Fix this up if we might have
1106// multiple SGPRs.
1107bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
1108 // With a constant bus limit of at least 2, there's no issue.
1109 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
1110 return selectImpl(MI, *CoverageInfo);
1111
1112 MachineBasicBlock *MBB = MI.getParent();
1113 const DebugLoc &DL = MI.getDebugLoc();
1114 Register VDst = MI.getOperand(0).getReg();
1115 Register Val = MI.getOperand(2).getReg();
1116 Register LaneSelect = MI.getOperand(3).getReg();
1117 Register VDstIn = MI.getOperand(4).getReg();
1118
1119 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
1120
1121 std::optional<ValueAndVReg> ConstSelect =
1122 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
1123 if (ConstSelect) {
1124 // The selector has to be an inline immediate, so we can use whatever for
1125 // the other operands.
1126 MIB.addReg(Val);
1127 MIB.addImm(ConstSelect->Value.getSExtValue() &
1128 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
1129 } else {
1130 std::optional<ValueAndVReg> ConstVal =
1132
1133 // If the value written is an inline immediate, we can get away without a
1134 // copy to m0.
1135 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
1136 STI.hasInv2PiInlineImm())) {
1137 MIB.addImm(ConstVal->Value.getSExtValue());
1138 MIB.addReg(LaneSelect);
1139 } else {
1140 MIB.addReg(Val);
1141
1142 // If the lane selector was originally in a VGPR and copied with
1143 // readfirstlane, there's a hazard to read the same SGPR from the
1144 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
1145 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
1146
1147 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1148 .addReg(LaneSelect);
1149 MIB.addReg(AMDGPU::M0);
1150 }
1151 }
1152
1153 MIB.addReg(VDstIn);
1154
1155 MI.eraseFromParent();
1156 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1157 return true;
1158}
1159
1160// We need to handle this here because tablegen doesn't support matching
1161// instructions with multiple outputs.
1162bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
1163 Register Dst0 = MI.getOperand(0).getReg();
1164 Register Dst1 = MI.getOperand(1).getReg();
1165
1166 LLT Ty = MRI->getType(Dst0);
1167 unsigned Opc;
1168 if (Ty == LLT::scalar(32))
1169 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1170 else if (Ty == LLT::scalar(64))
1171 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1172 else
1173 return false;
1174
1175 // TODO: Match source modifiers.
1176
1177 const DebugLoc &DL = MI.getDebugLoc();
1178 MachineBasicBlock *MBB = MI.getParent();
1179
1180 Register Numer = MI.getOperand(3).getReg();
1181 Register Denom = MI.getOperand(4).getReg();
1182 unsigned ChooseDenom = MI.getOperand(5).getImm();
1183
1184 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1185
1186 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1187 .addDef(Dst1)
1188 .addImm(0) // $src0_modifiers
1189 .addUse(Src0) // $src0
1190 .addImm(0) // $src1_modifiers
1191 .addUse(Denom) // $src1
1192 .addImm(0) // $src2_modifiers
1193 .addUse(Numer) // $src2
1194 .addImm(0) // $clamp
1195 .addImm(0); // $omod
1196
1197 MI.eraseFromParent();
1198 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1199 return true;
1200}
1201
1202bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1203 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1204 switch (IntrinsicID) {
1205 case Intrinsic::amdgcn_if_break: {
1206 MachineBasicBlock *BB = I.getParent();
1207
1208 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1209 // SelectionDAG uses for wave32 vs wave64.
1210 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1211 .add(I.getOperand(0))
1212 .add(I.getOperand(2))
1213 .add(I.getOperand(3));
1214
1215 Register DstReg = I.getOperand(0).getReg();
1216 Register Src0Reg = I.getOperand(2).getReg();
1217 Register Src1Reg = I.getOperand(3).getReg();
1218
1219 I.eraseFromParent();
1220
1221 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1222 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1223
1224 return true;
1225 }
1226 case Intrinsic::amdgcn_interp_p1_f16:
1227 return selectInterpP1F16(I);
1228 case Intrinsic::amdgcn_wqm:
1229 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1230 case Intrinsic::amdgcn_softwqm:
1231 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1232 case Intrinsic::amdgcn_strict_wwm:
1233 case Intrinsic::amdgcn_wwm:
1234 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1235 case Intrinsic::amdgcn_strict_wqm:
1236 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1237 case Intrinsic::amdgcn_writelane:
1238 return selectWritelane(I);
1239 case Intrinsic::amdgcn_div_scale:
1240 return selectDivScale(I);
1241 case Intrinsic::amdgcn_icmp:
1242 case Intrinsic::amdgcn_fcmp:
1243 if (selectImpl(I, *CoverageInfo))
1244 return true;
1245 return selectIntrinsicCmp(I);
1246 case Intrinsic::amdgcn_ballot:
1247 return selectBallot(I);
1248 case Intrinsic::amdgcn_reloc_constant:
1249 return selectRelocConstant(I);
1250 case Intrinsic::amdgcn_groupstaticsize:
1251 return selectGroupStaticSize(I);
1252 case Intrinsic::returnaddress:
1253 return selectReturnAddress(I);
1254 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1255 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1256 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1257 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1258 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1259 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1260 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1261 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1262 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1263 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1264 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1265 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1266 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1267 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1268 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1269 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1270 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1271 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1272 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1273 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1274 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1275 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1276 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1277 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1278 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1279 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1280 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1281 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1282 return selectSMFMACIntrin(I);
1283 case Intrinsic::amdgcn_permlane16_swap:
1284 case Intrinsic::amdgcn_permlane32_swap:
1285 return selectPermlaneSwapIntrin(I, IntrinsicID);
1286 case Intrinsic::amdgcn_wave_shuffle:
1287 return selectWaveShuffleIntrin(I);
1288 case Intrinsic::amdgcn_fma_legacy:
1289 if (!STI.hasFmaLegacy32Insts()) {
1291 return false;
1292 }
1293 return selectImpl(I, *CoverageInfo);
1294 case Intrinsic::amdgcn_sudot4:
1295 case Intrinsic::amdgcn_sudot8:
1296 if (!STI.hasDot8Insts()) {
1298 return false;
1299 }
1300 return selectImpl(I, *CoverageInfo);
1301 default:
1302 return selectImpl(I, *CoverageInfo);
1303 }
1304}
1305
1307 const GCNSubtarget &ST) {
1308 if (Size != 16 && Size != 32 && Size != 64)
1309 return -1;
1310
1311 if (Size == 16 && !ST.has16BitInsts())
1312 return -1;
1313
1314 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc,
1315 unsigned FakeS16Opc, unsigned S32Opc,
1316 unsigned S64Opc) {
1317 if (Size == 16)
1318 return ST.hasTrue16BitInsts()
1319 ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc
1320 : S16Opc;
1321 if (Size == 32)
1322 return S32Opc;
1323 return S64Opc;
1324 };
1325
1326 switch (P) {
1327 default:
1328 llvm_unreachable("Unknown condition code!");
1329 case CmpInst::ICMP_NE:
1330 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1331 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1332 AMDGPU::V_CMP_NE_U64_e64);
1333 case CmpInst::ICMP_EQ:
1334 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1335 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1336 AMDGPU::V_CMP_EQ_U64_e64);
1337 case CmpInst::ICMP_SGT:
1338 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1339 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1340 AMDGPU::V_CMP_GT_I64_e64);
1341 case CmpInst::ICMP_SGE:
1342 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1343 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1344 AMDGPU::V_CMP_GE_I64_e64);
1345 case CmpInst::ICMP_SLT:
1346 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1347 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1348 AMDGPU::V_CMP_LT_I64_e64);
1349 case CmpInst::ICMP_SLE:
1350 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1351 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1352 AMDGPU::V_CMP_LE_I64_e64);
1353 case CmpInst::ICMP_UGT:
1354 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1355 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1356 AMDGPU::V_CMP_GT_U64_e64);
1357 case CmpInst::ICMP_UGE:
1358 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1359 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1360 AMDGPU::V_CMP_GE_U64_e64);
1361 case CmpInst::ICMP_ULT:
1362 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1363 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1364 AMDGPU::V_CMP_LT_U64_e64);
1365 case CmpInst::ICMP_ULE:
1366 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1367 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1368 AMDGPU::V_CMP_LE_U64_e64);
1369
1370 case CmpInst::FCMP_OEQ:
1371 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1372 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1373 AMDGPU::V_CMP_EQ_F64_e64);
1374 case CmpInst::FCMP_OGT:
1375 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1376 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1377 AMDGPU::V_CMP_GT_F64_e64);
1378 case CmpInst::FCMP_OGE:
1379 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1380 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1381 AMDGPU::V_CMP_GE_F64_e64);
1382 case CmpInst::FCMP_OLT:
1383 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1384 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1385 AMDGPU::V_CMP_LT_F64_e64);
1386 case CmpInst::FCMP_OLE:
1387 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1388 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1389 AMDGPU::V_CMP_LE_F64_e64);
1390 case CmpInst::FCMP_ONE:
1391 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1392 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1393 AMDGPU::V_CMP_NEQ_F64_e64);
1394 case CmpInst::FCMP_ORD:
1395 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1396 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1397 AMDGPU::V_CMP_O_F64_e64);
1398 case CmpInst::FCMP_UNO:
1399 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1400 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1401 AMDGPU::V_CMP_U_F64_e64);
1402 case CmpInst::FCMP_UEQ:
1403 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1404 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1405 AMDGPU::V_CMP_NLG_F64_e64);
1406 case CmpInst::FCMP_UGT:
1407 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1408 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1409 AMDGPU::V_CMP_NLE_F64_e64);
1410 case CmpInst::FCMP_UGE:
1411 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1412 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1413 AMDGPU::V_CMP_NLT_F64_e64);
1414 case CmpInst::FCMP_ULT:
1415 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1416 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1417 AMDGPU::V_CMP_NGE_F64_e64);
1418 case CmpInst::FCMP_ULE:
1419 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1420 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1421 AMDGPU::V_CMP_NGT_F64_e64);
1422 case CmpInst::FCMP_UNE:
1423 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1424 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1425 AMDGPU::V_CMP_NEQ_F64_e64);
1426 case CmpInst::FCMP_TRUE:
1427 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1428 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1429 AMDGPU::V_CMP_TRU_F64_e64);
1431 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1432 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1433 AMDGPU::V_CMP_F_F64_e64);
1434 }
1435}
1436
1437int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1438 unsigned Size) const {
1439 if (Size == 64) {
1440 if (!STI.hasScalarCompareEq64())
1441 return -1;
1442
1443 switch (P) {
1444 case CmpInst::ICMP_NE:
1445 return AMDGPU::S_CMP_LG_U64;
1446 case CmpInst::ICMP_EQ:
1447 return AMDGPU::S_CMP_EQ_U64;
1448 default:
1449 return -1;
1450 }
1451 }
1452
1453 if (Size == 32) {
1454 switch (P) {
1455 case CmpInst::ICMP_NE:
1456 return AMDGPU::S_CMP_LG_U32;
1457 case CmpInst::ICMP_EQ:
1458 return AMDGPU::S_CMP_EQ_U32;
1459 case CmpInst::ICMP_SGT:
1460 return AMDGPU::S_CMP_GT_I32;
1461 case CmpInst::ICMP_SGE:
1462 return AMDGPU::S_CMP_GE_I32;
1463 case CmpInst::ICMP_SLT:
1464 return AMDGPU::S_CMP_LT_I32;
1465 case CmpInst::ICMP_SLE:
1466 return AMDGPU::S_CMP_LE_I32;
1467 case CmpInst::ICMP_UGT:
1468 return AMDGPU::S_CMP_GT_U32;
1469 case CmpInst::ICMP_UGE:
1470 return AMDGPU::S_CMP_GE_U32;
1471 case CmpInst::ICMP_ULT:
1472 return AMDGPU::S_CMP_LT_U32;
1473 case CmpInst::ICMP_ULE:
1474 return AMDGPU::S_CMP_LE_U32;
1475 case CmpInst::FCMP_OEQ:
1476 return AMDGPU::S_CMP_EQ_F32;
1477 case CmpInst::FCMP_OGT:
1478 return AMDGPU::S_CMP_GT_F32;
1479 case CmpInst::FCMP_OGE:
1480 return AMDGPU::S_CMP_GE_F32;
1481 case CmpInst::FCMP_OLT:
1482 return AMDGPU::S_CMP_LT_F32;
1483 case CmpInst::FCMP_OLE:
1484 return AMDGPU::S_CMP_LE_F32;
1485 case CmpInst::FCMP_ONE:
1486 return AMDGPU::S_CMP_LG_F32;
1487 case CmpInst::FCMP_ORD:
1488 return AMDGPU::S_CMP_O_F32;
1489 case CmpInst::FCMP_UNO:
1490 return AMDGPU::S_CMP_U_F32;
1491 case CmpInst::FCMP_UEQ:
1492 return AMDGPU::S_CMP_NLG_F32;
1493 case CmpInst::FCMP_UGT:
1494 return AMDGPU::S_CMP_NLE_F32;
1495 case CmpInst::FCMP_UGE:
1496 return AMDGPU::S_CMP_NLT_F32;
1497 case CmpInst::FCMP_ULT:
1498 return AMDGPU::S_CMP_NGE_F32;
1499 case CmpInst::FCMP_ULE:
1500 return AMDGPU::S_CMP_NGT_F32;
1501 case CmpInst::FCMP_UNE:
1502 return AMDGPU::S_CMP_NEQ_F32;
1503 default:
1504 llvm_unreachable("Unknown condition code!");
1505 }
1506 }
1507
1508 if (Size == 16) {
1509 if (!STI.hasSALUFloatInsts())
1510 return -1;
1511
1512 switch (P) {
1513 case CmpInst::FCMP_OEQ:
1514 return AMDGPU::S_CMP_EQ_F16;
1515 case CmpInst::FCMP_OGT:
1516 return AMDGPU::S_CMP_GT_F16;
1517 case CmpInst::FCMP_OGE:
1518 return AMDGPU::S_CMP_GE_F16;
1519 case CmpInst::FCMP_OLT:
1520 return AMDGPU::S_CMP_LT_F16;
1521 case CmpInst::FCMP_OLE:
1522 return AMDGPU::S_CMP_LE_F16;
1523 case CmpInst::FCMP_ONE:
1524 return AMDGPU::S_CMP_LG_F16;
1525 case CmpInst::FCMP_ORD:
1526 return AMDGPU::S_CMP_O_F16;
1527 case CmpInst::FCMP_UNO:
1528 return AMDGPU::S_CMP_U_F16;
1529 case CmpInst::FCMP_UEQ:
1530 return AMDGPU::S_CMP_NLG_F16;
1531 case CmpInst::FCMP_UGT:
1532 return AMDGPU::S_CMP_NLE_F16;
1533 case CmpInst::FCMP_UGE:
1534 return AMDGPU::S_CMP_NLT_F16;
1535 case CmpInst::FCMP_ULT:
1536 return AMDGPU::S_CMP_NGE_F16;
1537 case CmpInst::FCMP_ULE:
1538 return AMDGPU::S_CMP_NGT_F16;
1539 case CmpInst::FCMP_UNE:
1540 return AMDGPU::S_CMP_NEQ_F16;
1541 default:
1542 llvm_unreachable("Unknown condition code!");
1543 }
1544 }
1545
1546 return -1;
1547}
1548
1549bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1550
1551 MachineBasicBlock *BB = I.getParent();
1552 const DebugLoc &DL = I.getDebugLoc();
1553
1554 Register SrcReg = I.getOperand(2).getReg();
1555 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1556
1557 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1558
1559 Register CCReg = I.getOperand(0).getReg();
1560 if (!isVCC(CCReg, *MRI)) {
1561 int Opcode = getS_CMPOpcode(Pred, Size);
1562 if (Opcode == -1)
1563 return false;
1564 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1565 .add(I.getOperand(2))
1566 .add(I.getOperand(3));
1567 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1568 .addReg(AMDGPU::SCC);
1569 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1570 bool Ret =
1571 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1572 I.eraseFromParent();
1573 return Ret;
1574 }
1575
1576 if (I.getOpcode() == AMDGPU::G_FCMP)
1577 return false;
1578
1579 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1580 if (Opcode == -1)
1581 return false;
1582
1583 MachineInstrBuilder ICmp;
1584 // t16 instructions
1585 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers)) {
1586 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1587 .addImm(0)
1588 .add(I.getOperand(2))
1589 .addImm(0)
1590 .add(I.getOperand(3))
1591 .addImm(0); // op_sel
1592 } else {
1593 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg())
1594 .add(I.getOperand(2))
1595 .add(I.getOperand(3));
1596 }
1597
1598 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1599 *TRI.getBoolRC(), *MRI);
1600 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1601 I.eraseFromParent();
1602 return true;
1603}
1604
1605bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1606 Register Dst = I.getOperand(0).getReg();
1607 if (isVCC(Dst, *MRI))
1608 return false;
1609
1610 LLT DstTy = MRI->getType(Dst);
1611 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1612 return false;
1613
1614 MachineBasicBlock *BB = I.getParent();
1615 const DebugLoc &DL = I.getDebugLoc();
1616 Register SrcReg = I.getOperand(2).getReg();
1617 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1618
1619 // i1 inputs are not supported in GlobalISel.
1620 if (Size == 1)
1621 return false;
1622
1623 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1624 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1625 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1626 I.eraseFromParent();
1627 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1628 }
1629
1630 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1631 if (Opcode == -1)
1632 return false;
1633
1634 MachineInstrBuilder SelectedMI;
1635 MachineOperand &LHS = I.getOperand(2);
1636 MachineOperand &RHS = I.getOperand(3);
1637 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1638 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
1639 Register Src0Reg =
1640 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1641 Register Src1Reg =
1642 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1643 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1644 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1645 SelectedMI.addImm(Src0Mods);
1646 SelectedMI.addReg(Src0Reg);
1647 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1648 SelectedMI.addImm(Src1Mods);
1649 SelectedMI.addReg(Src1Reg);
1650 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1651 SelectedMI.addImm(0); // clamp
1652 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1653 SelectedMI.addImm(0); // op_sel
1654
1655 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1656 constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI);
1657
1658 I.eraseFromParent();
1659 return true;
1660}
1661
1662// Ballot has to zero bits in input lane-mask that are zero in current exec,
1663// Done as AND with exec. For inputs that are results of instruction that
1664// implicitly use same exec, for example compares in same basic block or SCC to
1665// VCC copy, use copy.
1668 MachineInstr *MI = MRI.getVRegDef(Reg);
1669 if (MI->getParent() != MBB)
1670 return false;
1671
1672 // Lane mask generated by SCC to VCC copy.
1673 if (MI->getOpcode() == AMDGPU::COPY) {
1674 auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg());
1675 auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg());
1676 if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1677 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1678 return true;
1679 }
1680
1681 // Lane mask generated using compare with same exec.
1682 if (isa<GAnyCmp>(MI))
1683 return true;
1684
1685 Register LHS, RHS;
1686 // Look through AND.
1687 if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
1688 return isLaneMaskFromSameBlock(LHS, MRI, MBB) ||
1690
1691 return false;
1692}
1693
1694bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1695 MachineBasicBlock *BB = I.getParent();
1696 const DebugLoc &DL = I.getDebugLoc();
1697 Register DstReg = I.getOperand(0).getReg();
1698 Register SrcReg = I.getOperand(2).getReg();
1699 const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
1700 const unsigned WaveSize = STI.getWavefrontSize();
1701
1702 // In the common case, the return type matches the wave size.
1703 // However we also support emitting i64 ballots in wave32 mode.
1704 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1705 return false;
1706
1707 std::optional<ValueAndVReg> Arg =
1709
1710 Register Dst = DstReg;
1711 // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1712 if (BallotSize != WaveSize) {
1713 Dst = MRI->createVirtualRegister(TRI.getBoolRC());
1714 }
1715
1716 if (Arg) {
1717 const int64_t Value = Arg->Value.getZExtValue();
1718 if (Value == 0) {
1719 // Dst = S_MOV 0
1720 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1721 BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
1722 } else {
1723 // Dst = COPY EXEC
1724 assert(Value == 1);
1725 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
1726 }
1727 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1728 return false;
1729 } else {
1730 if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {
1731 // Dst = COPY SrcReg
1732 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
1733 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1734 return false;
1735 } else {
1736 // Dst = S_AND SrcReg, EXEC
1737 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1738 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
1739 .addReg(SrcReg)
1740 .addReg(TRI.getExec())
1741 .setOperandDead(3); // Dead scc
1742 constrainSelectedInstRegOperands(*And, TII, TRI, RBI);
1743 }
1744 }
1745
1746 // i64 ballot on Wave32: zero-extend i32 ballot to i64.
1747 if (BallotSize != WaveSize) {
1748 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1749 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1750 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1751 .addReg(Dst)
1752 .addImm(AMDGPU::sub0)
1753 .addReg(HiReg)
1754 .addImm(AMDGPU::sub1);
1755 }
1756
1757 I.eraseFromParent();
1758 return true;
1759}
1760
1761bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1762 Register DstReg = I.getOperand(0).getReg();
1763 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1764 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1765 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1766 return false;
1767
1768 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1769
1770 Module *M = MF->getFunction().getParent();
1771 const MDNode *Metadata = I.getOperand(2).getMetadata();
1772 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1773 auto *RelocSymbol = cast<GlobalVariable>(
1774 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1775
1776 MachineBasicBlock *BB = I.getParent();
1777 BuildMI(*BB, &I, I.getDebugLoc(),
1778 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1780
1781 I.eraseFromParent();
1782 return true;
1783}
1784
1785bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1786 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1787
1788 Register DstReg = I.getOperand(0).getReg();
1789 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1790 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1791 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1792
1793 MachineBasicBlock *MBB = I.getParent();
1794 const DebugLoc &DL = I.getDebugLoc();
1795
1796 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1797
1798 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1799 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1800 MIB.addImm(MFI->getLDSSize());
1801 } else {
1802 Module *M = MF->getFunction().getParent();
1803 const GlobalValue *GV =
1804 Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1806 }
1807
1808 I.eraseFromParent();
1809 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1810 return true;
1811}
1812
1813bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1814 MachineBasicBlock *MBB = I.getParent();
1815 MachineFunction &MF = *MBB->getParent();
1816 const DebugLoc &DL = I.getDebugLoc();
1817
1818 MachineOperand &Dst = I.getOperand(0);
1819 Register DstReg = Dst.getReg();
1820 unsigned Depth = I.getOperand(2).getImm();
1821
1822 const TargetRegisterClass *RC
1823 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1824 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1825 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1826 return false;
1827
1828 // Check for kernel and shader functions
1829 if (Depth != 0 ||
1830 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1831 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1832 .addImm(0);
1833 I.eraseFromParent();
1834 return true;
1835 }
1836
1837 MachineFrameInfo &MFI = MF.getFrameInfo();
1838 // There is a call to @llvm.returnaddress in this function
1839 MFI.setReturnAddressIsTaken(true);
1840
1841 // Get the return address reg and mark it as an implicit live-in
1842 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1843 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1844 AMDGPU::SReg_64RegClass, DL);
1845 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1846 .addReg(LiveIn);
1847 I.eraseFromParent();
1848 return true;
1849}
1850
1851bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1852 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1853 // SelectionDAG uses for wave32 vs wave64.
1854 MachineBasicBlock *BB = MI.getParent();
1855 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1856 .add(MI.getOperand(1));
1857
1858 Register Reg = MI.getOperand(1).getReg();
1859 MI.eraseFromParent();
1860
1861 if (!MRI->getRegClassOrNull(Reg))
1862 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1863 return true;
1864}
1865
1866bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1867 MachineInstr &MI, Intrinsic::ID IntrID) const {
1868 MachineBasicBlock *MBB = MI.getParent();
1869 MachineFunction *MF = MBB->getParent();
1870 const DebugLoc &DL = MI.getDebugLoc();
1871
1872 unsigned IndexOperand = MI.getOperand(7).getImm();
1873 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1874 bool WaveDone = MI.getOperand(9).getImm() != 0;
1875
1876 if (WaveDone && !WaveRelease) {
1877 // TODO: Move this to IR verifier
1878 const Function &Fn = MF->getFunction();
1879 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1880 Fn, "ds_ordered_count: wave_done requires wave_release", DL));
1881 }
1882
1883 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1884 IndexOperand &= ~0x3f;
1885 unsigned CountDw = 0;
1886
1887 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1888 CountDw = (IndexOperand >> 24) & 0xf;
1889 IndexOperand &= ~(0xf << 24);
1890
1891 if (CountDw < 1 || CountDw > 4) {
1892 const Function &Fn = MF->getFunction();
1893 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1894 Fn, "ds_ordered_count: dword count must be between 1 and 4", DL));
1895 CountDw = 1;
1896 }
1897 }
1898
1899 if (IndexOperand) {
1900 const Function &Fn = MF->getFunction();
1901 Fn.getContext().diagnose(DiagnosticInfoUnsupported(
1902 Fn, "ds_ordered_count: bad index operand", DL));
1903 }
1904
1905 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1906 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1907
1908 unsigned Offset0 = OrderedCountIndex << 2;
1909 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1910
1911 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1912 Offset1 |= (CountDw - 1) << 6;
1913
1914 if (STI.getGeneration() < AMDGPUSubtarget::GFX11)
1915 Offset1 |= ShaderType << 2;
1916
1917 unsigned Offset = Offset0 | (Offset1 << 8);
1918
1919 Register M0Val = MI.getOperand(2).getReg();
1920 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1921 .addReg(M0Val);
1922
1923 Register DstReg = MI.getOperand(0).getReg();
1924 Register ValReg = MI.getOperand(3).getReg();
1925 MachineInstrBuilder DS =
1926 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1927 .addReg(ValReg)
1928 .addImm(Offset)
1929 .cloneMemRefs(MI);
1930
1931 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1932 return false;
1933
1934 constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1935 MI.eraseFromParent();
1936 return true;
1937}
1938
1939static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1940 switch (IntrID) {
1941 case Intrinsic::amdgcn_ds_gws_init:
1942 return AMDGPU::DS_GWS_INIT;
1943 case Intrinsic::amdgcn_ds_gws_barrier:
1944 return AMDGPU::DS_GWS_BARRIER;
1945 case Intrinsic::amdgcn_ds_gws_sema_v:
1946 return AMDGPU::DS_GWS_SEMA_V;
1947 case Intrinsic::amdgcn_ds_gws_sema_br:
1948 return AMDGPU::DS_GWS_SEMA_BR;
1949 case Intrinsic::amdgcn_ds_gws_sema_p:
1950 return AMDGPU::DS_GWS_SEMA_P;
1951 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1952 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1953 default:
1954 llvm_unreachable("not a gws intrinsic");
1955 }
1956}
1957
1958bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1959 Intrinsic::ID IID) const {
1960 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1961 !STI.hasGWSSemaReleaseAll()))
1962 return false;
1963
1964 // intrinsic ID, vsrc, offset
1965 const bool HasVSrc = MI.getNumOperands() == 3;
1966 assert(HasVSrc || MI.getNumOperands() == 2);
1967
1968 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1969 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1970 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1971 return false;
1972
1973 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1974 unsigned ImmOffset;
1975
1976 MachineBasicBlock *MBB = MI.getParent();
1977 const DebugLoc &DL = MI.getDebugLoc();
1978
1979 MachineInstr *Readfirstlane = nullptr;
1980
1981 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1982 // incoming offset, in case there's an add of a constant. We'll have to put it
1983 // back later.
1984 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1985 Readfirstlane = OffsetDef;
1986 BaseOffset = OffsetDef->getOperand(1).getReg();
1987 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1988 }
1989
1990 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1991 // If we have a constant offset, try to use the 0 in m0 as the base.
1992 // TODO: Look into changing the default m0 initialization value. If the
1993 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1994 // the immediate offset.
1995
1996 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1997 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1998 .addImm(0);
1999 } else {
2000 std::tie(BaseOffset, ImmOffset) =
2001 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, VT);
2002
2003 if (Readfirstlane) {
2004 // We have the constant offset now, so put the readfirstlane back on the
2005 // variable component.
2006 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
2007 return false;
2008
2009 Readfirstlane->getOperand(1).setReg(BaseOffset);
2010 BaseOffset = Readfirstlane->getOperand(0).getReg();
2011 } else {
2012 if (!RBI.constrainGenericRegister(BaseOffset,
2013 AMDGPU::SReg_32RegClass, *MRI))
2014 return false;
2015 }
2016
2017 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2018 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
2019 .addReg(BaseOffset)
2020 .addImm(16)
2021 .setOperandDead(3); // Dead scc
2022
2023 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2024 .addReg(M0Base);
2025 }
2026
2027 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2028 // offset field) % 64. Some versions of the programming guide omit the m0
2029 // part, or claim it's from offset 0.
2030
2031 unsigned Opc = gwsIntrinToOpcode(IID);
2032 const MCInstrDesc &InstrDesc = TII.get(Opc);
2033
2034 if (HasVSrc) {
2035 Register VSrc = MI.getOperand(1).getReg();
2036
2037 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
2038 const TargetRegisterClass *DataRC = TII.getRegClass(InstrDesc, Data0Idx);
2039 const TargetRegisterClass *SubRC =
2040 TRI.getSubRegisterClass(DataRC, AMDGPU::sub0);
2041
2042 if (!SubRC) {
2043 // 32-bit normal case.
2044 if (!RBI.constrainGenericRegister(VSrc, *DataRC, *MRI))
2045 return false;
2046
2047 BuildMI(*MBB, &MI, DL, InstrDesc)
2048 .addReg(VSrc)
2049 .addImm(ImmOffset)
2050 .cloneMemRefs(MI);
2051 } else {
2052 // Requires even register alignment, so create 64-bit value and pad the
2053 // top half with undef.
2054 Register DataReg = MRI->createVirtualRegister(DataRC);
2055 if (!RBI.constrainGenericRegister(VSrc, *SubRC, *MRI))
2056 return false;
2057
2058 Register UndefReg = MRI->createVirtualRegister(SubRC);
2059 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2060 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), DataReg)
2061 .addReg(VSrc)
2062 .addImm(AMDGPU::sub0)
2063 .addReg(UndefReg)
2064 .addImm(AMDGPU::sub1);
2065
2066 BuildMI(*MBB, &MI, DL, InstrDesc)
2067 .addReg(DataReg)
2068 .addImm(ImmOffset)
2069 .cloneMemRefs(MI);
2070 }
2071 } else {
2072 BuildMI(*MBB, &MI, DL, InstrDesc)
2073 .addImm(ImmOffset)
2074 .cloneMemRefs(MI);
2075 }
2076
2077 MI.eraseFromParent();
2078 return true;
2079}
2080
2081bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
2082 bool IsAppend) const {
2083 Register PtrBase = MI.getOperand(2).getReg();
2084 LLT PtrTy = MRI->getType(PtrBase);
2085 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2086
2087 unsigned Offset;
2088 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
2089
2090 // TODO: Should this try to look through readfirstlane like GWS?
2091 if (!isDSOffsetLegal(PtrBase, Offset)) {
2092 PtrBase = MI.getOperand(2).getReg();
2093 Offset = 0;
2094 }
2095
2096 MachineBasicBlock *MBB = MI.getParent();
2097 const DebugLoc &DL = MI.getDebugLoc();
2098 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2099
2100 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2101 .addReg(PtrBase);
2102 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
2103 return false;
2104
2105 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
2106 .addImm(Offset)
2107 .addImm(IsGDS ? -1 : 0)
2108 .cloneMemRefs(MI);
2109 MI.eraseFromParent();
2110 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2111 return true;
2112}
2113
2114bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
2115 MachineFunction *MF = MI.getMF();
2116 SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
2117
2118 MFInfo->setInitWholeWave();
2119 return selectImpl(MI, *CoverageInfo);
2120}
2121
2122static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
2123 bool &IsTexFail) {
2124 if (TexFailCtrl)
2125 IsTexFail = true;
2126
2127 TFE = TexFailCtrl & 0x1;
2128 TexFailCtrl &= ~(uint64_t)0x1;
2129 LWE = TexFailCtrl & 0x2;
2130 TexFailCtrl &= ~(uint64_t)0x2;
2131
2132 return TexFailCtrl == 0;
2133}
2134
2135bool AMDGPUInstructionSelector::selectImageIntrinsic(
2136 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
2137 MachineBasicBlock *MBB = MI.getParent();
2138 const DebugLoc &DL = MI.getDebugLoc();
2139 unsigned IntrOpcode = Intr->BaseOpcode;
2140
2141 // For image atomic: use no-return opcode if result is unused.
2142 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode) {
2143 Register ResultDef = MI.getOperand(0).getReg();
2144 if (MRI->use_nodbg_empty(ResultDef))
2145 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
2146 }
2147
2148 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2150
2151 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
2152 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
2153 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
2154 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
2155 const bool IsGFX13Plus = AMDGPU::isGFX13Plus(STI);
2156
2157 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
2158
2159 Register VDataIn = AMDGPU::NoRegister;
2160 Register VDataOut = AMDGPU::NoRegister;
2161 LLT VDataTy;
2162 int NumVDataDwords = -1;
2163 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
2164 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
2165
2166 bool Unorm;
2167 if (!BaseOpcode->Sampler)
2168 Unorm = true;
2169 else
2170 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
2171
2172 bool TFE;
2173 bool LWE;
2174 bool IsTexFail = false;
2175 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
2176 TFE, LWE, IsTexFail))
2177 return false;
2178
2179 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
2180 const bool IsA16 = (Flags & 1) != 0;
2181 const bool IsG16 = (Flags & 2) != 0;
2182
2183 // A16 implies 16 bit gradients if subtarget doesn't support G16
2184 if (IsA16 && !STI.hasG16() && !IsG16)
2185 return false;
2186
2187 unsigned DMask = 0;
2188 unsigned DMaskLanes = 0;
2189
2190 if (BaseOpcode->Atomic) {
2191 if (!BaseOpcode->NoReturn)
2192 VDataOut = MI.getOperand(0).getReg();
2193 VDataIn = MI.getOperand(2).getReg();
2194 LLT Ty = MRI->getType(VDataIn);
2195
2196 // Be careful to allow atomic swap on 16-bit element vectors.
2197 const bool Is64Bit = BaseOpcode->AtomicX2 ?
2198 Ty.getSizeInBits() == 128 :
2199 Ty.getSizeInBits() == 64;
2200
2201 if (BaseOpcode->AtomicX2) {
2202 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
2203
2204 DMask = Is64Bit ? 0xf : 0x3;
2205 NumVDataDwords = Is64Bit ? 4 : 2;
2206 } else {
2207 DMask = Is64Bit ? 0x3 : 0x1;
2208 NumVDataDwords = Is64Bit ? 2 : 1;
2209 }
2210 } else {
2211 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
2212 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
2213
2214 if (BaseOpcode->Store) {
2215 VDataIn = MI.getOperand(1).getReg();
2216 VDataTy = MRI->getType(VDataIn);
2217 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
2218 } else if (BaseOpcode->NoReturn) {
2219 NumVDataDwords = 0;
2220 } else {
2221 VDataOut = MI.getOperand(0).getReg();
2222 VDataTy = MRI->getType(VDataOut);
2223 NumVDataDwords = DMaskLanes;
2224
2225 if (IsD16 && !STI.hasUnpackedD16VMem())
2226 NumVDataDwords = (DMaskLanes + 1) / 2;
2227 }
2228 }
2229
2230 // Set G16 opcode
2231 if (Subtarget->hasG16() && IsG16) {
2232 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2234 assert(G16MappingInfo);
2235 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
2236 }
2237
2238 // TODO: Check this in verifier.
2239 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
2240
2241 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
2242 // Keep GLC only when the atomic's result is actually used.
2243 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
2245 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
2247 return false;
2248
2249 int NumVAddrRegs = 0;
2250 int NumVAddrDwords = 0;
2251 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
2252 // Skip the $noregs and 0s inserted during legalization.
2253 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
2254 if (!AddrOp.isReg())
2255 continue; // XXX - Break?
2256
2257 Register Addr = AddrOp.getReg();
2258 if (!Addr)
2259 break;
2260
2261 ++NumVAddrRegs;
2262 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
2263 }
2264
2265 // The legalizer preprocessed the intrinsic arguments. If we aren't using
2266 // NSA, these should have been packed into a single value in the first
2267 // address register
2268 const bool UseNSA =
2269 NumVAddrRegs != 1 &&
2270 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2271 : NumVAddrDwords == NumVAddrRegs);
2272 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
2273 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
2274 return false;
2275 }
2276
2277 if (IsTexFail)
2278 ++NumVDataDwords;
2279
2280 int Opcode = -1;
2281 if (IsGFX13Plus) {
2282 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx13,
2283 NumVDataDwords, NumVAddrDwords);
2284 } else if (IsGFX12Plus) {
2285 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
2286 NumVDataDwords, NumVAddrDwords);
2287 } else if (IsGFX11Plus) {
2288 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2289 UseNSA ? AMDGPU::MIMGEncGfx11NSA
2290 : AMDGPU::MIMGEncGfx11Default,
2291 NumVDataDwords, NumVAddrDwords);
2292 } else if (IsGFX10Plus) {
2293 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
2294 UseNSA ? AMDGPU::MIMGEncGfx10NSA
2295 : AMDGPU::MIMGEncGfx10Default,
2296 NumVDataDwords, NumVAddrDwords);
2297 } else {
2298 if (Subtarget->hasGFX90AInsts()) {
2299 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
2300 NumVDataDwords, NumVAddrDwords);
2301 if (Opcode == -1) {
2302 LLVM_DEBUG(
2303 dbgs()
2304 << "requested image instruction is not supported on this GPU\n");
2305 return false;
2306 }
2307 }
2308 if (Opcode == -1 &&
2309 STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
2310 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
2311 NumVDataDwords, NumVAddrDwords);
2312 if (Opcode == -1)
2313 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
2314 NumVDataDwords, NumVAddrDwords);
2315 }
2316 if (Opcode == -1)
2317 return false;
2318
2319 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
2320 .cloneMemRefs(MI);
2321
2322 if (VDataOut) {
2323 if (BaseOpcode->AtomicX2) {
2324 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
2325
2326 Register TmpReg = MRI->createVirtualRegister(
2327 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2328 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2329
2330 MIB.addDef(TmpReg);
2331 if (!MRI->use_empty(VDataOut)) {
2332 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
2333 .addReg(TmpReg, RegState::Kill, SubReg);
2334 }
2335
2336 } else {
2337 MIB.addDef(VDataOut); // vdata output
2338 }
2339 }
2340
2341 if (VDataIn)
2342 MIB.addReg(VDataIn); // vdata input
2343
2344 for (int I = 0; I != NumVAddrRegs; ++I) {
2345 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2346 if (SrcOp.isReg()) {
2347 assert(SrcOp.getReg() != 0);
2348 MIB.addReg(SrcOp.getReg());
2349 }
2350 }
2351
2352 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2353 if (BaseOpcode->Sampler)
2354 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2355
2356 MIB.addImm(DMask); // dmask
2357
2358 if (IsGFX10Plus)
2359 MIB.addImm(DimInfo->Encoding);
2360 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2361 MIB.addImm(Unorm);
2362
2363 MIB.addImm(CPol);
2364 MIB.addImm(IsA16 && // a16 or r128
2365 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2366 if (IsGFX10Plus)
2367 MIB.addImm(IsA16 ? -1 : 0);
2368
2369 if (!Subtarget->hasGFX90AInsts()) {
2370 MIB.addImm(TFE); // tfe
2371 } else if (TFE) {
2372 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2373 return false;
2374 }
2375
2376 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2377 MIB.addImm(LWE); // lwe
2378 if (!IsGFX10Plus)
2379 MIB.addImm(DimInfo->DA ? -1 : 0);
2380 if (BaseOpcode->HasD16)
2381 MIB.addImm(IsD16 ? -1 : 0);
2382
2383 MI.eraseFromParent();
2384 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2385 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2386 return true;
2387}
2388
2389// We need to handle this here because tablegen doesn't support matching
2390// instructions with multiple outputs.
2391bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2392 MachineInstr &MI) const {
2393 Register Dst0 = MI.getOperand(0).getReg();
2394 Register Dst1 = MI.getOperand(1).getReg();
2395
2396 const DebugLoc &DL = MI.getDebugLoc();
2397 MachineBasicBlock *MBB = MI.getParent();
2398
2399 Register Addr = MI.getOperand(3).getReg();
2400 Register Data0 = MI.getOperand(4).getReg();
2401 Register Data1 = MI.getOperand(5).getReg();
2402 unsigned Offset = MI.getOperand(6).getImm();
2403
2404 unsigned Opc;
2405 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
2406 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2407 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2408 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2409 break;
2410 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2411 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2412 break;
2413 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2414 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2415 break;
2416 }
2417
2418 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
2419 .addDef(Dst1)
2420 .addUse(Addr)
2421 .addUse(Data0)
2422 .addUse(Data1)
2423 .addImm(Offset)
2424 .cloneMemRefs(MI);
2425
2426 MI.eraseFromParent();
2427 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2428 return true;
2429}
2430
2431bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2432 MachineInstr &I) const {
2433 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2434 switch (IntrinsicID) {
2435 case Intrinsic::amdgcn_end_cf:
2436 return selectEndCfIntrinsic(I);
2437 case Intrinsic::amdgcn_ds_ordered_add:
2438 case Intrinsic::amdgcn_ds_ordered_swap:
2439 return selectDSOrderedIntrinsic(I, IntrinsicID);
2440 case Intrinsic::amdgcn_ds_gws_init:
2441 case Intrinsic::amdgcn_ds_gws_barrier:
2442 case Intrinsic::amdgcn_ds_gws_sema_v:
2443 case Intrinsic::amdgcn_ds_gws_sema_br:
2444 case Intrinsic::amdgcn_ds_gws_sema_p:
2445 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2446 return selectDSGWSIntrinsic(I, IntrinsicID);
2447 case Intrinsic::amdgcn_ds_append:
2448 return selectDSAppendConsume(I, true);
2449 case Intrinsic::amdgcn_ds_consume:
2450 return selectDSAppendConsume(I, false);
2451 case Intrinsic::amdgcn_init_whole_wave:
2452 return selectInitWholeWave(I);
2453 case Intrinsic::amdgcn_raw_buffer_load_lds:
2454 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
2455 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2456 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
2457 case Intrinsic::amdgcn_struct_buffer_load_lds:
2458 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
2459 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2460 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
2461 return selectBufferLoadLds(I);
2462 // Until we can store both the address space of the global and the LDS
2463 // arguments by having tto MachineMemOperands on an intrinsic, we just trust
2464 // that the argument is a global pointer (buffer pointers have been handled by
2465 // a LLVM IR-level lowering).
2466 case Intrinsic::amdgcn_load_to_lds:
2467 case Intrinsic::amdgcn_load_async_to_lds:
2468 case Intrinsic::amdgcn_global_load_lds:
2469 case Intrinsic::amdgcn_global_load_async_lds:
2470 return selectGlobalLoadLds(I);
2471 case Intrinsic::amdgcn_tensor_load_to_lds:
2472 case Intrinsic::amdgcn_tensor_store_from_lds:
2473 return selectTensorLoadStore(I, IntrinsicID);
2474 case Intrinsic::amdgcn_asyncmark:
2475 case Intrinsic::amdgcn_wait_asyncmark:
2476 if (!Subtarget->hasAsyncMark())
2477 return false;
2478 break;
2479 case Intrinsic::amdgcn_exp_compr:
2480 if (!STI.hasCompressedExport()) {
2482 return false;
2483 }
2484 break;
2485 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2486 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2487 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2488 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2489 return selectDSBvhStackIntrinsic(I);
2490 case Intrinsic::amdgcn_s_alloc_vgpr: {
2491 // S_ALLOC_VGPR doesn't have a destination register, it just implicitly sets
2492 // SCC. We then need to COPY it into the result vreg.
2493 MachineBasicBlock *MBB = I.getParent();
2494 const DebugLoc &DL = I.getDebugLoc();
2495
2496 Register ResReg = I.getOperand(0).getReg();
2497
2498 MachineInstr *AllocMI = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_ALLOC_VGPR))
2499 .add(I.getOperand(2));
2500 (void)BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), ResReg)
2501 .addReg(AMDGPU::SCC);
2502 I.eraseFromParent();
2503 constrainSelectedInstRegOperands(*AllocMI, TII, TRI, RBI);
2504 return RBI.constrainGenericRegister(ResReg, AMDGPU::SReg_32RegClass, *MRI);
2505 }
2506 case Intrinsic::amdgcn_s_barrier_init:
2507 case Intrinsic::amdgcn_s_barrier_signal_var:
2508 return selectNamedBarrierInit(I, IntrinsicID);
2509 case Intrinsic::amdgcn_s_wakeup_barrier: {
2510 if (!STI.hasSWakeupBarrier()) {
2512 return false;
2513 }
2514 return selectNamedBarrierInst(I, IntrinsicID);
2515 }
2516 case Intrinsic::amdgcn_s_barrier_join:
2517 case Intrinsic::amdgcn_s_get_named_barrier_state:
2518 return selectNamedBarrierInst(I, IntrinsicID);
2519 case Intrinsic::amdgcn_s_get_barrier_state:
2520 return selectSGetBarrierState(I, IntrinsicID);
2521 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2522 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2523 }
2524 return selectImpl(I, *CoverageInfo);
2525}
2526
2527bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2528 if (selectImpl(I, *CoverageInfo))
2529 return true;
2530
2531 MachineBasicBlock *BB = I.getParent();
2532 const DebugLoc &DL = I.getDebugLoc();
2533
2534 Register DstReg = I.getOperand(0).getReg();
2535 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2536 assert(Size <= 32 || Size == 64);
2537 const MachineOperand &CCOp = I.getOperand(1);
2538 Register CCReg = CCOp.getReg();
2539 if (!isVCC(CCReg, *MRI)) {
2540 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2541 AMDGPU::S_CSELECT_B32;
2542 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2543 .addReg(CCReg);
2544
2545 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2546 // bank, because it does not cover the register class that we used to represent
2547 // for it. So we need to manually set the register class here.
2548 if (!MRI->getRegClassOrNull(CCReg))
2549 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2550 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2551 .add(I.getOperand(2))
2552 .add(I.getOperand(3));
2553
2555 constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2556 I.eraseFromParent();
2557 return true;
2558 }
2559
2560 // Wide VGPR select should have been split in RegBankSelect.
2561 if (Size > 32)
2562 return false;
2563
2564 MachineInstr *Select =
2565 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2566 .addImm(0)
2567 .add(I.getOperand(3))
2568 .addImm(0)
2569 .add(I.getOperand(2))
2570 .add(I.getOperand(1));
2571
2573 I.eraseFromParent();
2574 return true;
2575}
2576
2577bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2578 Register DstReg = I.getOperand(0).getReg();
2579 Register SrcReg = I.getOperand(1).getReg();
2580 const LLT DstTy = MRI->getType(DstReg);
2581 const LLT SrcTy = MRI->getType(SrcReg);
2582 const LLT S1 = LLT::scalar(1);
2583
2584 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2585 const RegisterBank *DstRB;
2586 if (DstTy == S1) {
2587 // This is a special case. We don't treat s1 for legalization artifacts as
2588 // vcc booleans.
2589 DstRB = SrcRB;
2590 } else {
2591 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2592 if (SrcRB != DstRB)
2593 return false;
2594 }
2595
2596 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2597
2598 unsigned DstSize = DstTy.getSizeInBits();
2599 unsigned SrcSize = SrcTy.getSizeInBits();
2600
2601 const TargetRegisterClass *SrcRC =
2602 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2603 const TargetRegisterClass *DstRC =
2604 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2605 if (!SrcRC || !DstRC)
2606 return false;
2607
2608 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2609 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2610 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2611 return false;
2612 }
2613
2614 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2615 assert(STI.useRealTrue16Insts());
2616 const DebugLoc &DL = I.getDebugLoc();
2617 MachineBasicBlock *MBB = I.getParent();
2618 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
2619 .addReg(SrcReg, {}, AMDGPU::lo16);
2620 I.eraseFromParent();
2621 return true;
2622 }
2623
2624 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2625 MachineBasicBlock *MBB = I.getParent();
2626 const DebugLoc &DL = I.getDebugLoc();
2627
2628 Register LoReg = MRI->createVirtualRegister(DstRC);
2629 Register HiReg = MRI->createVirtualRegister(DstRC);
2630 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2631 .addReg(SrcReg, {}, AMDGPU::sub0);
2632 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2633 .addReg(SrcReg, {}, AMDGPU::sub1);
2634
2635 if (IsVALU && STI.hasSDWA()) {
2636 // Write the low 16-bits of the high element into the high 16-bits of the
2637 // low element.
2638 MachineInstr *MovSDWA =
2639 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2640 .addImm(0) // $src0_modifiers
2641 .addReg(HiReg) // $src0
2642 .addImm(0) // $clamp
2643 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2644 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2645 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2646 .addReg(LoReg, RegState::Implicit);
2647 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2648 } else {
2649 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2650 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2651 Register ImmReg = MRI->createVirtualRegister(DstRC);
2652 if (IsVALU) {
2653 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2654 .addImm(16)
2655 .addReg(HiReg);
2656 } else {
2657 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2658 .addReg(HiReg)
2659 .addImm(16)
2660 .setOperandDead(3); // Dead scc
2661 }
2662
2663 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2664 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2665 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2666
2667 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2668 .addImm(0xffff);
2669 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2670 .addReg(LoReg)
2671 .addReg(ImmReg);
2672 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2673 .addReg(TmpReg0)
2674 .addReg(TmpReg1);
2675
2676 if (!IsVALU) {
2677 And.setOperandDead(3); // Dead scc
2678 Or.setOperandDead(3); // Dead scc
2679 }
2680 }
2681
2682 I.eraseFromParent();
2683 return true;
2684 }
2685
2686 if (!DstTy.isScalar())
2687 return false;
2688
2689 if (SrcSize > 32) {
2690 unsigned SubRegIdx = DstSize < 32
2691 ? static_cast<unsigned>(AMDGPU::sub0)
2692 : TRI.getSubRegFromChannel(0, DstSize / 32);
2693 if (SubRegIdx == AMDGPU::NoSubRegister)
2694 return false;
2695
2696 // Deal with weird cases where the class only partially supports the subreg
2697 // index.
2698 const TargetRegisterClass *SrcWithSubRC
2699 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2700 if (!SrcWithSubRC)
2701 return false;
2702
2703 if (SrcWithSubRC != SrcRC) {
2704 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2705 return false;
2706 }
2707
2708 I.getOperand(1).setSubReg(SubRegIdx);
2709 }
2710
2711 I.setDesc(TII.get(TargetOpcode::COPY));
2712 return true;
2713}
2714
2715/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2716static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2718 int SignedMask = static_cast<int>(Mask);
2719 return SignedMask >= -16 && SignedMask <= 64;
2720}
2721
2722// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2723const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2724 Register Reg, const MachineRegisterInfo &MRI,
2725 const TargetRegisterInfo &TRI) const {
2726 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2727 if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))
2728 return RB;
2729
2730 // Ignore the type, since we don't use vcc in artifacts.
2731 if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
2732 return &RBI.getRegBankFromRegClass(*RC, LLT());
2733 return nullptr;
2734}
2735
2736bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2737 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2738 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2739 const DebugLoc &DL = I.getDebugLoc();
2740 MachineBasicBlock &MBB = *I.getParent();
2741 const Register DstReg = I.getOperand(0).getReg();
2742 const Register SrcReg = I.getOperand(1).getReg();
2743
2744 const LLT DstTy = MRI->getType(DstReg);
2745 const LLT SrcTy = MRI->getType(SrcReg);
2746 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2747 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2748 const unsigned DstSize = DstTy.getSizeInBits();
2749 if (!DstTy.isScalar())
2750 return false;
2751
2752 // Artifact casts should never use vcc.
2753 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2754
2755 // FIXME: This should probably be illegal and split earlier.
2756 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2757 if (DstSize <= 32)
2758 return selectCOPY(I);
2759
2760 const TargetRegisterClass *SrcRC =
2761 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2762 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2763 const TargetRegisterClass *DstRC =
2764 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2765
2766 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2767 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2768 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2769 .addReg(SrcReg)
2770 .addImm(AMDGPU::sub0)
2771 .addReg(UndefReg)
2772 .addImm(AMDGPU::sub1);
2773 I.eraseFromParent();
2774
2775 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2776 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2777 }
2778
2779 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2780 // 64-bit should have been split up in RegBankSelect
2781
2782 // Try to use an and with a mask if it will save code size.
2783 unsigned Mask;
2784 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2785 MachineInstr *ExtI =
2786 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2787 .addImm(Mask)
2788 .addReg(SrcReg);
2789 I.eraseFromParent();
2790 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2791 return true;
2792 }
2793
2794 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2795 MachineInstr *ExtI =
2796 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2797 .addReg(SrcReg)
2798 .addImm(0) // Offset
2799 .addImm(SrcSize); // Width
2800 I.eraseFromParent();
2801 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2802 return true;
2803 }
2804
2805 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2806 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2807 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2808 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2809 return false;
2810
2811 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2812 const unsigned SextOpc = SrcSize == 8 ?
2813 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2814 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2815 .addReg(SrcReg);
2816 I.eraseFromParent();
2817 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2818 }
2819
2820 // Using a single 32-bit SALU to calculate the high half is smaller than
2821 // S_BFE with a literal constant operand.
2822 if (DstSize > 32 && SrcSize == 32) {
2823 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2824 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2825 if (Signed) {
2826 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2827 .addReg(SrcReg, {}, SubReg)
2828 .addImm(31)
2829 .setOperandDead(3); // Dead scc
2830 } else {
2831 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2832 .addImm(0);
2833 }
2834 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2835 .addReg(SrcReg, {}, SubReg)
2836 .addImm(AMDGPU::sub0)
2837 .addReg(HiReg)
2838 .addImm(AMDGPU::sub1);
2839 I.eraseFromParent();
2840 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2841 *MRI);
2842 }
2843
2844 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2845 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2846
2847 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2848 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2849 // We need a 64-bit register source, but the high bits don't matter.
2850 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2851 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2852 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2853
2854 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2855 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2856 .addReg(SrcReg, {}, SubReg)
2857 .addImm(AMDGPU::sub0)
2858 .addReg(UndefReg)
2859 .addImm(AMDGPU::sub1);
2860
2861 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2862 .addReg(ExtReg)
2863 .addImm(SrcSize << 16);
2864
2865 I.eraseFromParent();
2866 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2867 }
2868
2869 unsigned Mask;
2870 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2871 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2872 .addReg(SrcReg)
2873 .addImm(Mask)
2874 .setOperandDead(3); // Dead scc
2875 } else {
2876 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2877 .addReg(SrcReg)
2878 .addImm(SrcSize << 16);
2879 }
2880
2881 I.eraseFromParent();
2882 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2883 }
2884
2885 return false;
2886}
2887
2891
2893 Register BitcastSrc;
2894 if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
2895 Reg = BitcastSrc;
2896 return Reg;
2897}
2898
2900 Register &Out) {
2901 // When unmerging a register that is composed of 2 x 16-bit values allow to
2902 // use an extract hi instruction for the upper 16 bits. We only need to check
2903 // the size of `In` as all defs are guaranteed to be the same type for
2904 // GUnmerge.
2905 if (auto *Unmerge = dyn_cast<GUnmerge>(MRI.getVRegDef(In))) {
2906 if (Unmerge->getNumDefs() == 2 && Unmerge->getOperand(1).getReg() == In &&
2907 MRI.getType(In).getSizeInBits() == 16) {
2908 Out = Unmerge->getSourceReg();
2909 return true;
2910 }
2911 }
2912
2913 Register Trunc;
2914 if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
2915 return false;
2916
2917 Register LShlSrc;
2918 Register Cst;
2919 if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
2920 Cst = stripCopy(Cst, MRI);
2921 if (mi_match(Cst, MRI, m_SpecificICst(16))) {
2922 Out = stripBitCast(LShlSrc, MRI);
2923 return true;
2924 }
2925 }
2926
2927 MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
2928 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2929 return false;
2930
2931 assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2932 LLT::fixed_vector(2, 16));
2933
2934 ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2935 assert(Mask.size() == 2);
2936
2937 if (Mask[0] == 1 && Mask[1] <= 1) {
2938 Out = Shuffle->getOperand(0).getReg();
2939 return true;
2940 }
2941
2942 return false;
2943}
2944
2945bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2946 if (!Subtarget->hasSALUFloatInsts())
2947 return false;
2948
2949 Register Dst = I.getOperand(0).getReg();
2950 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2951 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2952 return false;
2953
2954 Register Src = I.getOperand(1).getReg();
2955
2956 if (MRI->getType(Dst) == LLT::scalar(32) &&
2957 MRI->getType(Src) == LLT::scalar(16)) {
2958 if (isExtractHiElt(*MRI, Src, Src)) {
2959 MachineBasicBlock *BB = I.getParent();
2960 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2961 .addUse(Src);
2962 I.eraseFromParent();
2963 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2964 }
2965 }
2966
2967 return false;
2968}
2969
2970bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2971 // Only manually handle the f64 SGPR case.
2972 //
2973 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2974 // the bit ops theoretically have a second result due to the implicit def of
2975 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2976 // that is easy by disabling the check. The result works, but uses a
2977 // nonsensical sreg32orlds_and_sreg_1 regclass.
2978 //
2979 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2980 // the variadic REG_SEQUENCE operands.
2981
2982 Register Dst = MI.getOperand(0).getReg();
2983 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2984 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2985 MRI->getType(Dst) != LLT::scalar(64))
2986 return false;
2987
2988 Register Src = MI.getOperand(1).getReg();
2989 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2990 if (Fabs)
2991 Src = Fabs->getOperand(1).getReg();
2992
2993 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2994 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2995 return false;
2996
2997 MachineBasicBlock *BB = MI.getParent();
2998 const DebugLoc &DL = MI.getDebugLoc();
2999 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3000 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3001 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3002 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3003
3004 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
3005 .addReg(Src, {}, AMDGPU::sub0);
3006 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
3007 .addReg(Src, {}, AMDGPU::sub1);
3008 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
3009 .addImm(0x80000000);
3010
3011 // Set or toggle sign bit.
3012 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
3013 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
3014 .addReg(HiReg)
3015 .addReg(ConstReg)
3016 .setOperandDead(3); // Dead scc
3017 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
3018 .addReg(LoReg)
3019 .addImm(AMDGPU::sub0)
3020 .addReg(OpReg)
3021 .addImm(AMDGPU::sub1);
3022 MI.eraseFromParent();
3023 return true;
3024}
3025
3026// FIXME: This is a workaround for the same tablegen problems as G_FNEG
3027bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
3028 Register Dst = MI.getOperand(0).getReg();
3029 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
3030 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
3031 MRI->getType(Dst) != LLT::scalar(64))
3032 return false;
3033
3034 Register Src = MI.getOperand(1).getReg();
3035 MachineBasicBlock *BB = MI.getParent();
3036 const DebugLoc &DL = MI.getDebugLoc();
3037 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3038 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3039 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3040 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3041
3042 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
3043 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
3044 return false;
3045
3046 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
3047 .addReg(Src, {}, AMDGPU::sub0);
3048 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
3049 .addReg(Src, {}, AMDGPU::sub1);
3050 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
3051 .addImm(0x7fffffff);
3052
3053 // Clear sign bit.
3054 // TODO: Should this used S_BITSET0_*?
3055 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
3056 .addReg(HiReg)
3057 .addReg(ConstReg)
3058 .setOperandDead(3); // Dead scc
3059 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
3060 .addReg(LoReg)
3061 .addImm(AMDGPU::sub0)
3062 .addReg(OpReg)
3063 .addImm(AMDGPU::sub1);
3064
3065 MI.eraseFromParent();
3066 return true;
3067}
3068
3069static bool isConstant(const MachineInstr &MI) {
3070 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
3071}
3072
3073void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
3074 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
3075
3076 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
3077 const MachineInstr *PtrMI =
3078 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
3079
3080 assert(PtrMI);
3081
3082 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
3083 return;
3084
3085 GEPInfo GEPInfo;
3086
3087 for (unsigned i = 1; i != 3; ++i) {
3088 const MachineOperand &GEPOp = PtrMI->getOperand(i);
3089 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
3090 assert(OpDef);
3091 if (i == 2 && isConstant(*OpDef)) {
3092 // TODO: Could handle constant base + variable offset, but a combine
3093 // probably should have commuted it.
3094 assert(GEPInfo.Imm == 0);
3095 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
3096 continue;
3097 }
3098 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
3099 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
3100 GEPInfo.SgprParts.push_back(GEPOp.getReg());
3101 else
3102 GEPInfo.VgprParts.push_back(GEPOp.getReg());
3103 }
3104
3105 AddrInfo.push_back(GEPInfo);
3106 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
3107}
3108
3109bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
3110 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
3111}
3112
3113bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
3114 if (!MI.hasOneMemOperand())
3115 return false;
3116
3117 const MachineMemOperand *MMO = *MI.memoperands_begin();
3118 const Value *Ptr = MMO->getValue();
3119
3120 // UndefValue means this is a load of a kernel input. These are uniform.
3121 // Sometimes LDS instructions have constant pointers.
3122 // If Ptr is null, then that means this mem operand contains a
3123 // PseudoSourceValue like GOT.
3125 return true;
3126
3128 return true;
3129
3130 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
3131 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
3132 AMDGPU::SGPRRegBankID;
3133
3134 const Instruction *I = dyn_cast<Instruction>(Ptr);
3135 return I && I->getMetadata("amdgpu.uniform");
3136}
3137
3138bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
3139 for (const GEPInfo &GEPInfo : AddrInfo) {
3140 if (!GEPInfo.VgprParts.empty())
3141 return true;
3142 }
3143 return false;
3144}
3145
3146void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
3147 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
3148 unsigned AS = PtrTy.getAddressSpace();
3150 STI.ldsRequiresM0Init()) {
3151 MachineBasicBlock *BB = I.getParent();
3152
3153 // If DS instructions require M0 initialization, insert it before selecting.
3154 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3155 .addImm(-1);
3156 }
3157}
3158
3159bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
3160 MachineInstr &I) const {
3161 initM0(I);
3162 return selectImpl(I, *CoverageInfo);
3163}
3164
3166 if (Reg.isPhysical())
3167 return false;
3168
3170 const unsigned Opcode = MI.getOpcode();
3171
3172 if (Opcode == AMDGPU::COPY)
3173 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
3174
3175 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
3176 Opcode == AMDGPU::G_XOR)
3177 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
3178 isVCmpResult(MI.getOperand(2).getReg(), MRI);
3179
3180 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
3181 return GI->is(Intrinsic::amdgcn_class);
3182
3183 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
3184}
3185
3186bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
3187 MachineBasicBlock *BB = I.getParent();
3188 MachineOperand &CondOp = I.getOperand(0);
3189 Register CondReg = CondOp.getReg();
3190 const DebugLoc &DL = I.getDebugLoc();
3191
3192 unsigned BrOpcode;
3193 Register CondPhysReg;
3194 const TargetRegisterClass *ConstrainRC;
3195
3196 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
3197 // whether the branch is uniform when selecting the instruction. In
3198 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
3199 // RegBankSelect knows what it's doing if the branch condition is scc, even
3200 // though it currently does not.
3201 if (!isVCC(CondReg, *MRI)) {
3202 if (MRI->getType(CondReg) != LLT::scalar(32))
3203 return false;
3204
3205 CondPhysReg = AMDGPU::SCC;
3206 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
3207 ConstrainRC = &AMDGPU::SReg_32RegClass;
3208 } else {
3209 // FIXME: Should scc->vcc copies and with exec?
3210
3211 // Unless the value of CondReg is a result of a V_CMP* instruction then we
3212 // need to insert an and with exec.
3213 if (!isVCmpResult(CondReg, *MRI)) {
3214 const bool Is64 = STI.isWave64();
3215 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
3216 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
3217
3218 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
3219 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
3220 .addReg(CondReg)
3221 .addReg(Exec)
3222 .setOperandDead(3); // Dead scc
3223 CondReg = TmpReg;
3224 }
3225
3226 CondPhysReg = TRI.getVCC();
3227 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
3228 ConstrainRC = TRI.getBoolRC();
3229 }
3230
3231 if (!MRI->getRegClassOrNull(CondReg))
3232 MRI->setRegClass(CondReg, ConstrainRC);
3233
3234 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
3235 .addReg(CondReg);
3236 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
3237 .addMBB(I.getOperand(1).getMBB());
3238
3239 I.eraseFromParent();
3240 return true;
3241}
3242
3243bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
3244 MachineInstr &I) const {
3245 Register DstReg = I.getOperand(0).getReg();
3246 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3247 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3248 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
3249 if (IsVGPR)
3250 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
3251
3252 return RBI.constrainGenericRegister(
3253 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
3254}
3255
3256bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
3257 Register DstReg = I.getOperand(0).getReg();
3258 Register SrcReg = I.getOperand(1).getReg();
3259 Register MaskReg = I.getOperand(2).getReg();
3260 LLT Ty = MRI->getType(DstReg);
3261 LLT MaskTy = MRI->getType(MaskReg);
3262 MachineBasicBlock *BB = I.getParent();
3263 const DebugLoc &DL = I.getDebugLoc();
3264
3265 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3266 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3267 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
3268 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3269 if (DstRB != SrcRB) // Should only happen for hand written MIR.
3270 return false;
3271
3272 // Try to avoid emitting a bit operation when we only need to touch half of
3273 // the 64-bit pointer.
3274 APInt MaskOnes = VT->getKnownOnes(MaskReg).zext(64);
3275 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
3276 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
3277
3278 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
3279 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
3280
3281 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
3282 !CanCopyLow32 && !CanCopyHi32) {
3283 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
3284 .addReg(SrcReg)
3285 .addReg(MaskReg)
3286 .setOperandDead(3); // Dead scc
3287 I.eraseFromParent();
3288 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3289 return true;
3290 }
3291
3292 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3293 const TargetRegisterClass &RegRC
3294 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3295
3296 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
3297 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
3298 const TargetRegisterClass *MaskRC =
3299 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
3300
3301 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3302 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3303 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
3304 return false;
3305
3306 if (Ty.getSizeInBits() == 32) {
3307 assert(MaskTy.getSizeInBits() == 32 &&
3308 "ptrmask should have been narrowed during legalize");
3309
3310 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
3311 .addReg(SrcReg)
3312 .addReg(MaskReg);
3313
3314 if (!IsVGPR)
3315 NewOp.setOperandDead(3); // Dead scc
3316 I.eraseFromParent();
3317 return true;
3318 }
3319
3320 Register HiReg = MRI->createVirtualRegister(&RegRC);
3321 Register LoReg = MRI->createVirtualRegister(&RegRC);
3322
3323 // Extract the subregisters from the source pointer.
3324 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
3325 .addReg(SrcReg, {}, AMDGPU::sub0);
3326 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
3327 .addReg(SrcReg, {}, AMDGPU::sub1);
3328
3329 Register MaskedLo, MaskedHi;
3330
3331 if (CanCopyLow32) {
3332 // If all the bits in the low half are 1, we only need a copy for it.
3333 MaskedLo = LoReg;
3334 } else {
3335 // Extract the mask subregister and apply the and.
3336 Register MaskLo = MRI->createVirtualRegister(&RegRC);
3337 MaskedLo = MRI->createVirtualRegister(&RegRC);
3338
3339 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
3340 .addReg(MaskReg, {}, AMDGPU::sub0);
3341 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
3342 .addReg(LoReg)
3343 .addReg(MaskLo);
3344 }
3345
3346 if (CanCopyHi32) {
3347 // If all the bits in the high half are 1, we only need a copy for it.
3348 MaskedHi = HiReg;
3349 } else {
3350 Register MaskHi = MRI->createVirtualRegister(&RegRC);
3351 MaskedHi = MRI->createVirtualRegister(&RegRC);
3352
3353 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3354 .addReg(MaskReg, {}, AMDGPU::sub1);
3355 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3356 .addReg(HiReg)
3357 .addReg(MaskHi);
3358 }
3359
3360 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3361 .addReg(MaskedLo)
3362 .addImm(AMDGPU::sub0)
3363 .addReg(MaskedHi)
3364 .addImm(AMDGPU::sub1);
3365 I.eraseFromParent();
3366 return true;
3367}
3368
3369/// Return the register to use for the index value, and the subregister to use
3370/// for the indirectly accessed register.
3371static std::pair<Register, unsigned>
3373 const TargetRegisterClass *SuperRC, Register IdxReg,
3374 unsigned EltSize, GISelValueTracking &ValueTracking) {
3375 Register IdxBaseReg;
3376 int Offset;
3377
3378 std::tie(IdxBaseReg, Offset) =
3379 AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &ValueTracking);
3380 if (IdxBaseReg == AMDGPU::NoRegister) {
3381 // This will happen if the index is a known constant. This should ordinarily
3382 // be legalized out, but handle it as a register just in case.
3383 assert(Offset == 0);
3384 IdxBaseReg = IdxReg;
3385 }
3386
3387 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3388
3389 // Skip out of bounds offsets, or else we would end up using an undefined
3390 // register.
3391 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3392 return std::pair(IdxReg, SubRegs[0]);
3393 return std::pair(IdxBaseReg, SubRegs[Offset]);
3394}
3395
3396bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3397 MachineInstr &MI) const {
3398 Register DstReg = MI.getOperand(0).getReg();
3399 Register SrcReg = MI.getOperand(1).getReg();
3400 Register IdxReg = MI.getOperand(2).getReg();
3401
3402 LLT DstTy = MRI->getType(DstReg);
3403 LLT SrcTy = MRI->getType(SrcReg);
3404
3405 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3406 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3407 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3408
3409 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3410 // into a waterfall loop.
3411 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3412 return false;
3413
3414 const TargetRegisterClass *SrcRC =
3415 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3416 const TargetRegisterClass *DstRC =
3417 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3418 if (!SrcRC || !DstRC)
3419 return false;
3420 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3421 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3422 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3423 return false;
3424
3425 MachineBasicBlock *BB = MI.getParent();
3426 const DebugLoc &DL = MI.getDebugLoc();
3427 const bool Is64 = DstTy.getSizeInBits() == 64;
3428
3429 unsigned SubReg;
3430 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3431 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *VT);
3432
3433 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3434 if (DstTy.getSizeInBits() != 32 && !Is64)
3435 return false;
3436
3437 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3438 .addReg(IdxReg);
3439
3440 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3441 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3442 .addReg(SrcReg, {}, SubReg)
3443 .addReg(SrcReg, RegState::Implicit);
3444 MI.eraseFromParent();
3445 return true;
3446 }
3447
3448 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3449 return false;
3450
3451 if (!STI.useVGPRIndexMode()) {
3452 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3453 .addReg(IdxReg);
3454 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3455 .addReg(SrcReg, {}, SubReg)
3456 .addReg(SrcReg, RegState::Implicit);
3457 MI.eraseFromParent();
3458 return true;
3459 }
3460
3461 const MCInstrDesc &GPRIDXDesc =
3462 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3463 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3464 .addReg(SrcReg)
3465 .addReg(IdxReg)
3466 .addImm(SubReg);
3467
3468 MI.eraseFromParent();
3469 return true;
3470}
3471
3472// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3473bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3474 MachineInstr &MI) const {
3475 Register DstReg = MI.getOperand(0).getReg();
3476 Register VecReg = MI.getOperand(1).getReg();
3477 Register ValReg = MI.getOperand(2).getReg();
3478 Register IdxReg = MI.getOperand(3).getReg();
3479
3480 LLT VecTy = MRI->getType(DstReg);
3481 LLT ValTy = MRI->getType(ValReg);
3482 unsigned VecSize = VecTy.getSizeInBits();
3483 unsigned ValSize = ValTy.getSizeInBits();
3484
3485 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3486 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3487 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3488
3489 assert(VecTy.getElementType() == ValTy);
3490
3491 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3492 // into a waterfall loop.
3493 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3494 return false;
3495
3496 const TargetRegisterClass *VecRC =
3497 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3498 const TargetRegisterClass *ValRC =
3499 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3500
3501 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3502 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3503 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3504 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3505 return false;
3506
3507 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3508 return false;
3509
3510 unsigned SubReg;
3511 std::tie(IdxReg, SubReg) =
3512 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *VT);
3513
3514 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3515 STI.useVGPRIndexMode();
3516
3517 MachineBasicBlock *BB = MI.getParent();
3518 const DebugLoc &DL = MI.getDebugLoc();
3519
3520 if (!IndexMode) {
3521 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3522 .addReg(IdxReg);
3523
3524 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3525 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3526 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3527 .addReg(VecReg)
3528 .addReg(ValReg)
3529 .addImm(SubReg);
3530 MI.eraseFromParent();
3531 return true;
3532 }
3533
3534 const MCInstrDesc &GPRIDXDesc =
3535 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3536 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3537 .addReg(VecReg)
3538 .addReg(ValReg)
3539 .addReg(IdxReg)
3540 .addImm(SubReg);
3541
3542 MI.eraseFromParent();
3543 return true;
3544}
3545
3546static bool isAsyncLDSDMA(Intrinsic::ID Intr) {
3547 switch (Intr) {
3548 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
3549 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
3550 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
3551 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
3552 case Intrinsic::amdgcn_load_async_to_lds:
3553 case Intrinsic::amdgcn_global_load_async_lds:
3554 return true;
3555 }
3556 return false;
3557}
3558
3559bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3560 if (!Subtarget->hasVMemToLDSLoad())
3561 return false;
3562 unsigned Opc;
3563 unsigned Size = MI.getOperand(3).getImm();
3564 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
3565
3566 // The struct intrinsic variants add one additional operand over raw.
3567 const bool HasVIndex = MI.getNumOperands() == 9;
3568 Register VIndex;
3569 int OpOffset = 0;
3570 if (HasVIndex) {
3571 VIndex = MI.getOperand(4).getReg();
3572 OpOffset = 1;
3573 }
3574
3575 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3576 std::optional<ValueAndVReg> MaybeVOffset =
3578 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3579
3580 switch (Size) {
3581 default:
3582 return false;
3583 case 1:
3584 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3585 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3586 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3587 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3588 break;
3589 case 2:
3590 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3591 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3592 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3593 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3594 break;
3595 case 4:
3596 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3597 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3598 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3599 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3600 break;
3601 case 12:
3602 if (!Subtarget->hasLDSLoadB96_B128())
3603 return false;
3604
3605 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3606 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3607 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3608 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3609 break;
3610 case 16:
3611 if (!Subtarget->hasLDSLoadB96_B128())
3612 return false;
3613
3614 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3615 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3616 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3617 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3618 break;
3619 }
3620
3621 MachineBasicBlock *MBB = MI.getParent();
3622 const DebugLoc &DL = MI.getDebugLoc();
3623 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3624 .add(MI.getOperand(2));
3625
3626 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3627
3628 if (HasVIndex && HasVOffset) {
3629 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3630 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3631 .addReg(VIndex)
3632 .addImm(AMDGPU::sub0)
3633 .addReg(VOffset)
3634 .addImm(AMDGPU::sub1);
3635
3636 MIB.addReg(IdxReg);
3637 } else if (HasVIndex) {
3638 MIB.addReg(VIndex);
3639 } else if (HasVOffset) {
3640 MIB.addReg(VOffset);
3641 }
3642
3643 MIB.add(MI.getOperand(1)); // rsrc
3644 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3645 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3646 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
3647 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3648 MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3649 : AMDGPU::CPol::ALL_pregfx12)); // cpol
3650 MIB.addImm(
3651 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3652 ? 1
3653 : 0); // swz
3654 MIB.addImm(isAsyncLDSDMA(IntrinsicID));
3655
3656 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3657 // Don't set the offset value here because the pointer points to the base of
3658 // the buffer.
3659 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3660
3661 MachinePointerInfo StorePtrI = LoadPtrI;
3662 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3666
3667 auto F = LoadMMO->getFlags() &
3669 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3670 Size, LoadMMO->getBaseAlign());
3671
3672 MachineMemOperand *StoreMMO =
3673 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3674 sizeof(int32_t), LoadMMO->getBaseAlign());
3675
3676 MIB.setMemRefs({LoadMMO, StoreMMO});
3677
3678 MI.eraseFromParent();
3679 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3680 return true;
3681}
3682
3683/// Match a zero extend from a 32-bit value to 64-bits.
3684Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const {
3685 Register ZExtSrc;
3686 if (mi_match(Reg, *MRI, m_GZExt(m_Reg(ZExtSrc))))
3687 return MRI->getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3688
3689 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3690 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3691 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3692 return Register();
3693
3694 assert(Def->getNumOperands() == 3 &&
3695 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3696 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_ZeroInt())) {
3697 return Def->getOperand(1).getReg();
3698 }
3699
3700 return Register();
3701}
3702
3703/// Match a sign extend from a 32-bit value to 64-bits.
3704Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const {
3705 Register SExtSrc;
3706 if (mi_match(Reg, *MRI, m_GSExt(m_Reg(SExtSrc))))
3707 return MRI->getType(SExtSrc) == LLT::scalar(32) ? SExtSrc : Register();
3708
3709 // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31))
3710 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3711 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3712 return Register();
3713
3714 assert(Def->getNumOperands() == 3 &&
3715 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3716 if (mi_match(Def->getOperand(2).getReg(), *MRI,
3717 m_GAShr(m_SpecificReg(Def->getOperand(1).getReg()),
3718 m_SpecificICst(31))))
3719 return Def->getOperand(1).getReg();
3720
3721 if (VT->signBitIsZero(Reg))
3722 return matchZeroExtendFromS32(Reg);
3723
3724 return Register();
3725}
3726
3727/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
3728/// is 32-bit.
3730AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const {
3731 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3732 : matchZeroExtendFromS32(Reg);
3733}
3734
3735/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
3736/// is 32-bit.
3738AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const {
3739 return MRI->getType(Reg) == LLT::scalar(32) ? Reg
3740 : matchSignExtendFromS32(Reg);
3741}
3742
3744AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg,
3745 bool IsSigned) const {
3746 if (IsSigned)
3747 return matchSignExtendFromS32OrS32(Reg);
3748
3749 return matchZeroExtendFromS32OrS32(Reg);
3750}
3751
3752Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {
3753 Register AnyExtSrc;
3754 if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc))))
3755 return MRI->getType(AnyExtSrc) == LLT::scalar(32) ? AnyExtSrc : Register();
3756
3757 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 G_IMPLICIT_DEF)
3758 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3759 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3760 return Register();
3761
3762 assert(Def->getNumOperands() == 3 &&
3763 MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3764
3765 if (mi_match(Def->getOperand(2).getReg(), *MRI, m_GImplicitDef()))
3766 return Def->getOperand(1).getReg();
3767
3768 return Register();
3769}
3770
3771bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3772 if (!Subtarget->hasVMemToLDSLoad())
3773 return false;
3774
3775 unsigned Opc;
3776 unsigned Size = MI.getOperand(3).getImm();
3777 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
3778
3779 switch (Size) {
3780 default:
3781 return false;
3782 case 1:
3783 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3784 break;
3785 case 2:
3786 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3787 break;
3788 case 4:
3789 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3790 break;
3791 case 12:
3792 if (!Subtarget->hasLDSLoadB96_B128())
3793 return false;
3794 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3795 break;
3796 case 16:
3797 if (!Subtarget->hasLDSLoadB96_B128())
3798 return false;
3799 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3800 break;
3801 }
3802
3803 MachineBasicBlock *MBB = MI.getParent();
3804 const DebugLoc &DL = MI.getDebugLoc();
3805 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3806 .add(MI.getOperand(2));
3807
3808 Register Addr = MI.getOperand(1).getReg();
3809 Register VOffset;
3810 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3811 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3812 if (!isSGPR(Addr)) {
3813 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3814 if (isSGPR(AddrDef->Reg)) {
3815 Addr = AddrDef->Reg;
3816 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3817 Register SAddr =
3818 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3819 if (isSGPR(SAddr)) {
3820 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3821 if (Register Off = matchZeroExtendFromS32(PtrBaseOffset)) {
3822 Addr = SAddr;
3823 VOffset = Off;
3824 }
3825 }
3826 }
3827 }
3828
3829 if (isSGPR(Addr)) {
3831 if (!VOffset) {
3832 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3833 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3834 .addImm(0);
3835 }
3836 }
3837
3838 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3839 .addReg(Addr);
3840
3841 if (isSGPR(Addr))
3842 MIB.addReg(VOffset);
3843
3844 MIB.add(MI.getOperand(4)); // offset
3845
3846 unsigned Aux = MI.getOperand(5).getImm();
3847 MIB.addImm(Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol
3848 MIB.addImm(isAsyncLDSDMA(IntrinsicID));
3849
3850 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3851 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3852 LoadPtrI.Offset = MI.getOperand(4).getImm();
3853 MachinePointerInfo StorePtrI = LoadPtrI;
3854 LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
3858 auto F = LoadMMO->getFlags() &
3860 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3861 Size, LoadMMO->getBaseAlign());
3862 MachineMemOperand *StoreMMO =
3863 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3864 sizeof(int32_t), Align(4));
3865
3866 MIB.setMemRefs({LoadMMO, StoreMMO});
3867
3868 MI.eraseFromParent();
3869 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3870 return true;
3871}
3872
3873bool AMDGPUInstructionSelector::selectTensorLoadStore(MachineInstr &MI,
3874 Intrinsic::ID IID) const {
3875 bool IsLoad = IID == Intrinsic::amdgcn_tensor_load_to_lds;
3876 unsigned Opc =
3877 IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d4 : AMDGPU::TENSOR_STORE_FROM_LDS_d4;
3878 int NumGroups = 4;
3879
3880 // A lamda function to check whether an operand is a vector of all 0s.
3881 const auto isAllZeros = [&](MachineOperand &Opnd) {
3882 const MachineInstr *DefMI = MRI->getVRegDef(Opnd.getReg());
3883 if (!DefMI)
3884 return false;
3885 return llvm::isBuildVectorAllZeros(*DefMI, *MRI, true);
3886 };
3887
3888 // Use _D2 version if both group 2 and 3 are zero-initialized.
3889 if (isAllZeros(MI.getOperand(3)) && isAllZeros(MI.getOperand(4))) {
3890 NumGroups = 2;
3891 Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d2
3892 : AMDGPU::TENSOR_STORE_FROM_LDS_d2;
3893 }
3894
3895 // TODO: Handle the fifth group: MI.getOpetand(5), which is silently ignored
3896 // for now because all existing targets only support up to 4 groups.
3897 MachineBasicBlock *MBB = MI.getParent();
3898 auto MIB = BuildMI(*MBB, &MI, MI.getDebugLoc(), TII.get(Opc))
3899 .add(MI.getOperand(1)) // D# group 0
3900 .add(MI.getOperand(2)); // D# group 1
3901
3902 if (NumGroups >= 4) { // Has at least 4 groups
3903 MIB.add(MI.getOperand(3)) // D# group 2
3904 .add(MI.getOperand(4)); // D# group 3
3905 }
3906
3907 MIB.addImm(0) // r128
3908 .add(MI.getOperand(6)); // cpol
3909
3910 MI.eraseFromParent();
3911 return true;
3912}
3913
3914bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
3915 MachineInstr &MI) const {
3916 unsigned OpcodeOpIdx =
3917 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3;
3918 MI.setDesc(TII.get(MI.getOperand(OpcodeOpIdx).getImm()));
3919 MI.removeOperand(OpcodeOpIdx);
3920 MI.addImplicitDefUseOperands(*MI.getMF());
3921 constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3922 return true;
3923}
3924
3925// FIXME: This should be removed and let the patterns select. We just need the
3926// AGPR/VGPR combination versions.
3927bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3928 unsigned Opc;
3929 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3930 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3931 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3932 break;
3933 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3934 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3935 break;
3936 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3937 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3938 break;
3939 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3940 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3941 break;
3942 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3943 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3944 break;
3945 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3946 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3947 break;
3948 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3949 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3950 break;
3951 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3952 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3953 break;
3954 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3955 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3956 break;
3957 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3958 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3959 break;
3960 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3961 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3962 break;
3963 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3964 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3965 break;
3966 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3967 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3968 break;
3969 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3970 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3971 break;
3972 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3973 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3974 break;
3975 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3976 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3977 break;
3978 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
3979 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
3980 break;
3981 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
3982 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
3983 break;
3984 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
3985 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
3986 break;
3987 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
3988 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
3989 break;
3990 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
3991 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
3992 break;
3993 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3994 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3995 break;
3996 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
3997 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
3998 break;
3999 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
4000 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
4001 break;
4002 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
4003 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
4004 break;
4005 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
4006 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
4007 break;
4008 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
4009 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
4010 break;
4011 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
4012 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
4013 break;
4014 default:
4015 llvm_unreachable("unhandled smfmac intrinsic");
4016 }
4017
4018 auto VDst_In = MI.getOperand(4);
4019
4020 MI.setDesc(TII.get(Opc));
4021 MI.removeOperand(4); // VDst_In
4022 MI.removeOperand(1); // Intrinsic ID
4023 MI.addOperand(VDst_In); // Readd VDst_In to the end
4024 MI.addImplicitDefUseOperands(*MI.getMF());
4025 const MCInstrDesc &MCID = MI.getDesc();
4026 if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
4027 MI.getOperand(0).setIsEarlyClobber(true);
4028 }
4029 return true;
4030}
4031
4032bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
4033 MachineInstr &MI, Intrinsic::ID IntrID) const {
4034 if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
4035 !Subtarget->hasPermlane16Swap())
4036 return false;
4037 if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
4038 !Subtarget->hasPermlane32Swap())
4039 return false;
4040
4041 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
4042 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
4043 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
4044
4045 MI.removeOperand(2);
4046 MI.setDesc(TII.get(Opcode));
4047 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
4048
4049 MachineOperand &FI = MI.getOperand(4);
4051
4052 constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
4053 return true;
4054}
4055
4056bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
4057 Register DstReg = MI.getOperand(0).getReg();
4058 Register SrcReg = MI.getOperand(1).getReg();
4059 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4060 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
4061 MachineBasicBlock *MBB = MI.getParent();
4062 const DebugLoc &DL = MI.getDebugLoc();
4063
4064 if (IsVALU) {
4065 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
4066 .addImm(Subtarget->getWavefrontSizeLog2())
4067 .addReg(SrcReg);
4068 } else {
4069 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
4070 .addReg(SrcReg)
4071 .addImm(Subtarget->getWavefrontSizeLog2())
4072 .setOperandDead(3); // Dead scc
4073 }
4074
4075 const TargetRegisterClass &RC =
4076 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
4077 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
4078 return false;
4079
4080 MI.eraseFromParent();
4081 return true;
4082}
4083
4084bool AMDGPUInstructionSelector::selectWaveShuffleIntrin(
4085 MachineInstr &MI) const {
4086 assert(MI.getNumOperands() == 4);
4087 MachineBasicBlock *MBB = MI.getParent();
4088 const DebugLoc &DL = MI.getDebugLoc();
4089
4090 Register DstReg = MI.getOperand(0).getReg();
4091 Register ValReg = MI.getOperand(2).getReg();
4092 Register IdxReg = MI.getOperand(3).getReg();
4093
4094 const LLT DstTy = MRI->getType(DstReg);
4095 unsigned DstSize = DstTy.getSizeInBits();
4096 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4097 const TargetRegisterClass *DstRC =
4098 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
4099
4100 if (DstTy != LLT::scalar(32))
4101 return false;
4102
4103 if (!Subtarget->supportsBPermute())
4104 return false;
4105
4106 // If we can bpermute across the whole wave, then just do that
4107 if (Subtarget->supportsWaveWideBPermute()) {
4108 Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
4109 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
4110 .addImm(2)
4111 .addReg(IdxReg);
4112
4113 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), DstReg)
4114 .addReg(ShiftIdxReg)
4115 .addReg(ValReg)
4116 .addImm(0);
4117 } else {
4118 // Otherwise, we need to make use of whole wave mode
4119 assert(Subtarget->isWave64());
4120
4121 // Set inactive lanes to poison
4122 Register UndefValReg =
4123 MRI->createVirtualRegister(TRI.getRegClass(AMDGPU::SReg_32RegClassID));
4124 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefValReg);
4125
4126 Register UndefExecReg = MRI->createVirtualRegister(
4127 TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
4128 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefExecReg);
4129
4130 Register PoisonValReg = MRI->createVirtualRegister(DstRC);
4131 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonValReg)
4132 .addImm(0)
4133 .addReg(ValReg)
4134 .addImm(0)
4135 .addReg(UndefValReg)
4136 .addReg(UndefExecReg);
4137
4138 // ds_bpermute requires index to be multiplied by 4
4139 Register ShiftIdxReg = MRI->createVirtualRegister(DstRC);
4140 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), ShiftIdxReg)
4141 .addImm(2)
4142 .addReg(IdxReg);
4143
4144 Register PoisonIdxReg = MRI->createVirtualRegister(DstRC);
4145 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SET_INACTIVE_B32), PoisonIdxReg)
4146 .addImm(0)
4147 .addReg(ShiftIdxReg)
4148 .addImm(0)
4149 .addReg(UndefValReg)
4150 .addReg(UndefExecReg);
4151
4152 // Get permutation of each half, then we'll select which one to use
4153 Register SameSidePermReg = MRI->createVirtualRegister(DstRC);
4154 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), SameSidePermReg)
4155 .addReg(PoisonIdxReg)
4156 .addReg(PoisonValReg)
4157 .addImm(0);
4158
4159 Register SwappedValReg = MRI->createVirtualRegister(DstRC);
4160 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_PERMLANE64_B32), SwappedValReg)
4161 .addReg(PoisonValReg);
4162
4163 Register OppSidePermReg = MRI->createVirtualRegister(DstRC);
4164 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::DS_BPERMUTE_B32), OppSidePermReg)
4165 .addReg(PoisonIdxReg)
4166 .addReg(SwappedValReg)
4167 .addImm(0);
4168
4169 Register WWMSwapPermReg = MRI->createVirtualRegister(DstRC);
4170 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::STRICT_WWM), WWMSwapPermReg)
4171 .addReg(OppSidePermReg);
4172
4173 // Select which side to take the permute from
4174 // We can get away with only using mbcnt_lo here since we're only
4175 // trying to detect which side of 32 each lane is on, and mbcnt_lo
4176 // returns 32 for lanes 32-63.
4177 Register ThreadIDReg = MRI->createVirtualRegister(DstRC);
4178 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MBCNT_LO_U32_B32_e64), ThreadIDReg)
4179 .addImm(-1)
4180 .addImm(0);
4181
4182 Register XORReg = MRI->createVirtualRegister(DstRC);
4183 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_XOR_B32_e64), XORReg)
4184 .addReg(ThreadIDReg)
4185 .addReg(PoisonIdxReg);
4186
4187 Register ANDReg = MRI->createVirtualRegister(DstRC);
4188 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_AND_B32_e64), ANDReg)
4189 .addReg(XORReg)
4190 .addImm(32);
4191
4192 Register CompareReg = MRI->createVirtualRegister(
4193 TRI.getRegClass(AMDGPU::SReg_64_XEXECRegClassID));
4194 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), CompareReg)
4195 .addReg(ANDReg)
4196 .addImm(0);
4197
4198 // Finally do the selection
4199 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
4200 .addImm(0)
4201 .addReg(WWMSwapPermReg)
4202 .addImm(0)
4203 .addReg(SameSidePermReg)
4204 .addReg(CompareReg);
4205 }
4206
4207 MI.eraseFromParent();
4208 return true;
4209}
4210
4211// Match BITOP3 operation and return a number of matched instructions plus
4212// truth table.
4213static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
4215 const MachineRegisterInfo &MRI) {
4216 unsigned NumOpcodes = 0;
4217 uint8_t LHSBits, RHSBits;
4218
4219 auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
4220 // Define truth table given Src0, Src1, Src2 bits permutations:
4221 // 0 0 0
4222 // 0 0 1
4223 // 0 1 0
4224 // 0 1 1
4225 // 1 0 0
4226 // 1 0 1
4227 // 1 1 0
4228 // 1 1 1
4229 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
4230
4231 if (mi_match(Op, MRI, m_AllOnesInt())) {
4232 Bits = 0xff;
4233 return true;
4234 }
4235 if (mi_match(Op, MRI, m_ZeroInt())) {
4236 Bits = 0;
4237 return true;
4238 }
4239
4240 for (unsigned I = 0; I < Src.size(); ++I) {
4241 // Try to find existing reused operand
4242 if (Src[I] == Op) {
4243 Bits = SrcBits[I];
4244 return true;
4245 }
4246 // Try to replace parent operator
4247 if (Src[I] == R) {
4248 Bits = SrcBits[I];
4249 Src[I] = Op;
4250 return true;
4251 }
4252 }
4253
4254 if (Src.size() == 3) {
4255 // No room left for operands. Try one last time, there can be a 'not' of
4256 // one of our source operands. In this case we can compute the bits
4257 // without growing Src vector.
4258 Register LHS;
4259 if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {
4261 for (unsigned I = 0; I < Src.size(); ++I) {
4262 if (Src[I] == LHS) {
4263 Bits = ~SrcBits[I];
4264 return true;
4265 }
4266 }
4267 }
4268
4269 return false;
4270 }
4271
4272 Bits = SrcBits[Src.size()];
4273 Src.push_back(Op);
4274 return true;
4275 };
4276
4277 MachineInstr *MI = MRI.getVRegDef(R);
4278 switch (MI->getOpcode()) {
4279 case TargetOpcode::G_AND:
4280 case TargetOpcode::G_OR:
4281 case TargetOpcode::G_XOR: {
4282 Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);
4283 Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
4284
4285 SmallVector<Register, 3> Backup(Src.begin(), Src.end());
4286 if (!getOperandBits(LHS, LHSBits) ||
4287 !getOperandBits(RHS, RHSBits)) {
4288 Src = std::move(Backup);
4289 return std::make_pair(0, 0);
4290 }
4291
4292 // Recursion is naturally limited by the size of the operand vector.
4293 auto Op = BitOp3_Op(LHS, Src, MRI);
4294 if (Op.first) {
4295 NumOpcodes += Op.first;
4296 LHSBits = Op.second;
4297 }
4298
4299 Op = BitOp3_Op(RHS, Src, MRI);
4300 if (Op.first) {
4301 NumOpcodes += Op.first;
4302 RHSBits = Op.second;
4303 }
4304 break;
4305 }
4306 default:
4307 return std::make_pair(0, 0);
4308 }
4309
4310 uint8_t TTbl;
4311 switch (MI->getOpcode()) {
4312 case TargetOpcode::G_AND:
4313 TTbl = LHSBits & RHSBits;
4314 break;
4315 case TargetOpcode::G_OR:
4316 TTbl = LHSBits | RHSBits;
4317 break;
4318 case TargetOpcode::G_XOR:
4319 TTbl = LHSBits ^ RHSBits;
4320 break;
4321 default:
4322 break;
4323 }
4324
4325 return std::make_pair(NumOpcodes + 1, TTbl);
4326}
4327
4328bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
4329 if (!Subtarget->hasBitOp3Insts())
4330 return false;
4331
4332 Register DstReg = MI.getOperand(0).getReg();
4333 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
4334 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
4335 if (!IsVALU)
4336 return false;
4337
4339 uint8_t TTbl;
4340 unsigned NumOpcodes;
4341
4342 std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);
4343
4344 // Src.empty() case can happen if all operands are all zero or all ones.
4345 // Normally it shall be optimized out before reaching this.
4346 if (NumOpcodes < 2 || Src.empty())
4347 return false;
4348
4349 const bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
4350 if (NumOpcodes == 2 && IsB32) {
4351 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4352 // asm more readable. This cannot be modeled with AddedComplexity because
4353 // selector does not know how many operations did we match.
4354 if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||
4355 mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
4356 mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
4357 return false;
4358 } else if (NumOpcodes < 4) {
4359 // For a uniform case threshold should be higher to account for moves
4360 // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be
4361 // in SGPRs and a readtfirstlane after.
4362 return false;
4363 }
4364
4365 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
4366 if (!IsB32 && STI.hasTrue16BitInsts())
4367 Opc = STI.useRealTrue16Insts() ? AMDGPU::V_BITOP3_B16_gfx1250_t16_e64
4368 : AMDGPU::V_BITOP3_B16_gfx1250_fake16_e64;
4369 unsigned CBL = STI.getConstantBusLimit(Opc);
4370 MachineBasicBlock *MBB = MI.getParent();
4371 const DebugLoc &DL = MI.getDebugLoc();
4372
4373 for (unsigned I = 0; I < Src.size(); ++I) {
4374 const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
4375 if (RB->getID() != AMDGPU::SGPRRegBankID)
4376 continue;
4377 if (CBL > 0) {
4378 --CBL;
4379 continue;
4380 }
4381 Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4382 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
4383 .addReg(Src[I]);
4384 Src[I] = NewReg;
4385 }
4386
4387 // Last operand can be ignored, turning a ternary operation into a binary.
4388 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4389 // 'c' with 'a' here without changing the answer. In some pathological
4390 // cases it should be possible to get an operation with a single operand
4391 // too if optimizer would not catch it.
4392 while (Src.size() < 3)
4393 Src.push_back(Src[0]);
4394
4395 auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);
4396 if (!IsB32)
4397 MIB.addImm(0); // src_mod0
4398 MIB.addReg(Src[0]);
4399 if (!IsB32)
4400 MIB.addImm(0); // src_mod1
4401 MIB.addReg(Src[1]);
4402 if (!IsB32)
4403 MIB.addImm(0); // src_mod2
4404 MIB.addReg(Src[2])
4405 .addImm(TTbl);
4406 if (!IsB32)
4407 MIB.addImm(0); // op_sel
4408
4409 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
4410 MI.eraseFromParent();
4411
4412 return true;
4413}
4414
4415bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
4416 Register SrcReg = MI.getOperand(0).getReg();
4417 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
4418 return false;
4419
4420 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
4421 Register SP =
4422 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
4423 Register WaveAddr = getWaveAddress(DefMI);
4424 MachineBasicBlock *MBB = MI.getParent();
4425 const DebugLoc &DL = MI.getDebugLoc();
4426
4427 if (!WaveAddr) {
4428 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4429 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
4430 .addReg(SrcReg)
4431 .addImm(Subtarget->getWavefrontSizeLog2())
4432 .setOperandDead(3); // Dead scc
4433 }
4434
4435 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
4436 .addReg(WaveAddr);
4437
4438 MI.eraseFromParent();
4439 return true;
4440}
4441
4443
4444 if (!I.isPreISelOpcode()) {
4445 if (I.isCopy())
4446 return selectCOPY(I);
4447 return true;
4448 }
4449
4450 switch (I.getOpcode()) {
4451 case TargetOpcode::G_AND:
4452 case TargetOpcode::G_OR:
4453 case TargetOpcode::G_XOR:
4454 if (selectBITOP3(I))
4455 return true;
4456 if (selectImpl(I, *CoverageInfo))
4457 return true;
4458 return selectG_AND_OR_XOR(I);
4459 case TargetOpcode::G_ADD:
4460 case TargetOpcode::G_SUB:
4461 case TargetOpcode::G_PTR_ADD:
4462 if (selectImpl(I, *CoverageInfo))
4463 return true;
4464 return selectG_ADD_SUB(I);
4465 case TargetOpcode::G_UADDO:
4466 case TargetOpcode::G_USUBO:
4467 case TargetOpcode::G_UADDE:
4468 case TargetOpcode::G_USUBE:
4469 return selectG_UADDO_USUBO_UADDE_USUBE(I);
4470 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4471 case AMDGPU::G_AMDGPU_MAD_I64_I32:
4472 return selectG_AMDGPU_MAD_64_32(I);
4473 case TargetOpcode::G_INTTOPTR:
4474 case TargetOpcode::G_BITCAST:
4475 case TargetOpcode::G_PTRTOINT:
4476 case TargetOpcode::G_FREEZE:
4477 return selectCOPY(I);
4478 case TargetOpcode::G_FNEG:
4479 if (selectImpl(I, *CoverageInfo))
4480 return true;
4481 return selectG_FNEG(I);
4482 case TargetOpcode::G_FABS:
4483 if (selectImpl(I, *CoverageInfo))
4484 return true;
4485 return selectG_FABS(I);
4486 case TargetOpcode::G_EXTRACT:
4487 return selectG_EXTRACT(I);
4488 case TargetOpcode::G_MERGE_VALUES:
4489 case TargetOpcode::G_CONCAT_VECTORS:
4490 return selectG_MERGE_VALUES(I);
4491 case TargetOpcode::G_UNMERGE_VALUES:
4492 return selectG_UNMERGE_VALUES(I);
4493 case TargetOpcode::G_BUILD_VECTOR:
4494 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
4495 return selectG_BUILD_VECTOR(I);
4496 case TargetOpcode::G_IMPLICIT_DEF:
4497 return selectG_IMPLICIT_DEF(I);
4498 case TargetOpcode::G_INSERT:
4499 return selectG_INSERT(I);
4500 case TargetOpcode::G_INTRINSIC:
4501 case TargetOpcode::G_INTRINSIC_CONVERGENT:
4502 return selectG_INTRINSIC(I);
4503 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
4504 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
4505 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
4506 case TargetOpcode::G_ICMP:
4507 case TargetOpcode::G_FCMP:
4508 if (selectG_ICMP_or_FCMP(I))
4509 return true;
4510 return selectImpl(I, *CoverageInfo);
4511 case TargetOpcode::G_LOAD:
4512 case TargetOpcode::G_ZEXTLOAD:
4513 case TargetOpcode::G_SEXTLOAD:
4514 case TargetOpcode::G_STORE:
4515 case TargetOpcode::G_ATOMIC_CMPXCHG:
4516 case TargetOpcode::G_ATOMICRMW_XCHG:
4517 case TargetOpcode::G_ATOMICRMW_ADD:
4518 case TargetOpcode::G_ATOMICRMW_SUB:
4519 case TargetOpcode::G_ATOMICRMW_AND:
4520 case TargetOpcode::G_ATOMICRMW_OR:
4521 case TargetOpcode::G_ATOMICRMW_XOR:
4522 case TargetOpcode::G_ATOMICRMW_MIN:
4523 case TargetOpcode::G_ATOMICRMW_MAX:
4524 case TargetOpcode::G_ATOMICRMW_UMIN:
4525 case TargetOpcode::G_ATOMICRMW_UMAX:
4526 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
4527 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4528 case TargetOpcode::G_ATOMICRMW_USUB_COND:
4529 case TargetOpcode::G_ATOMICRMW_USUB_SAT:
4530 case TargetOpcode::G_ATOMICRMW_FADD:
4531 case TargetOpcode::G_ATOMICRMW_FMIN:
4532 case TargetOpcode::G_ATOMICRMW_FMAX:
4533 return selectG_LOAD_STORE_ATOMICRMW(I);
4534 case TargetOpcode::G_SELECT:
4535 return selectG_SELECT(I);
4536 case TargetOpcode::G_TRUNC:
4537 return selectG_TRUNC(I);
4538 case TargetOpcode::G_SEXT:
4539 case TargetOpcode::G_ZEXT:
4540 case TargetOpcode::G_ANYEXT:
4541 case TargetOpcode::G_SEXT_INREG:
4542 // This is a workaround. For extension from type i1, `selectImpl()` uses
4543 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
4544 // i1 can only be hold in a SGPR class.
4545 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
4546 selectImpl(I, *CoverageInfo))
4547 return true;
4548 return selectG_SZA_EXT(I);
4549 case TargetOpcode::G_FPEXT:
4550 if (selectG_FPEXT(I))
4551 return true;
4552 return selectImpl(I, *CoverageInfo);
4553 case TargetOpcode::G_BRCOND:
4554 return selectG_BRCOND(I);
4555 case TargetOpcode::G_GLOBAL_VALUE:
4556 return selectG_GLOBAL_VALUE(I);
4557 case TargetOpcode::G_PTRMASK:
4558 return selectG_PTRMASK(I);
4559 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4560 return selectG_EXTRACT_VECTOR_ELT(I);
4561 case TargetOpcode::G_INSERT_VECTOR_ELT:
4562 return selectG_INSERT_VECTOR_ELT(I);
4563 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4564 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4565 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4566 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4567 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4568 const AMDGPU::ImageDimIntrinsicInfo *Intr =
4570 assert(Intr && "not an image intrinsic with image pseudo");
4571 return selectImageIntrinsic(I, Intr);
4572 }
4573 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY:
4574 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY:
4575 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY:
4576 return selectBVHIntersectRayIntrinsic(I);
4577 case AMDGPU::G_SBFX:
4578 case AMDGPU::G_UBFX:
4579 return selectG_SBFX_UBFX(I);
4580 case AMDGPU::G_SI_CALL:
4581 I.setDesc(TII.get(AMDGPU::SI_CALL));
4582 return true;
4583 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4584 return selectWaveAddress(I);
4585 case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: {
4586 I.setDesc(TII.get(AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN));
4587 return true;
4588 }
4589 case AMDGPU::G_STACKRESTORE:
4590 return selectStackRestore(I);
4591 case AMDGPU::G_PHI:
4592 return selectPHI(I);
4593 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4594 return selectCOPY_SCC_VCC(I);
4595 case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4596 return selectCOPY_VCC_SCC(I);
4597 case AMDGPU::G_AMDGPU_READANYLANE:
4598 return selectReadAnyLane(I);
4599 case TargetOpcode::G_CONSTANT:
4600 case TargetOpcode::G_FCONSTANT:
4601 default:
4602 return selectImpl(I, *CoverageInfo);
4603 }
4604 return false;
4605}
4606
4608AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
4609 return {{
4610 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4611 }};
4612
4613}
4614
4615std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4616 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
4617 unsigned Mods = 0;
4618 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
4619
4620 if (MI->getOpcode() == AMDGPU::G_FNEG) {
4621 Src = MI->getOperand(1).getReg();
4622 Mods |= SISrcMods::NEG;
4623 MI = getDefIgnoringCopies(Src, *MRI);
4624 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4625 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
4626 // denormal mode, but we're implicitly canonicalizing in a source operand.
4627 const ConstantFP *LHS =
4628 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
4629 if (LHS && LHS->isZero()) {
4630 Mods |= SISrcMods::NEG;
4631 Src = MI->getOperand(2).getReg();
4632 }
4633 }
4634
4635 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
4636 Src = MI->getOperand(1).getReg();
4637 Mods |= SISrcMods::ABS;
4638 }
4639
4640 if (OpSel)
4641 Mods |= SISrcMods::OP_SEL_0;
4642
4643 return std::pair(Src, Mods);
4644}
4645
4646std::pair<Register, unsigned>
4647AMDGPUInstructionSelector::selectVOP3PModsF32Impl(Register Src) const {
4648 unsigned Mods;
4649 std::tie(Src, Mods) = selectVOP3ModsImpl(Src);
4650 Mods |= SISrcMods::OP_SEL_1;
4651 return std::pair(Src, Mods);
4652}
4653
4654Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4655 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
4656 bool ForceVGPR) const {
4657 if ((Mods != 0 || ForceVGPR) &&
4658 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4659
4660 // If we looked through copies to find source modifiers on an SGPR operand,
4661 // we now have an SGPR register source. To avoid potentially violating the
4662 // constant bus restriction, we need to insert a copy to a VGPR.
4663 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
4664 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
4665 TII.get(AMDGPU::COPY), VGPRSrc)
4666 .addReg(Src);
4667 Src = VGPRSrc;
4668 }
4669
4670 return Src;
4671}
4672
4673///
4674/// This will select either an SGPR or VGPR operand and will save us from
4675/// having to write an extra tablegen pattern.
4677AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
4678 return {{
4679 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4680 }};
4681}
4682
4684AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
4685 Register Src;
4686 unsigned Mods;
4687 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4688
4689 return {{
4690 [=](MachineInstrBuilder &MIB) {
4691 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4692 },
4693 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4694 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4695 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4696 }};
4697}
4698
4700AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
4701 Register Src;
4702 unsigned Mods;
4703 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4704 /*IsCanonicalizing=*/true,
4705 /*AllowAbs=*/false);
4706
4707 return {{
4708 [=](MachineInstrBuilder &MIB) {
4709 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4710 },
4711 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4712 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4713 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4714 }};
4715}
4716
4718AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
4719 return {{
4720 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
4721 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
4722 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
4723 }};
4724}
4725
4727AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
4728 Register Src;
4729 unsigned Mods;
4730 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4731
4732 return {{
4733 [=](MachineInstrBuilder &MIB) {
4734 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4735 },
4736 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4737 }};
4738}
4739
4741AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4742 MachineOperand &Root) const {
4743 Register Src;
4744 unsigned Mods;
4745 std::tie(Src, Mods) =
4746 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);
4747
4748 return {{
4749 [=](MachineInstrBuilder &MIB) {
4750 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4751 },
4752 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4753 }};
4754}
4755
4757AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
4758 Register Src;
4759 unsigned Mods;
4760 std::tie(Src, Mods) =
4761 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
4762 /*AllowAbs=*/false);
4763
4764 return {{
4765 [=](MachineInstrBuilder &MIB) {
4766 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4767 },
4768 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4769 }};
4770}
4771
4773AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
4774 Register Reg = Root.getReg();
4775 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
4776 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
4777 return {};
4778 return {{
4779 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4780 }};
4781}
4782
4783enum class SrcStatus {
4788 // This means current op = [op_upper, op_lower] and src = -op_lower.
4791 // This means current op = [op_upper, op_lower] and src = [op_upper,
4792 // -op_lower].
4800};
4801/// Test if the MI is truncating to half, such as `%reg0:n = G_TRUNC %reg1:2n`
4802static bool isTruncHalf(const MachineInstr *MI,
4803 const MachineRegisterInfo &MRI) {
4804 if (MI->getOpcode() != AMDGPU::G_TRUNC)
4805 return false;
4806
4807 unsigned DstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();
4808 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4809 return DstSize * 2 == SrcSize;
4810}
4811
4812/// Test if the MI is logic shift right with half bits,
4813/// such as `%reg0:2n =G_LSHR %reg1:2n, CONST(n)`
4814static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4815 if (MI->getOpcode() != AMDGPU::G_LSHR)
4816 return false;
4817
4818 Register ShiftSrc;
4819 std::optional<ValueAndVReg> ShiftAmt;
4820 if (mi_match(MI->getOperand(0).getReg(), MRI,
4821 m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4822 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4823 unsigned Shift = ShiftAmt->Value.getZExtValue();
4824 return Shift * 2 == SrcSize;
4825 }
4826 return false;
4827}
4828
4829/// Test if the MI is shift left with half bits,
4830/// such as `%reg0:2n =G_SHL %reg1:2n, CONST(n)`
4831static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) {
4832 if (MI->getOpcode() != AMDGPU::G_SHL)
4833 return false;
4834
4835 Register ShiftSrc;
4836 std::optional<ValueAndVReg> ShiftAmt;
4837 if (mi_match(MI->getOperand(0).getReg(), MRI,
4838 m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
4839 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
4840 unsigned Shift = ShiftAmt->Value.getZExtValue();
4841 return Shift * 2 == SrcSize;
4842 }
4843 return false;
4844}
4845
4846/// Test function, if the MI is `%reg0:n, %reg1:n = G_UNMERGE_VALUES %reg2:2n`
4847static bool isUnmergeHalf(const MachineInstr *MI,
4848 const MachineRegisterInfo &MRI) {
4849 if (MI->getOpcode() != AMDGPU::G_UNMERGE_VALUES)
4850 return false;
4851 return MI->getNumOperands() == 3 && MI->getOperand(0).isDef() &&
4852 MI->getOperand(1).isDef() && !MI->getOperand(2).isDef();
4853}
4854
4856
4858 const MachineRegisterInfo &MRI) {
4859 LLT OpTy = MRI.getType(Reg);
4860 if (OpTy.isScalar())
4861 return TypeClass::SCALAR;
4862 if (OpTy.isVector() && OpTy.getNumElements() == 2)
4865}
4866
4868 const MachineRegisterInfo &MRI) {
4869 TypeClass NegType = isVectorOfTwoOrScalar(Reg, MRI);
4870 if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR)
4871 return SrcStatus::INVALID;
4872
4873 switch (S) {
4874 case SrcStatus::IS_SAME:
4875 if (NegType == TypeClass::VECTOR_OF_TWO) {
4876 // Vector of 2:
4877 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4878 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4879 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4880 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4882 }
4883 if (NegType == TypeClass::SCALAR) {
4884 // Scalar:
4885 // [SrcHi, SrcLo] = [CurrHi, CurrLo]
4886 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4887 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4888 // [SrcHi, SrcLo] = [-OpHi, OpLo]
4889 return SrcStatus::IS_HI_NEG;
4890 }
4891 break;
4893 if (NegType == TypeClass::VECTOR_OF_TWO) {
4894 // Vector of 2:
4895 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4896 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type)
4897 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4898 // [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo]
4899 return SrcStatus::IS_LO_NEG;
4900 }
4901 if (NegType == TypeClass::SCALAR) {
4902 // Scalar:
4903 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
4904 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type)
4905 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4906 // [SrcHi, SrcLo] = [-(-OpHi), OpLo] = [OpHi, OpLo]
4907 return SrcStatus::IS_SAME;
4908 }
4909 break;
4911 if (NegType == TypeClass::VECTOR_OF_TWO) {
4912 // Vector of 2:
4913 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4914 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4915 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4916 // [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo]
4917 return SrcStatus::IS_HI_NEG;
4918 }
4919 if (NegType == TypeClass::SCALAR) {
4920 // Scalar:
4921 // [SrcHi, SrcLo] = [CurrHi, -CurrLo]
4922 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4923 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4924 // [SrcHi, SrcLo] = [-OpHi, -OpLo]
4926 }
4927 break;
4929 if (NegType == TypeClass::VECTOR_OF_TWO) {
4930 // Vector of 2:
4931 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4932 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type)
4933 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type)
4934 // [SrcHi, SrcLo] = [OpHi, OpLo]
4935 return SrcStatus::IS_SAME;
4936 }
4937 if (NegType == TypeClass::SCALAR) {
4938 // Scalar:
4939 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo]
4940 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type)
4941 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type)
4942 // [SrcHi, SrcLo] = [OpHi, -OpLo]
4943 return SrcStatus::IS_LO_NEG;
4944 }
4945 break;
4947 // Vector of 2:
4948 // Src = CurrUpper
4949 // Curr = [CurrUpper, CurrLower]
4950 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4951 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4952 // Src = -OpUpper
4953 //
4954 // Scalar:
4955 // Src = CurrUpper
4956 // Curr = [CurrUpper, CurrLower]
4957 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4958 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4959 // Src = -OpUpper
4962 if (NegType == TypeClass::VECTOR_OF_TWO) {
4963 // Vector of 2:
4964 // Src = CurrLower
4965 // Curr = [CurrUpper, CurrLower]
4966 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4967 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4968 // Src = -OpLower
4970 }
4971 if (NegType == TypeClass::SCALAR) {
4972 // Scalar:
4973 // Src = CurrLower
4974 // Curr = [CurrUpper, CurrLower]
4975 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4976 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4977 // Src = OpLower
4979 }
4980 break;
4982 // Vector of 2:
4983 // Src = -CurrUpper
4984 // Curr = [CurrUpper, CurrLower]
4985 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
4986 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
4987 // Src = -(-OpUpper) = OpUpper
4988 //
4989 // Scalar:
4990 // Src = -CurrUpper
4991 // Curr = [CurrUpper, CurrLower]
4992 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
4993 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
4994 // Src = -(-OpUpper) = OpUpper
4997 if (NegType == TypeClass::VECTOR_OF_TWO) {
4998 // Vector of 2:
4999 // Src = -CurrLower
5000 // Curr = [CurrUpper, CurrLower]
5001 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type)
5002 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type)
5003 // Src = -(-OpLower) = OpLower
5005 }
5006 if (NegType == TypeClass::SCALAR) {
5007 // Scalar:
5008 // Src = -CurrLower
5009 // Curr = [CurrUpper, CurrLower]
5010 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type)
5011 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type)
5012 // Src = -OpLower
5014 }
5015 break;
5016 default:
5017 break;
5018 }
5019 llvm_unreachable("unexpected SrcStatus & NegType combination");
5020}
5021
5022static std::optional<std::pair<Register, SrcStatus>>
5023calcNextStatus(std::pair<Register, SrcStatus> Curr,
5024 const MachineRegisterInfo &MRI) {
5025 const MachineInstr *MI = MRI.getVRegDef(Curr.first);
5026
5027 unsigned Opc = MI->getOpcode();
5028
5029 // Handle general Opc cases.
5030 switch (Opc) {
5031 case AMDGPU::G_BITCAST:
5032 return std::optional<std::pair<Register, SrcStatus>>(
5033 {MI->getOperand(1).getReg(), Curr.second});
5034 case AMDGPU::COPY:
5035 if (MI->getOperand(1).getReg().isPhysical())
5036 return std::nullopt;
5037 return std::optional<std::pair<Register, SrcStatus>>(
5038 {MI->getOperand(1).getReg(), Curr.second});
5039 case AMDGPU::G_FNEG: {
5040 SrcStatus Stat = getNegStatus(Curr.first, Curr.second, MRI);
5041 if (Stat == SrcStatus::INVALID)
5042 return std::nullopt;
5043 return std::optional<std::pair<Register, SrcStatus>>(
5044 {MI->getOperand(1).getReg(), Stat});
5045 }
5046 default:
5047 break;
5048 }
5049
5050 // Calc next Stat from current Stat.
5051 switch (Curr.second) {
5052 case SrcStatus::IS_SAME:
5053 if (isTruncHalf(MI, MRI))
5054 return std::optional<std::pair<Register, SrcStatus>>(
5055 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
5056 else if (isUnmergeHalf(MI, MRI)) {
5057 if (Curr.first == MI->getOperand(0).getReg())
5058 return std::optional<std::pair<Register, SrcStatus>>(
5059 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF});
5060 return std::optional<std::pair<Register, SrcStatus>>(
5061 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF});
5062 }
5063 break;
5065 if (isTruncHalf(MI, MRI)) {
5066 // [SrcHi, SrcLo] = [-CurrHi, CurrLo]
5067 // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower
5068 // = [OpLowerHi, OpLowerLo]
5069 // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo]
5070 // = [-OpLowerHi, OpLowerLo]
5071 // = -OpLower
5072 return std::optional<std::pair<Register, SrcStatus>>(
5073 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
5074 }
5075 if (isUnmergeHalf(MI, MRI)) {
5076 if (Curr.first == MI->getOperand(0).getReg())
5077 return std::optional<std::pair<Register, SrcStatus>>(
5078 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
5079 return std::optional<std::pair<Register, SrcStatus>>(
5080 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
5081 }
5082 break;
5084 if (isShlHalf(MI, MRI))
5085 return std::optional<std::pair<Register, SrcStatus>>(
5086 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF});
5087 break;
5089 if (isLshrHalf(MI, MRI))
5090 return std::optional<std::pair<Register, SrcStatus>>(
5091 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF});
5092 break;
5094 if (isShlHalf(MI, MRI))
5095 return std::optional<std::pair<Register, SrcStatus>>(
5096 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG});
5097 break;
5099 if (isLshrHalf(MI, MRI))
5100 return std::optional<std::pair<Register, SrcStatus>>(
5101 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF_NEG});
5102 break;
5103 default:
5104 break;
5105 }
5106 return std::nullopt;
5107}
5108
5109/// This is used to control valid status that current MI supports. For example,
5110/// non floating point intrinsic such as @llvm.amdgcn.sdot2 does not support NEG
5111/// bit on VOP3P.
5112/// The class can be further extended to recognize support on SEL, NEG, ABS bit
5113/// for different MI on different arch
5115private:
5116 bool HasNeg = false;
5117 // Assume all complex pattern of VOP3P have opsel.
5118 bool HasOpsel = true;
5119
5120public:
5122 const MachineInstr *MI = MRI.getVRegDef(Reg);
5123 unsigned Opc = MI->getOpcode();
5124
5125 if (Opc == TargetOpcode::G_INTRINSIC) {
5126 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID();
5127 // Only float point intrinsic has neg & neg_hi bits.
5128 if (IntrinsicID == Intrinsic::amdgcn_fdot2)
5129 HasNeg = true;
5131 // Keep same for generic op.
5132 HasNeg = true;
5133 }
5134 }
5135 bool checkOptions(SrcStatus Stat) const {
5136 if (!HasNeg &&
5137 (Stat >= SrcStatus::NEG_START && Stat <= SrcStatus::NEG_END)) {
5138 return false;
5139 }
5140 if (!HasOpsel &&
5141 (Stat >= SrcStatus::HALF_START && Stat <= SrcStatus::HALF_END)) {
5142 return false;
5143 }
5144 return true;
5145 }
5146};
5147
5150 int MaxDepth = 3) {
5151 int Depth = 0;
5152 auto Curr = calcNextStatus({Reg, SrcStatus::IS_SAME}, MRI);
5154
5155 while (Depth <= MaxDepth && Curr.has_value()) {
5156 Depth++;
5157 if (SO.checkOptions(Curr.value().second))
5158 Statlist.push_back(Curr.value());
5159 Curr = calcNextStatus(Curr.value(), MRI);
5160 }
5161
5162 return Statlist;
5163}
5164
5165static std::pair<Register, SrcStatus>
5167 int MaxDepth = 3) {
5168 int Depth = 0;
5169 std::pair<Register, SrcStatus> LastSameOrNeg = {Reg, SrcStatus::IS_SAME};
5170 auto Curr = calcNextStatus(LastSameOrNeg, MRI);
5171
5172 while (Depth <= MaxDepth && Curr.has_value()) {
5173 Depth++;
5174 SrcStatus Stat = Curr.value().second;
5175 if (SO.checkOptions(Stat)) {
5176 if (Stat == SrcStatus::IS_SAME || Stat == SrcStatus::IS_HI_NEG ||
5178 LastSameOrNeg = Curr.value();
5179 }
5180 Curr = calcNextStatus(Curr.value(), MRI);
5181 }
5182
5183 return LastSameOrNeg;
5184}
5185
5186static bool isSameBitWidth(Register Reg1, Register Reg2,
5187 const MachineRegisterInfo &MRI) {
5188 unsigned Width1 = MRI.getType(Reg1).getSizeInBits();
5189 unsigned Width2 = MRI.getType(Reg2).getSizeInBits();
5190 return Width1 == Width2;
5191}
5192
5193static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) {
5194 // SrcStatus::IS_LOWER_HALF remain 0.
5195 if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) {
5196 Mods ^= SISrcMods::NEG_HI;
5197 Mods |= SISrcMods::OP_SEL_1;
5198 } else if (HiStat == SrcStatus::IS_UPPER_HALF)
5199 Mods |= SISrcMods::OP_SEL_1;
5200 else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG)
5201 Mods ^= SISrcMods::NEG_HI;
5202 else if (HiStat == SrcStatus::IS_HI_NEG)
5203 Mods ^= SISrcMods::NEG_HI;
5204
5205 if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) {
5206 Mods ^= SISrcMods::NEG;
5207 Mods |= SISrcMods::OP_SEL_0;
5208 } else if (LoStat == SrcStatus::IS_UPPER_HALF)
5209 Mods |= SISrcMods::OP_SEL_0;
5210 else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG)
5211 Mods |= SISrcMods::NEG;
5212 else if (LoStat == SrcStatus::IS_HI_NEG)
5213 Mods ^= SISrcMods::NEG;
5214
5215 return Mods;
5216}
5217
5218static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg,
5219 Register RootReg, const SIInstrInfo &TII,
5220 const MachineRegisterInfo &MRI) {
5221 auto IsHalfState = [](SrcStatus S) {
5224 };
5225 return isSameBitWidth(NewReg, RootReg, MRI) && IsHalfState(LoStat) &&
5226 IsHalfState(HiStat);
5227}
5228
5229std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PModsImpl(
5230 Register RootReg, const MachineRegisterInfo &MRI, bool IsDOT) const {
5231 unsigned Mods = 0;
5232 // No modification if Root type is not form of <2 x Type>.
5233 if (isVectorOfTwoOrScalar(RootReg, MRI) != TypeClass::VECTOR_OF_TWO) {
5234 Mods |= SISrcMods::OP_SEL_1;
5235 return {RootReg, Mods};
5236 }
5237
5238 SearchOptions SO(RootReg, MRI);
5239
5240 std::pair<Register, SrcStatus> Stat = getLastSameOrNeg(RootReg, MRI, SO);
5241
5242 if (Stat.second == SrcStatus::IS_BOTH_NEG)
5244 else if (Stat.second == SrcStatus::IS_HI_NEG)
5245 Mods ^= SISrcMods::NEG_HI;
5246 else if (Stat.second == SrcStatus::IS_LO_NEG)
5247 Mods ^= SISrcMods::NEG;
5248
5249 MachineInstr *MI = MRI.getVRegDef(Stat.first);
5250
5251 if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 ||
5252 (IsDOT && Subtarget->hasDOTOpSelHazard())) {
5253 Mods |= SISrcMods::OP_SEL_1;
5254 return {Stat.first, Mods};
5255 }
5256
5258 getSrcStats(MI->getOperand(2).getReg(), MRI, SO);
5259
5260 if (StatlistHi.empty()) {
5261 Mods |= SISrcMods::OP_SEL_1;
5262 return {Stat.first, Mods};
5263 }
5264
5266 getSrcStats(MI->getOperand(1).getReg(), MRI, SO);
5267
5268 if (StatlistLo.empty()) {
5269 Mods |= SISrcMods::OP_SEL_1;
5270 return {Stat.first, Mods};
5271 }
5272
5273 for (int I = StatlistHi.size() - 1; I >= 0; I--) {
5274 for (int J = StatlistLo.size() - 1; J >= 0; J--) {
5275 if (StatlistHi[I].first == StatlistLo[J].first &&
5276 isValidToPack(StatlistHi[I].second, StatlistLo[J].second,
5277 StatlistHi[I].first, RootReg, TII, MRI))
5278 return {StatlistHi[I].first,
5279 updateMods(StatlistHi[I].second, StatlistLo[J].second, Mods)};
5280 }
5281 }
5282 // Packed instructions do not have abs modifiers.
5283 Mods |= SISrcMods::OP_SEL_1;
5284
5285 return {Stat.first, Mods};
5286}
5287
5288// Removed unused function `getAllKindImm` to eliminate dead code.
5289
5290static bool checkRB(Register Reg, unsigned int RBNo,
5291 const AMDGPURegisterBankInfo &RBI,
5292 const MachineRegisterInfo &MRI,
5293 const TargetRegisterInfo &TRI) {
5294 const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI);
5295 return RB->getID() == RBNo;
5296}
5297
5298// This function is used to get the correct register bank for returned reg.
5299// Assume:
5300// 1. VOP3P is always legal for VGPR.
5301// 2. RootOp's regbank is legal.
5302// Thus
5303// 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR.
5304// 2. If RootOp is VGPR, then NewOp must be VGPR.
5306 const AMDGPURegisterBankInfo &RBI,
5308 const TargetRegisterInfo &TRI,
5309 const SIInstrInfo &TII) {
5310 // RootOp can only be VGPR or SGPR (some hand written cases such as.
5311 // inst-select-ashr.v2s16.mir::ashr_v2s16_vs).
5312 if (checkRB(RootReg, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
5313 checkRB(NewReg, AMDGPU::VGPRRegBankID, RBI, MRI, TRI))
5314 return NewReg;
5315
5316 MachineInstr *MI = MRI.getVRegDef(RootReg);
5317 if (MI->getOpcode() == AMDGPU::COPY && NewReg == MI->getOperand(1).getReg()) {
5318 // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp.
5319 return RootReg;
5320 }
5321
5322 MachineBasicBlock *BB = MI->getParent();
5323 Register DstReg = MRI.cloneVirtualRegister(RootReg);
5324
5326 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
5327 .addReg(NewReg);
5328
5329 // Only accept VGPR.
5330 return MIB->getOperand(0).getReg();
5331}
5332
5334AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root,
5335 bool IsDOT) const {
5336 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5337 Register Reg;
5338 unsigned Mods;
5339 std::tie(Reg, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, IsDOT);
5340
5341 Reg = getLegalRegBank(Reg, Root.getReg(), RBI, MRI, TRI, TII);
5342 return {{
5343 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5344 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5345 }};
5346}
5347
5349AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
5350
5351 return selectVOP3PRetHelper(Root);
5352}
5353
5355AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
5356
5357 return selectVOP3PRetHelper(Root, true);
5358}
5359
5361AMDGPUInstructionSelector::selectVOP3PNoModsDOT(MachineOperand &Root) const {
5362 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5363 Register Src;
5364 unsigned Mods;
5365 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true /*IsDOT*/);
5366 if (Mods != SISrcMods::OP_SEL_1)
5367 return {};
5368
5369 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }}};
5370}
5371
5373AMDGPUInstructionSelector::selectVOP3PModsF32(MachineOperand &Root) const {
5374 Register Src;
5375 unsigned Mods;
5376 std::tie(Src, Mods) = selectVOP3PModsF32Impl(Root.getReg());
5377
5378 return {{
5379 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5380 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5381 }};
5382}
5383
5385AMDGPUInstructionSelector::selectVOP3PNoModsF32(MachineOperand &Root) const {
5386 Register Src;
5387 unsigned Mods;
5388 std::tie(Src, Mods) = selectVOP3PModsF32Impl(Root.getReg());
5389 if (Mods != SISrcMods::OP_SEL_1)
5390 return {};
5391
5392 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }}};
5393}
5394
5396AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
5397 MachineOperand &Root) const {
5398 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
5399 "expected i1 value");
5400 unsigned Mods = SISrcMods::OP_SEL_1;
5401 if (Root.getImm() != 0)
5402 Mods |= SISrcMods::OP_SEL_0;
5403
5404 return {{
5405 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5406 }};
5407}
5408
5410 MachineInstr *InsertPt,
5411 MachineRegisterInfo &MRI) {
5412 const TargetRegisterClass *DstRegClass;
5413 switch (Elts.size()) {
5414 case 8:
5415 DstRegClass = &AMDGPU::VReg_256RegClass;
5416 break;
5417 case 4:
5418 DstRegClass = &AMDGPU::VReg_128RegClass;
5419 break;
5420 case 2:
5421 DstRegClass = &AMDGPU::VReg_64RegClass;
5422 break;
5423 default:
5424 llvm_unreachable("unhandled Reg sequence size");
5425 }
5426
5427 MachineIRBuilder B(*InsertPt);
5428 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
5429 .addDef(MRI.createVirtualRegister(DstRegClass));
5430 for (unsigned i = 0; i < Elts.size(); ++i) {
5431 MIB.addReg(Elts[i]);
5433 }
5434 return MIB->getOperand(0).getReg();
5435}
5436
5437static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
5439 MachineInstr *InsertPt,
5440 MachineRegisterInfo &MRI) {
5441 if (ModOpcode == TargetOpcode::G_FNEG) {
5442 Mods |= SISrcMods::NEG;
5443 // Check if all elements also have abs modifier
5444 SmallVector<Register, 8> NegAbsElts;
5445 for (auto El : Elts) {
5446 Register FabsSrc;
5447 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
5448 break;
5449 NegAbsElts.push_back(FabsSrc);
5450 }
5451 if (Elts.size() != NegAbsElts.size()) {
5452 // Neg
5453 Src = buildRegSequence(Elts, InsertPt, MRI);
5454 } else {
5455 // Neg and Abs
5456 Mods |= SISrcMods::NEG_HI;
5457 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
5458 }
5459 } else {
5460 assert(ModOpcode == TargetOpcode::G_FABS);
5461 // Abs
5462 Mods |= SISrcMods::NEG_HI;
5463 Src = buildRegSequence(Elts, InsertPt, MRI);
5464 }
5465}
5466
5468AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
5469 Register Src = Root.getReg();
5470 unsigned Mods = SISrcMods::OP_SEL_1;
5472
5473 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
5474 assert(BV->getNumSources() > 0);
5475 // Based on first element decide which mod we match, neg or abs
5476 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
5477 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
5478 ? AMDGPU::G_FNEG
5479 : AMDGPU::G_FABS;
5480 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
5481 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
5482 if (ElF32->getOpcode() != ModOpcode)
5483 break;
5484 EltsF32.push_back(ElF32->getOperand(1).getReg());
5485 }
5486
5487 // All elements had ModOpcode modifier
5488 if (BV->getNumSources() == EltsF32.size()) {
5489 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
5490 *MRI);
5491 }
5492 }
5493
5494 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5495 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5496}
5497
5499AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
5500 Register Src = Root.getReg();
5501 unsigned Mods = SISrcMods::OP_SEL_1;
5502 SmallVector<Register, 8> EltsV2F16;
5503
5504 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5505 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5506 Register FNegSrc;
5507 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
5508 break;
5509 EltsV2F16.push_back(FNegSrc);
5510 }
5511
5512 // All elements had ModOpcode modifier
5513 if (CV->getNumSources() == EltsV2F16.size()) {
5514 Mods |= SISrcMods::NEG;
5515 Mods |= SISrcMods::NEG_HI;
5516 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
5517 }
5518 }
5519
5520 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5521 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5522}
5523
5525AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
5526 Register Src = Root.getReg();
5527 unsigned Mods = SISrcMods::OP_SEL_1;
5528 SmallVector<Register, 8> EltsV2F16;
5529
5530 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
5531 assert(CV->getNumSources() > 0);
5532 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
5533 // Based on first element decide which mod we match, neg or abs
5534 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
5535 ? AMDGPU::G_FNEG
5536 : AMDGPU::G_FABS;
5537
5538 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
5539 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
5540 if (ElV2F16->getOpcode() != ModOpcode)
5541 break;
5542 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
5543 }
5544
5545 // All elements had ModOpcode modifier
5546 if (CV->getNumSources() == EltsV2F16.size()) {
5547 MachineIRBuilder B(*Root.getParent());
5548 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
5549 *MRI);
5550 }
5551 }
5552
5553 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5554 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
5555}
5556
5558AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
5559 std::optional<FPValueAndVReg> FPValReg;
5560 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
5561 if (TII.isInlineConstant(FPValReg->Value)) {
5562 return {{[=](MachineInstrBuilder &MIB) {
5563 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
5564 }}};
5565 }
5566 // Non-inlineable splat floats should not fall-through for integer immediate
5567 // checks.
5568 return {};
5569 }
5570
5571 APInt ICst;
5572 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
5573 if (TII.isInlineConstant(ICst)) {
5574 return {
5575 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
5576 }
5577 }
5578
5579 return {};
5580}
5581
5583AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
5584 Register Src =
5585 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5586 unsigned Key = 0;
5587
5588 Register ShiftSrc;
5589 std::optional<ValueAndVReg> ShiftAmt;
5590 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5591 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5592 ShiftAmt->Value.getZExtValue() % 8 == 0) {
5593 Key = ShiftAmt->Value.getZExtValue() / 8;
5594 Src = ShiftSrc;
5595 }
5596
5597 return {{
5598 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5599 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5600 }};
5601}
5602
5604AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
5605
5606 Register Src =
5607 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5608 unsigned Key = 0;
5609
5610 Register ShiftSrc;
5611 std::optional<ValueAndVReg> ShiftAmt;
5612 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
5613 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
5614 ShiftAmt->Value.getZExtValue() == 16) {
5615 Src = ShiftSrc;
5616 Key = 1;
5617 }
5618
5619 return {{
5620 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5621 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5622 }};
5623}
5624
5626AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {
5627 Register Src =
5628 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
5629 unsigned Key = 0;
5630
5631 Register S32 = matchZeroExtendFromS32(Src);
5632 if (!S32)
5633 S32 = matchAnyExtendFromS32(Src);
5634
5635 if (S32) {
5636 const MachineInstr *Def = getDefIgnoringCopies(S32, *MRI);
5637 if (Def->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
5638 assert(Def->getNumOperands() == 3);
5639 Register DstReg1 = Def->getOperand(1).getReg();
5640 if (mi_match(S32, *MRI,
5641 m_any_of(m_SpecificReg(DstReg1), m_Copy(m_Reg(DstReg1))))) {
5642 Src = Def->getOperand(2).getReg();
5643 Key = 1;
5644 }
5645 }
5646 }
5647
5648 return {{
5649 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5650 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
5651 }};
5652}
5653
5655AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
5656 Register Src;
5657 unsigned Mods;
5658 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
5659
5660 // FIXME: Handle op_sel
5661 return {{
5662 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5663 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5664 }};
5665}
5666
5667// FIXME-TRUE16 remove when fake16 is removed
5669AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
5670 Register Src;
5671 unsigned Mods;
5672 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5673 /*IsCanonicalizing=*/true,
5674 /*AllowAbs=*/false,
5675 /*OpSel=*/false);
5676
5677 return {{
5678 [=](MachineInstrBuilder &MIB) {
5679 MIB.addReg(
5680 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5681 },
5682 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5683 }};
5684}
5685
5687AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
5688 Register Src;
5689 unsigned Mods;
5690 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
5691 /*IsCanonicalizing=*/true,
5692 /*AllowAbs=*/false,
5693 /*OpSel=*/true);
5694
5695 return {{
5696 [=](MachineInstrBuilder &MIB) {
5697 MIB.addReg(
5698 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
5699 },
5700 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
5701 }};
5702}
5703
5704// Given \p Offset and load specified by the \p Root operand check if \p Offset
5705// is a multiple of the load byte size. If it is update \p Offset to a
5706// pre-scaled value and return true.
5707bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root,
5709 bool IsSigned) const {
5710 if (!Subtarget->hasScaleOffset())
5711 return false;
5712
5713 const MachineInstr &MI = *Root.getParent();
5714 MachineMemOperand *MMO = *MI.memoperands_begin();
5715
5716 if (!MMO->getSize().hasValue())
5717 return false;
5718
5719 uint64_t Size = MMO->getSize().getValue();
5720
5721 Register OffsetReg = matchExtendFromS32OrS32(Offset, IsSigned);
5722 if (!OffsetReg)
5723 OffsetReg = Offset;
5724
5725 if (auto Def = getDefSrcRegIgnoringCopies(OffsetReg, *MRI))
5726 OffsetReg = Def->Reg;
5727
5728 Register Op0;
5729 MachineInstr *Mul;
5730 bool ScaleOffset =
5731 (isPowerOf2_64(Size) &&
5732 mi_match(OffsetReg, *MRI,
5733 m_GShl(m_Reg(Op0),
5736 mi_match(OffsetReg, *MRI,
5738 m_Copy(m_SpecificICst(Size))))) ||
5739 mi_match(
5740 OffsetReg, *MRI,
5741 m_BinOp(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64,
5742 m_Reg(Op0), m_SpecificICst(Size))) ||
5743 // Match G_AMDGPU_MAD_U64_U32 offset, c, 0
5744 (mi_match(OffsetReg, *MRI, m_MInstr(Mul)) &&
5745 (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32
5746 : AMDGPU::G_AMDGPU_MAD_U64_U32) ||
5747 (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 &&
5748 VT->signBitIsZero(Mul->getOperand(2).getReg()))) &&
5749 mi_match(Mul->getOperand(4).getReg(), *MRI, m_ZeroInt()) &&
5750 mi_match(Mul->getOperand(3).getReg(), *MRI,
5752 m_Copy(m_SpecificICst(Size))))) &&
5753 mi_match(Mul->getOperand(2).getReg(), *MRI, m_Reg(Op0)));
5754
5755 if (ScaleOffset)
5756 Offset = Op0;
5757
5758 return ScaleOffset;
5759}
5760
5761bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
5762 Register &Base,
5763 Register *SOffset,
5764 int64_t *Offset,
5765 bool *ScaleOffset) const {
5766 MachineInstr *MI = Root.getParent();
5767 MachineBasicBlock *MBB = MI->getParent();
5768
5769 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
5770 // then we can select all ptr + 32-bit offsets.
5771 SmallVector<GEPInfo, 4> AddrInfo;
5772 getAddrModeInfo(*MI, *MRI, AddrInfo);
5773
5774 if (AddrInfo.empty())
5775 return false;
5776
5777 const GEPInfo &GEPI = AddrInfo[0];
5778 std::optional<int64_t> EncodedImm;
5779
5780 if (ScaleOffset)
5781 *ScaleOffset = false;
5782
5783 if (SOffset && Offset) {
5784 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5785 /*HasSOffset=*/true);
5786 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
5787 AddrInfo.size() > 1) {
5788 const GEPInfo &GEPI2 = AddrInfo[1];
5789 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
5790 Register OffsetReg = GEPI2.SgprParts[1];
5791 if (ScaleOffset)
5792 *ScaleOffset =
5793 selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5794 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5795 if (OffsetReg) {
5796 Base = GEPI2.SgprParts[0];
5797 *SOffset = OffsetReg;
5798 *Offset = *EncodedImm;
5799 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
5800 return true;
5801
5802 // For unbuffered smem loads, it is illegal for the Immediate Offset
5803 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
5804 // is negative. Handle the case where the Immediate Offset + SOffset
5805 // is negative.
5806 auto SKnown = VT->getKnownBits(*SOffset);
5807 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
5808 return false;
5809
5810 return true;
5811 }
5812 }
5813 }
5814 return false;
5815 }
5816
5817 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
5818 /*HasSOffset=*/false);
5819 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
5820 Base = GEPI.SgprParts[0];
5821 *Offset = *EncodedImm;
5822 return true;
5823 }
5824
5825 // SGPR offset is unsigned.
5826 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
5827 GEPI.Imm != 0) {
5828 // If we make it this far we have a load with an 32-bit immediate offset.
5829 // It is OK to select this using a sgpr offset, because we have already
5830 // failed trying to select this load into one of the _IMM variants since
5831 // the _IMM Patterns are considered before the _SGPR patterns.
5832 Base = GEPI.SgprParts[0];
5833 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5834 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
5835 .addImm(GEPI.Imm);
5836 return true;
5837 }
5838
5839 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
5840 Register OffsetReg = GEPI.SgprParts[1];
5841 if (ScaleOffset)
5842 *ScaleOffset = selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
5843 OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
5844 if (OffsetReg) {
5845 Base = GEPI.SgprParts[0];
5846 *SOffset = OffsetReg;
5847 return true;
5848 }
5849 }
5850
5851 return false;
5852}
5853
5855AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
5856 Register Base;
5857 int64_t Offset;
5858 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset,
5859 /* ScaleOffset */ nullptr))
5860 return std::nullopt;
5861
5862 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5863 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
5864}
5865
5867AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
5868 SmallVector<GEPInfo, 4> AddrInfo;
5869 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
5870
5871 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
5872 return std::nullopt;
5873
5874 const GEPInfo &GEPInfo = AddrInfo[0];
5875 Register PtrReg = GEPInfo.SgprParts[0];
5876 std::optional<int64_t> EncodedImm =
5877 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
5878 if (!EncodedImm)
5879 return std::nullopt;
5880
5881 return {{
5882 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
5883 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
5884 }};
5885}
5886
5888AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
5889 Register Base, SOffset;
5890 bool ScaleOffset;
5891 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr,
5892 &ScaleOffset))
5893 return std::nullopt;
5894
5895 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5896 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5897 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5898 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5899}
5900
5902AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
5903 Register Base, SOffset;
5904 int64_t Offset;
5905 bool ScaleOffset;
5906 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset, &ScaleOffset))
5907 return std::nullopt;
5908
5909 unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
5910 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
5911 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5912 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
5913 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
5914}
5915
5916std::pair<Register, int>
5917AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
5918 uint64_t FlatVariant) const {
5919 MachineInstr *MI = Root.getParent();
5920
5921 auto Default = std::pair(Root.getReg(), 0);
5922
5923 if (!STI.hasFlatInstOffsets())
5924 return Default;
5925
5926 Register PtrBase;
5927 int64_t ConstOffset;
5928 bool IsInBounds;
5929 std::tie(PtrBase, ConstOffset, IsInBounds) =
5930 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5931
5932 // Adding the offset to the base address with an immediate in a FLAT
5933 // instruction must not change the memory aperture in which the address falls.
5934 // Therefore we can only fold offsets from inbounds GEPs into FLAT
5935 // instructions.
5936 if (ConstOffset == 0 ||
5937 (FlatVariant == SIInstrFlags::FlatScratch &&
5938 !isFlatScratchBaseLegal(Root.getReg())) ||
5939 (FlatVariant == SIInstrFlags::FLAT && !IsInBounds))
5940 return Default;
5941
5942 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
5943 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
5944 return Default;
5945
5946 return std::pair(PtrBase, ConstOffset);
5947}
5948
5950AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
5951 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
5952
5953 return {{
5954 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5955 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5956 }};
5957}
5958
5960AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
5961 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
5962
5963 return {{
5964 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5965 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5966 }};
5967}
5968
5970AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
5971 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
5972
5973 return {{
5974 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
5975 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
5976 }};
5977}
5978
5979// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
5981AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
5982 unsigned CPolBits,
5983 bool NeedIOffset) const {
5984 Register Addr = Root.getReg();
5985 Register PtrBase;
5986 int64_t ConstOffset;
5987 int64_t ImmOffset = 0;
5988
5989 // Match the immediate offset first, which canonically is moved as low as
5990 // possible.
5991 std::tie(PtrBase, ConstOffset, std::ignore) =
5992 getPtrBaseWithConstantOffset(Addr, *MRI);
5993
5994 if (ConstOffset != 0) {
5995 if (NeedIOffset &&
5996 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
5998 Addr = PtrBase;
5999 ImmOffset = ConstOffset;
6000 } else {
6001 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
6002 if (isSGPR(PtrBaseDef->Reg)) {
6003 if (ConstOffset > 0) {
6004 // Offset is too large.
6005 //
6006 // saddr + large_offset -> saddr +
6007 // (voffset = large_offset & ~MaxOffset) +
6008 // (large_offset & MaxOffset);
6009 int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset;
6010 if (NeedIOffset) {
6011 std::tie(SplitImmOffset, RemainderOffset) =
6012 TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
6014 }
6015
6016 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
6017 : isUInt<32>(RemainderOffset)) {
6018 MachineInstr *MI = Root.getParent();
6019 MachineBasicBlock *MBB = MI->getParent();
6020 Register HighBits =
6021 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6022
6023 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
6024 HighBits)
6025 .addImm(RemainderOffset);
6026
6027 if (NeedIOffset)
6028 return {{
6029 [=](MachineInstrBuilder &MIB) {
6030 MIB.addReg(PtrBase);
6031 }, // saddr
6032 [=](MachineInstrBuilder &MIB) {
6033 MIB.addReg(HighBits);
6034 }, // voffset
6035 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
6036 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
6037 }};
6038 return {{
6039 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
6040 [=](MachineInstrBuilder &MIB) {
6041 MIB.addReg(HighBits);
6042 }, // voffset
6043 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
6044 }};
6045 }
6046 }
6047
6048 // We are adding a 64 bit SGPR and a constant. If constant bus limit
6049 // is 1 we would need to perform 1 or 2 extra moves for each half of
6050 // the constant and it is better to do a scalar add and then issue a
6051 // single VALU instruction to materialize zero. Otherwise it is less
6052 // instructions to perform VALU adds with immediates or inline literals.
6053 unsigned NumLiterals =
6054 !TII.isInlineConstant(APInt(32, Lo_32(ConstOffset))) +
6055 !TII.isInlineConstant(APInt(32, Hi_32(ConstOffset)));
6056 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
6057 return std::nullopt;
6058 }
6059 }
6060 }
6061
6062 // Match the variable offset.
6063 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
6064 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
6065 // Look through the SGPR->VGPR copy.
6066 Register SAddr =
6067 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
6068
6069 if (isSGPR(SAddr)) {
6070 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
6071
6072 // It's possible voffset is an SGPR here, but the copy to VGPR will be
6073 // inserted later.
6074 bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset,
6075 Subtarget->hasSignedGVSOffset());
6076 if (Register VOffset = matchExtendFromS32OrS32(
6077 PtrBaseOffset, Subtarget->hasSignedGVSOffset())) {
6078 if (NeedIOffset)
6079 return {{[=](MachineInstrBuilder &MIB) { // saddr
6080 MIB.addReg(SAddr);
6081 },
6082 [=](MachineInstrBuilder &MIB) { // voffset
6083 MIB.addReg(VOffset);
6084 },
6085 [=](MachineInstrBuilder &MIB) { // offset
6086 MIB.addImm(ImmOffset);
6087 },
6088 [=](MachineInstrBuilder &MIB) { // cpol
6089 MIB.addImm(CPolBits |
6090 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
6091 }}};
6092 return {{[=](MachineInstrBuilder &MIB) { // saddr
6093 MIB.addReg(SAddr);
6094 },
6095 [=](MachineInstrBuilder &MIB) { // voffset
6096 MIB.addReg(VOffset);
6097 },
6098 [=](MachineInstrBuilder &MIB) { // cpol
6099 MIB.addImm(CPolBits |
6100 (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
6101 }}};
6102 }
6103 }
6104 }
6105
6106 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
6107 // drop this.
6108 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
6109 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
6110 return std::nullopt;
6111
6112 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
6113 // moves required to copy a 64-bit SGPR to VGPR.
6114 MachineInstr *MI = Root.getParent();
6115 MachineBasicBlock *MBB = MI->getParent();
6116 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6117
6118 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
6119 .addImm(0);
6120
6121 if (NeedIOffset)
6122 return {{
6123 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
6124 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
6125 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6126 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
6127 }};
6128 return {{
6129 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
6130 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
6131 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
6132 }};
6133}
6134
6136AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
6137 return selectGlobalSAddr(Root, 0);
6138}
6139
6141AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
6142 const MachineInstr &I = *Root.getParent();
6143
6144 // We are assuming CPol is always the last operand of the intrinsic.
6145 auto PassedCPol =
6146 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
6147 return selectGlobalSAddr(Root, PassedCPol);
6148}
6149
6151AMDGPUInstructionSelector::selectGlobalSAddrCPolM0(MachineOperand &Root) const {
6152 const MachineInstr &I = *Root.getParent();
6153
6154 // We are assuming CPol is second from last operand of the intrinsic.
6155 auto PassedCPol =
6156 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
6157 return selectGlobalSAddr(Root, PassedCPol);
6158}
6159
6161AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
6162 return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
6163}
6164
6166AMDGPUInstructionSelector::selectGlobalSAddrNoIOffset(
6167 MachineOperand &Root) const {
6168 const MachineInstr &I = *Root.getParent();
6169
6170 // We are assuming CPol is always the last operand of the intrinsic.
6171 auto PassedCPol =
6172 I.getOperand(I.getNumOperands() - 1).getImm() & ~AMDGPU::CPol::SCAL;
6173 return selectGlobalSAddr(Root, PassedCPol, false);
6174}
6175
6177AMDGPUInstructionSelector::selectGlobalSAddrNoIOffsetM0(
6178 MachineOperand &Root) const {
6179 const MachineInstr &I = *Root.getParent();
6180
6181 // We are assuming CPol is second from last operand of the intrinsic.
6182 auto PassedCPol =
6183 I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
6184 return selectGlobalSAddr(Root, PassedCPol, false);
6185}
6186
6188AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
6189 Register Addr = Root.getReg();
6190 Register PtrBase;
6191 int64_t ConstOffset;
6192 int64_t ImmOffset = 0;
6193
6194 // Match the immediate offset first, which canonically is moved as low as
6195 // possible.
6196 std::tie(PtrBase, ConstOffset, std::ignore) =
6197 getPtrBaseWithConstantOffset(Addr, *MRI);
6198
6199 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
6200 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
6202 Addr = PtrBase;
6203 ImmOffset = ConstOffset;
6204 }
6205
6206 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
6207 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6208 int FI = AddrDef->MI->getOperand(1).getIndex();
6209 return {{
6210 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
6211 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
6212 }};
6213 }
6214
6215 Register SAddr = AddrDef->Reg;
6216
6217 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
6218 Register LHS = AddrDef->MI->getOperand(1).getReg();
6219 Register RHS = AddrDef->MI->getOperand(2).getReg();
6220 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
6221 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
6222
6223 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
6224 isSGPR(RHSDef->Reg)) {
6225 int FI = LHSDef->MI->getOperand(1).getIndex();
6226 MachineInstr &I = *Root.getParent();
6227 MachineBasicBlock *BB = I.getParent();
6228 const DebugLoc &DL = I.getDebugLoc();
6229 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6230
6231 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
6232 .addFrameIndex(FI)
6233 .addReg(RHSDef->Reg)
6234 .setOperandDead(3); // Dead scc
6235 }
6236 }
6237
6238 if (!isSGPR(SAddr))
6239 return std::nullopt;
6240
6241 return {{
6242 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
6243 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
6244 }};
6245}
6246
6247// Check whether the flat scratch SVS swizzle bug affects this access.
6248bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
6249 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
6250 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
6251 return false;
6252
6253 // The bug affects the swizzling of SVS accesses if there is any carry out
6254 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
6255 // voffset to (soffset + inst_offset).
6256 auto VKnown = VT->getKnownBits(VAddr);
6257 auto SKnown = KnownBits::add(VT->getKnownBits(SAddr),
6258 KnownBits::makeConstant(APInt(32, ImmOffset)));
6259 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
6260 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
6261 return (VMax & 3) + (SMax & 3) >= 4;
6262}
6263
6265AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
6266 Register Addr = Root.getReg();
6267 Register PtrBase;
6268 int64_t ConstOffset;
6269 int64_t ImmOffset = 0;
6270
6271 // Match the immediate offset first, which canonically is moved as low as
6272 // possible.
6273 std::tie(PtrBase, ConstOffset, std::ignore) =
6274 getPtrBaseWithConstantOffset(Addr, *MRI);
6275
6276 Register OrigAddr = Addr;
6277 if (ConstOffset != 0 &&
6278 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
6280 Addr = PtrBase;
6281 ImmOffset = ConstOffset;
6282 }
6283
6284 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
6285 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
6286 return std::nullopt;
6287
6288 Register RHS = AddrDef->MI->getOperand(2).getReg();
6289 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
6290 return std::nullopt;
6291
6292 Register LHS = AddrDef->MI->getOperand(1).getReg();
6293 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
6294
6295 if (OrigAddr != Addr) {
6296 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
6297 return std::nullopt;
6298 } else {
6299 if (!isFlatScratchBaseLegalSV(OrigAddr))
6300 return std::nullopt;
6301 }
6302
6303 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
6304 return std::nullopt;
6305
6306 unsigned CPol = selectScaleOffset(Root, RHS, true /* IsSigned */)
6308 : 0;
6309
6310 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6311 int FI = LHSDef->MI->getOperand(1).getIndex();
6312 return {{
6313 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
6314 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
6315 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6316 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
6317 }};
6318 }
6319
6320 if (!isSGPR(LHS))
6321 if (auto Def = getDefSrcRegIgnoringCopies(LHS, *MRI))
6322 LHS = Def->Reg;
6323
6324 if (!isSGPR(LHS))
6325 return std::nullopt;
6326
6327 return {{
6328 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
6329 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
6330 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
6331 [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
6332 }};
6333}
6334
6336AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
6337 MachineInstr *MI = Root.getParent();
6338 MachineBasicBlock *MBB = MI->getParent();
6339 MachineFunction *MF = MBB->getParent();
6340 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6341
6342 int64_t Offset = 0;
6343 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
6345 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6346
6347 // TODO: Should this be inside the render function? The iterator seems to
6348 // move.
6349 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
6350 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
6351 HighBits)
6352 .addImm(Offset & ~MaxOffset);
6353
6354 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6355 MIB.addReg(Info->getScratchRSrcReg());
6356 },
6357 [=](MachineInstrBuilder &MIB) { // vaddr
6358 MIB.addReg(HighBits);
6359 },
6360 [=](MachineInstrBuilder &MIB) { // soffset
6361 // Use constant zero for soffset and rely on eliminateFrameIndex
6362 // to choose the appropriate frame register if need be.
6363 MIB.addImm(0);
6364 },
6365 [=](MachineInstrBuilder &MIB) { // offset
6366 MIB.addImm(Offset & MaxOffset);
6367 }}};
6368 }
6369
6370 assert(Offset == 0 || Offset == -1);
6371
6372 // Try to fold a frame index directly into the MUBUF vaddr field, and any
6373 // offsets.
6374 std::optional<int> FI;
6375 Register VAddr = Root.getReg();
6376
6377 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6378 Register PtrBase;
6379 int64_t ConstOffset;
6380 std::tie(PtrBase, ConstOffset, std::ignore) =
6381 getPtrBaseWithConstantOffset(VAddr, *MRI);
6382 if (ConstOffset != 0) {
6383 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
6384 (!STI.privateMemoryResourceIsRangeChecked() ||
6385 VT->signBitIsZero(PtrBase))) {
6386 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
6387 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
6388 FI = PtrBaseDef->getOperand(1).getIndex();
6389 else
6390 VAddr = PtrBase;
6391 Offset = ConstOffset;
6392 }
6393 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
6394 FI = RootDef->getOperand(1).getIndex();
6395 }
6396
6397 return {{[=](MachineInstrBuilder &MIB) { // rsrc
6398 MIB.addReg(Info->getScratchRSrcReg());
6399 },
6400 [=](MachineInstrBuilder &MIB) { // vaddr
6401 if (FI)
6402 MIB.addFrameIndex(*FI);
6403 else
6404 MIB.addReg(VAddr);
6405 },
6406 [=](MachineInstrBuilder &MIB) { // soffset
6407 // Use constant zero for soffset and rely on eliminateFrameIndex
6408 // to choose the appropriate frame register if need be.
6409 MIB.addImm(0);
6410 },
6411 [=](MachineInstrBuilder &MIB) { // offset
6412 MIB.addImm(Offset);
6413 }}};
6414}
6415
6416bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
6417 int64_t Offset) const {
6418 if (!isUInt<16>(Offset))
6419 return false;
6420
6421 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6422 return true;
6423
6424 // On Southern Islands instruction with a negative base value and an offset
6425 // don't seem to work.
6426 return VT->signBitIsZero(Base);
6427}
6428
6429bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
6430 int64_t Offset1,
6431 unsigned Size) const {
6432 if (Offset0 % Size != 0 || Offset1 % Size != 0)
6433 return false;
6434 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
6435 return false;
6436
6437 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
6438 return true;
6439
6440 // On Southern Islands instruction with a negative base value and an offset
6441 // don't seem to work.
6442 return VT->signBitIsZero(Base);
6443}
6444
6445// Return whether the operation has NoUnsignedWrap property.
6446static bool isNoUnsignedWrap(MachineInstr *Addr) {
6447 return Addr->getOpcode() == TargetOpcode::G_OR ||
6448 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
6450}
6451
6452// Check that the base address of flat scratch load/store in the form of `base +
6453// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
6454// requirement). We always treat the first operand as the base address here.
6455bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
6456 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6457
6458 if (isNoUnsignedWrap(AddrMI))
6459 return true;
6460
6461 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6462 // values.
6463 if (STI.hasSignedScratchOffsets())
6464 return true;
6465
6466 Register LHS = AddrMI->getOperand(1).getReg();
6467 Register RHS = AddrMI->getOperand(2).getReg();
6468
6469 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
6470 std::optional<ValueAndVReg> RhsValReg =
6472 // If the immediate offset is negative and within certain range, the base
6473 // address cannot also be negative. If the base is also negative, the sum
6474 // would be either negative or much larger than the valid range of scratch
6475 // memory a thread can access.
6476 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
6477 RhsValReg->Value.getSExtValue() > -0x40000000)
6478 return true;
6479 }
6480
6481 return VT->signBitIsZero(LHS);
6482}
6483
6484// Check address value in SGPR/VGPR are legal for flat scratch in the form
6485// of: SGPR + VGPR.
6486bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
6487 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6488
6489 if (isNoUnsignedWrap(AddrMI))
6490 return true;
6491
6492 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6493 // values.
6494 if (STI.hasSignedScratchOffsets())
6495 return true;
6496
6497 Register LHS = AddrMI->getOperand(1).getReg();
6498 Register RHS = AddrMI->getOperand(2).getReg();
6499 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6500}
6501
6502// Check address value in SGPR/VGPR are legal for flat scratch in the form
6503// of: SGPR + VGPR + Imm.
6504bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
6505 Register Addr) const {
6506 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
6507 // values.
6508 if (STI.hasSignedScratchOffsets())
6509 return true;
6510
6511 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
6512 Register Base = AddrMI->getOperand(1).getReg();
6513 std::optional<DefinitionAndSourceRegister> BaseDef =
6515 std::optional<ValueAndVReg> RHSOffset =
6517 assert(RHSOffset);
6518
6519 // If the immediate offset is negative and within certain range, the base
6520 // address cannot also be negative. If the base is also negative, the sum
6521 // would be either negative or much larger than the valid range of scratch
6522 // memory a thread can access.
6523 if (isNoUnsignedWrap(BaseDef->MI) &&
6524 (isNoUnsignedWrap(AddrMI) ||
6525 (RHSOffset->Value.getSExtValue() < 0 &&
6526 RHSOffset->Value.getSExtValue() > -0x40000000)))
6527 return true;
6528
6529 Register LHS = BaseDef->MI->getOperand(1).getReg();
6530 Register RHS = BaseDef->MI->getOperand(2).getReg();
6531 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS);
6532}
6533
6534bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
6535 unsigned ShAmtBits) const {
6536 assert(MI.getOpcode() == TargetOpcode::G_AND);
6537
6538 std::optional<APInt> RHS =
6539 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
6540 if (!RHS)
6541 return false;
6542
6543 if (RHS->countr_one() >= ShAmtBits)
6544 return true;
6545
6546 const APInt &LHSKnownZeros = VT->getKnownZeroes(MI.getOperand(1).getReg());
6547 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
6548}
6549
6551AMDGPUInstructionSelector::selectMUBUFScratchOffset(
6552 MachineOperand &Root) const {
6553 Register Reg = Root.getReg();
6554 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6555
6556 std::optional<DefinitionAndSourceRegister> Def =
6558 assert(Def && "this shouldn't be an optional result");
6559 Reg = Def->Reg;
6560
6561 if (Register WaveBase = getWaveAddress(Def->MI)) {
6562 return {{
6563 [=](MachineInstrBuilder &MIB) { // rsrc
6564 MIB.addReg(Info->getScratchRSrcReg());
6565 },
6566 [=](MachineInstrBuilder &MIB) { // soffset
6567 MIB.addReg(WaveBase);
6568 },
6569 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
6570 }};
6571 }
6572
6573 int64_t Offset = 0;
6574
6575 // FIXME: Copy check is a hack
6577 if (mi_match(Reg, *MRI,
6578 m_GPtrAdd(m_Reg(BasePtr),
6580 if (!TII.isLegalMUBUFImmOffset(Offset))
6581 return {};
6582 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
6583 Register WaveBase = getWaveAddress(BasePtrDef);
6584 if (!WaveBase)
6585 return {};
6586
6587 return {{
6588 [=](MachineInstrBuilder &MIB) { // rsrc
6589 MIB.addReg(Info->getScratchRSrcReg());
6590 },
6591 [=](MachineInstrBuilder &MIB) { // soffset
6592 MIB.addReg(WaveBase);
6593 },
6594 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6595 }};
6596 }
6597
6598 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
6599 !TII.isLegalMUBUFImmOffset(Offset))
6600 return {};
6601
6602 return {{
6603 [=](MachineInstrBuilder &MIB) { // rsrc
6604 MIB.addReg(Info->getScratchRSrcReg());
6605 },
6606 [=](MachineInstrBuilder &MIB) { // soffset
6607 MIB.addImm(0);
6608 },
6609 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
6610 }};
6611}
6612
6613std::pair<Register, unsigned>
6614AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
6615 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6616 int64_t ConstAddr = 0;
6617
6618 Register PtrBase;
6619 int64_t Offset;
6620 std::tie(PtrBase, Offset, std::ignore) =
6621 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6622
6623 if (Offset) {
6624 if (isDSOffsetLegal(PtrBase, Offset)) {
6625 // (add n0, c0)
6626 return std::pair(PtrBase, Offset);
6627 }
6628 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6629 // TODO
6630
6631
6632 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6633 // TODO
6634
6635 }
6636
6637 return std::pair(Root.getReg(), 0);
6638}
6639
6641AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
6642 Register Reg;
6643 unsigned Offset;
6644 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
6645 return {{
6646 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6647 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
6648 }};
6649}
6650
6652AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
6653 return selectDSReadWrite2(Root, 4);
6654}
6655
6657AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
6658 return selectDSReadWrite2(Root, 8);
6659}
6660
6662AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
6663 unsigned Size) const {
6664 Register Reg;
6665 unsigned Offset;
6666 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
6667 return {{
6668 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
6669 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
6670 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
6671 }};
6672}
6673
6674std::pair<Register, unsigned>
6675AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
6676 unsigned Size) const {
6677 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
6678 int64_t ConstAddr = 0;
6679
6680 Register PtrBase;
6681 int64_t Offset;
6682 std::tie(PtrBase, Offset, std::ignore) =
6683 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
6684
6685 if (Offset) {
6686 int64_t OffsetValue0 = Offset;
6687 int64_t OffsetValue1 = Offset + Size;
6688 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
6689 // (add n0, c0)
6690 return std::pair(PtrBase, OffsetValue0 / Size);
6691 }
6692 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
6693 // TODO
6694
6695 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
6696 // TODO
6697
6698 }
6699
6700 return std::pair(Root.getReg(), 0);
6701}
6702
6703/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
6704/// the base value with the constant offset, and if the offset computation is
6705/// known to be inbounds. There may be intervening copies between \p Root and
6706/// the identified constant. Returns \p Root, 0, false if this does not match
6707/// the pattern.
6708std::tuple<Register, int64_t, bool>
6709AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
6710 Register Root, const MachineRegisterInfo &MRI) const {
6711 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
6712 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
6713 return {Root, 0, false};
6714
6715 MachineOperand &RHS = RootI->getOperand(2);
6716 std::optional<ValueAndVReg> MaybeOffset =
6718 if (!MaybeOffset)
6719 return {Root, 0, false};
6720 bool IsInBounds = RootI->getFlag(MachineInstr::MIFlag::InBounds);
6721 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue(),
6722 IsInBounds};
6723}
6724
6726 MIB.addImm(0);
6727}
6728
6729/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
6730/// BasePtr is not valid, a null base pointer will be used.
6732 uint32_t FormatLo, uint32_t FormatHi,
6733 Register BasePtr) {
6734 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6735 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6736 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6737 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6738
6739 B.buildInstr(AMDGPU::S_MOV_B32)
6740 .addDef(RSrc2)
6741 .addImm(FormatLo);
6742 B.buildInstr(AMDGPU::S_MOV_B32)
6743 .addDef(RSrc3)
6744 .addImm(FormatHi);
6745
6746 // Build the half of the subregister with the constants before building the
6747 // full 128-bit register. If we are building multiple resource descriptors,
6748 // this will allow CSEing of the 2-component register.
6749 B.buildInstr(AMDGPU::REG_SEQUENCE)
6750 .addDef(RSrcHi)
6751 .addReg(RSrc2)
6752 .addImm(AMDGPU::sub0)
6753 .addReg(RSrc3)
6754 .addImm(AMDGPU::sub1);
6755
6756 Register RSrcLo = BasePtr;
6757 if (!BasePtr) {
6758 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6759 B.buildInstr(AMDGPU::S_MOV_B64)
6760 .addDef(RSrcLo)
6761 .addImm(0);
6762 }
6763
6764 B.buildInstr(AMDGPU::REG_SEQUENCE)
6765 .addDef(RSrc)
6766 .addReg(RSrcLo)
6767 .addImm(AMDGPU::sub0_sub1)
6768 .addReg(RSrcHi)
6769 .addImm(AMDGPU::sub2_sub3);
6770
6771 return RSrc;
6772}
6773
6775 const SIInstrInfo &TII, Register BasePtr) {
6776 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6777
6778 // FIXME: Why are half the "default" bits ignored based on the addressing
6779 // mode?
6780 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
6781}
6782
6784 const SIInstrInfo &TII, Register BasePtr) {
6785 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
6786
6787 // FIXME: Why are half the "default" bits ignored based on the addressing
6788 // mode?
6789 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
6790}
6791
6792AMDGPUInstructionSelector::MUBUFAddressData
6793AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
6794 MUBUFAddressData Data;
6795 Data.N0 = Src;
6796
6797 Register PtrBase;
6798 int64_t Offset;
6799
6800 std::tie(PtrBase, Offset, std::ignore) =
6801 getPtrBaseWithConstantOffset(Src, *MRI);
6802 if (isUInt<32>(Offset)) {
6803 Data.N0 = PtrBase;
6804 Data.Offset = Offset;
6805 }
6806
6807 if (MachineInstr *InputAdd
6808 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
6809 Data.N2 = InputAdd->getOperand(1).getReg();
6810 Data.N3 = InputAdd->getOperand(2).getReg();
6811
6812 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
6813 // FIXME: Don't know this was defined by operand 0
6814 //
6815 // TODO: Remove this when we have copy folding optimizations after
6816 // RegBankSelect.
6817 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
6818 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
6819 }
6820
6821 return Data;
6822}
6823
6824/// Return if the addr64 mubuf mode should be used for the given address.
6825bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
6826 // (ptr_add N2, N3) -> addr64, or
6827 // (ptr_add (ptr_add N2, N3), C1) -> addr64
6828 if (Addr.N2)
6829 return true;
6830
6831 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
6832 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
6833}
6834
6835/// Split an immediate offset \p ImmOffset depending on whether it fits in the
6836/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
6837/// component.
6838void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
6839 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
6840 if (TII.isLegalMUBUFImmOffset(ImmOffset))
6841 return;
6842
6843 // Illegal offset, store it in soffset.
6844 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6845 B.buildInstr(AMDGPU::S_MOV_B32)
6846 .addDef(SOffset)
6847 .addImm(ImmOffset);
6848 ImmOffset = 0;
6849}
6850
6851bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
6852 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
6853 Register &SOffset, int64_t &Offset) const {
6854 // FIXME: Predicates should stop this from reaching here.
6855 // addr64 bit was removed for volcanic islands.
6856 if (!STI.hasAddr64() || STI.useFlatForGlobal())
6857 return false;
6858
6859 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6860 if (!shouldUseAddr64(AddrData))
6861 return false;
6862
6863 Register N0 = AddrData.N0;
6864 Register N2 = AddrData.N2;
6865 Register N3 = AddrData.N3;
6866 Offset = AddrData.Offset;
6867
6868 // Base pointer for the SRD.
6869 Register SRDPtr;
6870
6871 if (N2) {
6872 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6873 assert(N3);
6874 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6875 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
6876 // addr64, and construct the default resource from a 0 address.
6877 VAddr = N0;
6878 } else {
6879 SRDPtr = N3;
6880 VAddr = N2;
6881 }
6882 } else {
6883 // N2 is not divergent.
6884 SRDPtr = N2;
6885 VAddr = N3;
6886 }
6887 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
6888 // Use the default null pointer in the resource
6889 VAddr = N0;
6890 } else {
6891 // N0 -> offset, or
6892 // (N0 + C1) -> offset
6893 SRDPtr = N0;
6894 }
6895
6896 MachineIRBuilder B(*Root.getParent());
6897 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
6898 splitIllegalMUBUFOffset(B, SOffset, Offset);
6899 return true;
6900}
6901
6902bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
6903 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
6904 int64_t &Offset) const {
6905
6906 // FIXME: Pattern should not reach here.
6907 if (STI.useFlatForGlobal())
6908 return false;
6909
6910 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
6911 if (shouldUseAddr64(AddrData))
6912 return false;
6913
6914 // N0 -> offset, or
6915 // (N0 + C1) -> offset
6916 Register SRDPtr = AddrData.N0;
6917 Offset = AddrData.Offset;
6918
6919 // TODO: Look through extensions for 32-bit soffset.
6920 MachineIRBuilder B(*Root.getParent());
6921
6922 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
6923 splitIllegalMUBUFOffset(B, SOffset, Offset);
6924 return true;
6925}
6926
6928AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
6929 Register VAddr;
6930 Register RSrcReg;
6931 Register SOffset;
6932 int64_t Offset = 0;
6933
6934 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
6935 return {};
6936
6937 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
6938 // pattern.
6939 return {{
6940 [=](MachineInstrBuilder &MIB) { // rsrc
6941 MIB.addReg(RSrcReg);
6942 },
6943 [=](MachineInstrBuilder &MIB) { // vaddr
6944 MIB.addReg(VAddr);
6945 },
6946 [=](MachineInstrBuilder &MIB) { // soffset
6947 if (SOffset)
6948 MIB.addReg(SOffset);
6949 else if (STI.hasRestrictedSOffset())
6950 MIB.addReg(AMDGPU::SGPR_NULL);
6951 else
6952 MIB.addImm(0);
6953 },
6954 [=](MachineInstrBuilder &MIB) { // offset
6955 MIB.addImm(Offset);
6956 },
6957 addZeroImm, // cpol
6958 addZeroImm, // tfe
6959 addZeroImm // swz
6960 }};
6961}
6962
6964AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
6965 Register RSrcReg;
6966 Register SOffset;
6967 int64_t Offset = 0;
6968
6969 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
6970 return {};
6971
6972 return {{
6973 [=](MachineInstrBuilder &MIB) { // rsrc
6974 MIB.addReg(RSrcReg);
6975 },
6976 [=](MachineInstrBuilder &MIB) { // soffset
6977 if (SOffset)
6978 MIB.addReg(SOffset);
6979 else if (STI.hasRestrictedSOffset())
6980 MIB.addReg(AMDGPU::SGPR_NULL);
6981 else
6982 MIB.addImm(0);
6983 },
6984 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
6985 addZeroImm, // cpol
6986 addZeroImm, // tfe
6987 addZeroImm, // swz
6988 }};
6989}
6990
6992AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
6993
6994 Register SOffset = Root.getReg();
6995
6996 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
6997 SOffset = AMDGPU::SGPR_NULL;
6998
6999 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
7000}
7001
7002/// Get an immediate that must be 32-bits, and treated as zero extended.
7003static std::optional<uint64_t>
7005 // getIConstantVRegVal sexts any values, so see if that matters.
7006 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
7007 if (!OffsetVal || !isInt<32>(*OffsetVal))
7008 return std::nullopt;
7009 return Lo_32(*OffsetVal);
7010}
7011
7013AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
7014 std::optional<uint64_t> OffsetVal =
7015 Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI);
7016 if (!OffsetVal)
7017 return {};
7018
7019 std::optional<int64_t> EncodedImm =
7020 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
7021 if (!EncodedImm)
7022 return {};
7023
7024 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
7025}
7026
7028AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
7029 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
7030
7031 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
7032 if (!OffsetVal)
7033 return {};
7034
7035 std::optional<int64_t> EncodedImm =
7037 if (!EncodedImm)
7038 return {};
7039
7040 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
7041}
7042
7044AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
7045 // Match the (soffset + offset) pair as a 32-bit register base and
7046 // an immediate offset.
7047 Register SOffset;
7048 unsigned Offset;
7049 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
7050 *MRI, Root.getReg(), VT, /*CheckNUW*/ true);
7051 if (!SOffset)
7052 return std::nullopt;
7053
7054 std::optional<int64_t> EncodedOffset =
7055 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
7056 if (!EncodedOffset)
7057 return std::nullopt;
7058
7059 assert(MRI->getType(SOffset) == LLT::scalar(32));
7060 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
7061 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
7062}
7063
7064std::pair<Register, unsigned>
7065AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
7066 bool &Matched) const {
7067 Matched = false;
7068
7069 Register Src;
7070 unsigned Mods;
7071 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
7072
7073 if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
7074 assert(MRI->getType(Src) == LLT::scalar(16));
7075
7076 // Only change Src if src modifier could be gained. In such cases new Src
7077 // could be sgpr but this does not violate constant bus restriction for
7078 // instruction that is being selected.
7079 Src = stripBitCast(Src, *MRI);
7080
7081 const auto CheckAbsNeg = [&]() {
7082 // Be careful about folding modifiers if we already have an abs. fneg is
7083 // applied last, so we don't want to apply an earlier fneg.
7084 if ((Mods & SISrcMods::ABS) == 0) {
7085 unsigned ModsTmp;
7086 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
7087
7088 if ((ModsTmp & SISrcMods::NEG) != 0)
7089 Mods ^= SISrcMods::NEG;
7090
7091 if ((ModsTmp & SISrcMods::ABS) != 0)
7092 Mods |= SISrcMods::ABS;
7093 }
7094 };
7095
7096 CheckAbsNeg();
7097
7098 // op_sel/op_sel_hi decide the source type and source.
7099 // If the source's op_sel_hi is set, it indicates to do a conversion from
7100 // fp16. If the sources's op_sel is set, it picks the high half of the
7101 // source register.
7102
7103 Mods |= SISrcMods::OP_SEL_1;
7104
7105 if (isExtractHiElt(*MRI, Src, Src)) {
7106 Mods |= SISrcMods::OP_SEL_0;
7107 CheckAbsNeg();
7108 }
7109
7110 Matched = true;
7111 }
7112
7113 return {Src, Mods};
7114}
7115
7117AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
7118 MachineOperand &Root) const {
7119 Register Src;
7120 unsigned Mods;
7121 bool Matched;
7122 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
7123 if (!Matched)
7124 return {};
7125
7126 return {{
7127 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
7128 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
7129 }};
7130}
7131
7133AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
7134 Register Src;
7135 unsigned Mods;
7136 bool Matched;
7137 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
7138
7139 return {{
7140 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
7141 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
7142 }};
7143}
7144
7145bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
7146 MachineInstr &I, Intrinsic::ID IntrID) const {
7147 MachineBasicBlock *MBB = I.getParent();
7148 const DebugLoc &DL = I.getDebugLoc();
7149 Register CCReg = I.getOperand(0).getReg();
7150
7151 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
7152 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_CMP_EQ_U32)).addImm(0).addImm(0);
7153
7154 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
7155 .addImm(I.getOperand(2).getImm());
7156
7157 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
7158
7159 I.eraseFromParent();
7160 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
7161 *MRI);
7162}
7163
7164bool AMDGPUInstructionSelector::selectSGetBarrierState(
7165 MachineInstr &I, Intrinsic::ID IntrID) const {
7166 MachineBasicBlock *MBB = I.getParent();
7167 const DebugLoc &DL = I.getDebugLoc();
7168 const MachineOperand &BarOp = I.getOperand(2);
7169 std::optional<int64_t> BarValImm =
7170 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
7171
7172 if (!BarValImm) {
7173 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
7174 .addReg(BarOp.getReg());
7175 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7176 }
7177 MachineInstrBuilder MIB;
7178 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
7179 : AMDGPU::S_GET_BARRIER_STATE_M0;
7180 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7181
7182 auto DstReg = I.getOperand(0).getReg();
7183 const TargetRegisterClass *DstRC =
7184 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
7185 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
7186 return false;
7187 MIB.addDef(DstReg);
7188 if (BarValImm) {
7189 MIB.addImm(*BarValImm);
7190 }
7191 I.eraseFromParent();
7192 return true;
7193}
7194
7195unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
7196 if (HasInlineConst) {
7197 switch (IntrID) {
7198 default:
7199 llvm_unreachable("not a named barrier op");
7200 case Intrinsic::amdgcn_s_barrier_join:
7201 return AMDGPU::S_BARRIER_JOIN_IMM;
7202 case Intrinsic::amdgcn_s_wakeup_barrier:
7203 return AMDGPU::S_WAKEUP_BARRIER_IMM;
7204 case Intrinsic::amdgcn_s_get_named_barrier_state:
7205 return AMDGPU::S_GET_BARRIER_STATE_IMM;
7206 };
7207 } else {
7208 switch (IntrID) {
7209 default:
7210 llvm_unreachable("not a named barrier op");
7211 case Intrinsic::amdgcn_s_barrier_join:
7212 return AMDGPU::S_BARRIER_JOIN_M0;
7213 case Intrinsic::amdgcn_s_wakeup_barrier:
7214 return AMDGPU::S_WAKEUP_BARRIER_M0;
7215 case Intrinsic::amdgcn_s_get_named_barrier_state:
7216 return AMDGPU::S_GET_BARRIER_STATE_M0;
7217 };
7218 }
7219}
7220
7221bool AMDGPUInstructionSelector::selectNamedBarrierInit(
7222 MachineInstr &I, Intrinsic::ID IntrID) const {
7223 MachineBasicBlock *MBB = I.getParent();
7224 const DebugLoc &DL = I.getDebugLoc();
7225 const MachineOperand &BarOp = I.getOperand(1);
7226 const MachineOperand &CntOp = I.getOperand(2);
7227
7228 // A member count of 0 means "keep existing member count". That plus a known
7229 // constant value for the barrier ID lets us use the immarg form.
7230 if (IntrID == Intrinsic::amdgcn_s_barrier_signal_var) {
7231 std::optional<int64_t> CntImm =
7232 getIConstantVRegSExtVal(CntOp.getReg(), *MRI);
7233 if (CntImm && *CntImm == 0) {
7234 std::optional<int64_t> BarValImm =
7235 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
7236 if (BarValImm) {
7237 auto BarID = ((*BarValImm) >> 4) & 0x3F;
7238 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
7239 .addImm(BarID);
7240 I.eraseFromParent();
7241 return true;
7242 }
7243 }
7244 }
7245
7246 // BarID = (BarOp >> 4) & 0x3F
7247 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7248 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
7249 .add(BarOp)
7250 .addImm(4u)
7251 .setOperandDead(3); // Dead scc
7252
7253 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7254 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
7255 .addReg(TmpReg0)
7256 .addImm(0x3F)
7257 .setOperandDead(3); // Dead scc
7258
7259 // MO = ((CntOp & 0x3F) << shAmt) | BarID
7260 Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7261 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2)
7262 .add(CntOp)
7263 .addImm(0x3F)
7264 .setOperandDead(3); // Dead scc
7265
7266 Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7267 constexpr unsigned ShAmt = 16;
7268 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3)
7269 .addReg(TmpReg2)
7270 .addImm(ShAmt)
7271 .setOperandDead(3); // Dead scc
7272
7273 Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7274 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4)
7275 .addReg(TmpReg1)
7276 .addReg(TmpReg3)
7277 .setOperandDead(3); // Dead scc;
7278
7279 auto CopyMIB =
7280 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
7281 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7282
7283 unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
7284 ? AMDGPU::S_BARRIER_INIT_M0
7285 : AMDGPU::S_BARRIER_SIGNAL_M0;
7286 MachineInstrBuilder MIB;
7287 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7288
7289 I.eraseFromParent();
7290 return true;
7291}
7292
7293bool AMDGPUInstructionSelector::selectNamedBarrierInst(
7294 MachineInstr &I, Intrinsic::ID IntrID) const {
7295 MachineBasicBlock *MBB = I.getParent();
7296 const DebugLoc &DL = I.getDebugLoc();
7297 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
7298 ? I.getOperand(2)
7299 : I.getOperand(1);
7300 std::optional<int64_t> BarValImm =
7301 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
7302
7303 if (!BarValImm) {
7304 // BarID = (BarOp >> 4) & 0x3F
7305 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7306 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
7307 .addReg(BarOp.getReg())
7308 .addImm(4u)
7309 .setOperandDead(3); // Dead scc;
7310
7311 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
7312 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
7313 .addReg(TmpReg0)
7314 .addImm(0x3F)
7315 .setOperandDead(3); // Dead scc;
7316
7317 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
7318 .addReg(TmpReg1);
7319 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
7320 }
7321
7322 MachineInstrBuilder MIB;
7323 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
7324 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
7325
7326 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
7327 auto DstReg = I.getOperand(0).getReg();
7328 const TargetRegisterClass *DstRC =
7329 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
7330 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
7331 return false;
7332 MIB.addDef(DstReg);
7333 }
7334
7335 if (BarValImm) {
7336 auto BarId = ((*BarValImm) >> 4) & 0x3F;
7337 MIB.addImm(BarId);
7338 }
7339
7340 I.eraseFromParent();
7341 return true;
7342}
7343
7344void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
7345 const MachineInstr &MI,
7346 int OpIdx) const {
7347 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7348 "Expected G_CONSTANT");
7349 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
7350}
7351
7352void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
7353 const MachineInstr &MI,
7354 int OpIdx) const {
7355 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7356 "Expected G_CONSTANT");
7357 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
7358}
7359
7360void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
7361 const MachineInstr &MI,
7362 int OpIdx) const {
7363 const MachineOperand &Op = MI.getOperand(1);
7364 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
7365 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
7366}
7367
7368void AMDGPUInstructionSelector::renderCountTrailingOnesImm(
7369 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7370 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7371 "Expected G_CONSTANT");
7372 MIB.addImm(MI.getOperand(1).getCImm()->getValue().countTrailingOnes());
7373}
7374
7375/// This only really exists to satisfy DAG type checking machinery, so is a
7376/// no-op here.
7377void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
7378 const MachineInstr &MI,
7379 int OpIdx) const {
7380 const MachineOperand &Op = MI.getOperand(OpIdx);
7381 int64_t Imm;
7382 if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm)))
7383 MIB.addImm(Imm);
7384 else
7385 MIB.addImm(Op.getImm());
7386}
7387
7388void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
7389 const MachineInstr &MI,
7390 int OpIdx) const {
7391 MIB.addImm(MI.getOperand(OpIdx).getImm() != 0);
7392}
7393
7394void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
7395 const MachineInstr &MI,
7396 int OpIdx) const {
7397 assert(OpIdx >= 0 && "expected to match an immediate operand");
7398 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7399}
7400
7401void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
7402 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7403 assert(OpIdx >= 0 && "expected to match an immediate operand");
7404 MIB.addImm(
7405 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7406}
7407
7408void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
7409 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7410 assert(OpIdx >= 0 && "expected to match an immediate operand");
7411 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
7413 : (int64_t)SISrcMods::DST_OP_SEL);
7414}
7415
7416void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
7417 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7418 assert(OpIdx >= 0 && "expected to match an immediate operand");
7419 MIB.addImm(
7420 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7421}
7422
7423void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
7424 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7425 assert(OpIdx >= 0 && "expected to match an immediate operand");
7426 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7427 ? (int64_t)(SISrcMods::OP_SEL_0)
7428 : 0);
7429}
7430
7431void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
7432 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7433 assert(OpIdx >= 0 && "expected to match an immediate operand");
7434 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
7435 : 0);
7436}
7437
7438void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
7439 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7440 assert(OpIdx >= 0 && "expected to match an immediate operand");
7441 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
7442 : 0);
7443}
7444
7445void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
7446 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7447 assert(OpIdx >= 0 && "expected to match an immediate operand");
7448 MIB.addImm(
7449 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7450}
7451
7452void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
7453 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7454 assert(OpIdx >= 0 && "expected to match an immediate operand");
7455 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
7456 ? (int64_t)SISrcMods::DST_OP_SEL
7457 : 0);
7458}
7459
7460void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
7461 const MachineInstr &MI,
7462 int OpIdx) const {
7463 assert(OpIdx >= 0 && "expected to match an immediate operand");
7464 MIB.addImm(MI.getOperand(OpIdx).getImm() &
7467}
7468
7469void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
7470 const MachineInstr &MI,
7471 int OpIdx) const {
7472 assert(OpIdx >= 0 && "expected to match an immediate operand");
7473 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
7476 MIB.addImm(Swizzle);
7477}
7478
7479void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
7480 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7481 assert(OpIdx >= 0 && "expected to match an immediate operand");
7482 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
7485 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
7486}
7487
7488void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
7489 const MachineInstr &MI,
7490 int OpIdx) const {
7491 MIB.addFrameIndex(MI.getOperand(1).getIndex());
7492}
7493
7494void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
7495 const MachineInstr &MI,
7496 int OpIdx) const {
7497 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
7498 int ExpVal = APF.getExactLog2Abs();
7499 assert(ExpVal != INT_MIN);
7500 MIB.addImm(ExpVal);
7501}
7502
7503void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
7504 const MachineInstr &MI,
7505 int OpIdx) const {
7506 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
7507 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
7508 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
7509 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
7510 MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
7511}
7512
7513void AMDGPUInstructionSelector::renderVOP3PModsNeg(MachineInstrBuilder &MIB,
7514 const MachineInstr &MI,
7515 int OpIdx) const {
7516 unsigned Mods = SISrcMods::OP_SEL_1;
7517 if (MI.getOperand(OpIdx).getImm())
7518 Mods ^= SISrcMods::NEG;
7519 MIB.addImm((int64_t)Mods);
7520}
7521
7522void AMDGPUInstructionSelector::renderVOP3PModsNegs(MachineInstrBuilder &MIB,
7523 const MachineInstr &MI,
7524 int OpIdx) const {
7525 unsigned Mods = SISrcMods::OP_SEL_1;
7526 if (MI.getOperand(OpIdx).getImm())
7528 MIB.addImm((int64_t)Mods);
7529}
7530
7531void AMDGPUInstructionSelector::renderVOP3PModsNegAbs(MachineInstrBuilder &MIB,
7532 const MachineInstr &MI,
7533 int OpIdx) const {
7534 unsigned Val = MI.getOperand(OpIdx).getImm();
7535 unsigned Mods = SISrcMods::OP_SEL_1; // default: none
7536 if (Val == 1) // neg
7537 Mods ^= SISrcMods::NEG;
7538 if (Val == 2) // abs
7539 Mods ^= SISrcMods::ABS;
7540 if (Val == 3) // neg and abs
7541 Mods ^= (SISrcMods::NEG | SISrcMods::ABS);
7542 MIB.addImm((int64_t)Mods);
7543}
7544
7545void AMDGPUInstructionSelector::renderPrefetchLoc(MachineInstrBuilder &MIB,
7546 const MachineInstr &MI,
7547 int OpIdx) const {
7548 uint32_t V = MI.getOperand(2).getImm();
7551 if (!Subtarget->hasSafeCUPrefetch())
7552 V = std::max(V, (uint32_t)AMDGPU::CPol::SCOPE_SE); // CU scope is unsafe
7553 MIB.addImm(V);
7554}
7555
7556/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
7557void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
7558 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7559 unsigned Val = MI.getOperand(OpIdx).getImm();
7560 unsigned New = 0;
7561 if (Val & 0x1)
7563 if (Val & 0x2)
7565 MIB.addImm(New);
7566}
7567
7568bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
7569 return TII.isInlineConstant(Imm);
7570}
7571
7572bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
7573 return TII.isInlineConstant(Imm);
7574}
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static Register getLegalRegBank(Register NewReg, Register RootReg, const AMDGPURegisterBankInfo &RBI, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const SIInstrInfo &TII)
static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is shift left with half bits, such as reg0:2n =G_SHL reg1:2n, CONST(n)
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
static bool checkRB(Register Reg, unsigned int RBNo, const AMDGPURegisterBankInfo &RBI, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI)
static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods)
static bool isTruncHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is truncating to half, such as reg0:n = G_TRUNC reg1:2n
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static TypeClass isVectorOfTwoOrScalar(Register Reg, const MachineRegisterInfo &MRI)
static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, MachineBasicBlock *MBB)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static bool isSameBitWidth(Register Reg1, Register Reg2, const MachineRegisterInfo &MRI)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static bool isAsyncLDSDMA(Intrinsic::ID Intr)
static void diagnoseUnsupportedIntrinsic(const MachineInstr &I)
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelValueTracking &ValueTracking)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static std::pair< Register, SrcStatus > getLastSameOrNeg(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI)
static std::optional< std::pair< Register, SrcStatus > > calcNextStatus(std::pair< Register, SrcStatus > Curr, const MachineRegisterInfo &MRI)
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg, Register RootReg, const SIInstrInfo &TII, const MachineRegisterInfo &MRI)
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static SmallVector< std::pair< Register, SrcStatus > > getSrcStats(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, int MaxDepth=3)
static bool isUnmergeHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test function, if the MI is reg0:n, reg1:n = G_UNMERGE_VALUES reg2:2n
static SrcStatus getNegStatus(Register Reg, SrcStatus S, const MachineRegisterInfo &MRI)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI)
Test if the MI is logic shift right with half bits, such as reg0:2n =G_LSHR reg1:2n,...
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
constexpr LLT S1
constexpr LLT S32
AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool isAllZeros(StringRef Arr)
Return true if the array is empty or all zeros.
dxil translate DXIL Translate Metadata
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
MachineInstr unsigned OpIdx
#define P(N)
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
#define LLVM_DEBUG(...)
Definition Debug.h:119
Value * RHS
Value * LHS
This is used to control valid status that current MI supports.
bool checkOptions(SrcStatus Stat) const
SearchOptions(Register Reg, const MachineRegisterInfo &MRI)
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelValueTracking *VT, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1594
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:743
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:757
@ ICMP_SLT
signed less than
Definition InstrTypes.h:769
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:770
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:746
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:755
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:744
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:745
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:764
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:763
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:767
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:754
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:748
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:751
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:752
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:747
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:749
@ ICMP_NE
not equal
Definition InstrTypes.h:762
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:768
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:756
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:766
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:753
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:742
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:750
bool isFPPredicate() const
Definition InstrTypes.h:845
bool isIntPredicate() const
Definition InstrTypes.h:846
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
LLVM_ABI DILocation * get() const
Get the underlying DILocation.
Definition DebugLoc.cpp:48
Diagnostic information for unsupported feature in backend.
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelValueTracking *vt, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool hasValue() const
TypeSize getValue() const
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void setReturnAddressIsTaken(bool s)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Analysis providing profile information.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
static bool isGenericOpcode(unsigned Opc)
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
LLVM_READONLY int32_t getGlobalSaddrOp(uint32_t Opcode)
bool isGFX13Plus(const MCSubtargetInfo &STI)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
IndexMode
ARM Index Modes.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
operand_type_match m_Reg()
SpecificConstantMatch m_SpecificICst(const APInt &RequestedValue)
Matches a constant equal to RequestedValue.
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_XOR, true > m_GXor(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_SEXT > m_GSExt(const SrcTy &Src)
UnaryOp_match< SrcTy, TargetOpcode::G_FPEXT > m_GFPExt(const SrcTy &Src)
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
ConstantMatch< APInt > m_ICst(APInt &Cst)
SpecificConstantMatch m_AllOnesInt()
BinaryOp_match< LHS, RHS, TargetOpcode::G_OR, true > m_GOr(const LHS &L, const RHS &R)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
ImplicitDefMatch m_GImplicitDef()
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
BinaryOp_match< LHS, RHS, TargetOpcode::G_ASHR, false > m_GAShr(const LHS &L, const RHS &R)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
SpecificRegisterMatch m_SpecificReg(Register RequestedReg)
Matches a register only if it is equal to RequestedReg.
BinaryOp_match< LHS, RHS, TargetOpcode::G_SHL, false > m_GShl(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
BinaryOp_match< LHS, RHS, TargetOpcode::G_AND, true > m_GAnd(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_BITCAST > m_GBitcast(const SrcTy &Src)
bind_ty< MachineInstr * > m_MInstr(MachineInstr *&MI)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_ANYEXT > m_GAnyExt(const SrcTy &Src)
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, TargetOpcode::G_MUL, true > m_GMul(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
auto m_BinOp()
Match an arbitrary binary operation and ignore it.
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:858
@ Offset
Definition DWP.cpp:558
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI bool isBuildVectorAllZeros(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndef=false)
Return true if the specified instruction is a G_BUILD_VECTOR or G_BUILD_VECTOR_TRUNC where all of the...
Definition Utils.cpp:1444
LLVM_ABI Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition Utils.cpp:57
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:653
PointerUnion< const TargetRegisterClass *, const RegisterBank * > RegClassOrRegBank
Convenient type to represent either a register class or a register bank.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:461
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
LLVM_ABI std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition Utils.cpp:294
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:156
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition Utils.cpp:494
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:314
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
LLVM_ABI std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition Utils.cpp:439
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
LLVM_ABI std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition Utils.cpp:469
LLVM_ABI Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition Utils.cpp:501
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
constexpr RegState getUndefRegState(bool B)
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:315
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false, bool SelfAdd=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:361
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.