LLVM 19.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
48 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
50#include "AMDGPUGenGlobalISel.inc"
53#include "AMDGPUGenGlobalISel.inc"
55{
56}
57
58const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
59
61 CodeGenCoverage *CoverageInfo,
63 BlockFrequencyInfo *BFI) {
64 MRI = &MF.getRegInfo();
65 Subtarget = &MF.getSubtarget<GCNSubtarget>();
67}
68
69// Return the wave level SGPR base address if this is a wave address.
71 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
72 ? Def->getOperand(1).getReg()
73 : Register();
74}
75
76bool AMDGPUInstructionSelector::isVCC(Register Reg,
77 const MachineRegisterInfo &MRI) const {
78 // The verifier is oblivious to s1 being a valid value for wavesize registers.
79 if (Reg.isPhysical())
80 return false;
81
82 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
83 const TargetRegisterClass *RC =
84 RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
85 if (RC) {
86 const LLT Ty = MRI.getType(Reg);
87 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
88 return false;
89 // G_TRUNC s1 result is never vcc.
90 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
91 RC->hasSuperClassEq(TRI.getBoolRC());
92 }
93
94 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
95 return RB->getID() == AMDGPU::VCCRegBankID;
96}
97
98bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
99 unsigned NewOpc) const {
100 MI.setDesc(TII.get(NewOpc));
101 MI.removeOperand(1); // Remove intrinsic ID.
102 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
103
104 MachineOperand &Dst = MI.getOperand(0);
105 MachineOperand &Src = MI.getOperand(1);
106
107 // TODO: This should be legalized to s32 if needed
108 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
109 return false;
110
111 const TargetRegisterClass *DstRC
112 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
113 const TargetRegisterClass *SrcRC
114 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
115 if (!DstRC || DstRC != SrcRC)
116 return false;
117
118 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
119 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
120}
121
122bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
123 const DebugLoc &DL = I.getDebugLoc();
124 MachineBasicBlock *BB = I.getParent();
125 I.setDesc(TII.get(TargetOpcode::COPY));
126
127 const MachineOperand &Src = I.getOperand(1);
128 MachineOperand &Dst = I.getOperand(0);
129 Register DstReg = Dst.getReg();
130 Register SrcReg = Src.getReg();
131
132 if (isVCC(DstReg, *MRI)) {
133 if (SrcReg == AMDGPU::SCC) {
134 const TargetRegisterClass *RC
135 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
136 if (!RC)
137 return true;
138 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
139 }
140
141 if (!isVCC(SrcReg, *MRI)) {
142 // TODO: Should probably leave the copy and let copyPhysReg expand it.
143 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
144 return false;
145
146 const TargetRegisterClass *SrcRC
147 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
148
149 std::optional<ValueAndVReg> ConstVal =
150 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
151 if (ConstVal) {
152 unsigned MovOpc =
153 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
154 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
155 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
156 } else {
157 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
158
159 // We can't trust the high bits at this point, so clear them.
160
161 // TODO: Skip masking high bits if def is known boolean.
162
163 bool IsSGPR = TRI.isSGPRClass(SrcRC);
164 unsigned AndOpc =
165 IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
166 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
167 .addImm(1)
168 .addReg(SrcReg);
169 if (IsSGPR)
170 And.setOperandDead(3); // Dead scc
171
172 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
173 .addImm(0)
174 .addReg(MaskedReg);
175 }
176
177 if (!MRI->getRegClassOrNull(SrcReg))
178 MRI->setRegClass(SrcReg, SrcRC);
179 I.eraseFromParent();
180 return true;
181 }
182
183 const TargetRegisterClass *RC =
185 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
186 return false;
187
188 return true;
189 }
190
191 for (const MachineOperand &MO : I.operands()) {
192 if (MO.getReg().isPhysical())
193 continue;
194
195 const TargetRegisterClass *RC =
197 if (!RC)
198 continue;
199 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
200 }
201 return true;
202}
203
204bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
205 const Register DefReg = I.getOperand(0).getReg();
206 const LLT DefTy = MRI->getType(DefReg);
207
208 // S1 G_PHIs should not be selected in instruction-select, instead:
209 // - divergent S1 G_PHI should go through lane mask merging algorithm
210 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
211 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
212 if (DefTy == LLT::scalar(1))
213 return false;
214
215 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
216
217 const RegClassOrRegBank &RegClassOrBank =
218 MRI->getRegClassOrRegBank(DefReg);
219
220 const TargetRegisterClass *DefRC
221 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
222 if (!DefRC) {
223 if (!DefTy.isValid()) {
224 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
225 return false;
226 }
227
228 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
229 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
230 if (!DefRC) {
231 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
232 return false;
233 }
234 }
235
236 // TODO: Verify that all registers have the same bank
237 I.setDesc(TII.get(TargetOpcode::PHI));
238 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
239}
240
242AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
243 const TargetRegisterClass &SubRC,
244 unsigned SubIdx) const {
245
246 MachineInstr *MI = MO.getParent();
248 Register DstReg = MRI->createVirtualRegister(&SubRC);
249
250 if (MO.isReg()) {
251 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
252 Register Reg = MO.getReg();
253 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
254 .addReg(Reg, 0, ComposedSubIdx);
255
256 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
257 MO.isKill(), MO.isDead(), MO.isUndef(),
258 MO.isEarlyClobber(), 0, MO.isDebug(),
259 MO.isInternalRead());
260 }
261
262 assert(MO.isImm());
263
264 APInt Imm(64, MO.getImm());
265
266 switch (SubIdx) {
267 default:
268 llvm_unreachable("do not know to split immediate with this sub index.");
269 case AMDGPU::sub0:
270 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
271 case AMDGPU::sub1:
272 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
273 }
274}
275
276static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
277 switch (Opc) {
278 case AMDGPU::G_AND:
279 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
280 case AMDGPU::G_OR:
281 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
282 case AMDGPU::G_XOR:
283 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
284 default:
285 llvm_unreachable("not a bit op");
286 }
287}
288
289bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
290 Register DstReg = I.getOperand(0).getReg();
291 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
292
293 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
294 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
295 DstRB->getID() != AMDGPU::VCCRegBankID)
296 return false;
297
298 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
299 STI.isWave64());
300 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
301
302 // Dead implicit-def of scc
303 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
304 true, // isImp
305 false, // isKill
306 true)); // isDead
307 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
308}
309
310bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
311 MachineBasicBlock *BB = I.getParent();
313 Register DstReg = I.getOperand(0).getReg();
314 const DebugLoc &DL = I.getDebugLoc();
315 LLT Ty = MRI->getType(DstReg);
316 if (Ty.isVector())
317 return false;
318
319 unsigned Size = Ty.getSizeInBits();
320 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
321 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
322 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
323
324 if (Size == 32) {
325 if (IsSALU) {
326 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
328 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
329 .add(I.getOperand(1))
330 .add(I.getOperand(2))
331 .setOperandDead(3); // Dead scc
332 I.eraseFromParent();
333 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
334 }
335
336 if (STI.hasAddNoCarry()) {
337 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
338 I.setDesc(TII.get(Opc));
339 I.addOperand(*MF, MachineOperand::CreateImm(0));
340 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
341 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
342 }
343
344 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
345
346 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
348 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
349 .addDef(UnusedCarry, RegState::Dead)
350 .add(I.getOperand(1))
351 .add(I.getOperand(2))
352 .addImm(0);
353 I.eraseFromParent();
354 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
355 }
356
357 assert(!Sub && "illegal sub should not reach here");
358
359 const TargetRegisterClass &RC
360 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
361 const TargetRegisterClass &HalfRC
362 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
363
364 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
365 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
366 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
367 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
368
369 Register DstLo = MRI->createVirtualRegister(&HalfRC);
370 Register DstHi = MRI->createVirtualRegister(&HalfRC);
371
372 if (IsSALU) {
373 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
374 .add(Lo1)
375 .add(Lo2);
376 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
377 .add(Hi1)
378 .add(Hi2)
379 .setOperandDead(3); // Dead scc
380 } else {
381 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
382 Register CarryReg = MRI->createVirtualRegister(CarryRC);
383 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
384 .addDef(CarryReg)
385 .add(Lo1)
386 .add(Lo2)
387 .addImm(0);
388 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
389 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
390 .add(Hi1)
391 .add(Hi2)
392 .addReg(CarryReg, RegState::Kill)
393 .addImm(0);
394
395 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
396 return false;
397 }
398
399 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
400 .addReg(DstLo)
401 .addImm(AMDGPU::sub0)
402 .addReg(DstHi)
403 .addImm(AMDGPU::sub1);
404
405
406 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
407 return false;
408
409 I.eraseFromParent();
410 return true;
411}
412
413bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
414 MachineInstr &I) const {
415 MachineBasicBlock *BB = I.getParent();
417 const DebugLoc &DL = I.getDebugLoc();
418 Register Dst0Reg = I.getOperand(0).getReg();
419 Register Dst1Reg = I.getOperand(1).getReg();
420 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
421 I.getOpcode() == AMDGPU::G_UADDE;
422 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
423 I.getOpcode() == AMDGPU::G_USUBE;
424
425 if (isVCC(Dst1Reg, *MRI)) {
426 unsigned NoCarryOpc =
427 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
428 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
429 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
430 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
431 I.addOperand(*MF, MachineOperand::CreateImm(0));
432 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
433 }
434
435 Register Src0Reg = I.getOperand(2).getReg();
436 Register Src1Reg = I.getOperand(3).getReg();
437
438 if (HasCarryIn) {
439 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
440 .addReg(I.getOperand(4).getReg());
441 }
442
443 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
444 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
445
446 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
447 .add(I.getOperand(2))
448 .add(I.getOperand(3));
449
450 if (MRI->use_nodbg_empty(Dst1Reg)) {
451 CarryInst.setOperandDead(3); // Dead scc
452 } else {
453 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
454 .addReg(AMDGPU::SCC);
455 if (!MRI->getRegClassOrNull(Dst1Reg))
456 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
457 }
458
459 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
460 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
461 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
462 return false;
463
464 if (HasCarryIn &&
465 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
466 AMDGPU::SReg_32RegClass, *MRI))
467 return false;
468
469 I.eraseFromParent();
470 return true;
471}
472
473bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
474 MachineInstr &I) const {
475 MachineBasicBlock *BB = I.getParent();
477 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
478
479 unsigned Opc;
480 if (Subtarget->hasMADIntraFwdBug())
481 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
482 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
483 else
484 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
485 I.setDesc(TII.get(Opc));
486 I.addOperand(*MF, MachineOperand::CreateImm(0));
487 I.addImplicitDefUseOperands(*MF);
488 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
489}
490
491// TODO: We should probably legalize these to only using 32-bit results.
492bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
493 MachineBasicBlock *BB = I.getParent();
494 Register DstReg = I.getOperand(0).getReg();
495 Register SrcReg = I.getOperand(1).getReg();
496 LLT DstTy = MRI->getType(DstReg);
497 LLT SrcTy = MRI->getType(SrcReg);
498 const unsigned SrcSize = SrcTy.getSizeInBits();
499 unsigned DstSize = DstTy.getSizeInBits();
500
501 // TODO: Should handle any multiple of 32 offset.
502 unsigned Offset = I.getOperand(2).getImm();
503 if (Offset % 32 != 0 || DstSize > 128)
504 return false;
505
506 // 16-bit operations really use 32-bit registers.
507 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
508 if (DstSize == 16)
509 DstSize = 32;
510
511 const TargetRegisterClass *DstRC =
512 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
513 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
514 return false;
515
516 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
517 const TargetRegisterClass *SrcRC =
518 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
519 if (!SrcRC)
520 return false;
522 DstSize / 32);
523 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
524 if (!SrcRC)
525 return false;
526
527 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
528 *SrcRC, I.getOperand(1));
529 const DebugLoc &DL = I.getDebugLoc();
530 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
531 .addReg(SrcReg, 0, SubReg);
532
533 I.eraseFromParent();
534 return true;
535}
536
537bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
538 MachineBasicBlock *BB = MI.getParent();
539 Register DstReg = MI.getOperand(0).getReg();
540 LLT DstTy = MRI->getType(DstReg);
541 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
542
543 const unsigned SrcSize = SrcTy.getSizeInBits();
544 if (SrcSize < 32)
545 return selectImpl(MI, *CoverageInfo);
546
547 const DebugLoc &DL = MI.getDebugLoc();
548 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
549 const unsigned DstSize = DstTy.getSizeInBits();
550 const TargetRegisterClass *DstRC =
551 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
552 if (!DstRC)
553 return false;
554
555 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
557 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
558 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
559 MachineOperand &Src = MI.getOperand(I + 1);
560 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
561 MIB.addImm(SubRegs[I]);
562
563 const TargetRegisterClass *SrcRC
564 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
565 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
566 return false;
567 }
568
569 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
570 return false;
571
572 MI.eraseFromParent();
573 return true;
574}
575
576bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
577 MachineBasicBlock *BB = MI.getParent();
578 const int NumDst = MI.getNumOperands() - 1;
579
580 MachineOperand &Src = MI.getOperand(NumDst);
581
582 Register SrcReg = Src.getReg();
583 Register DstReg0 = MI.getOperand(0).getReg();
584 LLT DstTy = MRI->getType(DstReg0);
585 LLT SrcTy = MRI->getType(SrcReg);
586
587 const unsigned DstSize = DstTy.getSizeInBits();
588 const unsigned SrcSize = SrcTy.getSizeInBits();
589 const DebugLoc &DL = MI.getDebugLoc();
590 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
591
592 const TargetRegisterClass *SrcRC =
593 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
594 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
595 return false;
596
597 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
598 // source, and this relies on the fact that the same subregister indices are
599 // used for both.
600 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
601 for (int I = 0, E = NumDst; I != E; ++I) {
602 MachineOperand &Dst = MI.getOperand(I);
603 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
604 .addReg(SrcReg, 0, SubRegs[I]);
605
606 // Make sure the subregister index is valid for the source register.
607 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
608 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
609 return false;
610
611 const TargetRegisterClass *DstRC =
613 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
614 return false;
615 }
616
617 MI.eraseFromParent();
618 return true;
619}
620
621bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
622 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
623 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
624
625 Register Src0 = MI.getOperand(1).getReg();
626 Register Src1 = MI.getOperand(2).getReg();
627 LLT SrcTy = MRI->getType(Src0);
628 const unsigned SrcSize = SrcTy.getSizeInBits();
629
630 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
631 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
632 return selectG_MERGE_VALUES(MI);
633 }
634
635 // Selection logic below is for V2S16 only.
636 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
637 Register Dst = MI.getOperand(0).getReg();
638 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
639 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
640 SrcTy != LLT::scalar(32)))
641 return selectImpl(MI, *CoverageInfo);
642
643 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
644 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
645 return false;
646
647 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
648 DstBank->getID() == AMDGPU::VGPRRegBankID);
649 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
650
651 const DebugLoc &DL = MI.getDebugLoc();
652 MachineBasicBlock *BB = MI.getParent();
653
654 // First, before trying TableGen patterns, check if both sources are
655 // constants. In those cases, we can trivially compute the final constant
656 // and emit a simple move.
657 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
658 if (ConstSrc1) {
659 auto ConstSrc0 =
660 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
661 if (ConstSrc0) {
662 const int64_t K0 = ConstSrc0->Value.getSExtValue();
663 const int64_t K1 = ConstSrc1->Value.getSExtValue();
664 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
665 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
666 uint32_t Imm = Lo16 | (Hi16 << 16);
667
668 // VALU
669 if (IsVector) {
670 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
671 MI.eraseFromParent();
672 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
673 }
674
675 // SALU
676 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
677 MI.eraseFromParent();
678 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
679 }
680 }
681
682 // Now try TableGen patterns.
683 if (selectImpl(MI, *CoverageInfo))
684 return true;
685
686 // TODO: This should probably be a combine somewhere
687 // (build_vector $src0, undef) -> copy $src0
688 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
689 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
690 MI.setDesc(TII.get(AMDGPU::COPY));
691 MI.removeOperand(2);
692 const auto &RC =
693 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
694 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
695 RBI.constrainGenericRegister(Src0, RC, *MRI);
696 }
697
698 // TODO: Can be improved?
699 if (IsVector) {
700 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
701 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
702 .addImm(0xFFFF)
703 .addReg(Src0);
704 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
705 return false;
706
707 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
708 .addReg(Src1)
709 .addImm(16)
710 .addReg(TmpReg);
711 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
712 return false;
713
714 MI.eraseFromParent();
715 return true;
716 }
717
718 Register ShiftSrc0;
719 Register ShiftSrc1;
720
721 // With multiple uses of the shift, this will duplicate the shift and
722 // increase register pressure.
723 //
724 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
725 // => (S_PACK_HH_B32_B16 $src0, $src1)
726 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
727 // => (S_PACK_HL_B32_B16 $src0, $src1)
728 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
729 // => (S_PACK_LH_B32_B16 $src0, $src1)
730 // (build_vector $src0, $src1)
731 // => (S_PACK_LL_B32_B16 $src0, $src1)
732
733 bool Shift0 = mi_match(
734 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
735
736 bool Shift1 = mi_match(
737 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
738
739 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
740 if (Shift0 && Shift1) {
741 Opc = AMDGPU::S_PACK_HH_B32_B16;
742 MI.getOperand(1).setReg(ShiftSrc0);
743 MI.getOperand(2).setReg(ShiftSrc1);
744 } else if (Shift1) {
745 Opc = AMDGPU::S_PACK_LH_B32_B16;
746 MI.getOperand(2).setReg(ShiftSrc1);
747 } else if (Shift0) {
748 auto ConstSrc1 =
749 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
750 if (ConstSrc1 && ConstSrc1->Value == 0) {
751 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
752 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
753 .addReg(ShiftSrc0)
754 .addImm(16)
755 .setOperandDead(3); // Dead scc
756
757 MI.eraseFromParent();
758 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
759 }
760 if (STI.hasSPackHL()) {
761 Opc = AMDGPU::S_PACK_HL_B32_B16;
762 MI.getOperand(1).setReg(ShiftSrc0);
763 }
764 }
765
766 MI.setDesc(TII.get(Opc));
767 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
768}
769
770bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
771 return selectG_ADD_SUB(I);
772}
773
774bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
775 const MachineOperand &MO = I.getOperand(0);
776
777 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
778 // regbank check here is to know why getConstrainedRegClassForOperand failed.
780 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
781 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
782 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
783 return true;
784 }
785
786 return false;
787}
788
789bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
790 MachineBasicBlock *BB = I.getParent();
791
792 Register DstReg = I.getOperand(0).getReg();
793 Register Src0Reg = I.getOperand(1).getReg();
794 Register Src1Reg = I.getOperand(2).getReg();
795 LLT Src1Ty = MRI->getType(Src1Reg);
796
797 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
798 unsigned InsSize = Src1Ty.getSizeInBits();
799
800 int64_t Offset = I.getOperand(3).getImm();
801
802 // FIXME: These cases should have been illegal and unnecessary to check here.
803 if (Offset % 32 != 0 || InsSize % 32 != 0)
804 return false;
805
806 // Currently not handled by getSubRegFromChannel.
807 if (InsSize > 128)
808 return false;
809
810 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
811 if (SubReg == AMDGPU::NoSubRegister)
812 return false;
813
814 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
815 const TargetRegisterClass *DstRC =
816 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
817 if (!DstRC)
818 return false;
819
820 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
821 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
822 const TargetRegisterClass *Src0RC =
823 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
824 const TargetRegisterClass *Src1RC =
825 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
826
827 // Deal with weird cases where the class only partially supports the subreg
828 // index.
829 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
830 if (!Src0RC || !Src1RC)
831 return false;
832
833 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
834 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
835 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
836 return false;
837
838 const DebugLoc &DL = I.getDebugLoc();
839 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
840 .addReg(Src0Reg)
841 .addReg(Src1Reg)
842 .addImm(SubReg);
843
844 I.eraseFromParent();
845 return true;
846}
847
848bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
849 Register DstReg = MI.getOperand(0).getReg();
850 Register SrcReg = MI.getOperand(1).getReg();
851 Register OffsetReg = MI.getOperand(2).getReg();
852 Register WidthReg = MI.getOperand(3).getReg();
853
854 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
855 "scalar BFX instructions are expanded in regbankselect");
856 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
857 "64-bit vector BFX instructions are expanded in regbankselect");
858
859 const DebugLoc &DL = MI.getDebugLoc();
860 MachineBasicBlock *MBB = MI.getParent();
861
862 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
863 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
864 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
865 .addReg(SrcReg)
866 .addReg(OffsetReg)
867 .addReg(WidthReg);
868 MI.eraseFromParent();
869 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
870}
871
872bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
873 if (STI.getLDSBankCount() != 16)
874 return selectImpl(MI, *CoverageInfo);
875
876 Register Dst = MI.getOperand(0).getReg();
877 Register Src0 = MI.getOperand(2).getReg();
878 Register M0Val = MI.getOperand(6).getReg();
879 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
880 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
881 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
882 return false;
883
884 // This requires 2 instructions. It is possible to write a pattern to support
885 // this, but the generated isel emitter doesn't correctly deal with multiple
886 // output instructions using the same physical register input. The copy to m0
887 // is incorrectly placed before the second instruction.
888 //
889 // TODO: Match source modifiers.
890
891 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
892 const DebugLoc &DL = MI.getDebugLoc();
893 MachineBasicBlock *MBB = MI.getParent();
894
895 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
896 .addReg(M0Val);
897 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
898 .addImm(2)
899 .addImm(MI.getOperand(4).getImm()) // $attr
900 .addImm(MI.getOperand(3).getImm()); // $attrchan
901
902 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
903 .addImm(0) // $src0_modifiers
904 .addReg(Src0) // $src0
905 .addImm(MI.getOperand(4).getImm()) // $attr
906 .addImm(MI.getOperand(3).getImm()) // $attrchan
907 .addImm(0) // $src2_modifiers
908 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
909 .addImm(MI.getOperand(5).getImm()) // $high
910 .addImm(0) // $clamp
911 .addImm(0); // $omod
912
913 MI.eraseFromParent();
914 return true;
915}
916
917// Writelane is special in that it can use SGPR and M0 (which would normally
918// count as using the constant bus twice - but in this case it is allowed since
919// the lane selector doesn't count as a use of the constant bus). However, it is
920// still required to abide by the 1 SGPR rule. Fix this up if we might have
921// multiple SGPRs.
922bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
923 // With a constant bus limit of at least 2, there's no issue.
924 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
925 return selectImpl(MI, *CoverageInfo);
926
927 MachineBasicBlock *MBB = MI.getParent();
928 const DebugLoc &DL = MI.getDebugLoc();
929 Register VDst = MI.getOperand(0).getReg();
930 Register Val = MI.getOperand(2).getReg();
931 Register LaneSelect = MI.getOperand(3).getReg();
932 Register VDstIn = MI.getOperand(4).getReg();
933
934 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
935
936 std::optional<ValueAndVReg> ConstSelect =
937 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
938 if (ConstSelect) {
939 // The selector has to be an inline immediate, so we can use whatever for
940 // the other operands.
941 MIB.addReg(Val);
942 MIB.addImm(ConstSelect->Value.getSExtValue() &
943 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
944 } else {
945 std::optional<ValueAndVReg> ConstVal =
947
948 // If the value written is an inline immediate, we can get away without a
949 // copy to m0.
950 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
951 STI.hasInv2PiInlineImm())) {
952 MIB.addImm(ConstVal->Value.getSExtValue());
953 MIB.addReg(LaneSelect);
954 } else {
955 MIB.addReg(Val);
956
957 // If the lane selector was originally in a VGPR and copied with
958 // readfirstlane, there's a hazard to read the same SGPR from the
959 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
960 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
961
962 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
963 .addReg(LaneSelect);
964 MIB.addReg(AMDGPU::M0);
965 }
966 }
967
968 MIB.addReg(VDstIn);
969
970 MI.eraseFromParent();
971 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
972}
973
974// We need to handle this here because tablegen doesn't support matching
975// instructions with multiple outputs.
976bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
977 Register Dst0 = MI.getOperand(0).getReg();
978 Register Dst1 = MI.getOperand(1).getReg();
979
980 LLT Ty = MRI->getType(Dst0);
981 unsigned Opc;
982 if (Ty == LLT::scalar(32))
983 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
984 else if (Ty == LLT::scalar(64))
985 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
986 else
987 return false;
988
989 // TODO: Match source modifiers.
990
991 const DebugLoc &DL = MI.getDebugLoc();
992 MachineBasicBlock *MBB = MI.getParent();
993
994 Register Numer = MI.getOperand(3).getReg();
995 Register Denom = MI.getOperand(4).getReg();
996 unsigned ChooseDenom = MI.getOperand(5).getImm();
997
998 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
999
1000 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1001 .addDef(Dst1)
1002 .addImm(0) // $src0_modifiers
1003 .addUse(Src0) // $src0
1004 .addImm(0) // $src1_modifiers
1005 .addUse(Denom) // $src1
1006 .addImm(0) // $src2_modifiers
1007 .addUse(Numer) // $src2
1008 .addImm(0) // $clamp
1009 .addImm(0); // $omod
1010
1011 MI.eraseFromParent();
1012 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1013}
1014
1015bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1016 unsigned IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1017 switch (IntrinsicID) {
1018 case Intrinsic::amdgcn_if_break: {
1019 MachineBasicBlock *BB = I.getParent();
1020
1021 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1022 // SelectionDAG uses for wave32 vs wave64.
1023 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1024 .add(I.getOperand(0))
1025 .add(I.getOperand(2))
1026 .add(I.getOperand(3));
1027
1028 Register DstReg = I.getOperand(0).getReg();
1029 Register Src0Reg = I.getOperand(2).getReg();
1030 Register Src1Reg = I.getOperand(3).getReg();
1031
1032 I.eraseFromParent();
1033
1034 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1035 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1036
1037 return true;
1038 }
1039 case Intrinsic::amdgcn_interp_p1_f16:
1040 return selectInterpP1F16(I);
1041 case Intrinsic::amdgcn_wqm:
1042 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1043 case Intrinsic::amdgcn_softwqm:
1044 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1045 case Intrinsic::amdgcn_strict_wwm:
1046 case Intrinsic::amdgcn_wwm:
1047 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1048 case Intrinsic::amdgcn_strict_wqm:
1049 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1050 case Intrinsic::amdgcn_writelane:
1051 return selectWritelane(I);
1052 case Intrinsic::amdgcn_div_scale:
1053 return selectDivScale(I);
1054 case Intrinsic::amdgcn_icmp:
1055 case Intrinsic::amdgcn_fcmp:
1056 if (selectImpl(I, *CoverageInfo))
1057 return true;
1058 return selectIntrinsicCmp(I);
1059 case Intrinsic::amdgcn_ballot:
1060 return selectBallot(I);
1061 case Intrinsic::amdgcn_inverse_ballot:
1062 return selectInverseBallot(I);
1063 case Intrinsic::amdgcn_reloc_constant:
1064 return selectRelocConstant(I);
1065 case Intrinsic::amdgcn_groupstaticsize:
1066 return selectGroupStaticSize(I);
1067 case Intrinsic::returnaddress:
1068 return selectReturnAddress(I);
1069 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1070 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1071 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1072 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1073 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1074 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1075 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1076 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1077 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1078 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1079 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1080 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1081 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1082 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1083 return selectSMFMACIntrin(I);
1084 default:
1085 return selectImpl(I, *CoverageInfo);
1086 }
1087}
1088
1090 const GCNSubtarget &ST) {
1091 if (Size != 16 && Size != 32 && Size != 64)
1092 return -1;
1093
1094 if (Size == 16 && !ST.has16BitInsts())
1095 return -1;
1096
1097 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc, unsigned S32Opc,
1098 unsigned S64Opc) {
1099 if (Size == 16)
1100 return ST.hasTrue16BitInsts() ? TrueS16Opc : S16Opc;
1101 if (Size == 32)
1102 return S32Opc;
1103 return S64Opc;
1104 };
1105
1106 switch (P) {
1107 default:
1108 llvm_unreachable("Unknown condition code!");
1109 case CmpInst::ICMP_NE:
1110 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1111 AMDGPU::V_CMP_NE_U32_e64, AMDGPU::V_CMP_NE_U64_e64);
1112 case CmpInst::ICMP_EQ:
1113 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1114 AMDGPU::V_CMP_EQ_U32_e64, AMDGPU::V_CMP_EQ_U64_e64);
1115 case CmpInst::ICMP_SGT:
1116 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1117 AMDGPU::V_CMP_GT_I32_e64, AMDGPU::V_CMP_GT_I64_e64);
1118 case CmpInst::ICMP_SGE:
1119 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1120 AMDGPU::V_CMP_GE_I32_e64, AMDGPU::V_CMP_GE_I64_e64);
1121 case CmpInst::ICMP_SLT:
1122 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1123 AMDGPU::V_CMP_LT_I32_e64, AMDGPU::V_CMP_LT_I64_e64);
1124 case CmpInst::ICMP_SLE:
1125 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1126 AMDGPU::V_CMP_LE_I32_e64, AMDGPU::V_CMP_LE_I64_e64);
1127 case CmpInst::ICMP_UGT:
1128 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1129 AMDGPU::V_CMP_GT_U32_e64, AMDGPU::V_CMP_GT_U64_e64);
1130 case CmpInst::ICMP_UGE:
1131 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1132 AMDGPU::V_CMP_GE_U32_e64, AMDGPU::V_CMP_GE_U64_e64);
1133 case CmpInst::ICMP_ULT:
1134 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1135 AMDGPU::V_CMP_LT_U32_e64, AMDGPU::V_CMP_LT_U64_e64);
1136 case CmpInst::ICMP_ULE:
1137 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1138 AMDGPU::V_CMP_LE_U32_e64, AMDGPU::V_CMP_LE_U64_e64);
1139
1140 case CmpInst::FCMP_OEQ:
1141 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1142 AMDGPU::V_CMP_EQ_F32_e64, AMDGPU::V_CMP_EQ_F64_e64);
1143 case CmpInst::FCMP_OGT:
1144 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1145 AMDGPU::V_CMP_GT_F32_e64, AMDGPU::V_CMP_GT_F64_e64);
1146 case CmpInst::FCMP_OGE:
1147 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1148 AMDGPU::V_CMP_GE_F32_e64, AMDGPU::V_CMP_GE_F64_e64);
1149 case CmpInst::FCMP_OLT:
1150 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1151 AMDGPU::V_CMP_LT_F32_e64, AMDGPU::V_CMP_LT_F64_e64);
1152 case CmpInst::FCMP_OLE:
1153 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1154 AMDGPU::V_CMP_LE_F32_e64, AMDGPU::V_CMP_LE_F64_e64);
1155 case CmpInst::FCMP_ONE:
1156 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1157 AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1158 case CmpInst::FCMP_ORD:
1159 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1160 AMDGPU::V_CMP_O_F32_e64, AMDGPU::V_CMP_O_F64_e64);
1161 case CmpInst::FCMP_UNO:
1162 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1163 AMDGPU::V_CMP_U_F32_e64, AMDGPU::V_CMP_U_F64_e64);
1164 case CmpInst::FCMP_UEQ:
1165 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1166 AMDGPU::V_CMP_NLG_F32_e64, AMDGPU::V_CMP_NLG_F64_e64);
1167 case CmpInst::FCMP_UGT:
1168 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1169 AMDGPU::V_CMP_NLE_F32_e64, AMDGPU::V_CMP_NLE_F64_e64);
1170 case CmpInst::FCMP_UGE:
1171 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1172 AMDGPU::V_CMP_NLT_F32_e64, AMDGPU::V_CMP_NLT_F64_e64);
1173 case CmpInst::FCMP_ULT:
1174 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1175 AMDGPU::V_CMP_NGE_F32_e64, AMDGPU::V_CMP_NGE_F64_e64);
1176 case CmpInst::FCMP_ULE:
1177 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1178 AMDGPU::V_CMP_NGT_F32_e64, AMDGPU::V_CMP_NGT_F64_e64);
1179 case CmpInst::FCMP_UNE:
1180 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1181 AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1182 case CmpInst::FCMP_TRUE:
1183 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1184 AMDGPU::V_CMP_TRU_F32_e64, AMDGPU::V_CMP_TRU_F64_e64);
1186 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1187 AMDGPU::V_CMP_F_F32_e64, AMDGPU::V_CMP_F_F64_e64);
1188 }
1189}
1190
1191int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1192 unsigned Size) const {
1193 if (Size == 64) {
1194 if (!STI.hasScalarCompareEq64())
1195 return -1;
1196
1197 switch (P) {
1198 case CmpInst::ICMP_NE:
1199 return AMDGPU::S_CMP_LG_U64;
1200 case CmpInst::ICMP_EQ:
1201 return AMDGPU::S_CMP_EQ_U64;
1202 default:
1203 return -1;
1204 }
1205 }
1206
1207 if (Size == 32) {
1208 switch (P) {
1209 case CmpInst::ICMP_NE:
1210 return AMDGPU::S_CMP_LG_U32;
1211 case CmpInst::ICMP_EQ:
1212 return AMDGPU::S_CMP_EQ_U32;
1213 case CmpInst::ICMP_SGT:
1214 return AMDGPU::S_CMP_GT_I32;
1215 case CmpInst::ICMP_SGE:
1216 return AMDGPU::S_CMP_GE_I32;
1217 case CmpInst::ICMP_SLT:
1218 return AMDGPU::S_CMP_LT_I32;
1219 case CmpInst::ICMP_SLE:
1220 return AMDGPU::S_CMP_LE_I32;
1221 case CmpInst::ICMP_UGT:
1222 return AMDGPU::S_CMP_GT_U32;
1223 case CmpInst::ICMP_UGE:
1224 return AMDGPU::S_CMP_GE_U32;
1225 case CmpInst::ICMP_ULT:
1226 return AMDGPU::S_CMP_LT_U32;
1227 case CmpInst::ICMP_ULE:
1228 return AMDGPU::S_CMP_LE_U32;
1229 case CmpInst::FCMP_OEQ:
1230 return AMDGPU::S_CMP_EQ_F32;
1231 case CmpInst::FCMP_OGT:
1232 return AMDGPU::S_CMP_GT_F32;
1233 case CmpInst::FCMP_OGE:
1234 return AMDGPU::S_CMP_GE_F32;
1235 case CmpInst::FCMP_OLT:
1236 return AMDGPU::S_CMP_LT_F32;
1237 case CmpInst::FCMP_OLE:
1238 return AMDGPU::S_CMP_LE_F32;
1239 case CmpInst::FCMP_ONE:
1240 return AMDGPU::S_CMP_LG_F32;
1241 case CmpInst::FCMP_ORD:
1242 return AMDGPU::S_CMP_O_F32;
1243 case CmpInst::FCMP_UNO:
1244 return AMDGPU::S_CMP_U_F32;
1245 case CmpInst::FCMP_UEQ:
1246 return AMDGPU::S_CMP_NLG_F32;
1247 case CmpInst::FCMP_UGT:
1248 return AMDGPU::S_CMP_NLE_F32;
1249 case CmpInst::FCMP_UGE:
1250 return AMDGPU::S_CMP_NLT_F32;
1251 case CmpInst::FCMP_ULT:
1252 return AMDGPU::S_CMP_NGE_F32;
1253 case CmpInst::FCMP_ULE:
1254 return AMDGPU::S_CMP_NGT_F32;
1255 case CmpInst::FCMP_UNE:
1256 return AMDGPU::S_CMP_NEQ_F32;
1257 default:
1258 llvm_unreachable("Unknown condition code!");
1259 }
1260 }
1261
1262 if (Size == 16) {
1263 if (!STI.hasSALUFloatInsts())
1264 return -1;
1265
1266 switch (P) {
1267 case CmpInst::FCMP_OEQ:
1268 return AMDGPU::S_CMP_EQ_F16;
1269 case CmpInst::FCMP_OGT:
1270 return AMDGPU::S_CMP_GT_F16;
1271 case CmpInst::FCMP_OGE:
1272 return AMDGPU::S_CMP_GE_F16;
1273 case CmpInst::FCMP_OLT:
1274 return AMDGPU::S_CMP_LT_F16;
1275 case CmpInst::FCMP_OLE:
1276 return AMDGPU::S_CMP_LE_F16;
1277 case CmpInst::FCMP_ONE:
1278 return AMDGPU::S_CMP_LG_F16;
1279 case CmpInst::FCMP_ORD:
1280 return AMDGPU::S_CMP_O_F16;
1281 case CmpInst::FCMP_UNO:
1282 return AMDGPU::S_CMP_U_F16;
1283 case CmpInst::FCMP_UEQ:
1284 return AMDGPU::S_CMP_NLG_F16;
1285 case CmpInst::FCMP_UGT:
1286 return AMDGPU::S_CMP_NLE_F16;
1287 case CmpInst::FCMP_UGE:
1288 return AMDGPU::S_CMP_NLT_F16;
1289 case CmpInst::FCMP_ULT:
1290 return AMDGPU::S_CMP_NGE_F16;
1291 case CmpInst::FCMP_ULE:
1292 return AMDGPU::S_CMP_NGT_F16;
1293 case CmpInst::FCMP_UNE:
1294 return AMDGPU::S_CMP_NEQ_F16;
1295 default:
1296 llvm_unreachable("Unknown condition code!");
1297 }
1298 }
1299
1300 return -1;
1301}
1302
1303bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1304
1305 MachineBasicBlock *BB = I.getParent();
1306 const DebugLoc &DL = I.getDebugLoc();
1307
1308 Register SrcReg = I.getOperand(2).getReg();
1309 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1310
1311 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1312
1313 Register CCReg = I.getOperand(0).getReg();
1314 if (!isVCC(CCReg, *MRI)) {
1315 int Opcode = getS_CMPOpcode(Pred, Size);
1316 if (Opcode == -1)
1317 return false;
1318 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1319 .add(I.getOperand(2))
1320 .add(I.getOperand(3));
1321 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1322 .addReg(AMDGPU::SCC);
1323 bool Ret =
1324 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1325 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1326 I.eraseFromParent();
1327 return Ret;
1328 }
1329
1330 if (I.getOpcode() == AMDGPU::G_FCMP)
1331 return false;
1332
1333 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1334 if (Opcode == -1)
1335 return false;
1336
1337 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1338 I.getOperand(0).getReg())
1339 .add(I.getOperand(2))
1340 .add(I.getOperand(3));
1342 *TRI.getBoolRC(), *MRI);
1343 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1344 I.eraseFromParent();
1345 return Ret;
1346}
1347
1348bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1349 Register Dst = I.getOperand(0).getReg();
1350 if (isVCC(Dst, *MRI))
1351 return false;
1352
1353 LLT DstTy = MRI->getType(Dst);
1354 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1355 return false;
1356
1357 MachineBasicBlock *BB = I.getParent();
1358 const DebugLoc &DL = I.getDebugLoc();
1359 Register SrcReg = I.getOperand(2).getReg();
1360 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1361
1362 // i1 inputs are not supported in GlobalISel.
1363 if (Size == 1)
1364 return false;
1365
1366 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1367 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1368 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1369 I.eraseFromParent();
1370 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1371 }
1372
1373 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1374 if (Opcode == -1)
1375 return false;
1376
1377 MachineInstrBuilder SelectedMI;
1378 MachineOperand &LHS = I.getOperand(2);
1379 MachineOperand &RHS = I.getOperand(3);
1380 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS);
1381 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS);
1382 Register Src0Reg =
1383 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1384 Register Src1Reg =
1385 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1386 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1387 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1388 SelectedMI.addImm(Src0Mods);
1389 SelectedMI.addReg(Src0Reg);
1390 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1391 SelectedMI.addImm(Src1Mods);
1392 SelectedMI.addReg(Src1Reg);
1393 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1394 SelectedMI.addImm(0); // clamp
1395 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1396 SelectedMI.addImm(0); // op_sel
1397
1398 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1399 if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1400 return false;
1401
1402 I.eraseFromParent();
1403 return true;
1404}
1405
1406bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1407 MachineBasicBlock *BB = I.getParent();
1408 const DebugLoc &DL = I.getDebugLoc();
1409 Register DstReg = I.getOperand(0).getReg();
1410 const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1411 const bool Is64 = Size == 64;
1412 const bool IsWave32 = (STI.getWavefrontSize() == 32);
1413
1414 // In the common case, the return type matches the wave size.
1415 // However we also support emitting i64 ballots in wave32 mode.
1416 if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32))
1417 return false;
1418
1419 std::optional<ValueAndVReg> Arg =
1420 getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
1421
1422 const auto BuildCopy = [&](Register SrcReg) {
1423 if (Size == STI.getWavefrontSize()) {
1424 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1425 .addReg(SrcReg);
1426 return;
1427 }
1428
1429 // If emitting a i64 ballot in wave32, fill the upper bits with zeroes.
1430 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1431 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1432 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1433 .addReg(SrcReg)
1434 .addImm(AMDGPU::sub0)
1435 .addReg(HiReg)
1436 .addImm(AMDGPU::sub1);
1437 };
1438
1439 if (Arg) {
1440 const int64_t Value = Arg->Value.getSExtValue();
1441 if (Value == 0) {
1442 unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1443 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1444 } else if (Value == -1) // all ones
1445 BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
1446 else
1447 return false;
1448 } else
1449 BuildCopy(I.getOperand(2).getReg());
1450
1451 I.eraseFromParent();
1452 return true;
1453}
1454
1455bool AMDGPUInstructionSelector::selectInverseBallot(MachineInstr &I) const {
1456 MachineBasicBlock *BB = I.getParent();
1457 const DebugLoc &DL = I.getDebugLoc();
1458 const Register DstReg = I.getOperand(0).getReg();
1459 const Register MaskReg = I.getOperand(2).getReg();
1460
1461 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(MaskReg);
1462 I.eraseFromParent();
1463 return true;
1464}
1465
1466bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1467 Register DstReg = I.getOperand(0).getReg();
1468 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1469 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1470 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1471 return false;
1472
1473 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1474
1476 const MDNode *Metadata = I.getOperand(2).getMetadata();
1477 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1478 auto RelocSymbol = cast<GlobalVariable>(
1479 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1480
1481 MachineBasicBlock *BB = I.getParent();
1482 BuildMI(*BB, &I, I.getDebugLoc(),
1483 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1485
1486 I.eraseFromParent();
1487 return true;
1488}
1489
1490bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1492
1493 Register DstReg = I.getOperand(0).getReg();
1494 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1495 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1496 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1497
1498 MachineBasicBlock *MBB = I.getParent();
1499 const DebugLoc &DL = I.getDebugLoc();
1500
1501 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1502
1503 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1505 MIB.addImm(MFI->getLDSSize());
1506 } else {
1508 const GlobalValue *GV
1509 = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1511 }
1512
1513 I.eraseFromParent();
1514 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1515}
1516
1517bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1518 MachineBasicBlock *MBB = I.getParent();
1520 const DebugLoc &DL = I.getDebugLoc();
1521
1522 MachineOperand &Dst = I.getOperand(0);
1523 Register DstReg = Dst.getReg();
1524 unsigned Depth = I.getOperand(2).getImm();
1525
1526 const TargetRegisterClass *RC
1527 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1528 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1529 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1530 return false;
1531
1532 // Check for kernel and shader functions
1533 if (Depth != 0 ||
1535 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1536 .addImm(0);
1537 I.eraseFromParent();
1538 return true;
1539 }
1540
1542 // There is a call to @llvm.returnaddress in this function
1543 MFI.setReturnAddressIsTaken(true);
1544
1545 // Get the return address reg and mark it as an implicit live-in
1546 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1547 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1548 AMDGPU::SReg_64RegClass, DL);
1549 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1550 .addReg(LiveIn);
1551 I.eraseFromParent();
1552 return true;
1553}
1554
1555bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1556 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1557 // SelectionDAG uses for wave32 vs wave64.
1558 MachineBasicBlock *BB = MI.getParent();
1559 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1560 .add(MI.getOperand(1));
1561
1562 Register Reg = MI.getOperand(1).getReg();
1563 MI.eraseFromParent();
1564
1565 if (!MRI->getRegClassOrNull(Reg))
1566 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1567 return true;
1568}
1569
1570bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1571 MachineInstr &MI, Intrinsic::ID IntrID) const {
1572 MachineBasicBlock *MBB = MI.getParent();
1574 const DebugLoc &DL = MI.getDebugLoc();
1575
1576 unsigned IndexOperand = MI.getOperand(7).getImm();
1577 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1578 bool WaveDone = MI.getOperand(9).getImm() != 0;
1579
1580 if (WaveDone && !WaveRelease)
1581 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1582
1583 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1584 IndexOperand &= ~0x3f;
1585 unsigned CountDw = 0;
1586
1588 CountDw = (IndexOperand >> 24) & 0xf;
1589 IndexOperand &= ~(0xf << 24);
1590
1591 if (CountDw < 1 || CountDw > 4) {
1593 "ds_ordered_count: dword count must be between 1 and 4");
1594 }
1595 }
1596
1597 if (IndexOperand)
1598 report_fatal_error("ds_ordered_count: bad index operand");
1599
1600 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1601 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1602
1603 unsigned Offset0 = OrderedCountIndex << 2;
1604 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1605
1607 Offset1 |= (CountDw - 1) << 6;
1608
1610 Offset1 |= ShaderType << 2;
1611
1612 unsigned Offset = Offset0 | (Offset1 << 8);
1613
1614 Register M0Val = MI.getOperand(2).getReg();
1615 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1616 .addReg(M0Val);
1617
1618 Register DstReg = MI.getOperand(0).getReg();
1619 Register ValReg = MI.getOperand(3).getReg();
1621 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1622 .addReg(ValReg)
1623 .addImm(Offset)
1624 .cloneMemRefs(MI);
1625
1626 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1627 return false;
1628
1629 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1630 MI.eraseFromParent();
1631 return Ret;
1632}
1633
1634static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1635 switch (IntrID) {
1636 case Intrinsic::amdgcn_ds_gws_init:
1637 return AMDGPU::DS_GWS_INIT;
1638 case Intrinsic::amdgcn_ds_gws_barrier:
1639 return AMDGPU::DS_GWS_BARRIER;
1640 case Intrinsic::amdgcn_ds_gws_sema_v:
1641 return AMDGPU::DS_GWS_SEMA_V;
1642 case Intrinsic::amdgcn_ds_gws_sema_br:
1643 return AMDGPU::DS_GWS_SEMA_BR;
1644 case Intrinsic::amdgcn_ds_gws_sema_p:
1645 return AMDGPU::DS_GWS_SEMA_P;
1646 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1647 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1648 default:
1649 llvm_unreachable("not a gws intrinsic");
1650 }
1651}
1652
1653bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1654 Intrinsic::ID IID) const {
1655 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1656 !STI.hasGWSSemaReleaseAll()))
1657 return false;
1658
1659 // intrinsic ID, vsrc, offset
1660 const bool HasVSrc = MI.getNumOperands() == 3;
1661 assert(HasVSrc || MI.getNumOperands() == 2);
1662
1663 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1664 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1665 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1666 return false;
1667
1668 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1669 unsigned ImmOffset;
1670
1671 MachineBasicBlock *MBB = MI.getParent();
1672 const DebugLoc &DL = MI.getDebugLoc();
1673
1674 MachineInstr *Readfirstlane = nullptr;
1675
1676 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1677 // incoming offset, in case there's an add of a constant. We'll have to put it
1678 // back later.
1679 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1680 Readfirstlane = OffsetDef;
1681 BaseOffset = OffsetDef->getOperand(1).getReg();
1682 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1683 }
1684
1685 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1686 // If we have a constant offset, try to use the 0 in m0 as the base.
1687 // TODO: Look into changing the default m0 initialization value. If the
1688 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1689 // the immediate offset.
1690
1691 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1692 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1693 .addImm(0);
1694 } else {
1695 std::tie(BaseOffset, ImmOffset) =
1696 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, KB);
1697
1698 if (Readfirstlane) {
1699 // We have the constant offset now, so put the readfirstlane back on the
1700 // variable component.
1701 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1702 return false;
1703
1704 Readfirstlane->getOperand(1).setReg(BaseOffset);
1705 BaseOffset = Readfirstlane->getOperand(0).getReg();
1706 } else {
1707 if (!RBI.constrainGenericRegister(BaseOffset,
1708 AMDGPU::SReg_32RegClass, *MRI))
1709 return false;
1710 }
1711
1712 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1713 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1714 .addReg(BaseOffset)
1715 .addImm(16)
1716 .setOperandDead(3); // Dead scc
1717
1718 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1719 .addReg(M0Base);
1720 }
1721
1722 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1723 // offset field) % 64. Some versions of the programming guide omit the m0
1724 // part, or claim it's from offset 0.
1725 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1726
1727 if (HasVSrc) {
1728 Register VSrc = MI.getOperand(1).getReg();
1729 MIB.addReg(VSrc);
1730
1731 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1732 return false;
1733 }
1734
1735 MIB.addImm(ImmOffset)
1736 .cloneMemRefs(MI);
1737
1738 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
1739
1740 MI.eraseFromParent();
1741 return true;
1742}
1743
1744bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1745 bool IsAppend) const {
1746 Register PtrBase = MI.getOperand(2).getReg();
1747 LLT PtrTy = MRI->getType(PtrBase);
1748 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1749
1750 unsigned Offset;
1751 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1752
1753 // TODO: Should this try to look through readfirstlane like GWS?
1754 if (!isDSOffsetLegal(PtrBase, Offset)) {
1755 PtrBase = MI.getOperand(2).getReg();
1756 Offset = 0;
1757 }
1758
1759 MachineBasicBlock *MBB = MI.getParent();
1760 const DebugLoc &DL = MI.getDebugLoc();
1761 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1762
1763 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1764 .addReg(PtrBase);
1765 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1766 return false;
1767
1768 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1769 .addImm(Offset)
1770 .addImm(IsGDS ? -1 : 0)
1771 .cloneMemRefs(MI);
1772 MI.eraseFromParent();
1773 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1774}
1775
1776bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1778 unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1779 if (WGSize <= STI.getWavefrontSize()) {
1780 MachineBasicBlock *MBB = MI.getParent();
1781 const DebugLoc &DL = MI.getDebugLoc();
1782 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1783 MI.eraseFromParent();
1784 return true;
1785 }
1786 }
1787
1788 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
1789 if (STI.hasSplitBarriers()) {
1790 MachineBasicBlock *MBB = MI.getParent();
1791 const DebugLoc &DL = MI.getDebugLoc();
1792 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
1794 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
1796 MI.eraseFromParent();
1797 return true;
1798 }
1799
1800 return selectImpl(MI, *CoverageInfo);
1801}
1802
1803static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1804 bool &IsTexFail) {
1805 if (TexFailCtrl)
1806 IsTexFail = true;
1807
1808 TFE = (TexFailCtrl & 0x1) ? true : false;
1809 TexFailCtrl &= ~(uint64_t)0x1;
1810 LWE = (TexFailCtrl & 0x2) ? true : false;
1811 TexFailCtrl &= ~(uint64_t)0x2;
1812
1813 return TexFailCtrl == 0;
1814}
1815
1816bool AMDGPUInstructionSelector::selectImageIntrinsic(
1818 MachineBasicBlock *MBB = MI.getParent();
1819 const DebugLoc &DL = MI.getDebugLoc();
1820
1821 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1823
1824 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1825 unsigned IntrOpcode = Intr->BaseOpcode;
1826 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
1827 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
1828 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
1829
1830 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
1831
1832 Register VDataIn, VDataOut;
1833 LLT VDataTy;
1834 int NumVDataDwords = -1;
1835 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
1836 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
1837
1838 bool Unorm;
1839 if (!BaseOpcode->Sampler)
1840 Unorm = true;
1841 else
1842 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
1843
1844 bool TFE;
1845 bool LWE;
1846 bool IsTexFail = false;
1847 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
1848 TFE, LWE, IsTexFail))
1849 return false;
1850
1851 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
1852 const bool IsA16 = (Flags & 1) != 0;
1853 const bool IsG16 = (Flags & 2) != 0;
1854
1855 // A16 implies 16 bit gradients if subtarget doesn't support G16
1856 if (IsA16 && !STI.hasG16() && !IsG16)
1857 return false;
1858
1859 unsigned DMask = 0;
1860 unsigned DMaskLanes = 0;
1861
1862 if (BaseOpcode->Atomic) {
1863 VDataOut = MI.getOperand(0).getReg();
1864 VDataIn = MI.getOperand(2).getReg();
1865 LLT Ty = MRI->getType(VDataIn);
1866
1867 // Be careful to allow atomic swap on 16-bit element vectors.
1868 const bool Is64Bit = BaseOpcode->AtomicX2 ?
1869 Ty.getSizeInBits() == 128 :
1870 Ty.getSizeInBits() == 64;
1871
1872 if (BaseOpcode->AtomicX2) {
1873 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1874
1875 DMask = Is64Bit ? 0xf : 0x3;
1876 NumVDataDwords = Is64Bit ? 4 : 2;
1877 } else {
1878 DMask = Is64Bit ? 0x3 : 0x1;
1879 NumVDataDwords = Is64Bit ? 2 : 1;
1880 }
1881 } else {
1882 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
1883 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
1884
1885 if (BaseOpcode->Store) {
1886 VDataIn = MI.getOperand(1).getReg();
1887 VDataTy = MRI->getType(VDataIn);
1888 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1889 } else {
1890 VDataOut = MI.getOperand(0).getReg();
1891 VDataTy = MRI->getType(VDataOut);
1892 NumVDataDwords = DMaskLanes;
1893
1894 if (IsD16 && !STI.hasUnpackedD16VMem())
1895 NumVDataDwords = (DMaskLanes + 1) / 2;
1896 }
1897 }
1898
1899 // Set G16 opcode
1900 if (Subtarget->hasG16() && IsG16) {
1901 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1903 assert(G16MappingInfo);
1904 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1905 }
1906
1907 // TODO: Check this in verifier.
1908 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1909
1910 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
1911 if (BaseOpcode->Atomic)
1912 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
1913 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
1915 return false;
1916
1917 int NumVAddrRegs = 0;
1918 int NumVAddrDwords = 0;
1919 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
1920 // Skip the $noregs and 0s inserted during legalization.
1921 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
1922 if (!AddrOp.isReg())
1923 continue; // XXX - Break?
1924
1925 Register Addr = AddrOp.getReg();
1926 if (!Addr)
1927 break;
1928
1929 ++NumVAddrRegs;
1930 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1931 }
1932
1933 // The legalizer preprocessed the intrinsic arguments. If we aren't using
1934 // NSA, these should have been packed into a single value in the first
1935 // address register
1936 const bool UseNSA =
1937 NumVAddrRegs != 1 &&
1938 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
1939 : NumVAddrDwords == NumVAddrRegs);
1940 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1941 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1942 return false;
1943 }
1944
1945 if (IsTexFail)
1946 ++NumVDataDwords;
1947
1948 int Opcode = -1;
1949 if (IsGFX12Plus) {
1950 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
1951 NumVDataDwords, NumVAddrDwords);
1952 } else if (IsGFX11Plus) {
1953 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1954 UseNSA ? AMDGPU::MIMGEncGfx11NSA
1955 : AMDGPU::MIMGEncGfx11Default,
1956 NumVDataDwords, NumVAddrDwords);
1957 } else if (IsGFX10Plus) {
1958 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1959 UseNSA ? AMDGPU::MIMGEncGfx10NSA
1960 : AMDGPU::MIMGEncGfx10Default,
1961 NumVDataDwords, NumVAddrDwords);
1962 } else {
1963 if (Subtarget->hasGFX90AInsts()) {
1964 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
1965 NumVDataDwords, NumVAddrDwords);
1966 if (Opcode == -1) {
1967 LLVM_DEBUG(
1968 dbgs()
1969 << "requested image instruction is not supported on this GPU\n");
1970 return false;
1971 }
1972 }
1973 if (Opcode == -1 &&
1975 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1976 NumVDataDwords, NumVAddrDwords);
1977 if (Opcode == -1)
1978 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1979 NumVDataDwords, NumVAddrDwords);
1980 }
1981 if (Opcode == -1)
1982 return false;
1983
1984 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1985 .cloneMemRefs(MI);
1986
1987 if (VDataOut) {
1988 if (BaseOpcode->AtomicX2) {
1989 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1990
1991 Register TmpReg = MRI->createVirtualRegister(
1992 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1993 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1994
1995 MIB.addDef(TmpReg);
1996 if (!MRI->use_empty(VDataOut)) {
1997 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1998 .addReg(TmpReg, RegState::Kill, SubReg);
1999 }
2000
2001 } else {
2002 MIB.addDef(VDataOut); // vdata output
2003 }
2004 }
2005
2006 if (VDataIn)
2007 MIB.addReg(VDataIn); // vdata input
2008
2009 for (int I = 0; I != NumVAddrRegs; ++I) {
2010 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2011 if (SrcOp.isReg()) {
2012 assert(SrcOp.getReg() != 0);
2013 MIB.addReg(SrcOp.getReg());
2014 }
2015 }
2016
2017 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2018 if (BaseOpcode->Sampler)
2019 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2020
2021 MIB.addImm(DMask); // dmask
2022
2023 if (IsGFX10Plus)
2024 MIB.addImm(DimInfo->Encoding);
2025 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2026 MIB.addImm(Unorm);
2027
2028 MIB.addImm(CPol);
2029 MIB.addImm(IsA16 && // a16 or r128
2030 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2031 if (IsGFX10Plus)
2032 MIB.addImm(IsA16 ? -1 : 0);
2033
2034 if (!Subtarget->hasGFX90AInsts()) {
2035 MIB.addImm(TFE); // tfe
2036 } else if (TFE) {
2037 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2038 return false;
2039 }
2040
2041 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2042 MIB.addImm(LWE); // lwe
2043 if (!IsGFX10Plus)
2044 MIB.addImm(DimInfo->DA ? -1 : 0);
2045 if (BaseOpcode->HasD16)
2046 MIB.addImm(IsD16 ? -1 : 0);
2047
2048 if (IsTexFail) {
2049 // An image load instruction with TFE/LWE only conditionally writes to its
2050 // result registers. Initialize them to zero so that we always get well
2051 // defined result values.
2052 assert(VDataOut && !VDataIn);
2053 Register Tied = MRI->cloneVirtualRegister(VDataOut);
2054 Register Zero = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2055 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::V_MOV_B32_e32), Zero)
2056 .addImm(0);
2057 auto Parts = TRI.getRegSplitParts(MRI->getRegClass(Tied), 4);
2058 if (STI.usePRTStrictNull()) {
2059 // With enable-prt-strict-null enabled, initialize all result registers to
2060 // zero.
2061 auto RegSeq =
2062 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
2063 for (auto Sub : Parts)
2064 RegSeq.addReg(Zero).addImm(Sub);
2065 } else {
2066 // With enable-prt-strict-null disabled, only initialize the extra TFE/LWE
2067 // result register.
2068 Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2069 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
2070 auto RegSeq =
2071 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
2072 for (auto Sub : Parts.drop_back(1))
2073 RegSeq.addReg(Undef).addImm(Sub);
2074 RegSeq.addReg(Zero).addImm(Parts.back());
2075 }
2076 MIB.addReg(Tied, RegState::Implicit);
2077 MIB->tieOperands(0, MIB->getNumOperands() - 1);
2078 }
2079
2080 MI.eraseFromParent();
2081 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2082 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2083 return true;
2084}
2085
2086// We need to handle this here because tablegen doesn't support matching
2087// instructions with multiple outputs.
2088bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2089 MachineInstr &MI) const {
2090 Register Dst0 = MI.getOperand(0).getReg();
2091 Register Dst1 = MI.getOperand(1).getReg();
2092
2093 const DebugLoc &DL = MI.getDebugLoc();
2094 MachineBasicBlock *MBB = MI.getParent();
2095
2096 Register Addr = MI.getOperand(3).getReg();
2097 Register Data0 = MI.getOperand(4).getReg();
2098 Register Data1 = MI.getOperand(5).getReg();
2099 unsigned Offset = MI.getOperand(6).getImm();
2100
2101 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0)
2102 .addDef(Dst1)
2103 .addUse(Addr)
2104 .addUse(Data0)
2105 .addUse(Data1)
2106 .addImm(Offset)
2107 .cloneMemRefs(MI);
2108
2109 MI.eraseFromParent();
2110 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2111}
2112
2113bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2114 MachineInstr &I) const {
2115 unsigned IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2116 switch (IntrinsicID) {
2117 case Intrinsic::amdgcn_end_cf:
2118 return selectEndCfIntrinsic(I);
2119 case Intrinsic::amdgcn_ds_ordered_add:
2120 case Intrinsic::amdgcn_ds_ordered_swap:
2121 return selectDSOrderedIntrinsic(I, IntrinsicID);
2122 case Intrinsic::amdgcn_ds_gws_init:
2123 case Intrinsic::amdgcn_ds_gws_barrier:
2124 case Intrinsic::amdgcn_ds_gws_sema_v:
2125 case Intrinsic::amdgcn_ds_gws_sema_br:
2126 case Intrinsic::amdgcn_ds_gws_sema_p:
2127 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2128 return selectDSGWSIntrinsic(I, IntrinsicID);
2129 case Intrinsic::amdgcn_ds_append:
2130 return selectDSAppendConsume(I, true);
2131 case Intrinsic::amdgcn_ds_consume:
2132 return selectDSAppendConsume(I, false);
2133 case Intrinsic::amdgcn_s_barrier:
2134 return selectSBarrier(I);
2135 case Intrinsic::amdgcn_raw_buffer_load_lds:
2136 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2137 case Intrinsic::amdgcn_struct_buffer_load_lds:
2138 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2139 return selectBufferLoadLds(I);
2140 case Intrinsic::amdgcn_global_load_lds:
2141 return selectGlobalLoadLds(I);
2142 case Intrinsic::amdgcn_exp_compr:
2143 if (!STI.hasCompressedExport()) {
2144 Function &F = I.getMF()->getFunction();
2146 F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error);
2147 F.getContext().diagnose(NoFpRet);
2148 return false;
2149 }
2150 break;
2151 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2152 return selectDSBvhStackIntrinsic(I);
2153 case Intrinsic::amdgcn_s_barrier_init:
2154 case Intrinsic::amdgcn_s_barrier_join:
2155 case Intrinsic::amdgcn_s_wakeup_barrier:
2156 case Intrinsic::amdgcn_s_get_barrier_state:
2157 return selectNamedBarrierInst(I, IntrinsicID);
2158 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2159 case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
2160 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2161 case Intrinsic::amdgcn_s_barrier_leave:
2162 return selectSBarrierLeave(I);
2163 }
2164 return selectImpl(I, *CoverageInfo);
2165}
2166
2167bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2168 if (selectImpl(I, *CoverageInfo))
2169 return true;
2170
2171 MachineBasicBlock *BB = I.getParent();
2172 const DebugLoc &DL = I.getDebugLoc();
2173
2174 Register DstReg = I.getOperand(0).getReg();
2175 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2176 assert(Size <= 32 || Size == 64);
2177 const MachineOperand &CCOp = I.getOperand(1);
2178 Register CCReg = CCOp.getReg();
2179 if (!isVCC(CCReg, *MRI)) {
2180 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2181 AMDGPU::S_CSELECT_B32;
2182 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2183 .addReg(CCReg);
2184
2185 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2186 // bank, because it does not cover the register class that we used to represent
2187 // for it. So we need to manually set the register class here.
2188 if (!MRI->getRegClassOrNull(CCReg))
2189 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2190 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2191 .add(I.getOperand(2))
2192 .add(I.getOperand(3));
2193
2194 bool Ret = false;
2195 Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2196 Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2197 I.eraseFromParent();
2198 return Ret;
2199 }
2200
2201 // Wide VGPR select should have been split in RegBankSelect.
2202 if (Size > 32)
2203 return false;
2204
2206 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2207 .addImm(0)
2208 .add(I.getOperand(3))
2209 .addImm(0)
2210 .add(I.getOperand(2))
2211 .add(I.getOperand(1));
2212
2213 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2214 I.eraseFromParent();
2215 return Ret;
2216}
2217
2218static int sizeToSubRegIndex(unsigned Size) {
2219 switch (Size) {
2220 case 32:
2221 return AMDGPU::sub0;
2222 case 64:
2223 return AMDGPU::sub0_sub1;
2224 case 96:
2225 return AMDGPU::sub0_sub1_sub2;
2226 case 128:
2227 return AMDGPU::sub0_sub1_sub2_sub3;
2228 case 256:
2229 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
2230 default:
2231 if (Size < 32)
2232 return AMDGPU::sub0;
2233 if (Size > 256)
2234 return -1;
2236 }
2237}
2238
2239bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2240 Register DstReg = I.getOperand(0).getReg();
2241 Register SrcReg = I.getOperand(1).getReg();
2242 const LLT DstTy = MRI->getType(DstReg);
2243 const LLT SrcTy = MRI->getType(SrcReg);
2244 const LLT S1 = LLT::scalar(1);
2245
2246 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2247 const RegisterBank *DstRB;
2248 if (DstTy == S1) {
2249 // This is a special case. We don't treat s1 for legalization artifacts as
2250 // vcc booleans.
2251 DstRB = SrcRB;
2252 } else {
2253 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2254 if (SrcRB != DstRB)
2255 return false;
2256 }
2257
2258 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2259
2260 unsigned DstSize = DstTy.getSizeInBits();
2261 unsigned SrcSize = SrcTy.getSizeInBits();
2262
2263 const TargetRegisterClass *SrcRC =
2264 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2265 const TargetRegisterClass *DstRC =
2266 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2267 if (!SrcRC || !DstRC)
2268 return false;
2269
2270 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2271 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2272 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2273 return false;
2274 }
2275
2276 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2277 MachineBasicBlock *MBB = I.getParent();
2278 const DebugLoc &DL = I.getDebugLoc();
2279
2280 Register LoReg = MRI->createVirtualRegister(DstRC);
2281 Register HiReg = MRI->createVirtualRegister(DstRC);
2282 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2283 .addReg(SrcReg, 0, AMDGPU::sub0);
2284 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2285 .addReg(SrcReg, 0, AMDGPU::sub1);
2286
2287 if (IsVALU && STI.hasSDWA()) {
2288 // Write the low 16-bits of the high element into the high 16-bits of the
2289 // low element.
2290 MachineInstr *MovSDWA =
2291 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2292 .addImm(0) // $src0_modifiers
2293 .addReg(HiReg) // $src0
2294 .addImm(0) // $clamp
2295 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2296 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2297 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2298 .addReg(LoReg, RegState::Implicit);
2299 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2300 } else {
2301 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2302 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2303 Register ImmReg = MRI->createVirtualRegister(DstRC);
2304 if (IsVALU) {
2305 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2306 .addImm(16)
2307 .addReg(HiReg);
2308 } else {
2309 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2310 .addReg(HiReg)
2311 .addImm(16)
2312 .setOperandDead(3); // Dead scc
2313 }
2314
2315 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2316 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2317 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2318
2319 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2320 .addImm(0xffff);
2321 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2322 .addReg(LoReg)
2323 .addReg(ImmReg);
2324 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2325 .addReg(TmpReg0)
2326 .addReg(TmpReg1);
2327
2328 if (!IsVALU) {
2329 And.setOperandDead(3); // Dead scc
2330 Or.setOperandDead(3); // Dead scc
2331 }
2332 }
2333
2334 I.eraseFromParent();
2335 return true;
2336 }
2337
2338 if (!DstTy.isScalar())
2339 return false;
2340
2341 if (SrcSize > 32) {
2342 int SubRegIdx = sizeToSubRegIndex(DstSize);
2343 if (SubRegIdx == -1)
2344 return false;
2345
2346 // Deal with weird cases where the class only partially supports the subreg
2347 // index.
2348 const TargetRegisterClass *SrcWithSubRC
2349 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2350 if (!SrcWithSubRC)
2351 return false;
2352
2353 if (SrcWithSubRC != SrcRC) {
2354 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2355 return false;
2356 }
2357
2358 I.getOperand(1).setSubReg(SubRegIdx);
2359 }
2360
2361 I.setDesc(TII.get(TargetOpcode::COPY));
2362 return true;
2363}
2364
2365/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2366static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2367 Mask = maskTrailingOnes<unsigned>(Size);
2368 int SignedMask = static_cast<int>(Mask);
2369 return SignedMask >= -16 && SignedMask <= 64;
2370}
2371
2372// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2373const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2374 Register Reg, const MachineRegisterInfo &MRI,
2375 const TargetRegisterInfo &TRI) const {
2376 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2377 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
2378 return RB;
2379
2380 // Ignore the type, since we don't use vcc in artifacts.
2381 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
2382 return &RBI.getRegBankFromRegClass(*RC, LLT());
2383 return nullptr;
2384}
2385
2386bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2387 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2388 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2389 const DebugLoc &DL = I.getDebugLoc();
2390 MachineBasicBlock &MBB = *I.getParent();
2391 const Register DstReg = I.getOperand(0).getReg();
2392 const Register SrcReg = I.getOperand(1).getReg();
2393
2394 const LLT DstTy = MRI->getType(DstReg);
2395 const LLT SrcTy = MRI->getType(SrcReg);
2396 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2397 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2398 const unsigned DstSize = DstTy.getSizeInBits();
2399 if (!DstTy.isScalar())
2400 return false;
2401
2402 // Artifact casts should never use vcc.
2403 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2404
2405 // FIXME: This should probably be illegal and split earlier.
2406 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2407 if (DstSize <= 32)
2408 return selectCOPY(I);
2409
2410 const TargetRegisterClass *SrcRC =
2411 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2412 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2413 const TargetRegisterClass *DstRC =
2414 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2415
2416 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2417 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2418 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2419 .addReg(SrcReg)
2420 .addImm(AMDGPU::sub0)
2421 .addReg(UndefReg)
2422 .addImm(AMDGPU::sub1);
2423 I.eraseFromParent();
2424
2425 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2426 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2427 }
2428
2429 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2430 // 64-bit should have been split up in RegBankSelect
2431
2432 // Try to use an and with a mask if it will save code size.
2433 unsigned Mask;
2434 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2435 MachineInstr *ExtI =
2436 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2437 .addImm(Mask)
2438 .addReg(SrcReg);
2439 I.eraseFromParent();
2440 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2441 }
2442
2443 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2444 MachineInstr *ExtI =
2445 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2446 .addReg(SrcReg)
2447 .addImm(0) // Offset
2448 .addImm(SrcSize); // Width
2449 I.eraseFromParent();
2450 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2451 }
2452
2453 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2454 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2455 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2456 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2457 return false;
2458
2459 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2460 const unsigned SextOpc = SrcSize == 8 ?
2461 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2462 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2463 .addReg(SrcReg);
2464 I.eraseFromParent();
2465 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2466 }
2467
2468 // Using a single 32-bit SALU to calculate the high half is smaller than
2469 // S_BFE with a literal constant operand.
2470 if (DstSize > 32 && SrcSize == 32) {
2471 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2472 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2473 if (Signed) {
2474 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2475 .addReg(SrcReg, 0, SubReg)
2476 .addImm(31)
2477 .setOperandDead(3); // Dead scc
2478 } else {
2479 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2480 .addImm(0);
2481 }
2482 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2483 .addReg(SrcReg, 0, SubReg)
2484 .addImm(AMDGPU::sub0)
2485 .addReg(HiReg)
2486 .addImm(AMDGPU::sub1);
2487 I.eraseFromParent();
2488 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2489 *MRI);
2490 }
2491
2492 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2493 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2494
2495 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2496 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2497 // We need a 64-bit register source, but the high bits don't matter.
2498 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2499 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2500 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2501
2502 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2503 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2504 .addReg(SrcReg, 0, SubReg)
2505 .addImm(AMDGPU::sub0)
2506 .addReg(UndefReg)
2507 .addImm(AMDGPU::sub1);
2508
2509 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2510 .addReg(ExtReg)
2511 .addImm(SrcSize << 16);
2512
2513 I.eraseFromParent();
2514 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2515 }
2516
2517 unsigned Mask;
2518 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2519 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2520 .addReg(SrcReg)
2521 .addImm(Mask)
2522 .setOperandDead(3); // Dead scc
2523 } else {
2524 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2525 .addReg(SrcReg)
2526 .addImm(SrcSize << 16);
2527 }
2528
2529 I.eraseFromParent();
2530 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2531 }
2532
2533 return false;
2534}
2535
2537 Register &Out) {
2538 Register LShlSrc;
2539 if (mi_match(In, MRI,
2540 m_GTrunc(m_GLShr(m_Reg(LShlSrc), m_SpecificICst(16))))) {
2541 Out = LShlSrc;
2542 return true;
2543 }
2544 return false;
2545}
2546
2547bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2548 if (!Subtarget->hasSALUFloatInsts())
2549 return false;
2550
2551 Register Dst = I.getOperand(0).getReg();
2552 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2553 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2554 return false;
2555
2556 Register Src = I.getOperand(1).getReg();
2557
2558 if (MRI->getType(Dst) == LLT::scalar(32) &&
2559 MRI->getType(Src) == LLT::scalar(16)) {
2560 if (isExtractHiElt(*MRI, Src, Src)) {
2561 MachineBasicBlock *BB = I.getParent();
2562 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2563 .addUse(Src);
2564 I.eraseFromParent();
2565 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2566 }
2567 }
2568
2569 return false;
2570}
2571
2572bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
2573 MachineBasicBlock *BB = I.getParent();
2574 MachineOperand &ImmOp = I.getOperand(1);
2575 Register DstReg = I.getOperand(0).getReg();
2576 unsigned Size = MRI->getType(DstReg).getSizeInBits();
2577 bool IsFP = false;
2578
2579 // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
2580 if (ImmOp.isFPImm()) {
2581 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
2582 ImmOp.ChangeToImmediate(Imm.getZExtValue());
2583 IsFP = true;
2584 } else if (ImmOp.isCImm()) {
2585 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
2586 } else {
2587 llvm_unreachable("Not supported by g_constants");
2588 }
2589
2590 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2591 const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
2592
2593 unsigned Opcode;
2594 if (DstRB->getID() == AMDGPU::VCCRegBankID) {
2595 Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2596 } else if (Size == 64 &&
2597 AMDGPU::isValid32BitLiteral(I.getOperand(1).getImm(), IsFP)) {
2598 Opcode = IsSgpr ? AMDGPU::S_MOV_B64_IMM_PSEUDO : AMDGPU::V_MOV_B64_PSEUDO;
2599 I.setDesc(TII.get(Opcode));
2600 I.addImplicitDefUseOperands(*MF);
2601 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2602 } else {
2603 Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2604
2605 // We should never produce s1 values on banks other than VCC. If the user of
2606 // this already constrained the register, we may incorrectly think it's VCC
2607 // if it wasn't originally.
2608 if (Size == 1)
2609 return false;
2610 }
2611
2612 if (Size != 64) {
2613 I.setDesc(TII.get(Opcode));
2614 I.addImplicitDefUseOperands(*MF);
2615 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2616 }
2617
2618 const DebugLoc &DL = I.getDebugLoc();
2619
2620 APInt Imm(Size, I.getOperand(1).getImm());
2621
2622 MachineInstr *ResInst;
2623 if (IsSgpr && TII.isInlineConstant(Imm)) {
2624 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2625 .addImm(I.getOperand(1).getImm());
2626 } else {
2627 const TargetRegisterClass *RC = IsSgpr ?
2628 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2629 Register LoReg = MRI->createVirtualRegister(RC);
2630 Register HiReg = MRI->createVirtualRegister(RC);
2631
2632 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2633 .addImm(Imm.trunc(32).getZExtValue());
2634
2635 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2636 .addImm(Imm.ashr(32).getZExtValue());
2637
2638 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2639 .addReg(LoReg)
2640 .addImm(AMDGPU::sub0)
2641 .addReg(HiReg)
2642 .addImm(AMDGPU::sub1);
2643 }
2644
2645 // We can't call constrainSelectedInstRegOperands here, because it doesn't
2646 // work for target independent opcodes
2647 I.eraseFromParent();
2648 const TargetRegisterClass *DstRC =
2649 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2650 if (!DstRC)
2651 return true;
2652 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2653}
2654
2655bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2656 // Only manually handle the f64 SGPR case.
2657 //
2658 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2659 // the bit ops theoretically have a second result due to the implicit def of
2660 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2661 // that is easy by disabling the check. The result works, but uses a
2662 // nonsensical sreg32orlds_and_sreg_1 regclass.
2663 //
2664 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2665 // the variadic REG_SEQUENCE operands.
2666
2667 Register Dst = MI.getOperand(0).getReg();
2668 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2669 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2670 MRI->getType(Dst) != LLT::scalar(64))
2671 return false;
2672
2673 Register Src = MI.getOperand(1).getReg();
2674 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2675 if (Fabs)
2676 Src = Fabs->getOperand(1).getReg();
2677
2678 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2679 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2680 return false;
2681
2682 MachineBasicBlock *BB = MI.getParent();
2683 const DebugLoc &DL = MI.getDebugLoc();
2684 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2685 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2686 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2687 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2688
2689 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2690 .addReg(Src, 0, AMDGPU::sub0);
2691 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2692 .addReg(Src, 0, AMDGPU::sub1);
2693 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2694 .addImm(0x80000000);
2695
2696 // Set or toggle sign bit.
2697 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2698 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2699 .addReg(HiReg)
2700 .addReg(ConstReg)
2701 .setOperandDead(3); // Dead scc
2702 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2703 .addReg(LoReg)
2704 .addImm(AMDGPU::sub0)
2705 .addReg(OpReg)
2706 .addImm(AMDGPU::sub1);
2707 MI.eraseFromParent();
2708 return true;
2709}
2710
2711// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2712bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2713 Register Dst = MI.getOperand(0).getReg();
2714 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2715 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2716 MRI->getType(Dst) != LLT::scalar(64))
2717 return false;
2718
2719 Register Src = MI.getOperand(1).getReg();
2720 MachineBasicBlock *BB = MI.getParent();
2721 const DebugLoc &DL = MI.getDebugLoc();
2722 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2723 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2724 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2725 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2726
2727 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2728 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2729 return false;
2730
2731 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2732 .addReg(Src, 0, AMDGPU::sub0);
2733 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2734 .addReg(Src, 0, AMDGPU::sub1);
2735 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2736 .addImm(0x7fffffff);
2737
2738 // Clear sign bit.
2739 // TODO: Should this used S_BITSET0_*?
2740 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2741 .addReg(HiReg)
2742 .addReg(ConstReg)
2743 .setOperandDead(3); // Dead scc
2744 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2745 .addReg(LoReg)
2746 .addImm(AMDGPU::sub0)
2747 .addReg(OpReg)
2748 .addImm(AMDGPU::sub1);
2749
2750 MI.eraseFromParent();
2751 return true;
2752}
2753
2754static bool isConstant(const MachineInstr &MI) {
2755 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2756}
2757
2758void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2759 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2760
2761 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2762 const MachineInstr *PtrMI =
2763 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2764
2765 assert(PtrMI);
2766
2767 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2768 return;
2769
2770 GEPInfo GEPInfo;
2771
2772 for (unsigned i = 1; i != 3; ++i) {
2773 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2774 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2775 assert(OpDef);
2776 if (i == 2 && isConstant(*OpDef)) {
2777 // TODO: Could handle constant base + variable offset, but a combine
2778 // probably should have commuted it.
2779 assert(GEPInfo.Imm == 0);
2780 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2781 continue;
2782 }
2783 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2784 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2785 GEPInfo.SgprParts.push_back(GEPOp.getReg());
2786 else
2787 GEPInfo.VgprParts.push_back(GEPOp.getReg());
2788 }
2789
2790 AddrInfo.push_back(GEPInfo);
2791 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2792}
2793
2794bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2795 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2796}
2797
2798bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2799 if (!MI.hasOneMemOperand())
2800 return false;
2801
2802 const MachineMemOperand *MMO = *MI.memoperands_begin();
2803 const Value *Ptr = MMO->getValue();
2804
2805 // UndefValue means this is a load of a kernel input. These are uniform.
2806 // Sometimes LDS instructions have constant pointers.
2807 // If Ptr is null, then that means this mem operand contains a
2808 // PseudoSourceValue like GOT.
2809 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2810 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2811 return true;
2812
2814 return true;
2815
2816 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
2817 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
2818 AMDGPU::SGPRRegBankID;
2819
2820 const Instruction *I = dyn_cast<Instruction>(Ptr);
2821 return I && I->getMetadata("amdgpu.uniform");
2822}
2823
2824bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2825 for (const GEPInfo &GEPInfo : AddrInfo) {
2826 if (!GEPInfo.VgprParts.empty())
2827 return true;
2828 }
2829 return false;
2830}
2831
2832void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2833 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2834 unsigned AS = PtrTy.getAddressSpace();
2836 STI.ldsRequiresM0Init()) {
2837 MachineBasicBlock *BB = I.getParent();
2838
2839 // If DS instructions require M0 initialization, insert it before selecting.
2840 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2841 .addImm(-1);
2842 }
2843}
2844
2845bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2846 MachineInstr &I) const {
2847 initM0(I);
2848 return selectImpl(I, *CoverageInfo);
2849}
2850
2852 if (Reg.isPhysical())
2853 return false;
2854
2855 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
2856 const unsigned Opcode = MI.getOpcode();
2857
2858 if (Opcode == AMDGPU::COPY)
2859 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
2860
2861 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
2862 Opcode == AMDGPU::G_XOR)
2863 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
2864 isVCmpResult(MI.getOperand(2).getReg(), MRI);
2865
2866 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
2867 return GI->is(Intrinsic::amdgcn_class);
2868
2869 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
2870}
2871
2872bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2873 MachineBasicBlock *BB = I.getParent();
2874 MachineOperand &CondOp = I.getOperand(0);
2875 Register CondReg = CondOp.getReg();
2876 const DebugLoc &DL = I.getDebugLoc();
2877
2878 unsigned BrOpcode;
2879 Register CondPhysReg;
2880 const TargetRegisterClass *ConstrainRC;
2881
2882 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2883 // whether the branch is uniform when selecting the instruction. In
2884 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2885 // RegBankSelect knows what it's doing if the branch condition is scc, even
2886 // though it currently does not.
2887 if (!isVCC(CondReg, *MRI)) {
2888 if (MRI->getType(CondReg) != LLT::scalar(32))
2889 return false;
2890
2891 CondPhysReg = AMDGPU::SCC;
2892 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2893 ConstrainRC = &AMDGPU::SReg_32RegClass;
2894 } else {
2895 // FIXME: Should scc->vcc copies and with exec?
2896
2897 // Unless the value of CondReg is a result of a V_CMP* instruction then we
2898 // need to insert an and with exec.
2899 if (!isVCmpResult(CondReg, *MRI)) {
2900 const bool Is64 = STI.isWave64();
2901 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
2902 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
2903
2904 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
2905 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
2906 .addReg(CondReg)
2907 .addReg(Exec)
2908 .setOperandDead(3); // Dead scc
2909 CondReg = TmpReg;
2910 }
2911
2912 CondPhysReg = TRI.getVCC();
2913 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2914 ConstrainRC = TRI.getBoolRC();
2915 }
2916
2917 if (!MRI->getRegClassOrNull(CondReg))
2918 MRI->setRegClass(CondReg, ConstrainRC);
2919
2920 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2921 .addReg(CondReg);
2922 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2923 .addMBB(I.getOperand(1).getMBB());
2924
2925 I.eraseFromParent();
2926 return true;
2927}
2928
2929bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2930 MachineInstr &I) const {
2931 Register DstReg = I.getOperand(0).getReg();
2932 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2933 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2934 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2935 if (IsVGPR)
2936 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2937
2938 return RBI.constrainGenericRegister(
2939 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2940}
2941
2942bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2943 Register DstReg = I.getOperand(0).getReg();
2944 Register SrcReg = I.getOperand(1).getReg();
2945 Register MaskReg = I.getOperand(2).getReg();
2946 LLT Ty = MRI->getType(DstReg);
2947 LLT MaskTy = MRI->getType(MaskReg);
2948 MachineBasicBlock *BB = I.getParent();
2949 const DebugLoc &DL = I.getDebugLoc();
2950
2951 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2952 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2953 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2954 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2955 if (DstRB != SrcRB) // Should only happen for hand written MIR.
2956 return false;
2957
2958 // Try to avoid emitting a bit operation when we only need to touch half of
2959 // the 64-bit pointer.
2960 APInt MaskOnes = KB->getKnownOnes(MaskReg).zext(64);
2961 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2962 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2963
2964 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
2965 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
2966
2967 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
2968 !CanCopyLow32 && !CanCopyHi32) {
2969 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
2970 .addReg(SrcReg)
2971 .addReg(MaskReg)
2972 .setOperandDead(3); // Dead scc
2973 I.eraseFromParent();
2974 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2975 }
2976
2977 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2978 const TargetRegisterClass &RegRC
2979 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2980
2981 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
2982 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
2983 const TargetRegisterClass *MaskRC =
2984 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
2985
2986 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2987 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2988 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2989 return false;
2990
2991 if (Ty.getSizeInBits() == 32) {
2992 assert(MaskTy.getSizeInBits() == 32 &&
2993 "ptrmask should have been narrowed during legalize");
2994
2995 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2996 .addReg(SrcReg)
2997 .addReg(MaskReg);
2998
2999 if (!IsVGPR)
3000 NewOp.setOperandDead(3); // Dead scc
3001 I.eraseFromParent();
3002 return true;
3003 }
3004
3005 Register HiReg = MRI->createVirtualRegister(&RegRC);
3006 Register LoReg = MRI->createVirtualRegister(&RegRC);
3007
3008 // Extract the subregisters from the source pointer.
3009 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
3010 .addReg(SrcReg, 0, AMDGPU::sub0);
3011 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
3012 .addReg(SrcReg, 0, AMDGPU::sub1);
3013
3014 Register MaskedLo, MaskedHi;
3015
3016 if (CanCopyLow32) {
3017 // If all the bits in the low half are 1, we only need a copy for it.
3018 MaskedLo = LoReg;
3019 } else {
3020 // Extract the mask subregister and apply the and.
3021 Register MaskLo = MRI->createVirtualRegister(&RegRC);
3022 MaskedLo = MRI->createVirtualRegister(&RegRC);
3023
3024 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
3025 .addReg(MaskReg, 0, AMDGPU::sub0);
3026 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
3027 .addReg(LoReg)
3028 .addReg(MaskLo);
3029 }
3030
3031 if (CanCopyHi32) {
3032 // If all the bits in the high half are 1, we only need a copy for it.
3033 MaskedHi = HiReg;
3034 } else {
3035 Register MaskHi = MRI->createVirtualRegister(&RegRC);
3036 MaskedHi = MRI->createVirtualRegister(&RegRC);
3037
3038 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3039 .addReg(MaskReg, 0, AMDGPU::sub1);
3040 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3041 .addReg(HiReg)
3042 .addReg(MaskHi);
3043 }
3044
3045 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3046 .addReg(MaskedLo)
3047 .addImm(AMDGPU::sub0)
3048 .addReg(MaskedHi)
3049 .addImm(AMDGPU::sub1);
3050 I.eraseFromParent();
3051 return true;
3052}
3053
3054/// Return the register to use for the index value, and the subregister to use
3055/// for the indirectly accessed register.
3056static std::pair<Register, unsigned>
3058 const TargetRegisterClass *SuperRC, Register IdxReg,
3059 unsigned EltSize, GISelKnownBits &KnownBits) {
3060 Register IdxBaseReg;
3061 int Offset;
3062
3063 std::tie(IdxBaseReg, Offset) =
3065 if (IdxBaseReg == AMDGPU::NoRegister) {
3066 // This will happen if the index is a known constant. This should ordinarily
3067 // be legalized out, but handle it as a register just in case.
3068 assert(Offset == 0);
3069 IdxBaseReg = IdxReg;
3070 }
3071
3072 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3073
3074 // Skip out of bounds offsets, or else we would end up using an undefined
3075 // register.
3076 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3077 return std::pair(IdxReg, SubRegs[0]);
3078 return std::pair(IdxBaseReg, SubRegs[Offset]);
3079}
3080
3081bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3082 MachineInstr &MI) const {
3083 Register DstReg = MI.getOperand(0).getReg();
3084 Register SrcReg = MI.getOperand(1).getReg();
3085 Register IdxReg = MI.getOperand(2).getReg();
3086
3087 LLT DstTy = MRI->getType(DstReg);
3088 LLT SrcTy = MRI->getType(SrcReg);
3089
3090 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3091 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3092 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3093
3094 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3095 // into a waterfall loop.
3096 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3097 return false;
3098
3099 const TargetRegisterClass *SrcRC =
3100 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3101 const TargetRegisterClass *DstRC =
3102 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3103 if (!SrcRC || !DstRC)
3104 return false;
3105 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3106 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3107 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3108 return false;
3109
3110 MachineBasicBlock *BB = MI.getParent();
3111 const DebugLoc &DL = MI.getDebugLoc();
3112 const bool Is64 = DstTy.getSizeInBits() == 64;
3113
3114 unsigned SubReg;
3115 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3116 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *KB);
3117
3118 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3119 if (DstTy.getSizeInBits() != 32 && !Is64)
3120 return false;
3121
3122 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3123 .addReg(IdxReg);
3124
3125 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3126 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3127 .addReg(SrcReg, 0, SubReg)
3128 .addReg(SrcReg, RegState::Implicit);
3129 MI.eraseFromParent();
3130 return true;
3131 }
3132
3133 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3134 return false;
3135
3136 if (!STI.useVGPRIndexMode()) {
3137 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3138 .addReg(IdxReg);
3139 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3140 .addReg(SrcReg, 0, SubReg)
3141 .addReg(SrcReg, RegState::Implicit);
3142 MI.eraseFromParent();
3143 return true;
3144 }
3145
3146 const MCInstrDesc &GPRIDXDesc =
3147 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3148 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3149 .addReg(SrcReg)
3150 .addReg(IdxReg)
3151 .addImm(SubReg);
3152
3153 MI.eraseFromParent();
3154 return true;
3155}
3156
3157// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3158bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3159 MachineInstr &MI) const {
3160 Register DstReg = MI.getOperand(0).getReg();
3161 Register VecReg = MI.getOperand(1).getReg();
3162 Register ValReg = MI.getOperand(2).getReg();
3163 Register IdxReg = MI.getOperand(3).getReg();
3164
3165 LLT VecTy = MRI->getType(DstReg);
3166 LLT ValTy = MRI->getType(ValReg);
3167 unsigned VecSize = VecTy.getSizeInBits();
3168 unsigned ValSize = ValTy.getSizeInBits();
3169
3170 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3171 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3172 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3173
3174 assert(VecTy.getElementType() == ValTy);
3175
3176 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3177 // into a waterfall loop.
3178 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3179 return false;
3180
3181 const TargetRegisterClass *VecRC =
3182 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3183 const TargetRegisterClass *ValRC =
3184 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3185
3186 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3187 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3188 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3189 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3190 return false;
3191
3192 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3193 return false;
3194
3195 unsigned SubReg;
3196 std::tie(IdxReg, SubReg) =
3197 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *KB);
3198
3199 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3200 STI.useVGPRIndexMode();
3201
3202 MachineBasicBlock *BB = MI.getParent();
3203 const DebugLoc &DL = MI.getDebugLoc();
3204
3205 if (!IndexMode) {
3206 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3207 .addReg(IdxReg);
3208
3209 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3210 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3211 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3212 .addReg(VecReg)
3213 .addReg(ValReg)
3214 .addImm(SubReg);
3215 MI.eraseFromParent();
3216 return true;
3217 }
3218
3219 const MCInstrDesc &GPRIDXDesc =
3220 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3221 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3222 .addReg(VecReg)
3223 .addReg(ValReg)
3224 .addReg(IdxReg)
3225 .addImm(SubReg);
3226
3227 MI.eraseFromParent();
3228 return true;
3229}
3230
3231bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3233 unsigned Opc;
3234 unsigned Size = MI.getOperand(3).getImm();
3235
3236 // The struct intrinsic variants add one additional operand over raw.
3237 const bool HasVIndex = MI.getNumOperands() == 9;
3238 Register VIndex;
3239 int OpOffset = 0;
3240 if (HasVIndex) {
3241 VIndex = MI.getOperand(4).getReg();
3242 OpOffset = 1;
3243 }
3244
3245 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3246 std::optional<ValueAndVReg> MaybeVOffset =
3248 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3249
3250 switch (Size) {
3251 default:
3252 return false;
3253 case 1:
3254 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3255 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3256 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3257 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3258 break;
3259 case 2:
3260 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3261 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3262 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3263 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3264 break;
3265 case 4:
3266 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3267 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3268 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3269 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3270 break;
3271 }
3272
3273 MachineBasicBlock *MBB = MI.getParent();
3274 const DebugLoc &DL = MI.getDebugLoc();
3275 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3276 .add(MI.getOperand(2));
3277
3278 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3279
3280 if (HasVIndex && HasVOffset) {
3281 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3282 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3283 .addReg(VIndex)
3284 .addImm(AMDGPU::sub0)
3285 .addReg(VOffset)
3286 .addImm(AMDGPU::sub1);
3287
3288 MIB.addReg(IdxReg);
3289 } else if (HasVIndex) {
3290 MIB.addReg(VIndex);
3291 } else if (HasVOffset) {
3292 MIB.addReg(VOffset);
3293 }
3294
3295 MIB.add(MI.getOperand(1)); // rsrc
3296 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3297 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3298 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3299 MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol
3300 MIB.addImm(Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0); // swz
3301
3302 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3303 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3304 LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
3305 MachinePointerInfo StorePtrI = LoadPtrI;
3306 StorePtrI.V = nullptr;
3308
3309 auto F = LoadMMO->getFlags() &
3311 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3312 Size, LoadMMO->getBaseAlign());
3313
3314 MachineMemOperand *StoreMMO =
3316 sizeof(int32_t), LoadMMO->getBaseAlign());
3317
3318 MIB.setMemRefs({LoadMMO, StoreMMO});
3319
3320 MI.eraseFromParent();
3321 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3322}
3323
3324/// Match a zero extend from a 32-bit value to 64-bits.
3326 Register ZExtSrc;
3327 if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3328 return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3329
3330 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3331 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3332 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3333 return Register();
3334
3335 assert(Def->getNumOperands() == 3 &&
3336 MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3337 if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3338 return Def->getOperand(1).getReg();
3339 }
3340
3341 return Register();
3342}
3343
3344bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3345 unsigned Opc;
3346 unsigned Size = MI.getOperand(3).getImm();
3347
3348 switch (Size) {
3349 default:
3350 return false;
3351 case 1:
3352 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3353 break;
3354 case 2:
3355 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3356 break;
3357 case 4:
3358 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3359 break;
3360 }
3361
3362 MachineBasicBlock *MBB = MI.getParent();
3363 const DebugLoc &DL = MI.getDebugLoc();
3364 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3365 .add(MI.getOperand(2));
3366
3367 Register Addr = MI.getOperand(1).getReg();
3368 Register VOffset;
3369 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3370 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3371 if (!isSGPR(Addr)) {
3372 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3373 if (isSGPR(AddrDef->Reg)) {
3374 Addr = AddrDef->Reg;
3375 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3376 Register SAddr =
3377 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3378 if (isSGPR(SAddr)) {
3379 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3380 if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
3381 Addr = SAddr;
3382 VOffset = Off;
3383 }
3384 }
3385 }
3386 }
3387
3388 if (isSGPR(Addr)) {
3389 Opc = AMDGPU::getGlobalSaddrOp(Opc);
3390 if (!VOffset) {
3391 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3392 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3393 .addImm(0);
3394 }
3395 }
3396
3397 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3398 .addReg(Addr);
3399
3400 if (isSGPR(Addr))
3401 MIB.addReg(VOffset);
3402
3403 MIB.add(MI.getOperand(4)) // offset
3404 .add(MI.getOperand(5)); // cpol
3405
3406 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3407 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3408 LoadPtrI.Offset = MI.getOperand(4).getImm();
3409 MachinePointerInfo StorePtrI = LoadPtrI;
3412 auto F = LoadMMO->getFlags() &
3414 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3415 Size, LoadMMO->getBaseAlign());
3416 MachineMemOperand *StoreMMO =
3418 sizeof(int32_t), Align(4));
3419
3420 MIB.setMemRefs({LoadMMO, StoreMMO});
3421
3422 MI.eraseFromParent();
3423 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3424}
3425
3426bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3427 MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3428 MI.removeOperand(1);
3429 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3430 return true;
3431}
3432
3433bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3434 unsigned Opc;
3435 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3436 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3437 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3438 break;
3439 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3440 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3441 break;
3442 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3443 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3444 break;
3445 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3446 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3447 break;
3448 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3449 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3450 break;
3451 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3452 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3453 break;
3454 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3455 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3456 break;
3457 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3458 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3459 break;
3460 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3461 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3462 break;
3463 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3464 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3465 break;
3466 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3467 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3468 break;
3469 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3470 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3471 break;
3472 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3473 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3474 break;
3475 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3476 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3477 break;
3478 default:
3479 llvm_unreachable("unhandled smfmac intrinsic");
3480 }
3481
3482 auto VDst_In = MI.getOperand(4);
3483
3484 MI.setDesc(TII.get(Opc));
3485 MI.removeOperand(4); // VDst_In
3486 MI.removeOperand(1); // Intrinsic ID
3487 MI.addOperand(VDst_In); // Readd VDst_In to the end
3488 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3489 return true;
3490}
3491
3492bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3493 Register DstReg = MI.getOperand(0).getReg();
3494 Register SrcReg = MI.getOperand(1).getReg();
3495 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3496 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3497 MachineBasicBlock *MBB = MI.getParent();
3498 const DebugLoc &DL = MI.getDebugLoc();
3499
3500 if (IsVALU) {
3501 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3502 .addImm(Subtarget->getWavefrontSizeLog2())
3503 .addReg(SrcReg);
3504 } else {
3505 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3506 .addReg(SrcReg)
3507 .addImm(Subtarget->getWavefrontSizeLog2())
3508 .setOperandDead(3); // Dead scc
3509 }
3510
3511 const TargetRegisterClass &RC =
3512 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3513 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3514 return false;
3515
3516 MI.eraseFromParent();
3517 return true;
3518}
3519
3520bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
3521 Register SrcReg = MI.getOperand(0).getReg();
3522 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
3523 return false;
3524
3525 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
3526 Register SP =
3528 Register WaveAddr = getWaveAddress(DefMI);
3529 MachineBasicBlock *MBB = MI.getParent();
3530 const DebugLoc &DL = MI.getDebugLoc();
3531
3532 if (!WaveAddr) {
3533 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3534 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
3535 .addReg(SrcReg)
3536 .addImm(Subtarget->getWavefrontSizeLog2())
3537 .setOperandDead(3); // Dead scc
3538 }
3539
3540 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
3541 .addReg(WaveAddr);
3542
3543 MI.eraseFromParent();
3544 return true;
3545}
3546
3548
3549 if (!I.isPreISelOpcode()) {
3550 if (I.isCopy())
3551 return selectCOPY(I);
3552 return true;
3553 }
3554
3555 switch (I.getOpcode()) {
3556 case TargetOpcode::G_AND:
3557 case TargetOpcode::G_OR:
3558 case TargetOpcode::G_XOR:
3559 if (selectImpl(I, *CoverageInfo))
3560 return true;
3561 return selectG_AND_OR_XOR(I);
3562 case TargetOpcode::G_ADD:
3563 case TargetOpcode::G_SUB:
3564 if (selectImpl(I, *CoverageInfo))
3565 return true;
3566 return selectG_ADD_SUB(I);
3567 case TargetOpcode::G_UADDO:
3568 case TargetOpcode::G_USUBO:
3569 case TargetOpcode::G_UADDE:
3570 case TargetOpcode::G_USUBE:
3571 return selectG_UADDO_USUBO_UADDE_USUBE(I);
3572 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3573 case AMDGPU::G_AMDGPU_MAD_I64_I32:
3574 return selectG_AMDGPU_MAD_64_32(I);
3575 case TargetOpcode::G_INTTOPTR:
3576 case TargetOpcode::G_BITCAST:
3577 case TargetOpcode::G_PTRTOINT:
3578 return selectCOPY(I);
3579 case TargetOpcode::G_CONSTANT:
3580 case TargetOpcode::G_FCONSTANT:
3581 return selectG_CONSTANT(I);
3582 case TargetOpcode::G_FNEG:
3583 if (selectImpl(I, *CoverageInfo))
3584 return true;
3585 return selectG_FNEG(I);
3586 case TargetOpcode::G_FABS:
3587 if (selectImpl(I, *CoverageInfo))
3588 return true;
3589 return selectG_FABS(I);
3590 case TargetOpcode::G_EXTRACT:
3591 return selectG_EXTRACT(I);
3592 case TargetOpcode::G_MERGE_VALUES:
3593 case TargetOpcode::G_CONCAT_VECTORS:
3594 return selectG_MERGE_VALUES(I);
3595 case TargetOpcode::G_UNMERGE_VALUES:
3596 return selectG_UNMERGE_VALUES(I);
3597 case TargetOpcode::G_BUILD_VECTOR:
3598 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
3599 return selectG_BUILD_VECTOR(I);
3600 case TargetOpcode::G_PTR_ADD:
3601 if (selectImpl(I, *CoverageInfo))
3602 return true;
3603 return selectG_PTR_ADD(I);
3604 case TargetOpcode::G_IMPLICIT_DEF:
3605 return selectG_IMPLICIT_DEF(I);
3606 case TargetOpcode::G_FREEZE:
3607 return selectCOPY(I);
3608 case TargetOpcode::G_INSERT:
3609 return selectG_INSERT(I);
3610 case TargetOpcode::G_INTRINSIC:
3611 case TargetOpcode::G_INTRINSIC_CONVERGENT:
3612 return selectG_INTRINSIC(I);
3613 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3614 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
3615 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
3616 case TargetOpcode::G_ICMP:
3617 case TargetOpcode::G_FCMP:
3618 if (selectG_ICMP_or_FCMP(I))
3619 return true;
3620 return selectImpl(I, *CoverageInfo);
3621 case TargetOpcode::G_LOAD:
3622 case TargetOpcode::G_STORE:
3623 case TargetOpcode::G_ATOMIC_CMPXCHG:
3624 case TargetOpcode::G_ATOMICRMW_XCHG:
3625 case TargetOpcode::G_ATOMICRMW_ADD:
3626 case TargetOpcode::G_ATOMICRMW_SUB:
3627 case TargetOpcode::G_ATOMICRMW_AND:
3628 case TargetOpcode::G_ATOMICRMW_OR:
3629 case TargetOpcode::G_ATOMICRMW_XOR:
3630 case TargetOpcode::G_ATOMICRMW_MIN:
3631 case TargetOpcode::G_ATOMICRMW_MAX:
3632 case TargetOpcode::G_ATOMICRMW_UMIN:
3633 case TargetOpcode::G_ATOMICRMW_UMAX:
3634 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
3635 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
3636 case TargetOpcode::G_ATOMICRMW_FADD:
3637 case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
3638 case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
3639 return selectG_LOAD_STORE_ATOMICRMW(I);
3640 case TargetOpcode::G_SELECT:
3641 return selectG_SELECT(I);
3642 case TargetOpcode::G_TRUNC:
3643 return selectG_TRUNC(I);
3644 case TargetOpcode::G_SEXT:
3645 case TargetOpcode::G_ZEXT:
3646 case TargetOpcode::G_ANYEXT:
3647 case TargetOpcode::G_SEXT_INREG:
3648 // This is a workaround. For extension from type i1, `selectImpl()` uses
3649 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
3650 // i1 can only be hold in a SGPR class.
3651 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
3652 selectImpl(I, *CoverageInfo))
3653 return true;
3654 return selectG_SZA_EXT(I);
3655 case TargetOpcode::G_FPEXT:
3656 if (selectG_FPEXT(I))
3657 return true;
3658 return selectImpl(I, *CoverageInfo);
3659 case TargetOpcode::G_BRCOND:
3660 return selectG_BRCOND(I);
3661 case TargetOpcode::G_GLOBAL_VALUE:
3662 return selectG_GLOBAL_VALUE(I);
3663 case TargetOpcode::G_PTRMASK:
3664 return selectG_PTRMASK(I);
3665 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3666 return selectG_EXTRACT_VECTOR_ELT(I);
3667 case TargetOpcode::G_INSERT_VECTOR_ELT:
3668 return selectG_INSERT_VECTOR_ELT(I);
3669 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3670 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3671 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3672 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3675 assert(Intr && "not an image intrinsic with image pseudo");
3676 return selectImageIntrinsic(I, Intr);
3677 }
3678 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
3679 return selectBVHIntrinsic(I);
3680 case AMDGPU::G_SBFX:
3681 case AMDGPU::G_UBFX:
3682 return selectG_SBFX_UBFX(I);
3683 case AMDGPU::G_SI_CALL:
3684 I.setDesc(TII.get(AMDGPU::SI_CALL));
3685 return true;
3686 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
3687 return selectWaveAddress(I);
3688 case AMDGPU::G_STACKRESTORE:
3689 return selectStackRestore(I);
3690 case AMDGPU::G_PHI:
3691 return selectPHI(I);
3692 default:
3693 return selectImpl(I, *CoverageInfo);
3694 }
3695 return false;
3696}
3697
3699AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
3700 return {{
3701 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3702 }};
3703
3704}
3705
3706std::pair<Register, unsigned>
3707AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
3708 bool IsCanonicalizing,
3709 bool AllowAbs, bool OpSel) const {
3710 Register Src = Root.getReg();
3711 unsigned Mods = 0;
3712 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
3713
3714 if (MI->getOpcode() == AMDGPU::G_FNEG) {
3715 Src = MI->getOperand(1).getReg();
3716 Mods |= SISrcMods::NEG;
3717 MI = getDefIgnoringCopies(Src, *MRI);
3718 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
3719 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
3720 // denormal mode, but we're implicitly canonicalizing in a source operand.
3721 const ConstantFP *LHS =
3722 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
3723 if (LHS && LHS->isZero()) {
3724 Mods |= SISrcMods::NEG;
3725 Src = MI->getOperand(2).getReg();
3726 }
3727 }
3728
3729 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
3730 Src = MI->getOperand(1).getReg();
3731 Mods |= SISrcMods::ABS;
3732 }
3733
3734 if (OpSel)
3735 Mods |= SISrcMods::OP_SEL_0;
3736
3737 return std::pair(Src, Mods);
3738}
3739
3740Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
3741 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
3742 bool ForceVGPR) const {
3743 if ((Mods != 0 || ForceVGPR) &&
3744 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
3745
3746 // If we looked through copies to find source modifiers on an SGPR operand,
3747 // we now have an SGPR register source. To avoid potentially violating the
3748 // constant bus restriction, we need to insert a copy to a VGPR.
3749 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
3750 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
3751 TII.get(AMDGPU::COPY), VGPRSrc)
3752 .addReg(Src);
3753 Src = VGPRSrc;
3754 }
3755
3756 return Src;
3757}
3758
3759///
3760/// This will select either an SGPR or VGPR operand and will save us from
3761/// having to write an extra tablegen pattern.
3763AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
3764 return {{
3765 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3766 }};
3767}
3768
3770AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
3771 Register Src;
3772 unsigned Mods;
3773 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3774
3775 return {{
3776 [=](MachineInstrBuilder &MIB) {
3777 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3778 },
3779 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3780 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3781 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3782 }};
3783}
3784
3786AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
3787 Register Src;
3788 unsigned Mods;
3789 std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
3790 /*IsCanonicalizing=*/true,
3791 /*AllowAbs=*/false);
3792
3793 return {{
3794 [=](MachineInstrBuilder &MIB) {
3795 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3796 },
3797 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3798 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3799 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3800 }};
3801}
3802
3804AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
3805 return {{
3806 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
3807 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3808 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3809 }};
3810}
3811
3813AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3814 Register Src;
3815 unsigned Mods;
3816 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3817
3818 return {{
3819 [=](MachineInstrBuilder &MIB) {
3820 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3821 },
3822 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3823 }};
3824}
3825
3827AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
3828 MachineOperand &Root) const {
3829 Register Src;
3830 unsigned Mods;
3831 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/false);
3832
3833 return {{
3834 [=](MachineInstrBuilder &MIB) {
3835 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3836 },
3837 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3838 }};
3839}
3840
3842AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
3843 Register Src;
3844 unsigned Mods;
3845 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/true,
3846 /*AllowAbs=*/false);
3847
3848 return {{
3849 [=](MachineInstrBuilder &MIB) {
3850 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3851 },
3852 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3853 }};
3854}
3855
3857AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3858 Register Reg = Root.getReg();
3859 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3860 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
3861 return {};
3862 return {{
3863 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3864 }};
3865}
3866
3867std::pair<Register, unsigned>
3868AMDGPUInstructionSelector::selectVOP3PModsImpl(
3869 Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
3870 unsigned Mods = 0;
3871 MachineInstr *MI = MRI.getVRegDef(Src);
3872
3873 if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3874 // It's possible to see an f32 fneg here, but unlikely.
3875 // TODO: Treat f32 fneg as only high bit.
3876 MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
3878 Src = MI->getOperand(1).getReg();
3879 MI = MRI.getVRegDef(Src);
3880 }
3881
3882 // TODO: Handle G_FSUB 0 as fneg
3883
3884 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3885 (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard()
3886
3887 // Packed instructions do not have abs modifiers.
3888 Mods |= SISrcMods::OP_SEL_1;
3889
3890 return std::pair(Src, Mods);
3891}
3892
3894AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3896 = Root.getParent()->getParent()->getParent()->getRegInfo();
3897
3898 Register Src;
3899 unsigned Mods;
3900 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3901
3902 return {{
3903 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3904 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3905 }};
3906}
3907
3909AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
3911 = Root.getParent()->getParent()->getParent()->getRegInfo();
3912
3913 Register Src;
3914 unsigned Mods;
3915 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true);
3916
3917 return {{
3918 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3919 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3920 }};
3921}
3922
3924AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
3925 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3926 // Value is in Imm operand as i1 sign extended to int64_t.
3927 // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
3928 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3929 "expected i1 value");
3930 unsigned Mods = SISrcMods::OP_SEL_1;
3931 if (Root.getImm() == -1)
3932 Mods ^= SISrcMods::NEG;
3933 return {{
3934 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3935 }};
3936}
3937
3939AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
3940 MachineOperand &Root) const {
3941 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3942 "expected i1 value");
3943 unsigned Mods = SISrcMods::OP_SEL_1;
3944 if (Root.getImm() != 0)
3945 Mods |= SISrcMods::OP_SEL_0;
3946
3947 return {{
3948 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3949 }};
3950}
3951
3953 MachineInstr *InsertPt,
3955 const TargetRegisterClass *DstRegClass;
3956 switch (Elts.size()) {
3957 case 8:
3958 DstRegClass = &AMDGPU::VReg_256RegClass;
3959 break;
3960 case 4:
3961 DstRegClass = &AMDGPU::VReg_128RegClass;
3962 break;
3963 case 2:
3964 DstRegClass = &AMDGPU::VReg_64RegClass;
3965 break;
3966 default:
3967 llvm_unreachable("unhandled Reg sequence size");
3968 }
3969
3970 MachineIRBuilder B(*InsertPt);
3971 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
3972 .addDef(MRI.createVirtualRegister(DstRegClass));
3973 for (unsigned i = 0; i < Elts.size(); ++i) {
3974 MIB.addReg(Elts[i]);
3976 }
3977 return MIB->getOperand(0).getReg();
3978}
3979
3980static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3982 MachineInstr *InsertPt,
3984 if (ModOpcode == TargetOpcode::G_FNEG) {
3985 Mods |= SISrcMods::NEG;
3986 // Check if all elements also have abs modifier
3987 SmallVector<Register, 8> NegAbsElts;
3988 for (auto El : Elts) {
3989 Register FabsSrc;
3990 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
3991 break;
3992 NegAbsElts.push_back(FabsSrc);
3993 }
3994 if (Elts.size() != NegAbsElts.size()) {
3995 // Neg
3996 Src = buildRegSequence(Elts, InsertPt, MRI);
3997 } else {
3998 // Neg and Abs
3999 Mods |= SISrcMods::NEG_HI;
4000 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
4001 }
4002 } else {
4003 assert(ModOpcode == TargetOpcode::G_FABS);
4004 // Abs
4005 Mods |= SISrcMods::NEG_HI;
4006 Src = buildRegSequence(Elts, InsertPt, MRI);
4007 }
4008}
4009
4011AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
4012 Register Src = Root.getReg();
4013 unsigned Mods = SISrcMods::OP_SEL_1;
4015
4016 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
4017 assert(BV->getNumSources() > 0);
4018 // Based on first element decide which mod we match, neg or abs
4019 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
4020 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
4021 ? AMDGPU::G_FNEG
4022 : AMDGPU::G_FABS;
4023 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
4024 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
4025 if (ElF32->getOpcode() != ModOpcode)
4026 break;
4027 EltsF32.push_back(ElF32->getOperand(1).getReg());
4028 }
4029
4030 // All elements had ModOpcode modifier
4031 if (BV->getNumSources() == EltsF32.size()) {
4032 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
4033 *MRI);
4034 }
4035 }
4036
4037 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4038 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4039}
4040
4042AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
4043 Register Src = Root.getReg();
4044 unsigned Mods = SISrcMods::OP_SEL_1;
4045 SmallVector<Register, 8> EltsV2F16;
4046
4047 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4048 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4049 Register FNegSrc;
4050 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
4051 break;
4052 EltsV2F16.push_back(FNegSrc);
4053 }
4054
4055 // All elements had ModOpcode modifier
4056 if (CV->getNumSources() == EltsV2F16.size()) {
4057 Mods |= SISrcMods::NEG;
4058 Mods |= SISrcMods::NEG_HI;
4059 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
4060 }
4061 }
4062
4063 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4064 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4065}
4066
4068AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
4069 Register Src = Root.getReg();
4070 unsigned Mods = SISrcMods::OP_SEL_1;
4071 SmallVector<Register, 8> EltsV2F16;
4072
4073 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4074 assert(CV->getNumSources() > 0);
4075 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
4076 // Based on first element decide which mod we match, neg or abs
4077 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
4078 ? AMDGPU::G_FNEG
4079 : AMDGPU::G_FABS;
4080
4081 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4082 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
4083 if (ElV2F16->getOpcode() != ModOpcode)
4084 break;
4085 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
4086 }
4087
4088 // All elements had ModOpcode modifier
4089 if (CV->getNumSources() == EltsV2F16.size()) {
4090 MachineIRBuilder B(*Root.getParent());
4091 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
4092 *MRI);
4093 }
4094 }
4095
4096 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4097 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4098}
4099
4101AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
4102 std::optional<FPValueAndVReg> FPValReg;
4103 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
4104 if (TII.isInlineConstant(FPValReg->Value)) {
4105 return {{[=](MachineInstrBuilder &MIB) {
4106 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
4107 }}};
4108 }
4109 // Non-inlineable splat floats should not fall-through for integer immediate
4110 // checks.
4111 return {};
4112 }
4113
4114 APInt ICst;
4115 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
4116 if (TII.isInlineConstant(ICst)) {
4117 return {
4118 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
4119 }
4120 }
4121
4122 return {};
4123}
4124
4126AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
4127 Register Src =
4128 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4129 unsigned Key = 0;
4130
4131 Register ShiftSrc;
4132 std::optional<ValueAndVReg> ShiftAmt;
4133 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4134 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4135 ShiftAmt->Value.getZExtValue() % 8 == 0) {
4136 Key = ShiftAmt->Value.getZExtValue() / 8;
4137 Src = ShiftSrc;
4138 }
4139
4140 return {{
4141 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4142 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4143 }};
4144}
4145
4147AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
4148
4149 Register Src =
4150 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4151 unsigned Key = 0;
4152
4153 Register ShiftSrc;
4154 std::optional<ValueAndVReg> ShiftAmt;
4155 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4156 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4157 ShiftAmt->Value.getZExtValue() == 16) {
4158 Src = ShiftSrc;
4159 Key = 1;
4160 }
4161
4162 return {{
4163 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4164 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4165 }};
4166}
4167
4169AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
4170 Register Src;
4171 unsigned Mods;
4172 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
4173
4174 // FIXME: Handle op_sel
4175 return {{
4176 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4177 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4178 }};
4179}
4180
4182AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
4183 Register Src;
4184 unsigned Mods;
4185 std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
4186 /*IsCanonicalizing=*/true,
4187 /*AllowAbs=*/false,
4188 /*OpSel=*/false);
4189
4190 return {{
4191 [=](MachineInstrBuilder &MIB) {
4192 MIB.addReg(
4193 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4194 },
4195 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4196 }};
4197}
4198
4200AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
4201 Register Src;
4202 unsigned Mods;
4203 std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
4204 /*IsCanonicalizing=*/true,
4205 /*AllowAbs=*/false,
4206 /*OpSel=*/true);
4207
4208 return {{
4209 [=](MachineInstrBuilder &MIB) {
4210 MIB.addReg(
4211 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4212 },
4213 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4214 }};
4215}
4216
4217bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
4218 Register &Base,
4219 Register *SOffset,
4220 int64_t *Offset) const {
4221 MachineInstr *MI = Root.getParent();
4222 MachineBasicBlock *MBB = MI->getParent();
4223
4224 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
4225 // then we can select all ptr + 32-bit offsets.
4226 SmallVector<GEPInfo, 4> AddrInfo;
4227 getAddrModeInfo(*MI, *MRI, AddrInfo);
4228
4229 if (AddrInfo.empty())
4230 return false;
4231
4232 const GEPInfo &GEPI = AddrInfo[0];
4233 std::optional<int64_t> EncodedImm =
4234 AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, false);
4235
4236 if (SOffset && Offset) {
4237 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
4238 AddrInfo.size() > 1) {
4239 const GEPInfo &GEPI2 = AddrInfo[1];
4240 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
4241 if (Register OffsetReg =
4242 matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
4243 Base = GEPI2.SgprParts[0];
4244 *SOffset = OffsetReg;
4245 *Offset = *EncodedImm;
4246 return true;
4247 }
4248 }
4249 }
4250 return false;
4251 }
4252
4253 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
4254 Base = GEPI.SgprParts[0];
4255 *Offset = *EncodedImm;
4256 return true;
4257 }
4258
4259 // SGPR offset is unsigned.
4260 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
4261 GEPI.Imm != 0) {
4262 // If we make it this far we have a load with an 32-bit immediate offset.
4263 // It is OK to select this using a sgpr offset, because we have already
4264 // failed trying to select this load into one of the _IMM variants since
4265 // the _IMM Patterns are considered before the _SGPR patterns.
4266 Base = GEPI.SgprParts[0];
4267 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4268 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
4269 .addImm(GEPI.Imm);
4270 return true;
4271 }
4272
4273 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
4274 if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
4275 Base = GEPI.SgprParts[0];
4276 *SOffset = OffsetReg;
4277 return true;
4278 }
4279 }
4280
4281 return false;
4282}
4283
4285AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
4286 Register Base;
4287 int64_t Offset;
4288 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
4289 return std::nullopt;
4290
4291 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4292 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4293}
4294
4296AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
4297 SmallVector<GEPInfo, 4> AddrInfo;
4298 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
4299
4300 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
4301 return std::nullopt;
4302
4303 const GEPInfo &GEPInfo = AddrInfo[0];
4304 Register PtrReg = GEPInfo.SgprParts[0];
4305 std::optional<int64_t> EncodedImm =
4306 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
4307 if (!EncodedImm)
4308 return std::nullopt;
4309
4310 return {{
4311 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
4312 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
4313 }};
4314}
4315
4317AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
4318 Register Base, SOffset;
4319 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
4320 return std::nullopt;
4321
4322 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4323 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
4324}
4325
4327AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
4328 Register Base, SOffset;
4329 int64_t Offset;
4330 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
4331 return std::nullopt;
4332
4333 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4334 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
4335 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4336}
4337
4338std::pair<Register, int>
4339AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
4340 uint64_t FlatVariant) const {
4341 MachineInstr *MI = Root.getParent();
4342
4343 auto Default = std::pair(Root.getReg(), 0);
4344
4345 if (!STI.hasFlatInstOffsets())
4346 return Default;
4347
4348 Register PtrBase;
4349 int64_t ConstOffset;
4350 std::tie(PtrBase, ConstOffset) =
4351 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4352
4353 if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch &&
4354 !isFlatScratchBaseLegal(Root.getReg())))
4355 return Default;
4356
4357 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
4358 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
4359 return Default;
4360
4361 return std::pair(PtrBase, ConstOffset);
4362}
4363
4365AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
4366 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
4367
4368 return {{
4369 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4370 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4371 }};
4372}
4373
4375AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
4376 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
4377
4378 return {{
4379 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4380 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4381 }};
4382}
4383
4385AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
4386 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
4387
4388 return {{
4389 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4390 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4391 }};
4392}
4393
4394// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
4396AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
4397 Register Addr = Root.getReg();
4398 Register PtrBase;
4399 int64_t ConstOffset;
4400 int64_t ImmOffset = 0;
4401
4402 // Match the immediate offset first, which canonically is moved as low as
4403 // possible.
4404 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4405
4406 if (ConstOffset != 0) {
4407 if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
4409 Addr = PtrBase;
4410 ImmOffset = ConstOffset;
4411 } else {
4412 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
4413 if (isSGPR(PtrBaseDef->Reg)) {
4414 if (ConstOffset > 0) {
4415 // Offset is too large.
4416 //
4417 // saddr + large_offset -> saddr +
4418 // (voffset = large_offset & ~MaxOffset) +
4419 // (large_offset & MaxOffset);
4420 int64_t SplitImmOffset, RemainderOffset;
4421 std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
4423
4424 if (isUInt<32>(RemainderOffset)) {
4425 MachineInstr *MI = Root.getParent();
4426 MachineBasicBlock *MBB = MI->getParent();
4427 Register HighBits =
4428 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4429
4430 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4431 HighBits)
4432 .addImm(RemainderOffset);
4433
4434 return {{
4435 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
4436 [=](MachineInstrBuilder &MIB) {
4437 MIB.addReg(HighBits);
4438 }, // voffset
4439 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
4440 }};
4441 }
4442 }
4443
4444 // We are adding a 64 bit SGPR and a constant. If constant bus limit
4445 // is 1 we would need to perform 1 or 2 extra moves for each half of
4446 // the constant and it is better to do a scalar add and then issue a
4447 // single VALU instruction to materialize zero. Otherwise it is less
4448 // instructions to perform VALU adds with immediates or inline literals.
4449 unsigned NumLiterals =
4450 !TII.isInlineConstant(APInt(32, ConstOffset & 0xffffffff)) +
4451 !TII.isInlineConstant(APInt(32, ConstOffset >> 32));
4452 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
4453 return std::nullopt;
4454 }
4455 }
4456 }
4457
4458 // Match the variable offset.
4459 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4460 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4461 // Look through the SGPR->VGPR copy.
4462 Register SAddr =
4463 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
4464
4465 if (isSGPR(SAddr)) {
4466 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
4467
4468 // It's possible voffset is an SGPR here, but the copy to VGPR will be
4469 // inserted later.
4470 if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
4471 return {{[=](MachineInstrBuilder &MIB) { // saddr
4472 MIB.addReg(SAddr);
4473 },
4474 [=](MachineInstrBuilder &MIB) { // voffset
4475 MIB.addReg(VOffset);
4476 },
4477 [=](MachineInstrBuilder &MIB) { // offset
4478 MIB.addImm(ImmOffset);
4479 }}};
4480 }
4481 }
4482 }
4483
4484 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
4485 // drop this.
4486 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
4487 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
4488 return std::nullopt;
4489
4490 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
4491 // moves required to copy a 64-bit SGPR to VGPR.
4492 MachineInstr *MI = Root.getParent();
4493 MachineBasicBlock *MBB = MI->getParent();
4494 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4495
4496 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
4497 .addImm(0);
4498
4499 return {{
4500 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
4501 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
4502 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4503 }};
4504}
4505
4507AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
4508 Register Addr = Root.getReg();
4509 Register PtrBase;
4510 int64_t ConstOffset;
4511 int64_t ImmOffset = 0;
4512
4513 // Match the immediate offset first, which canonically is moved as low as
4514 // possible.
4515 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4516
4517 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
4520 Addr = PtrBase;
4521 ImmOffset = ConstOffset;
4522 }
4523
4524 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4525 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4526 int FI = AddrDef->MI->getOperand(1).getIndex();
4527 return {{
4528 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4529 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4530 }};
4531 }
4532
4533 Register SAddr = AddrDef->Reg;
4534
4535 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4536 Register LHS = AddrDef->MI->getOperand(1).getReg();
4537 Register RHS = AddrDef->MI->getOperand(2).getReg();
4538 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4539 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
4540
4541 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
4542 isSGPR(RHSDef->Reg)) {
4543 int FI = LHSDef->MI->getOperand(1).getIndex();
4544 MachineInstr &I = *Root.getParent();
4545 MachineBasicBlock *BB = I.getParent();
4546 const DebugLoc &DL = I.getDebugLoc();
4547 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4548
4549 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
4550 .addFrameIndex(FI)
4551 .addReg(RHSDef->Reg)
4552 .setOperandDead(3); // Dead scc
4553 }
4554 }
4555
4556 if (!isSGPR(SAddr))
4557 return std::nullopt;
4558
4559 return {{
4560 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
4561 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4562 }};
4563}
4564
4565// Check whether the flat scratch SVS swizzle bug affects this access.
4566bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
4567 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
4568 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
4569 return false;
4570
4571 // The bug affects the swizzling of SVS accesses if there is any carry out
4572 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
4573 // voffset to (soffset + inst_offset).
4574 auto VKnown = KB->getKnownBits(VAddr);
4575 auto SKnown = KnownBits::computeForAddSub(
4576 /*Add=*/true, /*NSW=*/false, /*NUW=*/false, KB->getKnownBits(SAddr),
4577 KnownBits::makeConstant(APInt(32, ImmOffset)));
4578 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
4579 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
4580 return (VMax & 3) + (SMax & 3) >= 4;
4581}
4582
4584AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
4585 Register Addr = Root.getReg();
4586 Register PtrBase;
4587 int64_t ConstOffset;
4588 int64_t ImmOffset = 0;
4589
4590 // Match the immediate offset first, which canonically is moved as low as
4591 // possible.
4592 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4593
4594 Register OrigAddr = Addr;
4595 if (ConstOffset != 0 &&
4596 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
4597 Addr = PtrBase;
4598 ImmOffset = ConstOffset;
4599 }
4600
4601 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4602 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
4603 return std::nullopt;
4604
4605 Register RHS = AddrDef->MI->getOperand(2).getReg();
4606 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
4607 return std::nullopt;
4608
4609 Register LHS = AddrDef->MI->getOperand(1).getReg();
4610 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4611
4612 if (OrigAddr != Addr) {
4613 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
4614 return std::nullopt;
4615 } else {
4616 if (!isFlatScratchBaseLegalSV(OrigAddr))
4617 return std::nullopt;
4618 }
4619
4620 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
4621 return std::nullopt;
4622
4623 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4624 int FI = LHSDef->MI->getOperand(1).getIndex();
4625 return {{
4626 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4627 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4628 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4629 }};
4630 }
4631
4632 if (!isSGPR(LHS))
4633 return std::nullopt;
4634
4635 return {{
4636 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4637 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
4638 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4639 }};
4640}
4641
4643AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
4644 MachineInstr *MI = Root.getParent();
4645 MachineBasicBlock *MBB = MI->getParent();
4648
4649 int64_t Offset = 0;
4650 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
4652 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4653
4654 // TODO: Should this be inside the render function? The iterator seems to
4655 // move.
4656 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
4657 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4658 HighBits)
4659 .addImm(Offset & ~MaxOffset);
4660
4661 return {{[=](MachineInstrBuilder &MIB) { // rsrc
4662 MIB.addReg(Info->getScratchRSrcReg());
4663 },
4664 [=](MachineInstrBuilder &MIB) { // vaddr
4665 MIB.addReg(HighBits);
4666 },
4667 [=](MachineInstrBuilder &MIB) { // soffset
4668 // Use constant zero for soffset and rely on eliminateFrameIndex
4669 // to choose the appropriate frame register if need be.
4670 MIB.addImm(0);
4671 },
4672 [=](MachineInstrBuilder &MIB) { // offset
4673 MIB.addImm(Offset & MaxOffset);
4674 }}};
4675 }
4676
4677 assert(Offset == 0 || Offset == -1);
4678
4679 // Try to fold a frame index directly into the MUBUF vaddr field, and any
4680 // offsets.
4681 std::optional<int> FI;
4682 Register VAddr = Root.getReg();
4683 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
4684 Register PtrBase;
4685 int64_t ConstOffset;
4686 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
4687 if (ConstOffset != 0) {
4688 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
4690 KB->signBitIsZero(PtrBase))) {
4691 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
4692 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
4693 FI = PtrBaseDef->getOperand(1).getIndex();
4694 else
4695 VAddr = PtrBase;
4696 Offset = ConstOffset;
4697 }
4698 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4699 FI = RootDef->getOperand(1).getIndex();
4700 }
4701 }
4702
4703 return {{[=](MachineInstrBuilder &MIB) { // rsrc
4704 MIB.addReg(Info->getScratchRSrcReg());
4705 },
4706 [=](MachineInstrBuilder &MIB) { // vaddr
4707 if (FI)
4708 MIB.addFrameIndex(*FI);
4709 else
4710 MIB.addReg(VAddr);
4711 },
4712 [=](MachineInstrBuilder &MIB) { // soffset
4713 // Use constant zero for soffset and rely on eliminateFrameIndex
4714 // to choose the appropriate frame register if need be.
4715 MIB.addImm(0);
4716 },
4717 [=](MachineInstrBuilder &MIB) { // offset
4718 MIB.addImm(Offset);
4719 }}};
4720}
4721
4722bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
4723 int64_t Offset) const {
4724 if (!isUInt<16>(Offset))
4725 return false;
4726
4728 return true;
4729
4730 // On Southern Islands instruction with a negative base value and an offset
4731 // don't seem to work.
4732 return KB->signBitIsZero(Base);
4733}
4734
4735bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
4736 int64_t Offset1,
4737 unsigned Size) const {
4738 if (Offset0 % Size != 0 || Offset1 % Size != 0)
4739 return false;
4740 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
4741 return false;
4742
4744 return true;
4745
4746 // On Southern Islands instruction with a negative base value and an offset
4747 // don't seem to work.
4748 return KB->signBitIsZero(Base);
4749}
4750
4751// Return whether the operation has NoUnsignedWrap property.
4753 return Addr->getOpcode() == TargetOpcode::G_OR ||
4754 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
4755 Addr->getFlag(MachineInstr::NoUWrap));
4756}
4757
4758// Check that the base address of flat scratch load/store in the form of `base +
4759// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
4760// requirement). We always treat the first operand as the base address here.
4761bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
4762 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4763
4764 if (isNoUnsignedWrap(AddrMI))
4765 return true;
4766
4767 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4768 // values.
4769 if (STI.hasSignedScratchOffsets())
4770 return true;
4771
4772 Register LHS = AddrMI->getOperand(1).getReg();
4773 Register RHS = AddrMI->getOperand(2).getReg();
4774
4775 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
4776 std::optional<ValueAndVReg> RhsValReg =
4778 // If the immediate offset is negative and within certain range, the base
4779 // address cannot also be negative. If the base is also negative, the sum
4780 // would be either negative or much larger than the valid range of scratch
4781 // memory a thread can access.
4782 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
4783 RhsValReg->Value.getSExtValue() > -0x40000000)
4784 return true;
4785 }
4786
4787 return KB->signBitIsZero(LHS);
4788}
4789
4790// Check address value in SGPR/VGPR are legal for flat scratch in the form
4791// of: SGPR + VGPR.
4792bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
4793 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4794
4795 if (isNoUnsignedWrap(AddrMI))
4796 return true;
4797
4798 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4799 // values.
4800 if (STI.hasSignedScratchOffsets())
4801 return true;
4802
4803 Register LHS = AddrMI->getOperand(1).getReg();
4804 Register RHS = AddrMI->getOperand(2).getReg();
4805 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
4806}
4807
4808// Check address value in SGPR/VGPR are legal for flat scratch in the form
4809// of: SGPR + VGPR + Imm.
4810bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
4811 Register Addr) const {
4812 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4813 // values.
4814 if (STI.hasSignedScratchOffsets())
4815 return true;
4816
4817 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4818 Register Base = AddrMI->getOperand(1).getReg();
4819 std::optional<DefinitionAndSourceRegister> BaseDef =
4821 std::optional<ValueAndVReg> RHSOffset =
4823 assert(RHSOffset);
4824
4825 // If the immediate offset is negative and within certain range, the base
4826 // address cannot also be negative. If the base is also negative, the sum
4827 // would be either negative or much larger than the valid range of scratch
4828 // memory a thread can access.
4829 if (isNoUnsignedWrap(BaseDef->MI) &&
4830 (isNoUnsignedWrap(AddrMI) ||
4831 (RHSOffset->Value.getSExtValue() < 0 &&
4832 RHSOffset->Value.getSExtValue() > -0x40000000)))
4833 return true;
4834
4835 Register LHS = BaseDef->MI->getOperand(1).getReg();
4836 Register RHS = BaseDef->MI->getOperand(2).getReg();
4837 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
4838}
4839
4840bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
4841 unsigned ShAmtBits) const {
4842 assert(MI.getOpcode() == TargetOpcode::G_AND);
4843
4844 std::optional<APInt> RHS =
4845 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
4846 if (!RHS)
4847 return false;
4848
4849 if (RHS->countr_one() >= ShAmtBits)
4850 return true;
4851
4852 const APInt &LHSKnownZeros = KB->getKnownZeroes(MI.getOperand(1).getReg());
4853 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
4854}
4855
4857AMDGPUInstructionSelector::selectMUBUFScratchOffset(
4858 MachineOperand &Root) const {
4859 Register Reg = Root.getReg();
4861
4862 std::optional<DefinitionAndSourceRegister> Def =
4863 getDefSrcRegIgnoringCopies(Reg, *MRI);
4864 assert(Def && "this shouldn't be an optional result");
4865 Reg = Def->Reg;
4866
4867 if (Register WaveBase = getWaveAddress(Def->MI)) {
4868 return {{
4869 [=](MachineInstrBuilder &MIB) { // rsrc
4870 MIB.addReg(Info->getScratchRSrcReg());
4871 },
4872 [=](MachineInstrBuilder &MIB) { // soffset
4873 MIB.addReg(WaveBase);
4874 },
4875 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
4876 }};
4877 }
4878
4879 int64_t Offset = 0;
4880
4881 // FIXME: Copy check is a hack
4883 if (mi_match(Reg, *MRI,
4884 m_GPtrAdd(m_Reg(BasePtr),
4886 if (!TII.isLegalMUBUFImmOffset(Offset))
4887 return {};
4888 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
4889 Register WaveBase = getWaveAddress(BasePtrDef);
4890 if (!WaveBase)
4891 return {};
4892
4893 return {{
4894 [=](MachineInstrBuilder &MIB) { // rsrc
4895 MIB.addReg(Info->getScratchRSrcReg());
4896 },
4897 [=](MachineInstrBuilder &MIB) { // soffset
4898 MIB.addReg(WaveBase);
4899 },
4900 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
4901 }};
4902 }
4903
4904 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
4906 return {};
4907
4908 return {{
4909 [=](MachineInstrBuilder &MIB) { // rsrc
4910 MIB.addReg(Info->getScratchRSrcReg());
4911 },
4912 [=](MachineInstrBuilder &MIB) { // soffset
4913 MIB.addImm(0);
4914 },
4915 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
4916 }};
4917}
4918
4919std::pair<Register, unsigned>
4920AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
4921 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4922 if (!RootDef)
4923 return std::pair(Root.getReg(), 0);
4924
4925 int64_t ConstAddr = 0;
4926
4927 Register PtrBase;
4928 int64_t Offset;
4929 std::tie(PtrBase, Offset) =
4930 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4931
4932 if (Offset) {
4933 if (isDSOffsetLegal(PtrBase, Offset)) {
4934 // (add n0, c0)
4935 return std::pair(PtrBase, Offset);
4936 }
4937 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
4938 // TODO
4939
4940
4941 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
4942 // TODO
4943
4944 }
4945
4946 return std::pair(Root.getReg(), 0);
4947}
4948
4950AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
4951 Register Reg;
4952 unsigned Offset;
4953 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
4954 return {{
4955 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4956 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
4957 }};
4958}
4959
4961AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
4962 return selectDSReadWrite2(Root, 4);
4963}
4964
4966AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
4967 return selectDSReadWrite2(Root, 8);
4968}
4969
4971AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
4972 unsigned Size) const {
4973 Register Reg;
4974 unsigned Offset;
4975 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
4976 return {{
4977 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4978 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
4979 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
4980 }};
4981}
4982
4983std::pair<Register, unsigned>
4984AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
4985 unsigned Size) const {
4986 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4987 if (!RootDef)
4988 return std::pair(Root.getReg(), 0);
4989
4990 int64_t ConstAddr = 0;
4991
4992 Register PtrBase;
4993 int64_t Offset;
4994 std::tie(PtrBase, Offset) =
4995 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4996
4997 if (Offset) {
4998 int64_t OffsetValue0 = Offset;
4999 int64_t OffsetValue1 = Offset + Size;
5000 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
5001 // (add n0, c0)
5002 return std::pair(PtrBase, OffsetValue0 / Size);
5003 }
5004 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
5005 // TODO
5006
5007 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
5008 // TODO
5009
5010 }
5011
5012 return std::pair(Root.getReg(), 0);
5013}
5014
5015/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
5016/// the base value with the constant offset. There may be intervening copies
5017/// between \p Root and the identified constant. Returns \p Root, 0 if this does
5018/// not match the pattern.
5019std::pair<Register, int64_t>
5020AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
5021 Register Root, const MachineRegisterInfo &MRI) const {
5022 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
5023 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
5024 return {Root, 0};
5025
5026 MachineOperand &RHS = RootI->getOperand(2);
5027 std::optional<ValueAndVReg> MaybeOffset =
5029 if (!MaybeOffset)
5030 return {Root, 0};
5031 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
5032}
5033
5035 MIB.addImm(0);
5036}
5037
5038/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
5039/// BasePtr is not valid, a null base pointer will be used.
5041 uint32_t FormatLo, uint32_t FormatHi,
5042 Register BasePtr) {
5043 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5044 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5045 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5046 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
5047
5048 B.buildInstr(AMDGPU::S_MOV_B32)
5049 .addDef(RSrc2)
5050 .addImm(FormatLo);
5051 B.buildInstr(AMDGPU::S_MOV_B32)
5052 .addDef(RSrc3)
5053 .addImm(FormatHi);
5054
5055 // Build the half of the subregister with the constants before building the
5056 // full 128-bit register. If we are building multiple resource descriptors,
5057 // this will allow CSEing of the 2-component register.
5058 B.buildInstr(AMDGPU::REG_SEQUENCE)
5059 .addDef(RSrcHi)
5060 .addReg(RSrc2)
5061 .addImm(AMDGPU::sub0)
5062 .addReg(RSrc3)
5063 .addImm(AMDGPU::sub1);
5064
5065 Register RSrcLo = BasePtr;
5066 if (!BasePtr) {
5067 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5068 B.buildInstr(AMDGPU::S_MOV_B64)
5069 .addDef(RSrcLo)
5070 .addImm(0);
5071 }
5072
5073 B.buildInstr(AMDGPU::REG_SEQUENCE)
5074 .addDef(RSrc)
5075 .addReg(RSrcLo)
5076 .addImm(AMDGPU::sub0_sub1)
5077 .addReg(RSrcHi)
5078 .addImm(AMDGPU::sub2_sub3);
5079
5080 return RSrc;
5081}
5082
5084 const SIInstrInfo &TII, Register BasePtr) {
5085 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5086
5087 // FIXME: Why are half the "default" bits ignored based on the addressing
5088 // mode?
5089 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
5090}
5091
5093 const SIInstrInfo &TII, Register BasePtr) {
5094 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5095
5096 // FIXME: Why are half the "default" bits ignored based on the addressing
5097 // mode?
5098 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
5099}
5100
5101AMDGPUInstructionSelector::MUBUFAddressData
5102AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
5103 MUBUFAddressData Data;
5104 Data.N0 = Src;
5105
5106 Register PtrBase;
5107 int64_t Offset;
5108
5109 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
5110 if (isUInt<32>(Offset)) {
5111 Data.N0 = PtrBase;
5112 Data.Offset = Offset;
5113 }
5114
5115 if (MachineInstr *InputAdd
5116 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
5117 Data.N2 = InputAdd->getOperand(1).getReg();
5118 Data.N3 = InputAdd->getOperand(2).getReg();
5119
5120 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
5121 // FIXME: Don't know this was defined by operand 0
5122 //
5123 // TODO: Remove this when we have copy folding optimizations after
5124 // RegBankSelect.
5125 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
5126 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
5127 }
5128
5129 return Data;
5130}
5131
5132/// Return if the addr64 mubuf mode should be used for the given address.
5133bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
5134 // (ptr_add N2, N3) -> addr64, or
5135 // (ptr_add (ptr_add N2, N3), C1) -> addr64
5136 if (Addr.N2)
5137 return true;
5138
5139 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
5140 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
5141}
5142
5143/// Split an immediate offset \p ImmOffset depending on whether it fits in the
5144/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
5145/// component.
5146void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
5147 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
5148 if (TII.isLegalMUBUFImmOffset(ImmOffset))
5149 return;
5150
5151 // Illegal offset, store it in soffset.
5152 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5153 B.buildInstr(AMDGPU::S_MOV_B32)
5154 .addDef(SOffset)
5155 .addImm(ImmOffset);
5156 ImmOffset = 0;
5157}
5158
5159bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
5160 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
5161 Register &SOffset, int64_t &Offset) const {
5162 // FIXME: Predicates should stop this from reaching here.
5163 // addr64 bit was removed for volcanic islands.
5164 if (!STI.hasAddr64() || STI.useFlatForGlobal())
5165 return false;
5166
5167 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5168 if (!shouldUseAddr64(AddrData))
5169 return false;
5170
5171 Register N0 = AddrData.N0;
5172 Register N2 = AddrData.N2;
5173 Register N3 = AddrData.N3;
5174 Offset = AddrData.Offset;
5175
5176 // Base pointer for the SRD.
5177 Register SRDPtr;
5178
5179 if (N2) {
5180 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5181 assert(N3);
5182 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5183 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
5184 // addr64, and construct the default resource from a 0 address.
5185 VAddr = N0;
5186 } else {
5187 SRDPtr = N3;
5188 VAddr = N2;
5189 }
5190 } else {
5191 // N2 is not divergent.
5192 SRDPtr = N2;
5193 VAddr = N3;
5194 }
5195 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5196 // Use the default null pointer in the resource
5197 VAddr = N0;
5198 } else {
5199 // N0 -> offset, or
5200 // (N0 + C1) -> offset
5201 SRDPtr = N0;
5202 }
5203
5204 MachineIRBuilder B(*Root.getParent());
5205 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
5206 splitIllegalMUBUFOffset(B, SOffset, Offset);
5207 return true;
5208}
5209
5210bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
5211 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
5212 int64_t &Offset) const {
5213
5214 // FIXME: Pattern should not reach here.
5215 if (STI.useFlatForGlobal())
5216 return false;
5217
5218 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5219 if (shouldUseAddr64(AddrData))
5220 return false;
5221
5222 // N0 -> offset, or
5223 // (N0 + C1) -> offset
5224 Register SRDPtr = AddrData.N0;
5225 Offset = AddrData.Offset;
5226
5227 // TODO: Look through extensions for 32-bit soffset.
5228 MachineIRBuilder B(*Root.getParent());
5229
5230 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
5231 splitIllegalMUBUFOffset(B, SOffset, Offset);
5232 return true;
5233}
5234
5236AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
5237 Register VAddr;
5238 Register RSrcReg;
5239 Register SOffset;
5240 int64_t Offset = 0;
5241
5242 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
5243 return {};
5244
5245 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
5246 // pattern.
5247 return {{
5248 [=](MachineInstrBuilder &MIB) { // rsrc
5249 MIB.addReg(RSrcReg);
5250 },
5251 [=](MachineInstrBuilder &MIB) { // vaddr
5252 MIB.addReg(VAddr);
5253 },
5254 [=](MachineInstrBuilder &MIB) { // soffset
5255 if (SOffset)
5256 MIB.addReg(SOffset);
5257 else if (STI.hasRestrictedSOffset())
5258 MIB.addReg(AMDGPU::SGPR_NULL);
5259 else
5260 MIB.addImm(0);
5261 },
5262 [=](MachineInstrBuilder &MIB) { // offset
5263 MIB.addImm(Offset);
5264 },
5265 addZeroImm, // cpol
5266 addZeroImm, // tfe
5267 addZeroImm // swz
5268 }};
5269}
5270
5272AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
5273 Register RSrcReg;
5274 Register SOffset;
5275 int64_t Offset = 0;
5276
5277 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
5278 return {};
5279
5280 return {{
5281 [=](MachineInstrBuilder &MIB) { // rsrc
5282 MIB.addReg(RSrcReg);
5283 },
5284 [=](MachineInstrBuilder &MIB) { // soffset
5285 if (SOffset)
5286 MIB.addReg(SOffset);
5287 else if (STI.hasRestrictedSOffset())
5288 MIB.addReg(AMDGPU::SGPR_NULL);
5289 else
5290 MIB.addImm(0);
5291 },
5292 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
5293 addZeroImm, // cpol
5294 addZeroImm, // tfe
5295 addZeroImm, // swz
5296 }};
5297}
5298
5300AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
5301
5302 Register SOffset = Root.getReg();
5303
5304 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
5305 SOffset = AMDGPU::SGPR_NULL;
5306
5307 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
5308}
5309
5310/// Get an immediate that must be 32-bits, and treated as zero extended.
5311static std::optional<uint64_t>
5313 // getIConstantVRegVal sexts any values, so see if that matters.
5314 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
5315 if (!OffsetVal || !isInt<32>(*OffsetVal))
5316 return std::nullopt;
5317 return Lo_32(*OffsetVal);
5318}
5319
5321AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
5322 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
5323 if (!OffsetVal)
5324 return {};
5325
5326 std::optional<int64_t> EncodedImm =
5327 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
5328 if (!EncodedImm)
5329 return {};
5330
5331 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
5332}
5333
5335AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
5337
5338 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
5339 if (!OffsetVal)
5340 return {};
5341
5342 std::optional<int64_t> EncodedImm =
5344 if (!EncodedImm)
5345 return {};
5346
5347 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
5348}
5349
5351AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
5352 // Match the (soffset + offset) pair as a 32-bit register base and
5353 // an immediate offset.
5354 Register SOffset;
5355 unsigned Offset;
5356 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
5357 *MRI, Root.getReg(), KB, /*CheckNUW*/ true);
5358 if (!SOffset)
5359 return std::nullopt;
5360
5361 std::optional<int64_t> EncodedOffset =
5362 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
5363 if (!EncodedOffset)
5364 return std::nullopt;
5365
5366 assert(MRI->getType(SOffset) == LLT::scalar(32));
5367 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5368 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
5369}
5370
5371// Variant of stripBitCast that returns the instruction instead of a
5372// MachineOperand.
5374 if (MI->getOpcode() == AMDGPU::G_BITCAST)
5375 return getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI);
5376 return MI;
5377}
5378
5379// Figure out if this is really an extract of the high 16-bits of a dword,
5380// returns nullptr if it isn't.
5383 Inst = stripBitCast(Inst, MRI);
5384
5385 if (Inst->getOpcode() != AMDGPU::G_TRUNC)
5386 return nullptr;
5387
5388 MachineInstr *TruncOp =
5390 TruncOp = stripBitCast(TruncOp, MRI);
5391
5392 // G_LSHR x, (G_CONSTANT i32 16)
5393 if (TruncOp->getOpcode() == AMDGPU::G_LSHR) {
5394 auto SrlAmount = getIConstantVRegValWithLookThrough(
5395 TruncOp->getOperand(2).getReg(), MRI);
5396 if (SrlAmount && SrlAmount->Value.getZExtValue() == 16) {
5397 MachineInstr *SrlOp =
5398 getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
5399 return stripBitCast(SrlOp, MRI);
5400 }
5401 }
5402
5403 // G_SHUFFLE_VECTOR x, y, shufflemask(1, 1|0)
5404 // 1, 0 swaps the low/high 16 bits.
5405 // 1, 1 sets the high 16 bits to be the same as the low 16.
5406 // in any case, it selects the high elts.
5407 if (TruncOp->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) {
5408 assert(MRI.getType(TruncOp->getOperand(0).getReg()) ==
5409 LLT::fixed_vector(2, 16));
5410
5411 ArrayRef<int> Mask = TruncOp->getOperand(3).getShuffleMask();
5412 assert(Mask.size() == 2);
5413
5414 if (Mask[0] == 1 && Mask[1] <= 1) {
5415 MachineInstr *LHS =
5416 getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
5417 return stripBitCast(LHS, MRI);
5418 }
5419 }
5420
5421 return nullptr;
5422}
5423
5424std::pair<Register, unsigned>
5425AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
5426 bool &Matched) const {
5427 Matched = false;
5428
5429 Register Src;
5430 unsigned Mods;
5431 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
5432
5433 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
5434 if (MI->getOpcode() == AMDGPU::G_FPEXT) {
5435 MachineOperand *MO = &MI->getOperand(1);
5436 Src = MO->getReg();
5437 MI = getDefIgnoringCopies(Src, *MRI);
5438
5439 assert(MRI->getType(Src) == LLT::scalar(16));
5440
5441 // See through bitcasts.
5442 // FIXME: Would be nice to use stripBitCast here.
5443 if (MI->getOpcode() == AMDGPU::G_BITCAST) {
5444 MO = &MI->getOperand(1);
5445 Src = MO->getReg();
5446 MI = getDefIgnoringCopies(Src, *MRI);
5447 }
5448
5449 const auto CheckAbsNeg = [&]() {
5450 // Be careful about folding modifiers if we already have an abs. fneg is
5451 // applied last, so we don't want to apply an earlier fneg.
5452 if ((Mods & SISrcMods::ABS) == 0) {
5453 unsigned ModsTmp;
5454 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(*MO);
5455 MI = getDefIgnoringCopies(Src, *MRI);
5456
5457 if ((ModsTmp & SISrcMods::NEG) != 0)
5458 Mods ^= SISrcMods::NEG;
5459
5460 if ((ModsTmp & SISrcMods::ABS) != 0)
5461 Mods |= SISrcMods::ABS;
5462 }
5463 };
5464
5465 CheckAbsNeg();
5466
5467 // op_sel/op_sel_hi decide the source type and source.
5468 // If the source's op_sel_hi is set, it indicates to do a conversion from
5469 // fp16. If the sources's op_sel is set, it picks the high half of the
5470 // source register.
5471
5472 Mods |= SISrcMods::OP_SEL_1;
5473
5474 if (MachineInstr *ExtractHiEltMI = isExtractHiElt(MI, *MRI)) {
5475 Mods |= SISrcMods::OP_SEL_0;
5476 MI = ExtractHiEltMI;
5477 MO = &MI->getOperand(0);
5478 Src = MO->getReg();
5479
5480 CheckAbsNeg();
5481 }
5482
5483 Matched = true;
5484 }
5485
5486 return {Src, Mods};
5487}
5488
5490AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
5491 MachineOperand &Root) const {
5492 Register Src;
5493 unsigned Mods;
5494 bool Matched;
5495 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5496 if (!Matched)
5497 return {};
5498
5499 return {{
5500 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5501 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5502 }};
5503}
5504
5506AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
5507 Register Src;
5508 unsigned Mods;
5509 bool Matched;
5510 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5511
5512 return {{
5513 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5514 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5515 }};
5516}
5517
5518bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
5519 MachineInstr &I, Intrinsic::ID IntrID) const {
5520 MachineBasicBlock *MBB = I.getParent();
5521 const DebugLoc &DL = I.getDebugLoc();
5522 Register CCReg = I.getOperand(0).getReg();
5523
5524 bool HasM0 = IntrID == Intrinsic::amdgcn_s_barrier_signal_isfirst_var;
5525
5526 if (HasM0) {
5527 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
5528 .addReg(I.getOperand(2).getReg());
5529 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0));
5530 if (!constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI))
5531 return false;
5532 } else {
5533 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
5534 .addImm(I.getOperand(2).getImm());
5535 }
5536
5537 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
5538
5539 I.eraseFromParent();
5540 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
5541 *MRI);
5542}
5543
5544unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
5545 if (HasInlineConst) {
5546 switch (IntrID) {
5547 default:
5548 llvm_unreachable("not a named barrier op");
5549 case Intrinsic::amdgcn_s_barrier_init:
5550 return AMDGPU::S_BARRIER_INIT_IMM;
5551 case Intrinsic::amdgcn_s_barrier_join:
5552 return AMDGPU::S_BARRIER_JOIN_IMM;
5553 case Intrinsic::amdgcn_s_wakeup_barrier:
5554 return AMDGPU::S_WAKEUP_BARRIER_IMM;
5555 case Intrinsic::amdgcn_s_get_barrier_state:
5556 return AMDGPU::S_GET_BARRIER_STATE_IMM;
5557 };
5558 } else {
5559 switch (IntrID) {
5560 default:
5561 llvm_unreachable("not a named barrier op");
5562 case Intrinsic::amdgcn_s_barrier_init:
5563 return AMDGPU::S_BARRIER_INIT_M0;
5564 case Intrinsic::amdgcn_s_barrier_join:
5565 return AMDGPU::S_BARRIER_JOIN_M0;
5566 case Intrinsic::amdgcn_s_wakeup_barrier:
5567 return AMDGPU::S_WAKEUP_BARRIER_M0;
5568 case Intrinsic::amdgcn_s_get_barrier_state:
5569 return AMDGPU::S_GET_BARRIER_STATE_M0;
5570 };
5571 }
5572}
5573
5574bool AMDGPUInstructionSelector::selectNamedBarrierInst(
5575 MachineInstr &I, Intrinsic::ID IntrID) const {
5576 MachineBasicBlock *MBB = I.getParent();
5577 const DebugLoc &DL = I.getDebugLoc();
5578 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_barrier_state
5579 ? I.getOperand(2)
5580 : I.getOperand(1);
5581 std::optional<int64_t> BarValImm =
5582 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
5583 Register M0Val;
5584 Register TmpReg0;
5585
5586 // For S_BARRIER_INIT, member count will always be read from M0[16:22]
5587 if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
5588 Register MemberCount = I.getOperand(2).getReg();
5589 TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5590 // TODO: This should be expanded during legalization so that the the S_LSHL
5591 // and S_OR can be constant-folded
5592 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
5593 .addImm(16)
5594 .addReg(MemberCount);
5595 M0Val = TmpReg0;
5596 }
5597
5598 // If not inlinable, get reference to barrier depending on the instruction
5599 if (!BarValImm) {
5600 if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
5601 // If reference to barrier id is not an inlinable constant then it must be
5602 // referenced with M0[4:0]. Perform an OR with the member count to include
5603 // it in M0 for S_BARRIER_INIT.
5604 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5605 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg1)
5606 .addReg(BarOp.getReg())
5607 .addReg(TmpReg0);
5608 M0Val = TmpReg1;
5609 } else {
5610 M0Val = BarOp.getReg();
5611 }
5612 }
5613
5614 // Build copy to M0 if needed. For S_BARRIER_INIT, M0 is always required.
5615 if (M0Val) {
5616 auto CopyMIB =
5617 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(M0Val);
5618 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
5619 }
5620
5622 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
5623 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
5624
5625 if (IntrID == Intrinsic::amdgcn_s_get_barrier_state)
5626 MIB.addDef(I.getOperand(0).getReg());
5627
5628 if (BarValImm)
5629 MIB.addImm(*BarValImm);
5630
5631 I.eraseFromParent();
5632 return true;
5633}
5634
5635bool AMDGPUInstructionSelector::selectSBarrierLeave(MachineInstr &I) const {
5636 MachineBasicBlock *BB = I.getParent();
5637 const DebugLoc &DL = I.getDebugLoc();
5638 Register CCReg = I.getOperand(0).getReg();
5639
5640 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_BARRIER_LEAVE));
5641 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
5642
5643 I.eraseFromParent();
5644 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
5645 *MRI);
5646}
5647
5648void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
5649 const MachineInstr &MI,
5650 int OpIdx) const {
5651 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5652 "Expected G_CONSTANT");
5653 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
5654}
5655
5656void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
5657 const MachineInstr &MI,
5658 int OpIdx) const {
5659 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5660 "Expected G_CONSTANT");
5661 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
5662}
5663
5664void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
5665 const MachineInstr &MI,
5666 int OpIdx) const {
5667 assert(OpIdx == -1);
5668
5669 const MachineOperand &Op = MI.getOperand(1);
5670 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
5671 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
5672 else {
5673 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
5674 MIB.addImm(Op.getCImm()->getSExtValue());
5675 }
5676}
5677
5678void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
5679 const MachineInstr &MI,
5680 int OpIdx) const {
5681 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5682 "Expected G_CONSTANT");
5683 MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount());
5684}
5685
5686/// This only really exists to satisfy DAG type checking machinery, so is a
5687/// no-op here.
5688void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
5689 const MachineInstr &MI,
5690 int OpIdx) const {
5691 MIB.addImm(MI.getOperand(OpIdx).getImm());
5692}
5693
5694void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
5695 const MachineInstr &MI,
5696 int OpIdx) const {
5697 assert(OpIdx >= 0 && "expected to match an immediate operand");
5698 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
5699}
5700
5701void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
5702 const MachineInstr &MI,
5703 int OpIdx) const {
5704 assert(OpIdx >= 0 && "expected to match an immediate operand");
5705 MIB.addImm(MI.getOperand(OpIdx).getImm() &
5708}
5709
5710void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
5711 const MachineInstr &MI,
5712 int OpIdx) const {
5713 assert(OpIdx >= 0 && "expected to match an immediate operand");
5714 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
5717 MIB.addImm(Swizzle);
5718}
5719
5720void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
5721 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
5722 assert(OpIdx >= 0 && "expected to match an immediate operand");
5723 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
5726 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
5727}
5728
5729void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
5730 const MachineInstr &MI,
5731 int OpIdx) const {
5732 MIB.addFrameIndex(MI.getOperand(1).getIndex());
5733}
5734
5735void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
5736 const MachineInstr &MI,
5737 int OpIdx) const {
5738 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
5739 int ExpVal = APF.getExactLog2Abs();
5740 assert(ExpVal != INT_MIN);
5741 MIB.addImm(ExpVal);
5742}
5743
5744bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
5745 return TII.isInlineConstant(Imm);
5746}
5747
5748bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
5749 return TII.isInlineConstant(Imm);
5750}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelKnownBits &KnownBits)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static int sizeToSubRegIndex(unsigned Size)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg)
Match a zero extend from a 32-bit value to 64-bits.
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static MachineInstr * stripBitCast(MachineInstr *MI, MachineRegisterInfo &MRI)
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
static const LLT S1
amdgpu AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static Error getAddrSpace(StringRef R, unsigned &AddrSpace)
Definition: DataLayout.cpp:266
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
unsigned const TargetRegisterInfo * TRI
#define P(N)
const char LLVMTargetMachineRef TM
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
Value * RHS
Value * LHS
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelKnownBits *KB, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
const RegisterBank & getRegBankFromRegClass(const TargetRegisterClass &RC, LLT) const override
Get a register bank that covers RC.
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
unsigned getWavefrontSizeLog2() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
LLVM_READONLY int getExactLog2Abs() const
Definition: APFloat.h:1339
APInt bitcastToAPInt() const
Definition: APFloat.h:1210
Class for arbitrary precision integers.
Definition: APInt.h:76
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:284
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1606
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:965
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:968
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition: InstrTypes.h:982
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:994
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:995
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:971
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition: InstrTypes.h:980
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:969
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:970
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:989
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:988
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:992
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition: InstrTypes.h:979
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:973
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:976
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:990
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition: InstrTypes.h:977
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:972
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:974
@ ICMP_EQ
equal
Definition: InstrTypes.h:986
@ ICMP_NE
not equal
Definition: InstrTypes.h:987
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:993
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:981
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:991
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition: InstrTypes.h:978
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition: InstrTypes.h:967
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:975
bool isFPPredicate() const
Definition: InstrTypes.h:1083
bool isIntPredicate() const
Definition: InstrTypes.h:1084
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:267
const APFloat & getValueAPF() const
Definition: Constants.h:310
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:159
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:153
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
Represents a G_BUILD_VECTOR.
bool useVGPRIndexMode() const
bool hasScalarCompareEq64() const
Definition: GCNSubtarget.h:944
int getLDSBankCount() const
Definition: GCNSubtarget.h:325
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:453
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:457
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:610
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasMADIntraFwdBug() const
bool privateMemoryResourceIsRangeChecked() const
Definition: GCNSubtarget.h:538
bool hasSignedScratchOffsets() const
bool hasRestrictedSOffset() const
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:259
bool usePRTStrictNull() const
Definition: GCNSubtarget.h:544
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:688
bool isWave32() const
bool hasSPackHL() const
Return true if the target has the S_PACK_HL_B32_B16 instruction.
bool hasG16() const
bool hasFlatScratchSVSSwizzleBug() const
bool hasGWS() const
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:516
Generation getGeneration() const
Definition: GCNSubtarget.h:302
bool hasSplitBarriers() const
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:718
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:702
bool hasAddr64() const
Definition: GCNSubtarget.h:366
bool isWave64() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:710
bool hasSALUFloatInsts() const
bool hasPartialNSAEncoding() const
Represents a G_CONCAT_VECTORS.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelKnownBits *kb, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
APInt getKnownOnes(Register R)
KnownBits getKnownBits(Register R)
bool signBitIsZero(Register Op)
APInt getKnownZeroes(Register R)
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:655
constexpr bool isScalar() const
Definition: LowLevelType.h:146
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
constexpr bool isValid() const
Definition: LowLevelType.h:145
constexpr bool isVector() const
Definition: LowLevelType.h:148
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:193
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
Definition: LowLevelType.h:290
constexpr unsigned getAddressSpace() const
Definition: LowLevelType.h:280
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
Metadata node.
Definition: Metadata.h:1067
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
void setReturnAddressIsTaken(bool s)
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:544
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:327
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:547
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:473
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:554
A description of a memory reference used in the backend.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
bool isCImm() const
isCImm - Test if this is a MO_CImmediate operand.
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
const ConstantFP * getFPImm() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
T get() const
Returns the value of the specified pointer type.
Definition: PointerUnion.h:155
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:162
Analysis providing profile information.
static const TargetRegisterClass * constrainGenericRegister(Register Reg, const TargetRegisterClass &RC, MachineRegisterInfo &MRI)
Constrain the (possibly generic) virtual register Reg to RC.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
TypeSize getSizeInBits(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Get the size in bits of Reg.
This class implements the register bank concept.
Definition: RegisterBank.h:28
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:45
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCRegister getReturnAddressReg(const MachineFunction &MF) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const
const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const
const TargetRegisterClass * getBoolRC() const
const TargetRegisterClass * getWaveMaskRegClass() const
static bool isSGPRClass(const TargetRegisterClass *RC)
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
const Triple & getTargetTriple() const
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:370
static IntegerType * getInt32Ty(LLVMContext &C)
LLVM Value Representation.
Definition: Value.h:74
Value(Type *Ty, unsigned scid)
Definition: Value.cpp:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
Key
PAL metadata keys.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
unsigned getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelKnownBits *KnownBits=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
IndexMode
ARM Index Modes.
Definition: ARMBaseInfo.h:177
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1451
operand_type_match m_Reg()
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
SpecificConstantMatch m_SpecificICst(int64_t RequestedValue)
Matches a constant equal to RequestedValue.
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
ConstantMatch< APInt > m_ICst(APInt &Cst)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:548
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition: Utils.cpp:882
@ Offset
Definition: DWP.cpp:456
Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition: Utils.cpp:54
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:625
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition: Utils.cpp:438
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition: Utils.cpp:293
bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition: Utils.cpp:153
MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition: Utils.cpp:465
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition: Utils.cpp:305
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:136
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition: Utils.cpp:419
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:141
unsigned getUndefRegState(bool B)
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ Add
Sum of integers.
@ DS_Error
std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition: Utils.cpp:413
std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition: Utils.cpp:446
Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition: Utils.cpp:472
@ Default
The result values are uniform if and only if all operands are uniform.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:297
static KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition: KnownBits.cpp:57
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.