LLVM 20.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
48 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
50#include "AMDGPUGenGlobalISel.inc"
53#include "AMDGPUGenGlobalISel.inc"
55{
56}
57
58const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
59
61 CodeGenCoverage *CoverageInfo,
63 BlockFrequencyInfo *BFI) {
64 MRI = &MF.getRegInfo();
65 Subtarget = &MF.getSubtarget<GCNSubtarget>();
68}
69
70// Return the wave level SGPR base address if this is a wave address.
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(1).getReg()
74 : Register();
75}
76
77bool AMDGPUInstructionSelector::isVCC(Register Reg,
78 const MachineRegisterInfo &MRI) const {
79 // The verifier is oblivious to s1 being a valid value for wavesize registers.
80 if (Reg.isPhysical())
81 return false;
82
83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84 const TargetRegisterClass *RC =
85 RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
86 if (RC) {
87 const LLT Ty = MRI.getType(Reg);
88 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
89 return false;
90 // G_TRUNC s1 result is never vcc.
91 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
92 RC->hasSuperClassEq(TRI.getBoolRC());
93 }
94
95 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
96 return RB->getID() == AMDGPU::VCCRegBankID;
97}
98
99bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
100 unsigned NewOpc) const {
101 MI.setDesc(TII.get(NewOpc));
102 MI.removeOperand(1); // Remove intrinsic ID.
103 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
104
105 MachineOperand &Dst = MI.getOperand(0);
106 MachineOperand &Src = MI.getOperand(1);
107
108 // TODO: This should be legalized to s32 if needed
109 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
110 return false;
111
112 const TargetRegisterClass *DstRC
113 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
114 const TargetRegisterClass *SrcRC
115 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
116 if (!DstRC || DstRC != SrcRC)
117 return false;
118
119 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
120 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
121}
122
123bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
124 const DebugLoc &DL = I.getDebugLoc();
125 MachineBasicBlock *BB = I.getParent();
126 I.setDesc(TII.get(TargetOpcode::COPY));
127
128 const MachineOperand &Src = I.getOperand(1);
129 MachineOperand &Dst = I.getOperand(0);
130 Register DstReg = Dst.getReg();
131 Register SrcReg = Src.getReg();
132
133 if (isVCC(DstReg, *MRI)) {
134 if (SrcReg == AMDGPU::SCC) {
135 const TargetRegisterClass *RC
136 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
137 if (!RC)
138 return true;
139 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
140 }
141
142 if (!isVCC(SrcReg, *MRI)) {
143 // TODO: Should probably leave the copy and let copyPhysReg expand it.
144 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
145 return false;
146
147 const TargetRegisterClass *SrcRC
148 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
149
150 std::optional<ValueAndVReg> ConstVal =
151 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
152 if (ConstVal) {
153 unsigned MovOpc =
154 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
155 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
156 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
157 } else {
158 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
159
160 // We can't trust the high bits at this point, so clear them.
161
162 // TODO: Skip masking high bits if def is known boolean.
163
164 bool IsSGPR = TRI.isSGPRClass(SrcRC);
165 unsigned AndOpc =
166 IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
167 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
168 .addImm(1)
169 .addReg(SrcReg);
170 if (IsSGPR)
171 And.setOperandDead(3); // Dead scc
172
173 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
174 .addImm(0)
175 .addReg(MaskedReg);
176 }
177
178 if (!MRI->getRegClassOrNull(SrcReg))
179 MRI->setRegClass(SrcReg, SrcRC);
180 I.eraseFromParent();
181 return true;
182 }
183
184 const TargetRegisterClass *RC =
186 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
187 return false;
188
189 return true;
190 }
191
192 for (const MachineOperand &MO : I.operands()) {
193 if (MO.getReg().isPhysical())
194 continue;
195
196 const TargetRegisterClass *RC =
198 if (!RC)
199 continue;
200 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
201 }
202 return true;
203}
204
205bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
206 const Register DefReg = I.getOperand(0).getReg();
207 const LLT DefTy = MRI->getType(DefReg);
208
209 // S1 G_PHIs should not be selected in instruction-select, instead:
210 // - divergent S1 G_PHI should go through lane mask merging algorithm
211 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
212 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
213 if (DefTy == LLT::scalar(1))
214 return false;
215
216 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
217
218 const RegClassOrRegBank &RegClassOrBank =
219 MRI->getRegClassOrRegBank(DefReg);
220
221 const TargetRegisterClass *DefRC
222 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
223 if (!DefRC) {
224 if (!DefTy.isValid()) {
225 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
226 return false;
227 }
228
229 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
230 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
231 if (!DefRC) {
232 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
233 return false;
234 }
235 }
236
237 // TODO: Verify that all registers have the same bank
238 I.setDesc(TII.get(TargetOpcode::PHI));
239 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
240}
241
243AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
244 const TargetRegisterClass &SubRC,
245 unsigned SubIdx) const {
246
247 MachineInstr *MI = MO.getParent();
249 Register DstReg = MRI->createVirtualRegister(&SubRC);
250
251 if (MO.isReg()) {
252 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
253 Register Reg = MO.getReg();
254 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
255 .addReg(Reg, 0, ComposedSubIdx);
256
257 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
258 MO.isKill(), MO.isDead(), MO.isUndef(),
259 MO.isEarlyClobber(), 0, MO.isDebug(),
260 MO.isInternalRead());
261 }
262
263 assert(MO.isImm());
264
265 APInt Imm(64, MO.getImm());
266
267 switch (SubIdx) {
268 default:
269 llvm_unreachable("do not know to split immediate with this sub index.");
270 case AMDGPU::sub0:
271 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
272 case AMDGPU::sub1:
273 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
274 }
275}
276
277static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
278 switch (Opc) {
279 case AMDGPU::G_AND:
280 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
281 case AMDGPU::G_OR:
282 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
283 case AMDGPU::G_XOR:
284 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
285 default:
286 llvm_unreachable("not a bit op");
287 }
288}
289
290bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
291 Register DstReg = I.getOperand(0).getReg();
292 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
293
294 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
295 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
296 DstRB->getID() != AMDGPU::VCCRegBankID)
297 return false;
298
299 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
300 STI.isWave64());
301 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
302
303 // Dead implicit-def of scc
304 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
305 true, // isImp
306 false, // isKill
307 true)); // isDead
308 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
309}
310
311bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
312 MachineBasicBlock *BB = I.getParent();
314 Register DstReg = I.getOperand(0).getReg();
315 const DebugLoc &DL = I.getDebugLoc();
316 LLT Ty = MRI->getType(DstReg);
317 if (Ty.isVector())
318 return false;
319
320 unsigned Size = Ty.getSizeInBits();
321 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
322 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
323 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
324
325 if (Size == 32) {
326 if (IsSALU) {
327 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
329 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
330 .add(I.getOperand(1))
331 .add(I.getOperand(2))
332 .setOperandDead(3); // Dead scc
333 I.eraseFromParent();
334 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
335 }
336
337 if (STI.hasAddNoCarry()) {
338 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
339 I.setDesc(TII.get(Opc));
340 I.addOperand(*MF, MachineOperand::CreateImm(0));
341 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
342 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
343 }
344
345 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
346
347 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
349 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
350 .addDef(UnusedCarry, RegState::Dead)
351 .add(I.getOperand(1))
352 .add(I.getOperand(2))
353 .addImm(0);
354 I.eraseFromParent();
355 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
356 }
357
358 assert(!Sub && "illegal sub should not reach here");
359
360 const TargetRegisterClass &RC
361 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
362 const TargetRegisterClass &HalfRC
363 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
364
365 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
366 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
367 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
368 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
369
370 Register DstLo = MRI->createVirtualRegister(&HalfRC);
371 Register DstHi = MRI->createVirtualRegister(&HalfRC);
372
373 if (IsSALU) {
374 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
375 .add(Lo1)
376 .add(Lo2);
377 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
378 .add(Hi1)
379 .add(Hi2)
380 .setOperandDead(3); // Dead scc
381 } else {
382 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
383 Register CarryReg = MRI->createVirtualRegister(CarryRC);
384 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
385 .addDef(CarryReg)
386 .add(Lo1)
387 .add(Lo2)
388 .addImm(0);
389 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
390 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
391 .add(Hi1)
392 .add(Hi2)
393 .addReg(CarryReg, RegState::Kill)
394 .addImm(0);
395
396 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
397 return false;
398 }
399
400 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
401 .addReg(DstLo)
402 .addImm(AMDGPU::sub0)
403 .addReg(DstHi)
404 .addImm(AMDGPU::sub1);
405
406
407 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
408 return false;
409
410 I.eraseFromParent();
411 return true;
412}
413
414bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
415 MachineInstr &I) const {
416 MachineBasicBlock *BB = I.getParent();
418 const DebugLoc &DL = I.getDebugLoc();
419 Register Dst0Reg = I.getOperand(0).getReg();
420 Register Dst1Reg = I.getOperand(1).getReg();
421 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
422 I.getOpcode() == AMDGPU::G_UADDE;
423 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
424 I.getOpcode() == AMDGPU::G_USUBE;
425
426 if (isVCC(Dst1Reg, *MRI)) {
427 unsigned NoCarryOpc =
428 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
429 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
430 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
431 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
432 I.addOperand(*MF, MachineOperand::CreateImm(0));
433 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
434 }
435
436 Register Src0Reg = I.getOperand(2).getReg();
437 Register Src1Reg = I.getOperand(3).getReg();
438
439 if (HasCarryIn) {
440 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
441 .addReg(I.getOperand(4).getReg());
442 }
443
444 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
445 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
446
447 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
448 .add(I.getOperand(2))
449 .add(I.getOperand(3));
450
451 if (MRI->use_nodbg_empty(Dst1Reg)) {
452 CarryInst.setOperandDead(3); // Dead scc
453 } else {
454 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
455 .addReg(AMDGPU::SCC);
456 if (!MRI->getRegClassOrNull(Dst1Reg))
457 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
458 }
459
460 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
461 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
462 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
463 return false;
464
465 if (HasCarryIn &&
466 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
467 AMDGPU::SReg_32RegClass, *MRI))
468 return false;
469
470 I.eraseFromParent();
471 return true;
472}
473
474bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
475 MachineInstr &I) const {
476 MachineBasicBlock *BB = I.getParent();
478 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
479
480 unsigned Opc;
481 if (Subtarget->hasMADIntraFwdBug())
482 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
483 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
484 else
485 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
486 I.setDesc(TII.get(Opc));
487 I.addOperand(*MF, MachineOperand::CreateImm(0));
488 I.addImplicitDefUseOperands(*MF);
489 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
490}
491
492// TODO: We should probably legalize these to only using 32-bit results.
493bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
494 MachineBasicBlock *BB = I.getParent();
495 Register DstReg = I.getOperand(0).getReg();
496 Register SrcReg = I.getOperand(1).getReg();
497 LLT DstTy = MRI->getType(DstReg);
498 LLT SrcTy = MRI->getType(SrcReg);
499 const unsigned SrcSize = SrcTy.getSizeInBits();
500 unsigned DstSize = DstTy.getSizeInBits();
501
502 // TODO: Should handle any multiple of 32 offset.
503 unsigned Offset = I.getOperand(2).getImm();
504 if (Offset % 32 != 0 || DstSize > 128)
505 return false;
506
507 // 16-bit operations really use 32-bit registers.
508 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
509 if (DstSize == 16)
510 DstSize = 32;
511
512 const TargetRegisterClass *DstRC =
513 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
514 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
515 return false;
516
517 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
518 const TargetRegisterClass *SrcRC =
519 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
520 if (!SrcRC)
521 return false;
523 DstSize / 32);
524 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
525 if (!SrcRC)
526 return false;
527
528 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
529 *SrcRC, I.getOperand(1));
530 const DebugLoc &DL = I.getDebugLoc();
531 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
532 .addReg(SrcReg, 0, SubReg);
533
534 I.eraseFromParent();
535 return true;
536}
537
538bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
539 MachineBasicBlock *BB = MI.getParent();
540 Register DstReg = MI.getOperand(0).getReg();
541 LLT DstTy = MRI->getType(DstReg);
542 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
543
544 const unsigned SrcSize = SrcTy.getSizeInBits();
545 if (SrcSize < 32)
546 return selectImpl(MI, *CoverageInfo);
547
548 const DebugLoc &DL = MI.getDebugLoc();
549 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
550 const unsigned DstSize = DstTy.getSizeInBits();
551 const TargetRegisterClass *DstRC =
552 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
553 if (!DstRC)
554 return false;
555
556 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
558 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
559 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
560 MachineOperand &Src = MI.getOperand(I + 1);
561 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
562 MIB.addImm(SubRegs[I]);
563
564 const TargetRegisterClass *SrcRC
565 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
566 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
567 return false;
568 }
569
570 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
571 return false;
572
573 MI.eraseFromParent();
574 return true;
575}
576
577bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
578 MachineBasicBlock *BB = MI.getParent();
579 const int NumDst = MI.getNumOperands() - 1;
580
581 MachineOperand &Src = MI.getOperand(NumDst);
582
583 Register SrcReg = Src.getReg();
584 Register DstReg0 = MI.getOperand(0).getReg();
585 LLT DstTy = MRI->getType(DstReg0);
586 LLT SrcTy = MRI->getType(SrcReg);
587
588 const unsigned DstSize = DstTy.getSizeInBits();
589 const unsigned SrcSize = SrcTy.getSizeInBits();
590 const DebugLoc &DL = MI.getDebugLoc();
591 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
592
593 const TargetRegisterClass *SrcRC =
594 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
595 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
596 return false;
597
598 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
599 // source, and this relies on the fact that the same subregister indices are
600 // used for both.
601 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
602 for (int I = 0, E = NumDst; I != E; ++I) {
603 MachineOperand &Dst = MI.getOperand(I);
604 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
605 .addReg(SrcReg, 0, SubRegs[I]);
606
607 // Make sure the subregister index is valid for the source register.
608 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
609 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
610 return false;
611
612 const TargetRegisterClass *DstRC =
614 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
615 return false;
616 }
617
618 MI.eraseFromParent();
619 return true;
620}
621
622bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
623 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
624 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
625
626 Register Src0 = MI.getOperand(1).getReg();
627 Register Src1 = MI.getOperand(2).getReg();
628 LLT SrcTy = MRI->getType(Src0);
629 const unsigned SrcSize = SrcTy.getSizeInBits();
630
631 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
632 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
633 return selectG_MERGE_VALUES(MI);
634 }
635
636 // Selection logic below is for V2S16 only.
637 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
638 Register Dst = MI.getOperand(0).getReg();
639 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
640 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
641 SrcTy != LLT::scalar(32)))
642 return selectImpl(MI, *CoverageInfo);
643
644 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
645 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
646 return false;
647
648 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
649 DstBank->getID() == AMDGPU::VGPRRegBankID);
650 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
651
652 const DebugLoc &DL = MI.getDebugLoc();
653 MachineBasicBlock *BB = MI.getParent();
654
655 // First, before trying TableGen patterns, check if both sources are
656 // constants. In those cases, we can trivially compute the final constant
657 // and emit a simple move.
658 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
659 if (ConstSrc1) {
660 auto ConstSrc0 =
661 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
662 if (ConstSrc0) {
663 const int64_t K0 = ConstSrc0->Value.getSExtValue();
664 const int64_t K1 = ConstSrc1->Value.getSExtValue();
665 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
666 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
667 uint32_t Imm = Lo16 | (Hi16 << 16);
668
669 // VALU
670 if (IsVector) {
671 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
672 MI.eraseFromParent();
673 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
674 }
675
676 // SALU
677 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
678 MI.eraseFromParent();
679 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
680 }
681 }
682
683 // Now try TableGen patterns.
684 if (selectImpl(MI, *CoverageInfo))
685 return true;
686
687 // TODO: This should probably be a combine somewhere
688 // (build_vector $src0, undef) -> copy $src0
689 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
690 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
691 MI.setDesc(TII.get(AMDGPU::COPY));
692 MI.removeOperand(2);
693 const auto &RC =
694 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
695 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
696 RBI.constrainGenericRegister(Src0, RC, *MRI);
697 }
698
699 // TODO: Can be improved?
700 if (IsVector) {
701 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
702 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
703 .addImm(0xFFFF)
704 .addReg(Src0);
705 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
706 return false;
707
708 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
709 .addReg(Src1)
710 .addImm(16)
711 .addReg(TmpReg);
712 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
713 return false;
714
715 MI.eraseFromParent();
716 return true;
717 }
718
719 Register ShiftSrc0;
720 Register ShiftSrc1;
721
722 // With multiple uses of the shift, this will duplicate the shift and
723 // increase register pressure.
724 //
725 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
726 // => (S_PACK_HH_B32_B16 $src0, $src1)
727 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
728 // => (S_PACK_HL_B32_B16 $src0, $src1)
729 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
730 // => (S_PACK_LH_B32_B16 $src0, $src1)
731 // (build_vector $src0, $src1)
732 // => (S_PACK_LL_B32_B16 $src0, $src1)
733
734 bool Shift0 = mi_match(
735 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
736
737 bool Shift1 = mi_match(
738 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
739
740 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
741 if (Shift0 && Shift1) {
742 Opc = AMDGPU::S_PACK_HH_B32_B16;
743 MI.getOperand(1).setReg(ShiftSrc0);
744 MI.getOperand(2).setReg(ShiftSrc1);
745 } else if (Shift1) {
746 Opc = AMDGPU::S_PACK_LH_B32_B16;
747 MI.getOperand(2).setReg(ShiftSrc1);
748 } else if (Shift0) {
749 auto ConstSrc1 =
750 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
751 if (ConstSrc1 && ConstSrc1->Value == 0) {
752 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
753 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
754 .addReg(ShiftSrc0)
755 .addImm(16)
756 .setOperandDead(3); // Dead scc
757
758 MI.eraseFromParent();
759 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
760 }
761 if (STI.hasSPackHL()) {
762 Opc = AMDGPU::S_PACK_HL_B32_B16;
763 MI.getOperand(1).setReg(ShiftSrc0);
764 }
765 }
766
767 MI.setDesc(TII.get(Opc));
768 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
769}
770
771bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
772 const MachineOperand &MO = I.getOperand(0);
773
774 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
775 // regbank check here is to know why getConstrainedRegClassForOperand failed.
777 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
778 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
779 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
780 return true;
781 }
782
783 return false;
784}
785
786bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
787 MachineBasicBlock *BB = I.getParent();
788
789 Register DstReg = I.getOperand(0).getReg();
790 Register Src0Reg = I.getOperand(1).getReg();
791 Register Src1Reg = I.getOperand(2).getReg();
792 LLT Src1Ty = MRI->getType(Src1Reg);
793
794 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
795 unsigned InsSize = Src1Ty.getSizeInBits();
796
797 int64_t Offset = I.getOperand(3).getImm();
798
799 // FIXME: These cases should have been illegal and unnecessary to check here.
800 if (Offset % 32 != 0 || InsSize % 32 != 0)
801 return false;
802
803 // Currently not handled by getSubRegFromChannel.
804 if (InsSize > 128)
805 return false;
806
807 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
808 if (SubReg == AMDGPU::NoSubRegister)
809 return false;
810
811 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
812 const TargetRegisterClass *DstRC =
813 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
814 if (!DstRC)
815 return false;
816
817 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
818 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
819 const TargetRegisterClass *Src0RC =
820 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
821 const TargetRegisterClass *Src1RC =
822 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
823
824 // Deal with weird cases where the class only partially supports the subreg
825 // index.
826 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
827 if (!Src0RC || !Src1RC)
828 return false;
829
830 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
831 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
832 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
833 return false;
834
835 const DebugLoc &DL = I.getDebugLoc();
836 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
837 .addReg(Src0Reg)
838 .addReg(Src1Reg)
839 .addImm(SubReg);
840
841 I.eraseFromParent();
842 return true;
843}
844
845bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
846 Register DstReg = MI.getOperand(0).getReg();
847 Register SrcReg = MI.getOperand(1).getReg();
848 Register OffsetReg = MI.getOperand(2).getReg();
849 Register WidthReg = MI.getOperand(3).getReg();
850
851 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
852 "scalar BFX instructions are expanded in regbankselect");
853 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
854 "64-bit vector BFX instructions are expanded in regbankselect");
855
856 const DebugLoc &DL = MI.getDebugLoc();
857 MachineBasicBlock *MBB = MI.getParent();
858
859 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
860 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
861 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
862 .addReg(SrcReg)
863 .addReg(OffsetReg)
864 .addReg(WidthReg);
865 MI.eraseFromParent();
866 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
867}
868
869bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
870 if (STI.getLDSBankCount() != 16)
871 return selectImpl(MI, *CoverageInfo);
872
873 Register Dst = MI.getOperand(0).getReg();
874 Register Src0 = MI.getOperand(2).getReg();
875 Register M0Val = MI.getOperand(6).getReg();
876 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
877 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
878 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
879 return false;
880
881 // This requires 2 instructions. It is possible to write a pattern to support
882 // this, but the generated isel emitter doesn't correctly deal with multiple
883 // output instructions using the same physical register input. The copy to m0
884 // is incorrectly placed before the second instruction.
885 //
886 // TODO: Match source modifiers.
887
888 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
889 const DebugLoc &DL = MI.getDebugLoc();
890 MachineBasicBlock *MBB = MI.getParent();
891
892 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
893 .addReg(M0Val);
894 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
895 .addImm(2)
896 .addImm(MI.getOperand(4).getImm()) // $attr
897 .addImm(MI.getOperand(3).getImm()); // $attrchan
898
899 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
900 .addImm(0) // $src0_modifiers
901 .addReg(Src0) // $src0
902 .addImm(MI.getOperand(4).getImm()) // $attr
903 .addImm(MI.getOperand(3).getImm()) // $attrchan
904 .addImm(0) // $src2_modifiers
905 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
906 .addImm(MI.getOperand(5).getImm()) // $high
907 .addImm(0) // $clamp
908 .addImm(0); // $omod
909
910 MI.eraseFromParent();
911 return true;
912}
913
914// Writelane is special in that it can use SGPR and M0 (which would normally
915// count as using the constant bus twice - but in this case it is allowed since
916// the lane selector doesn't count as a use of the constant bus). However, it is
917// still required to abide by the 1 SGPR rule. Fix this up if we might have
918// multiple SGPRs.
919bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
920 // With a constant bus limit of at least 2, there's no issue.
921 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
922 return selectImpl(MI, *CoverageInfo);
923
924 MachineBasicBlock *MBB = MI.getParent();
925 const DebugLoc &DL = MI.getDebugLoc();
926 Register VDst = MI.getOperand(0).getReg();
927 Register Val = MI.getOperand(2).getReg();
928 Register LaneSelect = MI.getOperand(3).getReg();
929 Register VDstIn = MI.getOperand(4).getReg();
930
931 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
932
933 std::optional<ValueAndVReg> ConstSelect =
934 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
935 if (ConstSelect) {
936 // The selector has to be an inline immediate, so we can use whatever for
937 // the other operands.
938 MIB.addReg(Val);
939 MIB.addImm(ConstSelect->Value.getSExtValue() &
940 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
941 } else {
942 std::optional<ValueAndVReg> ConstVal =
944
945 // If the value written is an inline immediate, we can get away without a
946 // copy to m0.
947 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
948 STI.hasInv2PiInlineImm())) {
949 MIB.addImm(ConstVal->Value.getSExtValue());
950 MIB.addReg(LaneSelect);
951 } else {
952 MIB.addReg(Val);
953
954 // If the lane selector was originally in a VGPR and copied with
955 // readfirstlane, there's a hazard to read the same SGPR from the
956 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
957 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
958
959 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
960 .addReg(LaneSelect);
961 MIB.addReg(AMDGPU::M0);
962 }
963 }
964
965 MIB.addReg(VDstIn);
966
967 MI.eraseFromParent();
968 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
969}
970
971// We need to handle this here because tablegen doesn't support matching
972// instructions with multiple outputs.
973bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
974 Register Dst0 = MI.getOperand(0).getReg();
975 Register Dst1 = MI.getOperand(1).getReg();
976
977 LLT Ty = MRI->getType(Dst0);
978 unsigned Opc;
979 if (Ty == LLT::scalar(32))
980 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
981 else if (Ty == LLT::scalar(64))
982 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
983 else
984 return false;
985
986 // TODO: Match source modifiers.
987
988 const DebugLoc &DL = MI.getDebugLoc();
989 MachineBasicBlock *MBB = MI.getParent();
990
991 Register Numer = MI.getOperand(3).getReg();
992 Register Denom = MI.getOperand(4).getReg();
993 unsigned ChooseDenom = MI.getOperand(5).getImm();
994
995 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
996
997 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
998 .addDef(Dst1)
999 .addImm(0) // $src0_modifiers
1000 .addUse(Src0) // $src0
1001 .addImm(0) // $src1_modifiers
1002 .addUse(Denom) // $src1
1003 .addImm(0) // $src2_modifiers
1004 .addUse(Numer) // $src2
1005 .addImm(0) // $clamp
1006 .addImm(0); // $omod
1007
1008 MI.eraseFromParent();
1009 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1010}
1011
1012bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1013 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1014 switch (IntrinsicID) {
1015 case Intrinsic::amdgcn_if_break: {
1016 MachineBasicBlock *BB = I.getParent();
1017
1018 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1019 // SelectionDAG uses for wave32 vs wave64.
1020 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1021 .add(I.getOperand(0))
1022 .add(I.getOperand(2))
1023 .add(I.getOperand(3));
1024
1025 Register DstReg = I.getOperand(0).getReg();
1026 Register Src0Reg = I.getOperand(2).getReg();
1027 Register Src1Reg = I.getOperand(3).getReg();
1028
1029 I.eraseFromParent();
1030
1031 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1032 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1033
1034 return true;
1035 }
1036 case Intrinsic::amdgcn_interp_p1_f16:
1037 return selectInterpP1F16(I);
1038 case Intrinsic::amdgcn_wqm:
1039 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1040 case Intrinsic::amdgcn_softwqm:
1041 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1042 case Intrinsic::amdgcn_strict_wwm:
1043 case Intrinsic::amdgcn_wwm:
1044 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1045 case Intrinsic::amdgcn_strict_wqm:
1046 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1047 case Intrinsic::amdgcn_writelane:
1048 return selectWritelane(I);
1049 case Intrinsic::amdgcn_div_scale:
1050 return selectDivScale(I);
1051 case Intrinsic::amdgcn_icmp:
1052 case Intrinsic::amdgcn_fcmp:
1053 if (selectImpl(I, *CoverageInfo))
1054 return true;
1055 return selectIntrinsicCmp(I);
1056 case Intrinsic::amdgcn_ballot:
1057 return selectBallot(I);
1058 case Intrinsic::amdgcn_reloc_constant:
1059 return selectRelocConstant(I);
1060 case Intrinsic::amdgcn_groupstaticsize:
1061 return selectGroupStaticSize(I);
1062 case Intrinsic::returnaddress:
1063 return selectReturnAddress(I);
1064 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1065 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1066 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1067 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1068 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1069 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1070 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1071 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1072 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1073 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1074 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1075 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1076 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1077 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1078 return selectSMFMACIntrin(I);
1079 default:
1080 return selectImpl(I, *CoverageInfo);
1081 }
1082}
1083
1085 const GCNSubtarget &ST) {
1086 if (Size != 16 && Size != 32 && Size != 64)
1087 return -1;
1088
1089 if (Size == 16 && !ST.has16BitInsts())
1090 return -1;
1091
1092 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc, unsigned S32Opc,
1093 unsigned S64Opc) {
1094 if (Size == 16)
1095 return ST.hasTrue16BitInsts() ? TrueS16Opc : S16Opc;
1096 if (Size == 32)
1097 return S32Opc;
1098 return S64Opc;
1099 };
1100
1101 switch (P) {
1102 default:
1103 llvm_unreachable("Unknown condition code!");
1104 case CmpInst::ICMP_NE:
1105 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1106 AMDGPU::V_CMP_NE_U32_e64, AMDGPU::V_CMP_NE_U64_e64);
1107 case CmpInst::ICMP_EQ:
1108 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1109 AMDGPU::V_CMP_EQ_U32_e64, AMDGPU::V_CMP_EQ_U64_e64);
1110 case CmpInst::ICMP_SGT:
1111 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1112 AMDGPU::V_CMP_GT_I32_e64, AMDGPU::V_CMP_GT_I64_e64);
1113 case CmpInst::ICMP_SGE:
1114 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1115 AMDGPU::V_CMP_GE_I32_e64, AMDGPU::V_CMP_GE_I64_e64);
1116 case CmpInst::ICMP_SLT:
1117 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1118 AMDGPU::V_CMP_LT_I32_e64, AMDGPU::V_CMP_LT_I64_e64);
1119 case CmpInst::ICMP_SLE:
1120 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1121 AMDGPU::V_CMP_LE_I32_e64, AMDGPU::V_CMP_LE_I64_e64);
1122 case CmpInst::ICMP_UGT:
1123 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1124 AMDGPU::V_CMP_GT_U32_e64, AMDGPU::V_CMP_GT_U64_e64);
1125 case CmpInst::ICMP_UGE:
1126 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1127 AMDGPU::V_CMP_GE_U32_e64, AMDGPU::V_CMP_GE_U64_e64);
1128 case CmpInst::ICMP_ULT:
1129 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1130 AMDGPU::V_CMP_LT_U32_e64, AMDGPU::V_CMP_LT_U64_e64);
1131 case CmpInst::ICMP_ULE:
1132 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1133 AMDGPU::V_CMP_LE_U32_e64, AMDGPU::V_CMP_LE_U64_e64);
1134
1135 case CmpInst::FCMP_OEQ:
1136 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1137 AMDGPU::V_CMP_EQ_F32_e64, AMDGPU::V_CMP_EQ_F64_e64);
1138 case CmpInst::FCMP_OGT:
1139 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1140 AMDGPU::V_CMP_GT_F32_e64, AMDGPU::V_CMP_GT_F64_e64);
1141 case CmpInst::FCMP_OGE:
1142 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1143 AMDGPU::V_CMP_GE_F32_e64, AMDGPU::V_CMP_GE_F64_e64);
1144 case CmpInst::FCMP_OLT:
1145 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1146 AMDGPU::V_CMP_LT_F32_e64, AMDGPU::V_CMP_LT_F64_e64);
1147 case CmpInst::FCMP_OLE:
1148 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1149 AMDGPU::V_CMP_LE_F32_e64, AMDGPU::V_CMP_LE_F64_e64);
1150 case CmpInst::FCMP_ONE:
1151 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1152 AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1153 case CmpInst::FCMP_ORD:
1154 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1155 AMDGPU::V_CMP_O_F32_e64, AMDGPU::V_CMP_O_F64_e64);
1156 case CmpInst::FCMP_UNO:
1157 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1158 AMDGPU::V_CMP_U_F32_e64, AMDGPU::V_CMP_U_F64_e64);
1159 case CmpInst::FCMP_UEQ:
1160 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1161 AMDGPU::V_CMP_NLG_F32_e64, AMDGPU::V_CMP_NLG_F64_e64);
1162 case CmpInst::FCMP_UGT:
1163 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1164 AMDGPU::V_CMP_NLE_F32_e64, AMDGPU::V_CMP_NLE_F64_e64);
1165 case CmpInst::FCMP_UGE:
1166 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1167 AMDGPU::V_CMP_NLT_F32_e64, AMDGPU::V_CMP_NLT_F64_e64);
1168 case CmpInst::FCMP_ULT:
1169 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1170 AMDGPU::V_CMP_NGE_F32_e64, AMDGPU::V_CMP_NGE_F64_e64);
1171 case CmpInst::FCMP_ULE:
1172 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1173 AMDGPU::V_CMP_NGT_F32_e64, AMDGPU::V_CMP_NGT_F64_e64);
1174 case CmpInst::FCMP_UNE:
1175 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1176 AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1177 case CmpInst::FCMP_TRUE:
1178 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1179 AMDGPU::V_CMP_TRU_F32_e64, AMDGPU::V_CMP_TRU_F64_e64);
1181 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1182 AMDGPU::V_CMP_F_F32_e64, AMDGPU::V_CMP_F_F64_e64);
1183 }
1184}
1185
1186int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1187 unsigned Size) const {
1188 if (Size == 64) {
1189 if (!STI.hasScalarCompareEq64())
1190 return -1;
1191
1192 switch (P) {
1193 case CmpInst::ICMP_NE:
1194 return AMDGPU::S_CMP_LG_U64;
1195 case CmpInst::ICMP_EQ:
1196 return AMDGPU::S_CMP_EQ_U64;
1197 default:
1198 return -1;
1199 }
1200 }
1201
1202 if (Size == 32) {
1203 switch (P) {
1204 case CmpInst::ICMP_NE:
1205 return AMDGPU::S_CMP_LG_U32;
1206 case CmpInst::ICMP_EQ:
1207 return AMDGPU::S_CMP_EQ_U32;
1208 case CmpInst::ICMP_SGT:
1209 return AMDGPU::S_CMP_GT_I32;
1210 case CmpInst::ICMP_SGE:
1211 return AMDGPU::S_CMP_GE_I32;
1212 case CmpInst::ICMP_SLT:
1213 return AMDGPU::S_CMP_LT_I32;
1214 case CmpInst::ICMP_SLE:
1215 return AMDGPU::S_CMP_LE_I32;
1216 case CmpInst::ICMP_UGT:
1217 return AMDGPU::S_CMP_GT_U32;
1218 case CmpInst::ICMP_UGE:
1219 return AMDGPU::S_CMP_GE_U32;
1220 case CmpInst::ICMP_ULT:
1221 return AMDGPU::S_CMP_LT_U32;
1222 case CmpInst::ICMP_ULE:
1223 return AMDGPU::S_CMP_LE_U32;
1224 case CmpInst::FCMP_OEQ:
1225 return AMDGPU::S_CMP_EQ_F32;
1226 case CmpInst::FCMP_OGT:
1227 return AMDGPU::S_CMP_GT_F32;
1228 case CmpInst::FCMP_OGE:
1229 return AMDGPU::S_CMP_GE_F32;
1230 case CmpInst::FCMP_OLT:
1231 return AMDGPU::S_CMP_LT_F32;
1232 case CmpInst::FCMP_OLE:
1233 return AMDGPU::S_CMP_LE_F32;
1234 case CmpInst::FCMP_ONE:
1235 return AMDGPU::S_CMP_LG_F32;
1236 case CmpInst::FCMP_ORD:
1237 return AMDGPU::S_CMP_O_F32;
1238 case CmpInst::FCMP_UNO:
1239 return AMDGPU::S_CMP_U_F32;
1240 case CmpInst::FCMP_UEQ:
1241 return AMDGPU::S_CMP_NLG_F32;
1242 case CmpInst::FCMP_UGT:
1243 return AMDGPU::S_CMP_NLE_F32;
1244 case CmpInst::FCMP_UGE:
1245 return AMDGPU::S_CMP_NLT_F32;
1246 case CmpInst::FCMP_ULT:
1247 return AMDGPU::S_CMP_NGE_F32;
1248 case CmpInst::FCMP_ULE:
1249 return AMDGPU::S_CMP_NGT_F32;
1250 case CmpInst::FCMP_UNE:
1251 return AMDGPU::S_CMP_NEQ_F32;
1252 default:
1253 llvm_unreachable("Unknown condition code!");
1254 }
1255 }
1256
1257 if (Size == 16) {
1258 if (!STI.hasSALUFloatInsts())
1259 return -1;
1260
1261 switch (P) {
1262 case CmpInst::FCMP_OEQ:
1263 return AMDGPU::S_CMP_EQ_F16;
1264 case CmpInst::FCMP_OGT:
1265 return AMDGPU::S_CMP_GT_F16;
1266 case CmpInst::FCMP_OGE:
1267 return AMDGPU::S_CMP_GE_F16;
1268 case CmpInst::FCMP_OLT:
1269 return AMDGPU::S_CMP_LT_F16;
1270 case CmpInst::FCMP_OLE:
1271 return AMDGPU::S_CMP_LE_F16;
1272 case CmpInst::FCMP_ONE:
1273 return AMDGPU::S_CMP_LG_F16;
1274 case CmpInst::FCMP_ORD:
1275 return AMDGPU::S_CMP_O_F16;
1276 case CmpInst::FCMP_UNO:
1277 return AMDGPU::S_CMP_U_F16;
1278 case CmpInst::FCMP_UEQ:
1279 return AMDGPU::S_CMP_NLG_F16;
1280 case CmpInst::FCMP_UGT:
1281 return AMDGPU::S_CMP_NLE_F16;
1282 case CmpInst::FCMP_UGE:
1283 return AMDGPU::S_CMP_NLT_F16;
1284 case CmpInst::FCMP_ULT:
1285 return AMDGPU::S_CMP_NGE_F16;
1286 case CmpInst::FCMP_ULE:
1287 return AMDGPU::S_CMP_NGT_F16;
1288 case CmpInst::FCMP_UNE:
1289 return AMDGPU::S_CMP_NEQ_F16;
1290 default:
1291 llvm_unreachable("Unknown condition code!");
1292 }
1293 }
1294
1295 return -1;
1296}
1297
1298bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1299
1300 MachineBasicBlock *BB = I.getParent();
1301 const DebugLoc &DL = I.getDebugLoc();
1302
1303 Register SrcReg = I.getOperand(2).getReg();
1304 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1305
1306 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1307
1308 Register CCReg = I.getOperand(0).getReg();
1309 if (!isVCC(CCReg, *MRI)) {
1310 int Opcode = getS_CMPOpcode(Pred, Size);
1311 if (Opcode == -1)
1312 return false;
1313 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1314 .add(I.getOperand(2))
1315 .add(I.getOperand(3));
1316 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1317 .addReg(AMDGPU::SCC);
1318 bool Ret =
1319 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1320 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1321 I.eraseFromParent();
1322 return Ret;
1323 }
1324
1325 if (I.getOpcode() == AMDGPU::G_FCMP)
1326 return false;
1327
1328 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1329 if (Opcode == -1)
1330 return false;
1331
1332 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1333 I.getOperand(0).getReg())
1334 .add(I.getOperand(2))
1335 .add(I.getOperand(3));
1337 *TRI.getBoolRC(), *MRI);
1338 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1339 I.eraseFromParent();
1340 return Ret;
1341}
1342
1343bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1344 Register Dst = I.getOperand(0).getReg();
1345 if (isVCC(Dst, *MRI))
1346 return false;
1347
1348 LLT DstTy = MRI->getType(Dst);
1349 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1350 return false;
1351
1352 MachineBasicBlock *BB = I.getParent();
1353 const DebugLoc &DL = I.getDebugLoc();
1354 Register SrcReg = I.getOperand(2).getReg();
1355 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1356
1357 // i1 inputs are not supported in GlobalISel.
1358 if (Size == 1)
1359 return false;
1360
1361 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1362 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1363 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1364 I.eraseFromParent();
1365 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1366 }
1367
1368 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1369 if (Opcode == -1)
1370 return false;
1371
1372 MachineInstrBuilder SelectedMI;
1373 MachineOperand &LHS = I.getOperand(2);
1374 MachineOperand &RHS = I.getOperand(3);
1375 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS);
1376 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS);
1377 Register Src0Reg =
1378 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1379 Register Src1Reg =
1380 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1381 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1382 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1383 SelectedMI.addImm(Src0Mods);
1384 SelectedMI.addReg(Src0Reg);
1385 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1386 SelectedMI.addImm(Src1Mods);
1387 SelectedMI.addReg(Src1Reg);
1388 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1389 SelectedMI.addImm(0); // clamp
1390 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1391 SelectedMI.addImm(0); // op_sel
1392
1393 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1394 if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1395 return false;
1396
1397 I.eraseFromParent();
1398 return true;
1399}
1400
1401bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1402 MachineBasicBlock *BB = I.getParent();
1403 const DebugLoc &DL = I.getDebugLoc();
1404 Register DstReg = I.getOperand(0).getReg();
1405 const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1406 const bool Is64 = Size == 64;
1407 const bool IsWave32 = (STI.getWavefrontSize() == 32);
1408
1409 // In the common case, the return type matches the wave size.
1410 // However we also support emitting i64 ballots in wave32 mode.
1411 if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32))
1412 return false;
1413
1414 std::optional<ValueAndVReg> Arg =
1415 getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
1416
1417 const auto BuildCopy = [&](Register SrcReg) {
1418 if (Size == STI.getWavefrontSize()) {
1419 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1420 .addReg(SrcReg);
1421 return;
1422 }
1423
1424 // If emitting a i64 ballot in wave32, fill the upper bits with zeroes.
1425 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1426 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1427 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1428 .addReg(SrcReg)
1429 .addImm(AMDGPU::sub0)
1430 .addReg(HiReg)
1431 .addImm(AMDGPU::sub1);
1432 };
1433
1434 if (Arg) {
1435 const int64_t Value = Arg->Value.getSExtValue();
1436 if (Value == 0) {
1437 unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1438 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1439 } else if (Value == -1) // all ones
1440 BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
1441 else
1442 return false;
1443 } else
1444 BuildCopy(I.getOperand(2).getReg());
1445
1446 I.eraseFromParent();
1447 return true;
1448}
1449
1450bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1451 Register DstReg = I.getOperand(0).getReg();
1452 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1453 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1454 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1455 return false;
1456
1457 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1458
1460 const MDNode *Metadata = I.getOperand(2).getMetadata();
1461 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1462 auto RelocSymbol = cast<GlobalVariable>(
1463 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1464
1465 MachineBasicBlock *BB = I.getParent();
1466 BuildMI(*BB, &I, I.getDebugLoc(),
1467 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1469
1470 I.eraseFromParent();
1471 return true;
1472}
1473
1474bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1476
1477 Register DstReg = I.getOperand(0).getReg();
1478 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1479 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1480 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1481
1482 MachineBasicBlock *MBB = I.getParent();
1483 const DebugLoc &DL = I.getDebugLoc();
1484
1485 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1486
1487 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1489 MIB.addImm(MFI->getLDSSize());
1490 } else {
1492 const GlobalValue *GV
1493 = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1495 }
1496
1497 I.eraseFromParent();
1498 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1499}
1500
1501bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1502 MachineBasicBlock *MBB = I.getParent();
1504 const DebugLoc &DL = I.getDebugLoc();
1505
1506 MachineOperand &Dst = I.getOperand(0);
1507 Register DstReg = Dst.getReg();
1508 unsigned Depth = I.getOperand(2).getImm();
1509
1510 const TargetRegisterClass *RC
1511 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1512 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1513 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1514 return false;
1515
1516 // Check for kernel and shader functions
1517 if (Depth != 0 ||
1519 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1520 .addImm(0);
1521 I.eraseFromParent();
1522 return true;
1523 }
1524
1526 // There is a call to @llvm.returnaddress in this function
1527 MFI.setReturnAddressIsTaken(true);
1528
1529 // Get the return address reg and mark it as an implicit live-in
1530 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1531 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1532 AMDGPU::SReg_64RegClass, DL);
1533 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1534 .addReg(LiveIn);
1535 I.eraseFromParent();
1536 return true;
1537}
1538
1539bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1540 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1541 // SelectionDAG uses for wave32 vs wave64.
1542 MachineBasicBlock *BB = MI.getParent();
1543 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1544 .add(MI.getOperand(1));
1545
1546 Register Reg = MI.getOperand(1).getReg();
1547 MI.eraseFromParent();
1548
1549 if (!MRI->getRegClassOrNull(Reg))
1550 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1551 return true;
1552}
1553
1554bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1555 MachineInstr &MI, Intrinsic::ID IntrID) const {
1556 MachineBasicBlock *MBB = MI.getParent();
1558 const DebugLoc &DL = MI.getDebugLoc();
1559
1560 unsigned IndexOperand = MI.getOperand(7).getImm();
1561 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1562 bool WaveDone = MI.getOperand(9).getImm() != 0;
1563
1564 if (WaveDone && !WaveRelease)
1565 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1566
1567 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1568 IndexOperand &= ~0x3f;
1569 unsigned CountDw = 0;
1570
1572 CountDw = (IndexOperand >> 24) & 0xf;
1573 IndexOperand &= ~(0xf << 24);
1574
1575 if (CountDw < 1 || CountDw > 4) {
1577 "ds_ordered_count: dword count must be between 1 and 4");
1578 }
1579 }
1580
1581 if (IndexOperand)
1582 report_fatal_error("ds_ordered_count: bad index operand");
1583
1584 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1585 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1586
1587 unsigned Offset0 = OrderedCountIndex << 2;
1588 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1589
1591 Offset1 |= (CountDw - 1) << 6;
1592
1594 Offset1 |= ShaderType << 2;
1595
1596 unsigned Offset = Offset0 | (Offset1 << 8);
1597
1598 Register M0Val = MI.getOperand(2).getReg();
1599 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1600 .addReg(M0Val);
1601
1602 Register DstReg = MI.getOperand(0).getReg();
1603 Register ValReg = MI.getOperand(3).getReg();
1605 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1606 .addReg(ValReg)
1607 .addImm(Offset)
1608 .cloneMemRefs(MI);
1609
1610 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1611 return false;
1612
1613 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1614 MI.eraseFromParent();
1615 return Ret;
1616}
1617
1618static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1619 switch (IntrID) {
1620 case Intrinsic::amdgcn_ds_gws_init:
1621 return AMDGPU::DS_GWS_INIT;
1622 case Intrinsic::amdgcn_ds_gws_barrier:
1623 return AMDGPU::DS_GWS_BARRIER;
1624 case Intrinsic::amdgcn_ds_gws_sema_v:
1625 return AMDGPU::DS_GWS_SEMA_V;
1626 case Intrinsic::amdgcn_ds_gws_sema_br:
1627 return AMDGPU::DS_GWS_SEMA_BR;
1628 case Intrinsic::amdgcn_ds_gws_sema_p:
1629 return AMDGPU::DS_GWS_SEMA_P;
1630 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1631 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1632 default:
1633 llvm_unreachable("not a gws intrinsic");
1634 }
1635}
1636
1637bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1638 Intrinsic::ID IID) const {
1639 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1640 !STI.hasGWSSemaReleaseAll()))
1641 return false;
1642
1643 // intrinsic ID, vsrc, offset
1644 const bool HasVSrc = MI.getNumOperands() == 3;
1645 assert(HasVSrc || MI.getNumOperands() == 2);
1646
1647 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1648 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1649 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1650 return false;
1651
1652 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1653 unsigned ImmOffset;
1654
1655 MachineBasicBlock *MBB = MI.getParent();
1656 const DebugLoc &DL = MI.getDebugLoc();
1657
1658 MachineInstr *Readfirstlane = nullptr;
1659
1660 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1661 // incoming offset, in case there's an add of a constant. We'll have to put it
1662 // back later.
1663 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1664 Readfirstlane = OffsetDef;
1665 BaseOffset = OffsetDef->getOperand(1).getReg();
1666 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1667 }
1668
1669 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1670 // If we have a constant offset, try to use the 0 in m0 as the base.
1671 // TODO: Look into changing the default m0 initialization value. If the
1672 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1673 // the immediate offset.
1674
1675 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1676 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1677 .addImm(0);
1678 } else {
1679 std::tie(BaseOffset, ImmOffset) =
1680 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, KB);
1681
1682 if (Readfirstlane) {
1683 // We have the constant offset now, so put the readfirstlane back on the
1684 // variable component.
1685 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1686 return false;
1687
1688 Readfirstlane->getOperand(1).setReg(BaseOffset);
1689 BaseOffset = Readfirstlane->getOperand(0).getReg();
1690 } else {
1691 if (!RBI.constrainGenericRegister(BaseOffset,
1692 AMDGPU::SReg_32RegClass, *MRI))
1693 return false;
1694 }
1695
1696 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1697 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1698 .addReg(BaseOffset)
1699 .addImm(16)
1700 .setOperandDead(3); // Dead scc
1701
1702 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1703 .addReg(M0Base);
1704 }
1705
1706 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1707 // offset field) % 64. Some versions of the programming guide omit the m0
1708 // part, or claim it's from offset 0.
1709 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1710
1711 if (HasVSrc) {
1712 Register VSrc = MI.getOperand(1).getReg();
1713 MIB.addReg(VSrc);
1714
1715 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1716 return false;
1717 }
1718
1719 MIB.addImm(ImmOffset)
1720 .cloneMemRefs(MI);
1721
1722 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
1723
1724 MI.eraseFromParent();
1725 return true;
1726}
1727
1728bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1729 bool IsAppend) const {
1730 Register PtrBase = MI.getOperand(2).getReg();
1731 LLT PtrTy = MRI->getType(PtrBase);
1732 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1733
1734 unsigned Offset;
1735 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1736
1737 // TODO: Should this try to look through readfirstlane like GWS?
1738 if (!isDSOffsetLegal(PtrBase, Offset)) {
1739 PtrBase = MI.getOperand(2).getReg();
1740 Offset = 0;
1741 }
1742
1743 MachineBasicBlock *MBB = MI.getParent();
1744 const DebugLoc &DL = MI.getDebugLoc();
1745 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1746
1747 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1748 .addReg(PtrBase);
1749 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1750 return false;
1751
1752 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1753 .addImm(Offset)
1754 .addImm(IsGDS ? -1 : 0)
1755 .cloneMemRefs(MI);
1756 MI.eraseFromParent();
1757 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1758}
1759
1760bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1762 unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1763 if (WGSize <= STI.getWavefrontSize()) {
1764 MachineBasicBlock *MBB = MI.getParent();
1765 const DebugLoc &DL = MI.getDebugLoc();
1766 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1767 MI.eraseFromParent();
1768 return true;
1769 }
1770 }
1771
1772 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
1773 if (STI.hasSplitBarriers()) {
1774 MachineBasicBlock *MBB = MI.getParent();
1775 const DebugLoc &DL = MI.getDebugLoc();
1776 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
1778 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
1780 MI.eraseFromParent();
1781 return true;
1782 }
1783
1784 return selectImpl(MI, *CoverageInfo);
1785}
1786
1787static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1788 bool &IsTexFail) {
1789 if (TexFailCtrl)
1790 IsTexFail = true;
1791
1792 TFE = (TexFailCtrl & 0x1) ? true : false;
1793 TexFailCtrl &= ~(uint64_t)0x1;
1794 LWE = (TexFailCtrl & 0x2) ? true : false;
1795 TexFailCtrl &= ~(uint64_t)0x2;
1796
1797 return TexFailCtrl == 0;
1798}
1799
1800bool AMDGPUInstructionSelector::selectImageIntrinsic(
1802 MachineBasicBlock *MBB = MI.getParent();
1803 const DebugLoc &DL = MI.getDebugLoc();
1804
1805 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1807
1808 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1809 unsigned IntrOpcode = Intr->BaseOpcode;
1810 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
1811 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
1812 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
1813
1814 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
1815
1816 Register VDataIn, VDataOut;
1817 LLT VDataTy;
1818 int NumVDataDwords = -1;
1819 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
1820 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
1821
1822 bool Unorm;
1823 if (!BaseOpcode->Sampler)
1824 Unorm = true;
1825 else
1826 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
1827
1828 bool TFE;
1829 bool LWE;
1830 bool IsTexFail = false;
1831 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
1832 TFE, LWE, IsTexFail))
1833 return false;
1834
1835 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
1836 const bool IsA16 = (Flags & 1) != 0;
1837 const bool IsG16 = (Flags & 2) != 0;
1838
1839 // A16 implies 16 bit gradients if subtarget doesn't support G16
1840 if (IsA16 && !STI.hasG16() && !IsG16)
1841 return false;
1842
1843 unsigned DMask = 0;
1844 unsigned DMaskLanes = 0;
1845
1846 if (BaseOpcode->Atomic) {
1847 VDataOut = MI.getOperand(0).getReg();
1848 VDataIn = MI.getOperand(2).getReg();
1849 LLT Ty = MRI->getType(VDataIn);
1850
1851 // Be careful to allow atomic swap on 16-bit element vectors.
1852 const bool Is64Bit = BaseOpcode->AtomicX2 ?
1853 Ty.getSizeInBits() == 128 :
1854 Ty.getSizeInBits() == 64;
1855
1856 if (BaseOpcode->AtomicX2) {
1857 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1858
1859 DMask = Is64Bit ? 0xf : 0x3;
1860 NumVDataDwords = Is64Bit ? 4 : 2;
1861 } else {
1862 DMask = Is64Bit ? 0x3 : 0x1;
1863 NumVDataDwords = Is64Bit ? 2 : 1;
1864 }
1865 } else {
1866 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
1867 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
1868
1869 if (BaseOpcode->Store) {
1870 VDataIn = MI.getOperand(1).getReg();
1871 VDataTy = MRI->getType(VDataIn);
1872 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1873 } else if (BaseOpcode->NoReturn) {
1874 NumVDataDwords = 0;
1875 } else {
1876 VDataOut = MI.getOperand(0).getReg();
1877 VDataTy = MRI->getType(VDataOut);
1878 NumVDataDwords = DMaskLanes;
1879
1880 if (IsD16 && !STI.hasUnpackedD16VMem())
1881 NumVDataDwords = (DMaskLanes + 1) / 2;
1882 }
1883 }
1884
1885 // Set G16 opcode
1886 if (Subtarget->hasG16() && IsG16) {
1887 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1889 assert(G16MappingInfo);
1890 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1891 }
1892
1893 // TODO: Check this in verifier.
1894 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1895
1896 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
1897 if (BaseOpcode->Atomic)
1898 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
1899 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
1901 return false;
1902
1903 int NumVAddrRegs = 0;
1904 int NumVAddrDwords = 0;
1905 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
1906 // Skip the $noregs and 0s inserted during legalization.
1907 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
1908 if (!AddrOp.isReg())
1909 continue; // XXX - Break?
1910
1911 Register Addr = AddrOp.getReg();
1912 if (!Addr)
1913 break;
1914
1915 ++NumVAddrRegs;
1916 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1917 }
1918
1919 // The legalizer preprocessed the intrinsic arguments. If we aren't using
1920 // NSA, these should have been packed into a single value in the first
1921 // address register
1922 const bool UseNSA =
1923 NumVAddrRegs != 1 &&
1924 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
1925 : NumVAddrDwords == NumVAddrRegs);
1926 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1927 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1928 return false;
1929 }
1930
1931 if (IsTexFail)
1932 ++NumVDataDwords;
1933
1934 int Opcode = -1;
1935 if (IsGFX12Plus) {
1936 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
1937 NumVDataDwords, NumVAddrDwords);
1938 } else if (IsGFX11Plus) {
1939 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1940 UseNSA ? AMDGPU::MIMGEncGfx11NSA
1941 : AMDGPU::MIMGEncGfx11Default,
1942 NumVDataDwords, NumVAddrDwords);
1943 } else if (IsGFX10Plus) {
1944 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1945 UseNSA ? AMDGPU::MIMGEncGfx10NSA
1946 : AMDGPU::MIMGEncGfx10Default,
1947 NumVDataDwords, NumVAddrDwords);
1948 } else {
1949 if (Subtarget->hasGFX90AInsts()) {
1950 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
1951 NumVDataDwords, NumVAddrDwords);
1952 if (Opcode == -1) {
1953 LLVM_DEBUG(
1954 dbgs()
1955 << "requested image instruction is not supported on this GPU\n");
1956 return false;
1957 }
1958 }
1959 if (Opcode == -1 &&
1961 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1962 NumVDataDwords, NumVAddrDwords);
1963 if (Opcode == -1)
1964 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1965 NumVDataDwords, NumVAddrDwords);
1966 }
1967 if (Opcode == -1)
1968 return false;
1969
1970 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1971 .cloneMemRefs(MI);
1972
1973 if (VDataOut) {
1974 if (BaseOpcode->AtomicX2) {
1975 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1976
1977 Register TmpReg = MRI->createVirtualRegister(
1978 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1979 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1980
1981 MIB.addDef(TmpReg);
1982 if (!MRI->use_empty(VDataOut)) {
1983 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1984 .addReg(TmpReg, RegState::Kill, SubReg);
1985 }
1986
1987 } else {
1988 MIB.addDef(VDataOut); // vdata output
1989 }
1990 }
1991
1992 if (VDataIn)
1993 MIB.addReg(VDataIn); // vdata input
1994
1995 for (int I = 0; I != NumVAddrRegs; ++I) {
1996 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
1997 if (SrcOp.isReg()) {
1998 assert(SrcOp.getReg() != 0);
1999 MIB.addReg(SrcOp.getReg());
2000 }
2001 }
2002
2003 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2004 if (BaseOpcode->Sampler)
2005 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2006
2007 MIB.addImm(DMask); // dmask
2008
2009 if (IsGFX10Plus)
2010 MIB.addImm(DimInfo->Encoding);
2011 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2012 MIB.addImm(Unorm);
2013
2014 MIB.addImm(CPol);
2015 MIB.addImm(IsA16 && // a16 or r128
2016 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2017 if (IsGFX10Plus)
2018 MIB.addImm(IsA16 ? -1 : 0);
2019
2020 if (!Subtarget->hasGFX90AInsts()) {
2021 MIB.addImm(TFE); // tfe
2022 } else if (TFE) {
2023 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2024 return false;
2025 }
2026
2027 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2028 MIB.addImm(LWE); // lwe
2029 if (!IsGFX10Plus)
2030 MIB.addImm(DimInfo->DA ? -1 : 0);
2031 if (BaseOpcode->HasD16)
2032 MIB.addImm(IsD16 ? -1 : 0);
2033
2034 MI.eraseFromParent();
2035 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2036 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2037 return true;
2038}
2039
2040// We need to handle this here because tablegen doesn't support matching
2041// instructions with multiple outputs.
2042bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2043 MachineInstr &MI) const {
2044 Register Dst0 = MI.getOperand(0).getReg();
2045 Register Dst1 = MI.getOperand(1).getReg();
2046
2047 const DebugLoc &DL = MI.getDebugLoc();
2048 MachineBasicBlock *MBB = MI.getParent();
2049
2050 Register Addr = MI.getOperand(3).getReg();
2051 Register Data0 = MI.getOperand(4).getReg();
2052 Register Data1 = MI.getOperand(5).getReg();
2053 unsigned Offset = MI.getOperand(6).getImm();
2054
2055 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0)
2056 .addDef(Dst1)
2057 .addUse(Addr)
2058 .addUse(Data0)
2059 .addUse(Data1)
2060 .addImm(Offset)
2061 .cloneMemRefs(MI);
2062
2063 MI.eraseFromParent();
2064 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2065}
2066
2067bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2068 MachineInstr &I) const {
2069 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2070 switch (IntrinsicID) {
2071 case Intrinsic::amdgcn_end_cf:
2072 return selectEndCfIntrinsic(I);
2073 case Intrinsic::amdgcn_ds_ordered_add:
2074 case Intrinsic::amdgcn_ds_ordered_swap:
2075 return selectDSOrderedIntrinsic(I, IntrinsicID);
2076 case Intrinsic::amdgcn_ds_gws_init:
2077 case Intrinsic::amdgcn_ds_gws_barrier:
2078 case Intrinsic::amdgcn_ds_gws_sema_v:
2079 case Intrinsic::amdgcn_ds_gws_sema_br:
2080 case Intrinsic::amdgcn_ds_gws_sema_p:
2081 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2082 return selectDSGWSIntrinsic(I, IntrinsicID);
2083 case Intrinsic::amdgcn_ds_append:
2084 return selectDSAppendConsume(I, true);
2085 case Intrinsic::amdgcn_ds_consume:
2086 return selectDSAppendConsume(I, false);
2087 case Intrinsic::amdgcn_s_barrier:
2088 return selectSBarrier(I);
2089 case Intrinsic::amdgcn_raw_buffer_load_lds:
2090 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2091 case Intrinsic::amdgcn_struct_buffer_load_lds:
2092 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2093 return selectBufferLoadLds(I);
2094 case Intrinsic::amdgcn_global_load_lds:
2095 return selectGlobalLoadLds(I);
2096 case Intrinsic::amdgcn_exp_compr:
2097 if (!STI.hasCompressedExport()) {
2098 Function &F = I.getMF()->getFunction();
2100 F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error);
2101 F.getContext().diagnose(NoFpRet);
2102 return false;
2103 }
2104 break;
2105 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2106 return selectDSBvhStackIntrinsic(I);
2107 case Intrinsic::amdgcn_s_barrier_init:
2108 case Intrinsic::amdgcn_s_barrier_join:
2109 case Intrinsic::amdgcn_s_wakeup_barrier:
2110 case Intrinsic::amdgcn_s_get_barrier_state:
2111 return selectNamedBarrierInst(I, IntrinsicID);
2112 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2113 case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
2114 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2115 case Intrinsic::amdgcn_s_barrier_leave:
2116 return selectSBarrierLeave(I);
2117 }
2118 return selectImpl(I, *CoverageInfo);
2119}
2120
2121bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2122 if (selectImpl(I, *CoverageInfo))
2123 return true;
2124
2125 MachineBasicBlock *BB = I.getParent();
2126 const DebugLoc &DL = I.getDebugLoc();
2127
2128 Register DstReg = I.getOperand(0).getReg();
2129 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2130 assert(Size <= 32 || Size == 64);
2131 const MachineOperand &CCOp = I.getOperand(1);
2132 Register CCReg = CCOp.getReg();
2133 if (!isVCC(CCReg, *MRI)) {
2134 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2135 AMDGPU::S_CSELECT_B32;
2136 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2137 .addReg(CCReg);
2138
2139 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2140 // bank, because it does not cover the register class that we used to represent
2141 // for it. So we need to manually set the register class here.
2142 if (!MRI->getRegClassOrNull(CCReg))
2143 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2144 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2145 .add(I.getOperand(2))
2146 .add(I.getOperand(3));
2147
2148 bool Ret = false;
2149 Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2150 Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2151 I.eraseFromParent();
2152 return Ret;
2153 }
2154
2155 // Wide VGPR select should have been split in RegBankSelect.
2156 if (Size > 32)
2157 return false;
2158
2160 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2161 .addImm(0)
2162 .add(I.getOperand(3))
2163 .addImm(0)
2164 .add(I.getOperand(2))
2165 .add(I.getOperand(1));
2166
2167 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2168 I.eraseFromParent();
2169 return Ret;
2170}
2171
2172static int sizeToSubRegIndex(unsigned Size) {
2173 switch (Size) {
2174 case 32:
2175 return AMDGPU::sub0;
2176 case 64:
2177 return AMDGPU::sub0_sub1;
2178 case 96:
2179 return AMDGPU::sub0_sub1_sub2;
2180 case 128:
2181 return AMDGPU::sub0_sub1_sub2_sub3;
2182 case 256:
2183 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
2184 default:
2185 if (Size < 32)
2186 return AMDGPU::sub0;
2187 if (Size > 256)
2188 return -1;
2190 }
2191}
2192
2193bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2194 Register DstReg = I.getOperand(0).getReg();
2195 Register SrcReg = I.getOperand(1).getReg();
2196 const LLT DstTy = MRI->getType(DstReg);
2197 const LLT SrcTy = MRI->getType(SrcReg);
2198 const LLT S1 = LLT::scalar(1);
2199
2200 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2201 const RegisterBank *DstRB;
2202 if (DstTy == S1) {
2203 // This is a special case. We don't treat s1 for legalization artifacts as
2204 // vcc booleans.
2205 DstRB = SrcRB;
2206 } else {
2207 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2208 if (SrcRB != DstRB)
2209 return false;
2210 }
2211
2212 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2213
2214 unsigned DstSize = DstTy.getSizeInBits();
2215 unsigned SrcSize = SrcTy.getSizeInBits();
2216
2217 const TargetRegisterClass *SrcRC =
2218 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2219 const TargetRegisterClass *DstRC =
2220 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2221 if (!SrcRC || !DstRC)
2222 return false;
2223
2224 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2225 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2226 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2227 return false;
2228 }
2229
2230 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2231 MachineBasicBlock *MBB = I.getParent();
2232 const DebugLoc &DL = I.getDebugLoc();
2233
2234 Register LoReg = MRI->createVirtualRegister(DstRC);
2235 Register HiReg = MRI->createVirtualRegister(DstRC);
2236 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2237 .addReg(SrcReg, 0, AMDGPU::sub0);
2238 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2239 .addReg(SrcReg, 0, AMDGPU::sub1);
2240
2241 if (IsVALU && STI.hasSDWA()) {
2242 // Write the low 16-bits of the high element into the high 16-bits of the
2243 // low element.
2244 MachineInstr *MovSDWA =
2245 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2246 .addImm(0) // $src0_modifiers
2247 .addReg(HiReg) // $src0
2248 .addImm(0) // $clamp
2249 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2250 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2251 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2252 .addReg(LoReg, RegState::Implicit);
2253 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2254 } else {
2255 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2256 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2257 Register ImmReg = MRI->createVirtualRegister(DstRC);
2258 if (IsVALU) {
2259 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2260 .addImm(16)
2261 .addReg(HiReg);
2262 } else {
2263 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2264 .addReg(HiReg)
2265 .addImm(16)
2266 .setOperandDead(3); // Dead scc
2267 }
2268
2269 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2270 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2271 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2272
2273 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2274 .addImm(0xffff);
2275 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2276 .addReg(LoReg)
2277 .addReg(ImmReg);
2278 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2279 .addReg(TmpReg0)
2280 .addReg(TmpReg1);
2281
2282 if (!IsVALU) {
2283 And.setOperandDead(3); // Dead scc
2284 Or.setOperandDead(3); // Dead scc
2285 }
2286 }
2287
2288 I.eraseFromParent();
2289 return true;
2290 }
2291
2292 if (!DstTy.isScalar())
2293 return false;
2294
2295 if (SrcSize > 32) {
2296 int SubRegIdx = sizeToSubRegIndex(DstSize);
2297 if (SubRegIdx == -1)
2298 return false;
2299
2300 // Deal with weird cases where the class only partially supports the subreg
2301 // index.
2302 const TargetRegisterClass *SrcWithSubRC
2303 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2304 if (!SrcWithSubRC)
2305 return false;
2306
2307 if (SrcWithSubRC != SrcRC) {
2308 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2309 return false;
2310 }
2311
2312 I.getOperand(1).setSubReg(SubRegIdx);
2313 }
2314
2315 I.setDesc(TII.get(TargetOpcode::COPY));
2316 return true;
2317}
2318
2319/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2320static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2321 Mask = maskTrailingOnes<unsigned>(Size);
2322 int SignedMask = static_cast<int>(Mask);
2323 return SignedMask >= -16 && SignedMask <= 64;
2324}
2325
2326// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2327const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2328 Register Reg, const MachineRegisterInfo &MRI,
2329 const TargetRegisterInfo &TRI) const {
2330 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2331 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
2332 return RB;
2333
2334 // Ignore the type, since we don't use vcc in artifacts.
2335 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
2336 return &RBI.getRegBankFromRegClass(*RC, LLT());
2337 return nullptr;
2338}
2339
2340bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2341 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2342 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2343 const DebugLoc &DL = I.getDebugLoc();
2344 MachineBasicBlock &MBB = *I.getParent();
2345 const Register DstReg = I.getOperand(0).getReg();
2346 const Register SrcReg = I.getOperand(1).getReg();
2347
2348 const LLT DstTy = MRI->getType(DstReg);
2349 const LLT SrcTy = MRI->getType(SrcReg);
2350 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2351 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2352 const unsigned DstSize = DstTy.getSizeInBits();
2353 if (!DstTy.isScalar())
2354 return false;
2355
2356 // Artifact casts should never use vcc.
2357 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2358
2359 // FIXME: This should probably be illegal and split earlier.
2360 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2361 if (DstSize <= 32)
2362 return selectCOPY(I);
2363
2364 const TargetRegisterClass *SrcRC =
2365 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2366 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2367 const TargetRegisterClass *DstRC =
2368 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2369
2370 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2371 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2372 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2373 .addReg(SrcReg)
2374 .addImm(AMDGPU::sub0)
2375 .addReg(UndefReg)
2376 .addImm(AMDGPU::sub1);
2377 I.eraseFromParent();
2378
2379 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2380 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2381 }
2382
2383 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2384 // 64-bit should have been split up in RegBankSelect
2385
2386 // Try to use an and with a mask if it will save code size.
2387 unsigned Mask;
2388 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2389 MachineInstr *ExtI =
2390 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2391 .addImm(Mask)
2392 .addReg(SrcReg);
2393 I.eraseFromParent();
2394 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2395 }
2396
2397 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2398 MachineInstr *ExtI =
2399 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2400 .addReg(SrcReg)
2401 .addImm(0) // Offset
2402 .addImm(SrcSize); // Width
2403 I.eraseFromParent();
2404 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2405 }
2406
2407 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2408 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2409 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2410 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2411 return false;
2412
2413 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2414 const unsigned SextOpc = SrcSize == 8 ?
2415 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2416 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2417 .addReg(SrcReg);
2418 I.eraseFromParent();
2419 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2420 }
2421
2422 // Using a single 32-bit SALU to calculate the high half is smaller than
2423 // S_BFE with a literal constant operand.
2424 if (DstSize > 32 && SrcSize == 32) {
2425 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2426 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2427 if (Signed) {
2428 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2429 .addReg(SrcReg, 0, SubReg)
2430 .addImm(31)
2431 .setOperandDead(3); // Dead scc
2432 } else {
2433 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2434 .addImm(0);
2435 }
2436 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2437 .addReg(SrcReg, 0, SubReg)
2438 .addImm(AMDGPU::sub0)
2439 .addReg(HiReg)
2440 .addImm(AMDGPU::sub1);
2441 I.eraseFromParent();
2442 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2443 *MRI);
2444 }
2445
2446 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2447 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2448
2449 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2450 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2451 // We need a 64-bit register source, but the high bits don't matter.
2452 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2453 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2454 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2455
2456 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2457 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2458 .addReg(SrcReg, 0, SubReg)
2459 .addImm(AMDGPU::sub0)
2460 .addReg(UndefReg)
2461 .addImm(AMDGPU::sub1);
2462
2463 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2464 .addReg(ExtReg)
2465 .addImm(SrcSize << 16);
2466
2467 I.eraseFromParent();
2468 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2469 }
2470
2471 unsigned Mask;
2472 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2473 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2474 .addReg(SrcReg)
2475 .addImm(Mask)
2476 .setOperandDead(3); // Dead scc
2477 } else {
2478 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2479 .addReg(SrcReg)
2480 .addImm(SrcSize << 16);
2481 }
2482
2483 I.eraseFromParent();
2484 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2485 }
2486
2487 return false;
2488}
2489
2491 Register &Out) {
2492 Register LShlSrc;
2493 if (mi_match(In, MRI,
2494 m_GTrunc(m_GLShr(m_Reg(LShlSrc), m_SpecificICst(16))))) {
2495 Out = LShlSrc;
2496 return true;
2497 }
2498 return false;
2499}
2500
2501bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2502 if (!Subtarget->hasSALUFloatInsts())
2503 return false;
2504
2505 Register Dst = I.getOperand(0).getReg();
2506 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2507 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2508 return false;
2509
2510 Register Src = I.getOperand(1).getReg();
2511
2512 if (MRI->getType(Dst) == LLT::scalar(32) &&
2513 MRI->getType(Src) == LLT::scalar(16)) {
2514 if (isExtractHiElt(*MRI, Src, Src)) {
2515 MachineBasicBlock *BB = I.getParent();
2516 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2517 .addUse(Src);
2518 I.eraseFromParent();
2519 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2520 }
2521 }
2522
2523 return false;
2524}
2525
2526bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
2527 MachineBasicBlock *BB = I.getParent();
2528 MachineOperand &ImmOp = I.getOperand(1);
2529 Register DstReg = I.getOperand(0).getReg();
2530 unsigned Size = MRI->getType(DstReg).getSizeInBits();
2531 bool IsFP = false;
2532
2533 // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
2534 if (ImmOp.isFPImm()) {
2535 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
2536 ImmOp.ChangeToImmediate(Imm.getZExtValue());
2537 IsFP = true;
2538 } else if (ImmOp.isCImm()) {
2539 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
2540 } else {
2541 llvm_unreachable("Not supported by g_constants");
2542 }
2543
2544 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2545 const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
2546
2547 unsigned Opcode;
2548 if (DstRB->getID() == AMDGPU::VCCRegBankID) {
2549 Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2550 } else if (Size == 64 &&
2551 AMDGPU::isValid32BitLiteral(I.getOperand(1).getImm(), IsFP)) {
2552 Opcode = IsSgpr ? AMDGPU::S_MOV_B64_IMM_PSEUDO : AMDGPU::V_MOV_B64_PSEUDO;
2553 I.setDesc(TII.get(Opcode));
2554 I.addImplicitDefUseOperands(*MF);
2555 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2556 } else {
2557 Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2558
2559 // We should never produce s1 values on banks other than VCC. If the user of
2560 // this already constrained the register, we may incorrectly think it's VCC
2561 // if it wasn't originally.
2562 if (Size == 1)
2563 return false;
2564 }
2565
2566 if (Size != 64) {
2567 I.setDesc(TII.get(Opcode));
2568 I.addImplicitDefUseOperands(*MF);
2569 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2570 }
2571
2572 const DebugLoc &DL = I.getDebugLoc();
2573
2574 APInt Imm(Size, I.getOperand(1).getImm());
2575
2576 MachineInstr *ResInst;
2577 if (IsSgpr && TII.isInlineConstant(Imm)) {
2578 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2579 .addImm(I.getOperand(1).getImm());
2580 } else {
2581 const TargetRegisterClass *RC = IsSgpr ?
2582 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2583 Register LoReg = MRI->createVirtualRegister(RC);
2584 Register HiReg = MRI->createVirtualRegister(RC);
2585
2586 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2587 .addImm(Imm.trunc(32).getZExtValue());
2588
2589 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2590 .addImm(Imm.ashr(32).getZExtValue());
2591
2592 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2593 .addReg(LoReg)
2594 .addImm(AMDGPU::sub0)
2595 .addReg(HiReg)
2596 .addImm(AMDGPU::sub1);
2597 }
2598
2599 // We can't call constrainSelectedInstRegOperands here, because it doesn't
2600 // work for target independent opcodes
2601 I.eraseFromParent();
2602 const TargetRegisterClass *DstRC =
2603 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2604 if (!DstRC)
2605 return true;
2606 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2607}
2608
2609bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2610 // Only manually handle the f64 SGPR case.
2611 //
2612 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2613 // the bit ops theoretically have a second result due to the implicit def of
2614 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2615 // that is easy by disabling the check. The result works, but uses a
2616 // nonsensical sreg32orlds_and_sreg_1 regclass.
2617 //
2618 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2619 // the variadic REG_SEQUENCE operands.
2620
2621 Register Dst = MI.getOperand(0).getReg();
2622 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2623 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2624 MRI->getType(Dst) != LLT::scalar(64))
2625 return false;
2626
2627 Register Src = MI.getOperand(1).getReg();
2628 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2629 if (Fabs)
2630 Src = Fabs->getOperand(1).getReg();
2631
2632 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2633 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2634 return false;
2635
2636 MachineBasicBlock *BB = MI.getParent();
2637 const DebugLoc &DL = MI.getDebugLoc();
2638 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2639 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2640 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2641 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2642
2643 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2644 .addReg(Src, 0, AMDGPU::sub0);
2645 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2646 .addReg(Src, 0, AMDGPU::sub1);
2647 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2648 .addImm(0x80000000);
2649
2650 // Set or toggle sign bit.
2651 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2652 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2653 .addReg(HiReg)
2654 .addReg(ConstReg)
2655 .setOperandDead(3); // Dead scc
2656 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2657 .addReg(LoReg)
2658 .addImm(AMDGPU::sub0)
2659 .addReg(OpReg)
2660 .addImm(AMDGPU::sub1);
2661 MI.eraseFromParent();
2662 return true;
2663}
2664
2665// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2666bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2667 Register Dst = MI.getOperand(0).getReg();
2668 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2669 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2670 MRI->getType(Dst) != LLT::scalar(64))
2671 return false;
2672
2673 Register Src = MI.getOperand(1).getReg();
2674 MachineBasicBlock *BB = MI.getParent();
2675 const DebugLoc &DL = MI.getDebugLoc();
2676 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2677 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2678 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2679 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2680
2681 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2682 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2683 return false;
2684
2685 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2686 .addReg(Src, 0, AMDGPU::sub0);
2687 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2688 .addReg(Src, 0, AMDGPU::sub1);
2689 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2690 .addImm(0x7fffffff);
2691
2692 // Clear sign bit.
2693 // TODO: Should this used S_BITSET0_*?
2694 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2695 .addReg(HiReg)
2696 .addReg(ConstReg)
2697 .setOperandDead(3); // Dead scc
2698 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2699 .addReg(LoReg)
2700 .addImm(AMDGPU::sub0)
2701 .addReg(OpReg)
2702 .addImm(AMDGPU::sub1);
2703
2704 MI.eraseFromParent();
2705 return true;
2706}
2707
2708static bool isConstant(const MachineInstr &MI) {
2709 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2710}
2711
2712void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2713 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2714
2715 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2716 const MachineInstr *PtrMI =
2717 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2718
2719 assert(PtrMI);
2720
2721 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2722 return;
2723
2724 GEPInfo GEPInfo;
2725
2726 for (unsigned i = 1; i != 3; ++i) {
2727 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2728 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2729 assert(OpDef);
2730 if (i == 2 && isConstant(*OpDef)) {
2731 // TODO: Could handle constant base + variable offset, but a combine
2732 // probably should have commuted it.
2733 assert(GEPInfo.Imm == 0);
2734 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2735 continue;
2736 }
2737 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2738 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2739 GEPInfo.SgprParts.push_back(GEPOp.getReg());
2740 else
2741 GEPInfo.VgprParts.push_back(GEPOp.getReg());
2742 }
2743
2744 AddrInfo.push_back(GEPInfo);
2745 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2746}
2747
2748bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2749 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2750}
2751
2752bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2753 if (!MI.hasOneMemOperand())
2754 return false;
2755
2756 const MachineMemOperand *MMO = *MI.memoperands_begin();
2757 const Value *Ptr = MMO->getValue();
2758
2759 // UndefValue means this is a load of a kernel input. These are uniform.
2760 // Sometimes LDS instructions have constant pointers.
2761 // If Ptr is null, then that means this mem operand contains a
2762 // PseudoSourceValue like GOT.
2763 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2764 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2765 return true;
2766
2768 return true;
2769
2770 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
2771 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
2772 AMDGPU::SGPRRegBankID;
2773
2774 const Instruction *I = dyn_cast<Instruction>(Ptr);
2775 return I && I->getMetadata("amdgpu.uniform");
2776}
2777
2778bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2779 for (const GEPInfo &GEPInfo : AddrInfo) {
2780 if (!GEPInfo.VgprParts.empty())
2781 return true;
2782 }
2783 return false;
2784}
2785
2786void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2787 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2788 unsigned AS = PtrTy.getAddressSpace();
2790 STI.ldsRequiresM0Init()) {
2791 MachineBasicBlock *BB = I.getParent();
2792
2793 // If DS instructions require M0 initialization, insert it before selecting.
2794 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2795 .addImm(-1);
2796 }
2797}
2798
2799bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2800 MachineInstr &I) const {
2801 initM0(I);
2802 return selectImpl(I, *CoverageInfo);
2803}
2804
2806 if (Reg.isPhysical())
2807 return false;
2808
2809 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
2810 const unsigned Opcode = MI.getOpcode();
2811
2812 if (Opcode == AMDGPU::COPY)
2813 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
2814
2815 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
2816 Opcode == AMDGPU::G_XOR)
2817 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
2818 isVCmpResult(MI.getOperand(2).getReg(), MRI);
2819
2820 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
2821 return GI->is(Intrinsic::amdgcn_class);
2822
2823 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
2824}
2825
2826bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2827 MachineBasicBlock *BB = I.getParent();
2828 MachineOperand &CondOp = I.getOperand(0);
2829 Register CondReg = CondOp.getReg();
2830 const DebugLoc &DL = I.getDebugLoc();
2831
2832 unsigned BrOpcode;
2833 Register CondPhysReg;
2834 const TargetRegisterClass *ConstrainRC;
2835
2836 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2837 // whether the branch is uniform when selecting the instruction. In
2838 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2839 // RegBankSelect knows what it's doing if the branch condition is scc, even
2840 // though it currently does not.
2841 if (!isVCC(CondReg, *MRI)) {
2842 if (MRI->getType(CondReg) != LLT::scalar(32))
2843 return false;
2844
2845 CondPhysReg = AMDGPU::SCC;
2846 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2847 ConstrainRC = &AMDGPU::SReg_32RegClass;
2848 } else {
2849 // FIXME: Should scc->vcc copies and with exec?
2850
2851 // Unless the value of CondReg is a result of a V_CMP* instruction then we
2852 // need to insert an and with exec.
2853 if (!isVCmpResult(CondReg, *MRI)) {
2854 const bool Is64 = STI.isWave64();
2855 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
2856 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
2857
2858 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
2859 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
2860 .addReg(CondReg)
2861 .addReg(Exec)
2862 .setOperandDead(3); // Dead scc
2863 CondReg = TmpReg;
2864 }
2865
2866 CondPhysReg = TRI.getVCC();
2867 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2868 ConstrainRC = TRI.getBoolRC();
2869 }
2870
2871 if (!MRI->getRegClassOrNull(CondReg))
2872 MRI->setRegClass(CondReg, ConstrainRC);
2873
2874 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2875 .addReg(CondReg);
2876 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2877 .addMBB(I.getOperand(1).getMBB());
2878
2879 I.eraseFromParent();
2880 return true;
2881}
2882
2883bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2884 MachineInstr &I) const {
2885 Register DstReg = I.getOperand(0).getReg();
2886 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2887 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2888 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2889 if (IsVGPR)
2890 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2891
2892 return RBI.constrainGenericRegister(
2893 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2894}
2895
2896bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2897 Register DstReg = I.getOperand(0).getReg();
2898 Register SrcReg = I.getOperand(1).getReg();
2899 Register MaskReg = I.getOperand(2).getReg();
2900 LLT Ty = MRI->getType(DstReg);
2901 LLT MaskTy = MRI->getType(MaskReg);
2902 MachineBasicBlock *BB = I.getParent();
2903 const DebugLoc &DL = I.getDebugLoc();
2904
2905 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2906 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2907 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2908 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2909 if (DstRB != SrcRB) // Should only happen for hand written MIR.
2910 return false;
2911
2912 // Try to avoid emitting a bit operation when we only need to touch half of
2913 // the 64-bit pointer.
2914 APInt MaskOnes = KB->getKnownOnes(MaskReg).zext(64);
2915 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2916 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2917
2918 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
2919 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
2920
2921 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
2922 !CanCopyLow32 && !CanCopyHi32) {
2923 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
2924 .addReg(SrcReg)
2925 .addReg(MaskReg)
2926 .setOperandDead(3); // Dead scc
2927 I.eraseFromParent();
2928 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2929 }
2930
2931 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2932 const TargetRegisterClass &RegRC
2933 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2934
2935 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
2936 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
2937 const TargetRegisterClass *MaskRC =
2938 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
2939
2940 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2941 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2942 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2943 return false;
2944
2945 if (Ty.getSizeInBits() == 32) {
2946 assert(MaskTy.getSizeInBits() == 32 &&
2947 "ptrmask should have been narrowed during legalize");
2948
2949 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2950 .addReg(SrcReg)
2951 .addReg(MaskReg);
2952
2953 if (!IsVGPR)
2954 NewOp.setOperandDead(3); // Dead scc
2955 I.eraseFromParent();
2956 return true;
2957 }
2958
2959 Register HiReg = MRI->createVirtualRegister(&RegRC);
2960 Register LoReg = MRI->createVirtualRegister(&RegRC);
2961
2962 // Extract the subregisters from the source pointer.
2963 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2964 .addReg(SrcReg, 0, AMDGPU::sub0);
2965 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2966 .addReg(SrcReg, 0, AMDGPU::sub1);
2967
2968 Register MaskedLo, MaskedHi;
2969
2970 if (CanCopyLow32) {
2971 // If all the bits in the low half are 1, we only need a copy for it.
2972 MaskedLo = LoReg;
2973 } else {
2974 // Extract the mask subregister and apply the and.
2975 Register MaskLo = MRI->createVirtualRegister(&RegRC);
2976 MaskedLo = MRI->createVirtualRegister(&RegRC);
2977
2978 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2979 .addReg(MaskReg, 0, AMDGPU::sub0);
2980 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2981 .addReg(LoReg)
2982 .addReg(MaskLo);
2983 }
2984
2985 if (CanCopyHi32) {
2986 // If all the bits in the high half are 1, we only need a copy for it.
2987 MaskedHi = HiReg;
2988 } else {
2989 Register MaskHi = MRI->createVirtualRegister(&RegRC);
2990 MaskedHi = MRI->createVirtualRegister(&RegRC);
2991
2992 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
2993 .addReg(MaskReg, 0, AMDGPU::sub1);
2994 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
2995 .addReg(HiReg)
2996 .addReg(MaskHi);
2997 }
2998
2999 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3000 .addReg(MaskedLo)
3001 .addImm(AMDGPU::sub0)
3002 .addReg(MaskedHi)
3003 .addImm(AMDGPU::sub1);
3004 I.eraseFromParent();
3005 return true;
3006}
3007
3008/// Return the register to use for the index value, and the subregister to use
3009/// for the indirectly accessed register.
3010static std::pair<Register, unsigned>
3012 const TargetRegisterClass *SuperRC, Register IdxReg,
3013 unsigned EltSize, GISelKnownBits &KnownBits) {
3014 Register IdxBaseReg;
3015 int Offset;
3016
3017 std::tie(IdxBaseReg, Offset) =
3019 if (IdxBaseReg == AMDGPU::NoRegister) {
3020 // This will happen if the index is a known constant. This should ordinarily
3021 // be legalized out, but handle it as a register just in case.
3022 assert(Offset == 0);
3023 IdxBaseReg = IdxReg;
3024 }
3025
3026 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3027
3028 // Skip out of bounds offsets, or else we would end up using an undefined
3029 // register.
3030 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3031 return std::pair(IdxReg, SubRegs[0]);
3032 return std::pair(IdxBaseReg, SubRegs[Offset]);
3033}
3034
3035bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3036 MachineInstr &MI) const {
3037 Register DstReg = MI.getOperand(0).getReg();
3038 Register SrcReg = MI.getOperand(1).getReg();
3039 Register IdxReg = MI.getOperand(2).getReg();
3040
3041 LLT DstTy = MRI->getType(DstReg);
3042 LLT SrcTy = MRI->getType(SrcReg);
3043
3044 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3045 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3046 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3047
3048 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3049 // into a waterfall loop.
3050 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3051 return false;
3052
3053 const TargetRegisterClass *SrcRC =
3054 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3055 const TargetRegisterClass *DstRC =
3056 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3057 if (!SrcRC || !DstRC)
3058 return false;
3059 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3060 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3061 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3062 return false;
3063
3064 MachineBasicBlock *BB = MI.getParent();
3065 const DebugLoc &DL = MI.getDebugLoc();
3066 const bool Is64 = DstTy.getSizeInBits() == 64;
3067
3068 unsigned SubReg;
3069 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3070 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *KB);
3071
3072 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3073 if (DstTy.getSizeInBits() != 32 && !Is64)
3074 return false;
3075
3076 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3077 .addReg(IdxReg);
3078
3079 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3080 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3081 .addReg(SrcReg, 0, SubReg)
3082 .addReg(SrcReg, RegState::Implicit);
3083 MI.eraseFromParent();
3084 return true;
3085 }
3086
3087 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3088 return false;
3089
3090 if (!STI.useVGPRIndexMode()) {
3091 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3092 .addReg(IdxReg);
3093 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3094 .addReg(SrcReg, 0, SubReg)
3095 .addReg(SrcReg, RegState::Implicit);
3096 MI.eraseFromParent();
3097 return true;
3098 }
3099
3100 const MCInstrDesc &GPRIDXDesc =
3101 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3102 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3103 .addReg(SrcReg)
3104 .addReg(IdxReg)
3105 .addImm(SubReg);
3106
3107 MI.eraseFromParent();
3108 return true;
3109}
3110
3111// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3112bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3113 MachineInstr &MI) const {
3114 Register DstReg = MI.getOperand(0).getReg();
3115 Register VecReg = MI.getOperand(1).getReg();
3116 Register ValReg = MI.getOperand(2).getReg();
3117 Register IdxReg = MI.getOperand(3).getReg();
3118
3119 LLT VecTy = MRI->getType(DstReg);
3120 LLT ValTy = MRI->getType(ValReg);
3121 unsigned VecSize = VecTy.getSizeInBits();
3122 unsigned ValSize = ValTy.getSizeInBits();
3123
3124 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3125 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3126 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3127
3128 assert(VecTy.getElementType() == ValTy);
3129
3130 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3131 // into a waterfall loop.
3132 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3133 return false;
3134
3135 const TargetRegisterClass *VecRC =
3136 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3137 const TargetRegisterClass *ValRC =
3138 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3139
3140 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3141 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3142 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3143 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3144 return false;
3145
3146 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3147 return false;
3148
3149 unsigned SubReg;
3150 std::tie(IdxReg, SubReg) =
3151 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *KB);
3152
3153 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3154 STI.useVGPRIndexMode();
3155
3156 MachineBasicBlock *BB = MI.getParent();
3157 const DebugLoc &DL = MI.getDebugLoc();
3158
3159 if (!IndexMode) {
3160 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3161 .addReg(IdxReg);
3162
3163 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3164 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3165 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3166 .addReg(VecReg)
3167 .addReg(ValReg)
3168 .addImm(SubReg);
3169 MI.eraseFromParent();
3170 return true;
3171 }
3172
3173 const MCInstrDesc &GPRIDXDesc =
3174 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3175 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3176 .addReg(VecReg)
3177 .addReg(ValReg)
3178 .addReg(IdxReg)
3179 .addImm(SubReg);
3180
3181 MI.eraseFromParent();
3182 return true;
3183}
3184
3185bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3187 unsigned Opc;
3188 unsigned Size = MI.getOperand(3).getImm();
3189
3190 // The struct intrinsic variants add one additional operand over raw.
3191 const bool HasVIndex = MI.getNumOperands() == 9;
3192 Register VIndex;
3193 int OpOffset = 0;
3194 if (HasVIndex) {
3195 VIndex = MI.getOperand(4).getReg();
3196 OpOffset = 1;
3197 }
3198
3199 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3200 std::optional<ValueAndVReg> MaybeVOffset =
3202 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3203
3204 switch (Size) {
3205 default:
3206 return false;
3207 case 1:
3208 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3209 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3210 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3211 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3212 break;
3213 case 2:
3214 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3215 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3216 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3217 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3218 break;
3219 case 4:
3220 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3221 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3222 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3223 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3224 break;
3225 }
3226
3227 MachineBasicBlock *MBB = MI.getParent();
3228 const DebugLoc &DL = MI.getDebugLoc();
3229 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3230 .add(MI.getOperand(2));
3231
3232 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3233
3234 if (HasVIndex && HasVOffset) {
3235 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3236 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3237 .addReg(VIndex)
3238 .addImm(AMDGPU::sub0)
3239 .addReg(VOffset)
3240 .addImm(AMDGPU::sub1);
3241
3242 MIB.addReg(IdxReg);
3243 } else if (HasVIndex) {
3244 MIB.addReg(VIndex);
3245 } else if (HasVOffset) {
3246 MIB.addReg(VOffset);
3247 }
3248
3249 MIB.add(MI.getOperand(1)); // rsrc
3250 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3251 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3252 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3253 MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol
3254 MIB.addImm(Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0); // swz
3255
3256 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3257 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3258 LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
3259 MachinePointerInfo StorePtrI = LoadPtrI;
3260 StorePtrI.V = nullptr;
3262
3263 auto F = LoadMMO->getFlags() &
3265 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3266 Size, LoadMMO->getBaseAlign());
3267
3268 MachineMemOperand *StoreMMO =
3270 sizeof(int32_t), LoadMMO->getBaseAlign());
3271
3272 MIB.setMemRefs({LoadMMO, StoreMMO});
3273
3274 MI.eraseFromParent();
3275 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3276}
3277
3278/// Match a zero extend from a 32-bit value to 64-bits.
3280 Register ZExtSrc;
3281 if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3282 return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3283
3284 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3285 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3286 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3287 return Register();
3288
3289 assert(Def->getNumOperands() == 3 &&
3290 MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3291 if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3292 return Def->getOperand(1).getReg();
3293 }
3294
3295 return Register();
3296}
3297
3298bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3299 unsigned Opc;
3300 unsigned Size = MI.getOperand(3).getImm();
3301
3302 switch (Size) {
3303 default:
3304 return false;
3305 case 1:
3306 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3307 break;
3308 case 2:
3309 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3310 break;
3311 case 4:
3312 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3313 break;
3314 }
3315
3316 MachineBasicBlock *MBB = MI.getParent();
3317 const DebugLoc &DL = MI.getDebugLoc();
3318 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3319 .add(MI.getOperand(2));
3320
3321 Register Addr = MI.getOperand(1).getReg();
3322 Register VOffset;
3323 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3324 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3325 if (!isSGPR(Addr)) {
3326 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3327 if (isSGPR(AddrDef->Reg)) {
3328 Addr = AddrDef->Reg;
3329 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3330 Register SAddr =
3331 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3332 if (isSGPR(SAddr)) {
3333 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3334 if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
3335 Addr = SAddr;
3336 VOffset = Off;
3337 }
3338 }
3339 }
3340 }
3341
3342 if (isSGPR(Addr)) {
3343 Opc = AMDGPU::getGlobalSaddrOp(Opc);
3344 if (!VOffset) {
3345 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3346 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3347 .addImm(0);
3348 }
3349 }
3350
3351 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3352 .addReg(Addr);
3353
3354 if (isSGPR(Addr))
3355 MIB.addReg(VOffset);
3356
3357 MIB.add(MI.getOperand(4)) // offset
3358 .add(MI.getOperand(5)); // cpol
3359
3360 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3361 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3362 LoadPtrI.Offset = MI.getOperand(4).getImm();
3363 MachinePointerInfo StorePtrI = LoadPtrI;
3366 auto F = LoadMMO->getFlags() &
3368 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3369 Size, LoadMMO->getBaseAlign());
3370 MachineMemOperand *StoreMMO =
3372 sizeof(int32_t), Align(4));
3373
3374 MIB.setMemRefs({LoadMMO, StoreMMO});
3375
3376 MI.eraseFromParent();
3377 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3378}
3379
3380bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3381 MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3382 MI.removeOperand(1);
3383 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3384 return true;
3385}
3386
3387bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3388 unsigned Opc;
3389 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3390 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3391 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3392 break;
3393 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3394 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3395 break;
3396 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3397 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3398 break;
3399 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3400 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3401 break;
3402 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3403 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3404 break;
3405 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3406 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3407 break;
3408 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3409 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3410 break;
3411 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3412 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3413 break;
3414 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3415 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3416 break;
3417 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3418 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3419 break;
3420 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3421 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3422 break;
3423 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3424 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3425 break;
3426 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3427 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3428 break;
3429 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3430 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3431 break;
3432 default:
3433 llvm_unreachable("unhandled smfmac intrinsic");
3434 }
3435
3436 auto VDst_In = MI.getOperand(4);
3437
3438 MI.setDesc(TII.get(Opc));
3439 MI.removeOperand(4); // VDst_In
3440 MI.removeOperand(1); // Intrinsic ID
3441 MI.addOperand(VDst_In); // Readd VDst_In to the end
3442 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3443 return true;
3444}
3445
3446bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3447 Register DstReg = MI.getOperand(0).getReg();
3448 Register SrcReg = MI.getOperand(1).getReg();
3449 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3450 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3451 MachineBasicBlock *MBB = MI.getParent();
3452 const DebugLoc &DL = MI.getDebugLoc();
3453
3454 if (IsVALU) {
3455 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3456 .addImm(Subtarget->getWavefrontSizeLog2())
3457 .addReg(SrcReg);
3458 } else {
3459 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3460 .addReg(SrcReg)
3461 .addImm(Subtarget->getWavefrontSizeLog2())
3462 .setOperandDead(3); // Dead scc
3463 }
3464
3465 const TargetRegisterClass &RC =
3466 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3467 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3468 return false;
3469
3470 MI.eraseFromParent();
3471 return true;
3472}
3473
3474bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
3475 Register SrcReg = MI.getOperand(0).getReg();
3476 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
3477 return false;
3478
3479 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
3480 Register SP =
3482 Register WaveAddr = getWaveAddress(DefMI);
3483 MachineBasicBlock *MBB = MI.getParent();
3484 const DebugLoc &DL = MI.getDebugLoc();
3485
3486 if (!WaveAddr) {
3487 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3488 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
3489 .addReg(SrcReg)
3490 .addImm(Subtarget->getWavefrontSizeLog2())
3491 .setOperandDead(3); // Dead scc
3492 }
3493
3494 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
3495 .addReg(WaveAddr);
3496
3497 MI.eraseFromParent();
3498 return true;
3499}
3500
3502
3503 if (!I.isPreISelOpcode()) {
3504 if (I.isCopy())
3505 return selectCOPY(I);
3506 return true;
3507 }
3508
3509 switch (I.getOpcode()) {
3510 case TargetOpcode::G_AND:
3511 case TargetOpcode::G_OR:
3512 case TargetOpcode::G_XOR:
3513 if (selectImpl(I, *CoverageInfo))
3514 return true;
3515 return selectG_AND_OR_XOR(I);
3516 case TargetOpcode::G_ADD:
3517 case TargetOpcode::G_SUB:
3518 case TargetOpcode::G_PTR_ADD:
3519 if (selectImpl(I, *CoverageInfo))
3520 return true;
3521 return selectG_ADD_SUB(I);
3522 case TargetOpcode::G_UADDO:
3523 case TargetOpcode::G_USUBO:
3524 case TargetOpcode::G_UADDE:
3525 case TargetOpcode::G_USUBE:
3526 return selectG_UADDO_USUBO_UADDE_USUBE(I);
3527 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3528 case AMDGPU::G_AMDGPU_MAD_I64_I32:
3529 return selectG_AMDGPU_MAD_64_32(I);
3530 case TargetOpcode::G_INTTOPTR:
3531 case TargetOpcode::G_BITCAST:
3532 case TargetOpcode::G_PTRTOINT:
3533 case TargetOpcode::G_FREEZE:
3534 return selectCOPY(I);
3535 case TargetOpcode::G_CONSTANT:
3536 case TargetOpcode::G_FCONSTANT:
3537 return selectG_CONSTANT(I);
3538 case TargetOpcode::G_FNEG:
3539 if (selectImpl(I, *CoverageInfo))
3540 return true;
3541 return selectG_FNEG(I);
3542 case TargetOpcode::G_FABS:
3543 if (selectImpl(I, *CoverageInfo))
3544 return true;
3545 return selectG_FABS(I);
3546 case TargetOpcode::G_EXTRACT:
3547 return selectG_EXTRACT(I);
3548 case TargetOpcode::G_MERGE_VALUES:
3549 case TargetOpcode::G_CONCAT_VECTORS:
3550 return selectG_MERGE_VALUES(I);
3551 case TargetOpcode::G_UNMERGE_VALUES:
3552 return selectG_UNMERGE_VALUES(I);
3553 case TargetOpcode::G_BUILD_VECTOR:
3554 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
3555 return selectG_BUILD_VECTOR(I);
3556 case TargetOpcode::G_IMPLICIT_DEF:
3557 return selectG_IMPLICIT_DEF(I);
3558 case TargetOpcode::G_INSERT:
3559 return selectG_INSERT(I);
3560 case TargetOpcode::G_INTRINSIC:
3561 case TargetOpcode::G_INTRINSIC_CONVERGENT:
3562 return selectG_INTRINSIC(I);
3563 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3564 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
3565 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
3566 case TargetOpcode::G_ICMP:
3567 case TargetOpcode::G_FCMP:
3568 if (selectG_ICMP_or_FCMP(I))
3569 return true;
3570 return selectImpl(I, *CoverageInfo);
3571 case TargetOpcode::G_LOAD:
3572 case TargetOpcode::G_STORE:
3573 case TargetOpcode::G_ATOMIC_CMPXCHG:
3574 case TargetOpcode::G_ATOMICRMW_XCHG:
3575 case TargetOpcode::G_ATOMICRMW_ADD:
3576 case TargetOpcode::G_ATOMICRMW_SUB:
3577 case TargetOpcode::G_ATOMICRMW_AND:
3578 case TargetOpcode::G_ATOMICRMW_OR:
3579 case TargetOpcode::G_ATOMICRMW_XOR:
3580 case TargetOpcode::G_ATOMICRMW_MIN:
3581 case TargetOpcode::G_ATOMICRMW_MAX:
3582 case TargetOpcode::G_ATOMICRMW_UMIN:
3583 case TargetOpcode::G_ATOMICRMW_UMAX:
3584 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
3585 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
3586 case TargetOpcode::G_ATOMICRMW_FADD:
3587 case TargetOpcode::G_ATOMICRMW_FMIN:
3588 case TargetOpcode::G_ATOMICRMW_FMAX:
3589 return selectG_LOAD_STORE_ATOMICRMW(I);
3590 case TargetOpcode::G_SELECT:
3591 return selectG_SELECT(I);
3592 case TargetOpcode::G_TRUNC:
3593 return selectG_TRUNC(I);
3594 case TargetOpcode::G_SEXT:
3595 case TargetOpcode::G_ZEXT:
3596 case TargetOpcode::G_ANYEXT:
3597 case TargetOpcode::G_SEXT_INREG:
3598 // This is a workaround. For extension from type i1, `selectImpl()` uses
3599 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
3600 // i1 can only be hold in a SGPR class.
3601 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
3602 selectImpl(I, *CoverageInfo))
3603 return true;
3604 return selectG_SZA_EXT(I);
3605 case TargetOpcode::G_FPEXT:
3606 if (selectG_FPEXT(I))
3607 return true;
3608 return selectImpl(I, *CoverageInfo);
3609 case TargetOpcode::G_BRCOND:
3610 return selectG_BRCOND(I);
3611 case TargetOpcode::G_GLOBAL_VALUE:
3612 return selectG_GLOBAL_VALUE(I);
3613 case TargetOpcode::G_PTRMASK:
3614 return selectG_PTRMASK(I);
3615 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3616 return selectG_EXTRACT_VECTOR_ELT(I);
3617 case TargetOpcode::G_INSERT_VECTOR_ELT:
3618 return selectG_INSERT_VECTOR_ELT(I);
3619 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3620 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3621 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
3622 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3623 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3626 assert(Intr && "not an image intrinsic with image pseudo");
3627 return selectImageIntrinsic(I, Intr);
3628 }
3629 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
3630 return selectBVHIntrinsic(I);
3631 case AMDGPU::G_SBFX:
3632 case AMDGPU::G_UBFX:
3633 return selectG_SBFX_UBFX(I);
3634 case AMDGPU::G_SI_CALL:
3635 I.setDesc(TII.get(AMDGPU::SI_CALL));
3636 return true;
3637 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
3638 return selectWaveAddress(I);
3639 case AMDGPU::G_STACKRESTORE:
3640 return selectStackRestore(I);
3641 case AMDGPU::G_PHI:
3642 return selectPHI(I);
3643 default:
3644 return selectImpl(I, *CoverageInfo);
3645 }
3646 return false;
3647}
3648
3650AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
3651 return {{
3652 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3653 }};
3654
3655}
3656
3657std::pair<Register, unsigned>
3658AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
3659 bool IsCanonicalizing,
3660 bool AllowAbs, bool OpSel) const {
3661 Register Src = Root.getReg();
3662 unsigned Mods = 0;
3663 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
3664
3665 if (MI->getOpcode() == AMDGPU::G_FNEG) {
3666 Src = MI->getOperand(1).getReg();
3667 Mods |= SISrcMods::NEG;
3668 MI = getDefIgnoringCopies(Src, *MRI);
3669 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
3670 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
3671 // denormal mode, but we're implicitly canonicalizing in a source operand.
3672 const ConstantFP *LHS =
3673 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
3674 if (LHS && LHS->isZero()) {
3675 Mods |= SISrcMods::NEG;
3676 Src = MI->getOperand(2).getReg();
3677 }
3678 }
3679
3680 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
3681 Src = MI->getOperand(1).getReg();
3682 Mods |= SISrcMods::ABS;
3683 }
3684
3685 if (OpSel)
3686 Mods |= SISrcMods::OP_SEL_0;
3687
3688 return std::pair(Src, Mods);
3689}
3690
3691Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
3692 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
3693 bool ForceVGPR) const {
3694 if ((Mods != 0 || ForceVGPR) &&
3695 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
3696
3697 // If we looked through copies to find source modifiers on an SGPR operand,
3698 // we now have an SGPR register source. To avoid potentially violating the
3699 // constant bus restriction, we need to insert a copy to a VGPR.
3700 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
3701 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
3702 TII.get(AMDGPU::COPY), VGPRSrc)
3703 .addReg(Src);
3704 Src = VGPRSrc;
3705 }
3706
3707 return Src;
3708}
3709
3710///
3711/// This will select either an SGPR or VGPR operand and will save us from
3712/// having to write an extra tablegen pattern.
3714AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
3715 return {{
3716 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3717 }};
3718}
3719
3721AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
3722 Register Src;
3723 unsigned Mods;
3724 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3725
3726 return {{
3727 [=](MachineInstrBuilder &MIB) {
3728 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3729 },
3730 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3731 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3732 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3733 }};
3734}
3735
3737AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
3738 Register Src;
3739 unsigned Mods;
3740 std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
3741 /*IsCanonicalizing=*/true,
3742 /*AllowAbs=*/false);
3743
3744 return {{
3745 [=](MachineInstrBuilder &MIB) {
3746 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3747 },
3748 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3749 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3750 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3751 }};
3752}
3753
3755AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
3756 return {{
3757 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
3758 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3759 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3760 }};
3761}
3762
3764AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3765 Register Src;
3766 unsigned Mods;
3767 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3768
3769 return {{
3770 [=](MachineInstrBuilder &MIB) {
3771 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3772 },
3773 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3774 }};
3775}
3776
3778AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
3779 MachineOperand &Root) const {
3780 Register Src;
3781 unsigned Mods;
3782 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/false);
3783
3784 return {{
3785 [=](MachineInstrBuilder &MIB) {
3786 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3787 },
3788 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3789 }};
3790}
3791
3793AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
3794 Register Src;
3795 unsigned Mods;
3796 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/true,
3797 /*AllowAbs=*/false);
3798
3799 return {{
3800 [=](MachineInstrBuilder &MIB) {
3801 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3802 },
3803 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3804 }};
3805}
3806
3808AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3809 Register Reg = Root.getReg();
3810 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3811 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
3812 return {};
3813 return {{
3814 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3815 }};
3816}
3817
3818std::pair<Register, unsigned>
3819AMDGPUInstructionSelector::selectVOP3PModsImpl(
3820 Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
3821 unsigned Mods = 0;
3822 MachineInstr *MI = MRI.getVRegDef(Src);
3823
3824 if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3825 // It's possible to see an f32 fneg here, but unlikely.
3826 // TODO: Treat f32 fneg as only high bit.
3827 MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
3829 Src = MI->getOperand(1).getReg();
3830 MI = MRI.getVRegDef(Src);
3831 }
3832
3833 // TODO: Handle G_FSUB 0 as fneg
3834
3835 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3836 (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard()
3837
3838 // Packed instructions do not have abs modifiers.
3839 Mods |= SISrcMods::OP_SEL_1;
3840
3841 return std::pair(Src, Mods);
3842}
3843
3845AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3847 = Root.getParent()->getParent()->getParent()->getRegInfo();
3848
3849 Register Src;
3850 unsigned Mods;
3851 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3852
3853 return {{
3854 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3855 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3856 }};
3857}
3858
3860AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
3862 = Root.getParent()->getParent()->getParent()->getRegInfo();
3863
3864 Register Src;
3865 unsigned Mods;
3866 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true);
3867
3868 return {{
3869 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3870 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3871 }};
3872}
3873
3875AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
3876 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3877 // Value is in Imm operand as i1 sign extended to int64_t.
3878 // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
3879 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3880 "expected i1 value");
3881 unsigned Mods = SISrcMods::OP_SEL_1;
3882 if (Root.getImm() == -1)
3883 Mods ^= SISrcMods::NEG;
3884 return {{
3885 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3886 }};
3887}
3888
3890AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
3891 MachineOperand &Root) const {
3892 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3893 "expected i1 value");
3894 unsigned Mods = SISrcMods::OP_SEL_1;
3895 if (Root.getImm() != 0)
3896 Mods |= SISrcMods::OP_SEL_0;
3897
3898 return {{
3899 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3900 }};
3901}
3902
3904 MachineInstr *InsertPt,
3906 const TargetRegisterClass *DstRegClass;
3907 switch (Elts.size()) {
3908 case 8:
3909 DstRegClass = &AMDGPU::VReg_256RegClass;
3910 break;
3911 case 4:
3912 DstRegClass = &AMDGPU::VReg_128RegClass;
3913 break;
3914 case 2:
3915 DstRegClass = &AMDGPU::VReg_64RegClass;
3916 break;
3917 default:
3918 llvm_unreachable("unhandled Reg sequence size");
3919 }
3920
3921 MachineIRBuilder B(*InsertPt);
3922 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
3923 .addDef(MRI.createVirtualRegister(DstRegClass));
3924 for (unsigned i = 0; i < Elts.size(); ++i) {
3925 MIB.addReg(Elts[i]);
3927 }
3928 return MIB->getOperand(0).getReg();
3929}
3930
3931static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3933 MachineInstr *InsertPt,
3935 if (ModOpcode == TargetOpcode::G_FNEG) {
3936 Mods |= SISrcMods::NEG;
3937 // Check if all elements also have abs modifier
3938 SmallVector<Register, 8> NegAbsElts;
3939 for (auto El : Elts) {
3940 Register FabsSrc;
3941 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
3942 break;
3943 NegAbsElts.push_back(FabsSrc);
3944 }
3945 if (Elts.size() != NegAbsElts.size()) {
3946 // Neg
3947 Src = buildRegSequence(Elts, InsertPt, MRI);
3948 } else {
3949 // Neg and Abs
3950 Mods |= SISrcMods::NEG_HI;
3951 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
3952 }
3953 } else {
3954 assert(ModOpcode == TargetOpcode::G_FABS);
3955 // Abs
3956 Mods |= SISrcMods::NEG_HI;
3957 Src = buildRegSequence(Elts, InsertPt, MRI);
3958 }
3959}
3960
3962AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
3963 Register Src = Root.getReg();
3964 unsigned Mods = SISrcMods::OP_SEL_1;
3966
3967 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
3968 assert(BV->getNumSources() > 0);
3969 // Based on first element decide which mod we match, neg or abs
3970 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
3971 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
3972 ? AMDGPU::G_FNEG
3973 : AMDGPU::G_FABS;
3974 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
3975 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
3976 if (ElF32->getOpcode() != ModOpcode)
3977 break;
3978 EltsF32.push_back(ElF32->getOperand(1).getReg());
3979 }
3980
3981 // All elements had ModOpcode modifier
3982 if (BV->getNumSources() == EltsF32.size()) {
3983 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
3984 *MRI);
3985 }
3986 }
3987
3988 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3989 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
3990}
3991
3993AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
3994 Register Src = Root.getReg();
3995 unsigned Mods = SISrcMods::OP_SEL_1;
3996 SmallVector<Register, 8> EltsV2F16;
3997
3998 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
3999 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4000 Register FNegSrc;
4001 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
4002 break;
4003 EltsV2F16.push_back(FNegSrc);
4004 }
4005
4006 // All elements had ModOpcode modifier
4007 if (CV->getNumSources() == EltsV2F16.size()) {
4008 Mods |= SISrcMods::NEG;
4009 Mods |= SISrcMods::NEG_HI;
4010 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
4011 }
4012 }
4013
4014 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4015 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4016}
4017
4019AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
4020 Register Src = Root.getReg();
4021 unsigned Mods = SISrcMods::OP_SEL_1;
4022 SmallVector<Register, 8> EltsV2F16;
4023
4024 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4025 assert(CV->getNumSources() > 0);
4026 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
4027 // Based on first element decide which mod we match, neg or abs
4028 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
4029 ? AMDGPU::G_FNEG
4030 : AMDGPU::G_FABS;
4031
4032 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4033 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
4034 if (ElV2F16->getOpcode() != ModOpcode)
4035 break;
4036 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
4037 }
4038
4039 // All elements had ModOpcode modifier
4040 if (CV->getNumSources() == EltsV2F16.size()) {
4041 MachineIRBuilder B(*Root.getParent());
4042 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
4043 *MRI);
4044 }
4045 }
4046
4047 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4048 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4049}
4050
4052AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
4053 std::optional<FPValueAndVReg> FPValReg;
4054 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
4055 if (TII.isInlineConstant(FPValReg->Value)) {
4056 return {{[=](MachineInstrBuilder &MIB) {
4057 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
4058 }}};
4059 }
4060 // Non-inlineable splat floats should not fall-through for integer immediate
4061 // checks.
4062 return {};
4063 }
4064
4065 APInt ICst;
4066 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
4067 if (TII.isInlineConstant(ICst)) {
4068 return {
4069 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
4070 }
4071 }
4072
4073 return {};
4074}
4075
4077AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
4078 Register Src =
4079 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4080 unsigned Key = 0;
4081
4082 Register ShiftSrc;
4083 std::optional<ValueAndVReg> ShiftAmt;
4084 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4085 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4086 ShiftAmt->Value.getZExtValue() % 8 == 0) {
4087 Key = ShiftAmt->Value.getZExtValue() / 8;
4088 Src = ShiftSrc;
4089 }
4090
4091 return {{
4092 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4093 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4094 }};
4095}
4096
4098AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
4099
4100 Register Src =
4101 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4102 unsigned Key = 0;
4103
4104 Register ShiftSrc;
4105 std::optional<ValueAndVReg> ShiftAmt;
4106 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4107 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4108 ShiftAmt->Value.getZExtValue() == 16) {
4109 Src = ShiftSrc;
4110 Key = 1;
4111 }
4112
4113 return {{
4114 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4115 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4116 }};
4117}
4118
4120AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
4121 Register Src;
4122 unsigned Mods;
4123 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
4124
4125 // FIXME: Handle op_sel
4126 return {{
4127 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4128 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4129 }};
4130}
4131
4133AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
4134 Register Src;
4135 unsigned Mods;
4136 std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
4137 /*IsCanonicalizing=*/true,
4138 /*AllowAbs=*/false,
4139 /*OpSel=*/false);
4140
4141 return {{
4142 [=](MachineInstrBuilder &MIB) {
4143 MIB.addReg(
4144 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4145 },
4146 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4147 }};
4148}
4149
4151AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
4152 Register Src;
4153 unsigned Mods;
4154 std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
4155 /*IsCanonicalizing=*/true,
4156 /*AllowAbs=*/false,
4157 /*OpSel=*/true);
4158
4159 return {{
4160 [=](MachineInstrBuilder &MIB) {
4161 MIB.addReg(
4162 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4163 },
4164 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4165 }};
4166}
4167
4168bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
4169 Register &Base,
4170 Register *SOffset,
4171 int64_t *Offset) const {
4172 MachineInstr *MI = Root.getParent();
4173 MachineBasicBlock *MBB = MI->getParent();
4174
4175 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
4176 // then we can select all ptr + 32-bit offsets.
4177 SmallVector<GEPInfo, 4> AddrInfo;
4178 getAddrModeInfo(*MI, *MRI, AddrInfo);
4179
4180 if (AddrInfo.empty())
4181 return false;
4182
4183 const GEPInfo &GEPI = AddrInfo[0];
4184 std::optional<int64_t> EncodedImm;
4185
4186 if (SOffset && Offset) {
4187 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
4188 /*HasSOffset=*/true);
4189 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
4190 AddrInfo.size() > 1) {
4191 const GEPInfo &GEPI2 = AddrInfo[1];
4192 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
4193 if (Register OffsetReg =
4194 matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
4195 Base = GEPI2.SgprParts[0];
4196 *SOffset = OffsetReg;
4197 *Offset = *EncodedImm;
4198 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
4199 return true;
4200
4201 // For unbuffered smem loads, it is illegal for the Immediate Offset
4202 // to be negative if the resulting (Offset + (M0 or SOffset or zero)
4203 // is negative. Handle the case where the Immediate Offset + SOffset
4204 // is negative.
4205 auto SKnown = KB->getKnownBits(*SOffset);
4206 if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
4207 return false;
4208
4209 return true;
4210 }
4211 }
4212 }
4213 return false;
4214 }
4215
4216 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
4217 /*HasSOffset=*/false);
4218 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
4219 Base = GEPI.SgprParts[0];
4220 *Offset = *EncodedImm;
4221 return true;
4222 }
4223
4224 // SGPR offset is unsigned.
4225 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
4226 GEPI.Imm != 0) {
4227 // If we make it this far we have a load with an 32-bit immediate offset.
4228 // It is OK to select this using a sgpr offset, because we have already
4229 // failed trying to select this load into one of the _IMM variants since
4230 // the _IMM Patterns are considered before the _SGPR patterns.
4231 Base = GEPI.SgprParts[0];
4232 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4233 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
4234 .addImm(GEPI.Imm);
4235 return true;
4236 }
4237
4238 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
4239 if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
4240 Base = GEPI.SgprParts[0];
4241 *SOffset = OffsetReg;
4242 return true;
4243 }
4244 }
4245
4246 return false;
4247}
4248
4250AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
4251 Register Base;
4252 int64_t Offset;
4253 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
4254 return std::nullopt;
4255
4256 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4257 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4258}
4259
4261AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
4262 SmallVector<GEPInfo, 4> AddrInfo;
4263 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
4264
4265 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
4266 return std::nullopt;
4267
4268 const GEPInfo &GEPInfo = AddrInfo[0];
4269 Register PtrReg = GEPInfo.SgprParts[0];
4270 std::optional<int64_t> EncodedImm =
4271 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
4272 if (!EncodedImm)
4273 return std::nullopt;
4274
4275 return {{
4276 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
4277 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
4278 }};
4279}
4280
4282AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
4283 Register Base, SOffset;
4284 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
4285 return std::nullopt;
4286
4287 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4288 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
4289}
4290
4292AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
4293 Register Base, SOffset;
4294 int64_t Offset;
4295 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
4296 return std::nullopt;
4297
4298 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4299 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
4300 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4301}
4302
4303std::pair<Register, int>
4304AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
4305 uint64_t FlatVariant) const {
4306 MachineInstr *MI = Root.getParent();
4307
4308 auto Default = std::pair(Root.getReg(), 0);
4309
4310 if (!STI.hasFlatInstOffsets())
4311 return Default;
4312
4313 Register PtrBase;
4314 int64_t ConstOffset;
4315 std::tie(PtrBase, ConstOffset) =
4316 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4317
4318 if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch &&
4319 !isFlatScratchBaseLegal(Root.getReg())))
4320 return Default;
4321
4322 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
4323 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
4324 return Default;
4325
4326 return std::pair(PtrBase, ConstOffset);
4327}
4328
4330AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
4331 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
4332
4333 return {{
4334 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4335 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4336 }};
4337}
4338
4340AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
4341 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
4342
4343 return {{
4344 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4345 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4346 }};
4347}
4348
4350AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
4351 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
4352
4353 return {{
4354 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4355 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4356 }};
4357}
4358
4359// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
4361AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
4362 Register Addr = Root.getReg();
4363 Register PtrBase;
4364 int64_t ConstOffset;
4365 int64_t ImmOffset = 0;
4366
4367 // Match the immediate offset first, which canonically is moved as low as
4368 // possible.
4369 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4370
4371 if (ConstOffset != 0) {
4372 if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
4374 Addr = PtrBase;
4375 ImmOffset = ConstOffset;
4376 } else {
4377 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
4378 if (isSGPR(PtrBaseDef->Reg)) {
4379 if (ConstOffset > 0) {
4380 // Offset is too large.
4381 //
4382 // saddr + large_offset -> saddr +
4383 // (voffset = large_offset & ~MaxOffset) +
4384 // (large_offset & MaxOffset);
4385 int64_t SplitImmOffset, RemainderOffset;
4386 std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
4388
4389 if (isUInt<32>(RemainderOffset)) {
4390 MachineInstr *MI = Root.getParent();
4391 MachineBasicBlock *MBB = MI->getParent();
4392 Register HighBits =
4393 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4394
4395 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4396 HighBits)
4397 .addImm(RemainderOffset);
4398
4399 return {{
4400 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
4401 [=](MachineInstrBuilder &MIB) {
4402 MIB.addReg(HighBits);
4403 }, // voffset
4404 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
4405 }};
4406 }
4407 }
4408
4409 // We are adding a 64 bit SGPR and a constant. If constant bus limit
4410 // is 1 we would need to perform 1 or 2 extra moves for each half of
4411 // the constant and it is better to do a scalar add and then issue a
4412 // single VALU instruction to materialize zero. Otherwise it is less
4413 // instructions to perform VALU adds with immediates or inline literals.
4414 unsigned NumLiterals =
4415 !TII.isInlineConstant(APInt(32, ConstOffset & 0xffffffff)) +
4416 !TII.isInlineConstant(APInt(32, ConstOffset >> 32));
4417 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
4418 return std::nullopt;
4419 }
4420 }
4421 }
4422
4423 // Match the variable offset.
4424 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4425 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4426 // Look through the SGPR->VGPR copy.
4427 Register SAddr =
4428 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
4429
4430 if (isSGPR(SAddr)) {
4431 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
4432
4433 // It's possible voffset is an SGPR here, but the copy to VGPR will be
4434 // inserted later.
4435 if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
4436 return {{[=](MachineInstrBuilder &MIB) { // saddr
4437 MIB.addReg(SAddr);
4438 },
4439 [=](MachineInstrBuilder &MIB) { // voffset
4440 MIB.addReg(VOffset);
4441 },
4442 [=](MachineInstrBuilder &MIB) { // offset
4443 MIB.addImm(ImmOffset);
4444 }}};
4445 }
4446 }
4447 }
4448
4449 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
4450 // drop this.
4451 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
4452 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
4453 return std::nullopt;
4454
4455 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
4456 // moves required to copy a 64-bit SGPR to VGPR.
4457 MachineInstr *MI = Root.getParent();
4458 MachineBasicBlock *MBB = MI->getParent();
4459 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4460
4461 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
4462 .addImm(0);
4463
4464 return {{
4465 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
4466 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
4467 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4468 }};
4469}
4470
4472AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
4473 Register Addr = Root.getReg();
4474 Register PtrBase;
4475 int64_t ConstOffset;
4476 int64_t ImmOffset = 0;
4477
4478 // Match the immediate offset first, which canonically is moved as low as
4479 // possible.
4480 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4481
4482 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
4485 Addr = PtrBase;
4486 ImmOffset = ConstOffset;
4487 }
4488
4489 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4490 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4491 int FI = AddrDef->MI->getOperand(1).getIndex();
4492 return {{
4493 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4494 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4495 }};
4496 }
4497
4498 Register SAddr = AddrDef->Reg;
4499
4500 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4501 Register LHS = AddrDef->MI->getOperand(1).getReg();
4502 Register RHS = AddrDef->MI->getOperand(2).getReg();
4503 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4504 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
4505
4506 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
4507 isSGPR(RHSDef->Reg)) {
4508 int FI = LHSDef->MI->getOperand(1).getIndex();
4509 MachineInstr &I = *Root.getParent();
4510 MachineBasicBlock *BB = I.getParent();
4511 const DebugLoc &DL = I.getDebugLoc();
4512 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4513
4514 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
4515 .addFrameIndex(FI)
4516 .addReg(RHSDef->Reg)
4517 .setOperandDead(3); // Dead scc
4518 }
4519 }
4520
4521 if (!isSGPR(SAddr))
4522 return std::nullopt;
4523
4524 return {{
4525 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
4526 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4527 }};
4528}
4529
4530// Check whether the flat scratch SVS swizzle bug affects this access.
4531bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
4532 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
4533 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
4534 return false;
4535
4536 // The bug affects the swizzling of SVS accesses if there is any carry out
4537 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
4538 // voffset to (soffset + inst_offset).
4539 auto VKnown = KB->getKnownBits(VAddr);
4540 auto SKnown = KnownBits::computeForAddSub(
4541 /*Add=*/true, /*NSW=*/false, /*NUW=*/false, KB->getKnownBits(SAddr),
4542 KnownBits::makeConstant(APInt(32, ImmOffset)));
4543 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
4544 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
4545 return (VMax & 3) + (SMax & 3) >= 4;
4546}
4547
4549AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
4550 Register Addr = Root.getReg();
4551 Register PtrBase;
4552 int64_t ConstOffset;
4553 int64_t ImmOffset = 0;
4554
4555 // Match the immediate offset first, which canonically is moved as low as
4556 // possible.
4557 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4558
4559 Register OrigAddr = Addr;
4560 if (ConstOffset != 0 &&
4561 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
4562 Addr = PtrBase;
4563 ImmOffset = ConstOffset;
4564 }
4565
4566 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4567 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
4568 return std::nullopt;
4569
4570 Register RHS = AddrDef->MI->getOperand(2).getReg();
4571 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
4572 return std::nullopt;
4573
4574 Register LHS = AddrDef->MI->getOperand(1).getReg();
4575 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4576
4577 if (OrigAddr != Addr) {
4578 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
4579 return std::nullopt;
4580 } else {
4581 if (!isFlatScratchBaseLegalSV(OrigAddr))
4582 return std::nullopt;
4583 }
4584
4585 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
4586 return std::nullopt;
4587
4588 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4589 int FI = LHSDef->MI->getOperand(1).getIndex();
4590 return {{
4591 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4592 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4593 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4594 }};
4595 }
4596
4597 if (!isSGPR(LHS))
4598 return std::nullopt;
4599
4600 return {{
4601 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4602 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
4603 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4604 }};
4605}
4606
4608AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
4609 MachineInstr *MI = Root.getParent();
4610 MachineBasicBlock *MBB = MI->getParent();
4613
4614 int64_t Offset = 0;
4615 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
4617 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4618
4619 // TODO: Should this be inside the render function? The iterator seems to
4620 // move.
4621 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
4622 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4623 HighBits)
4624 .addImm(Offset & ~MaxOffset);
4625
4626 return {{[=](MachineInstrBuilder &MIB) { // rsrc
4627 MIB.addReg(Info->getScratchRSrcReg());
4628 },
4629 [=](MachineInstrBuilder &MIB) { // vaddr
4630 MIB.addReg(HighBits);
4631 },
4632 [=](MachineInstrBuilder &MIB) { // soffset
4633 // Use constant zero for soffset and rely on eliminateFrameIndex
4634 // to choose the appropriate frame register if need be.
4635 MIB.addImm(0);
4636 },
4637 [=](MachineInstrBuilder &MIB) { // offset
4638 MIB.addImm(Offset & MaxOffset);
4639 }}};
4640 }
4641
4642 assert(Offset == 0 || Offset == -1);
4643
4644 // Try to fold a frame index directly into the MUBUF vaddr field, and any
4645 // offsets.
4646 std::optional<int> FI;
4647 Register VAddr = Root.getReg();
4648 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
4649 Register PtrBase;
4650 int64_t ConstOffset;
4651 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
4652 if (ConstOffset != 0) {
4653 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
4655 KB->signBitIsZero(PtrBase))) {
4656 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
4657 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
4658 FI = PtrBaseDef->getOperand(1).getIndex();
4659 else
4660 VAddr = PtrBase;
4661 Offset = ConstOffset;
4662 }
4663 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4664 FI = RootDef->getOperand(1).getIndex();
4665 }
4666 }
4667
4668 return {{[=](MachineInstrBuilder &MIB) { // rsrc
4669 MIB.addReg(Info->getScratchRSrcReg());
4670 },
4671 [=](MachineInstrBuilder &MIB) { // vaddr
4672 if (FI)
4673 MIB.addFrameIndex(*FI);
4674 else
4675 MIB.addReg(VAddr);
4676 },
4677 [=](MachineInstrBuilder &MIB) { // soffset
4678 // Use constant zero for soffset and rely on eliminateFrameIndex
4679 // to choose the appropriate frame register if need be.
4680 MIB.addImm(0);
4681 },
4682 [=](MachineInstrBuilder &MIB) { // offset
4683 MIB.addImm(Offset);
4684 }}};
4685}
4686
4687bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
4688 int64_t Offset) const {
4689 if (!isUInt<16>(Offset))
4690 return false;
4691
4693 return true;
4694
4695 // On Southern Islands instruction with a negative base value and an offset
4696 // don't seem to work.
4697 return KB->signBitIsZero(Base);
4698}
4699
4700bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
4701 int64_t Offset1,
4702 unsigned Size) const {
4703 if (Offset0 % Size != 0 || Offset1 % Size != 0)
4704 return false;
4705 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
4706 return false;
4707
4709 return true;
4710
4711 // On Southern Islands instruction with a negative base value and an offset
4712 // don't seem to work.
4713 return KB->signBitIsZero(Base);
4714}
4715
4716// Return whether the operation has NoUnsignedWrap property.
4718 return Addr->getOpcode() == TargetOpcode::G_OR ||
4719 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
4720 Addr->getFlag(MachineInstr::NoUWrap));
4721}
4722
4723// Check that the base address of flat scratch load/store in the form of `base +
4724// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
4725// requirement). We always treat the first operand as the base address here.
4726bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
4727 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4728
4729 if (isNoUnsignedWrap(AddrMI))
4730 return true;
4731
4732 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4733 // values.
4734 if (STI.hasSignedScratchOffsets())
4735 return true;
4736
4737 Register LHS = AddrMI->getOperand(1).getReg();
4738 Register RHS = AddrMI->getOperand(2).getReg();
4739
4740 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
4741 std::optional<ValueAndVReg> RhsValReg =
4743 // If the immediate offset is negative and within certain range, the base
4744 // address cannot also be negative. If the base is also negative, the sum
4745 // would be either negative or much larger than the valid range of scratch
4746 // memory a thread can access.
4747 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
4748 RhsValReg->Value.getSExtValue() > -0x40000000)
4749 return true;
4750 }
4751
4752 return KB->signBitIsZero(LHS);
4753}
4754
4755// Check address value in SGPR/VGPR are legal for flat scratch in the form
4756// of: SGPR + VGPR.
4757bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
4758 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4759
4760 if (isNoUnsignedWrap(AddrMI))
4761 return true;
4762
4763 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4764 // values.
4765 if (STI.hasSignedScratchOffsets())
4766 return true;
4767
4768 Register LHS = AddrMI->getOperand(1).getReg();
4769 Register RHS = AddrMI->getOperand(2).getReg();
4770 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
4771}
4772
4773// Check address value in SGPR/VGPR are legal for flat scratch in the form
4774// of: SGPR + VGPR + Imm.
4775bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
4776 Register Addr) const {
4777 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4778 // values.
4779 if (STI.hasSignedScratchOffsets())
4780 return true;
4781
4782 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4783 Register Base = AddrMI->getOperand(1).getReg();
4784 std::optional<DefinitionAndSourceRegister> BaseDef =
4786 std::optional<ValueAndVReg> RHSOffset =
4788 assert(RHSOffset);
4789
4790 // If the immediate offset is negative and within certain range, the base
4791 // address cannot also be negative. If the base is also negative, the sum
4792 // would be either negative or much larger than the valid range of scratch
4793 // memory a thread can access.
4794 if (isNoUnsignedWrap(BaseDef->MI) &&
4795 (isNoUnsignedWrap(AddrMI) ||
4796 (RHSOffset->Value.getSExtValue() < 0 &&
4797 RHSOffset->Value.getSExtValue() > -0x40000000)))
4798 return true;
4799
4800 Register LHS = BaseDef->MI->getOperand(1).getReg();
4801 Register RHS = BaseDef->MI->getOperand(2).getReg();
4802 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
4803}
4804
4805bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
4806 unsigned ShAmtBits) const {
4807 assert(MI.getOpcode() == TargetOpcode::G_AND);
4808
4809 std::optional<APInt> RHS =
4810 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
4811 if (!RHS)
4812 return false;
4813
4814 if (RHS->countr_one() >= ShAmtBits)
4815 return true;
4816
4817 const APInt &LHSKnownZeros = KB->getKnownZeroes(MI.getOperand(1).getReg());
4818 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
4819}
4820
4822AMDGPUInstructionSelector::selectMUBUFScratchOffset(
4823 MachineOperand &Root) const {
4824 Register Reg = Root.getReg();
4826
4827 std::optional<DefinitionAndSourceRegister> Def =
4828 getDefSrcRegIgnoringCopies(Reg, *MRI);
4829 assert(Def && "this shouldn't be an optional result");
4830 Reg = Def->Reg;
4831
4832 if (Register WaveBase = getWaveAddress(Def->MI)) {
4833 return {{
4834 [=](MachineInstrBuilder &MIB) { // rsrc
4835 MIB.addReg(Info->getScratchRSrcReg());
4836 },
4837 [=](MachineInstrBuilder &MIB) { // soffset
4838 MIB.addReg(WaveBase);
4839 },
4840 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
4841 }};
4842 }
4843
4844 int64_t Offset = 0;
4845
4846 // FIXME: Copy check is a hack
4848 if (mi_match(Reg, *MRI,
4849 m_GPtrAdd(m_Reg(BasePtr),
4851 if (!TII.isLegalMUBUFImmOffset(Offset))
4852 return {};
4853 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
4854 Register WaveBase = getWaveAddress(BasePtrDef);
4855 if (!WaveBase)
4856 return {};
4857
4858 return {{
4859 [=](MachineInstrBuilder &MIB) { // rsrc
4860 MIB.addReg(Info->getScratchRSrcReg());
4861 },
4862 [=](MachineInstrBuilder &MIB) { // soffset
4863 MIB.addReg(WaveBase);
4864 },
4865 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
4866 }};
4867 }
4868
4869 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
4871 return {};
4872
4873 return {{
4874 [=](MachineInstrBuilder &MIB) { // rsrc
4875 MIB.addReg(Info->getScratchRSrcReg());
4876 },
4877 [=](MachineInstrBuilder &MIB) { // soffset
4878 MIB.addImm(0);
4879 },
4880 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
4881 }};
4882}
4883
4884std::pair<Register, unsigned>
4885AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
4886 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4887 if (!RootDef)
4888 return std::pair(Root.getReg(), 0);
4889
4890 int64_t ConstAddr = 0;
4891
4892 Register PtrBase;
4893 int64_t Offset;
4894 std::tie(PtrBase, Offset) =
4895 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4896
4897 if (Offset) {
4898 if (isDSOffsetLegal(PtrBase, Offset)) {
4899 // (add n0, c0)
4900 return std::pair(PtrBase, Offset);
4901 }
4902 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
4903 // TODO
4904
4905
4906 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
4907 // TODO
4908
4909 }
4910
4911 return std::pair(Root.getReg(), 0);
4912}
4913
4915AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
4916 Register Reg;
4917 unsigned Offset;
4918 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
4919 return {{
4920 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4921 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
4922 }};
4923}
4924
4926AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
4927 return selectDSReadWrite2(Root, 4);
4928}
4929
4931AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
4932 return selectDSReadWrite2(Root, 8);
4933}
4934
4936AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
4937 unsigned Size) const {
4938 Register Reg;
4939 unsigned Offset;
4940 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
4941 return {{
4942 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4943 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
4944 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
4945 }};
4946}
4947
4948std::pair<Register, unsigned>
4949AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
4950 unsigned Size) const {
4951 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4952 if (!RootDef)
4953 return std::pair(Root.getReg(), 0);
4954
4955 int64_t ConstAddr = 0;
4956
4957 Register PtrBase;
4958 int64_t Offset;
4959 std::tie(PtrBase, Offset) =
4960 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4961
4962 if (Offset) {
4963 int64_t OffsetValue0 = Offset;
4964 int64_t OffsetValue1 = Offset + Size;
4965 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
4966 // (add n0, c0)
4967 return std::pair(PtrBase, OffsetValue0 / Size);
4968 }
4969 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
4970 // TODO
4971
4972 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
4973 // TODO
4974
4975 }
4976
4977 return std::pair(Root.getReg(), 0);
4978}
4979
4980/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
4981/// the base value with the constant offset. There may be intervening copies
4982/// between \p Root and the identified constant. Returns \p Root, 0 if this does
4983/// not match the pattern.
4984std::pair<Register, int64_t>
4985AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
4986 Register Root, const MachineRegisterInfo &MRI) const {
4987 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
4988 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
4989 return {Root, 0};
4990
4991 MachineOperand &RHS = RootI->getOperand(2);
4992 std::optional<ValueAndVReg> MaybeOffset =
4994 if (!MaybeOffset)
4995 return {Root, 0};
4996 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
4997}
4998
5000 MIB.addImm(0);
5001}
5002
5003/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
5004/// BasePtr is not valid, a null base pointer will be used.
5006 uint32_t FormatLo, uint32_t FormatHi,
5007 Register BasePtr) {
5008 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5009 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5010 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5011 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
5012
5013 B.buildInstr(AMDGPU::S_MOV_B32)
5014 .addDef(RSrc2)
5015 .addImm(FormatLo);
5016 B.buildInstr(AMDGPU::S_MOV_B32)
5017 .addDef(RSrc3)
5018 .addImm(FormatHi);
5019
5020 // Build the half of the subregister with the constants before building the
5021 // full 128-bit register. If we are building multiple resource descriptors,
5022 // this will allow CSEing of the 2-component register.
5023 B.buildInstr(AMDGPU::REG_SEQUENCE)
5024 .addDef(RSrcHi)
5025 .addReg(RSrc2)
5026 .addImm(AMDGPU::sub0)
5027 .addReg(RSrc3)
5028 .addImm(AMDGPU::sub1);
5029
5030 Register RSrcLo = BasePtr;
5031 if (!BasePtr) {
5032 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5033 B.buildInstr(AMDGPU::S_MOV_B64)
5034 .addDef(RSrcLo)
5035 .addImm(0);
5036 }
5037
5038 B.buildInstr(AMDGPU::REG_SEQUENCE)
5039 .addDef(RSrc)
5040 .addReg(RSrcLo)
5041 .addImm(AMDGPU::sub0_sub1)
5042 .addReg(RSrcHi)
5043 .addImm(AMDGPU::sub2_sub3);
5044
5045 return RSrc;
5046}
5047
5049 const SIInstrInfo &TII, Register BasePtr) {
5050 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5051
5052 // FIXME: Why are half the "default" bits ignored based on the addressing
5053 // mode?
5054 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
5055}
5056
5058 const SIInstrInfo &TII, Register BasePtr) {
5059 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5060
5061 // FIXME: Why are half the "default" bits ignored based on the addressing
5062 // mode?
5063 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
5064}
5065
5066AMDGPUInstructionSelector::MUBUFAddressData
5067AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
5068 MUBUFAddressData Data;
5069 Data.N0 = Src;
5070
5071 Register PtrBase;
5072 int64_t Offset;
5073
5074 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
5075 if (isUInt<32>(Offset)) {
5076 Data.N0 = PtrBase;
5077 Data.Offset = Offset;
5078 }
5079
5080 if (MachineInstr *InputAdd
5081 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
5082 Data.N2 = InputAdd->getOperand(1).getReg();
5083 Data.N3 = InputAdd->getOperand(2).getReg();
5084
5085 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
5086 // FIXME: Don't know this was defined by operand 0
5087 //
5088 // TODO: Remove this when we have copy folding optimizations after
5089 // RegBankSelect.
5090 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
5091 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
5092 }
5093
5094 return Data;
5095}
5096
5097/// Return if the addr64 mubuf mode should be used for the given address.
5098bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
5099 // (ptr_add N2, N3) -> addr64, or
5100 // (ptr_add (ptr_add N2, N3), C1) -> addr64
5101 if (Addr.N2)
5102 return true;
5103
5104 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
5105 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
5106}
5107
5108/// Split an immediate offset \p ImmOffset depending on whether it fits in the
5109/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
5110/// component.
5111void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
5112 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
5113 if (TII.isLegalMUBUFImmOffset(ImmOffset))
5114 return;
5115
5116 // Illegal offset, store it in soffset.
5117 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5118 B.buildInstr(AMDGPU::S_MOV_B32)
5119 .addDef(SOffset)
5120 .addImm(ImmOffset);
5121 ImmOffset = 0;
5122}
5123
5124bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
5125 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
5126 Register &SOffset, int64_t &Offset) const {
5127 // FIXME: Predicates should stop this from reaching here.
5128 // addr64 bit was removed for volcanic islands.
5129 if (!STI.hasAddr64() || STI.useFlatForGlobal())
5130 return false;
5131
5132 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5133 if (!shouldUseAddr64(AddrData))
5134 return false;
5135
5136 Register N0 = AddrData.N0;
5137 Register N2 = AddrData.N2;
5138 Register N3 = AddrData.N3;
5139 Offset = AddrData.Offset;
5140
5141 // Base pointer for the SRD.
5142 Register SRDPtr;
5143
5144 if (N2) {
5145 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5146 assert(N3);
5147 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5148 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
5149 // addr64, and construct the default resource from a 0 address.
5150 VAddr = N0;
5151 } else {
5152 SRDPtr = N3;
5153 VAddr = N2;
5154 }
5155 } else {
5156 // N2 is not divergent.
5157 SRDPtr = N2;
5158 VAddr = N3;
5159 }
5160 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5161 // Use the default null pointer in the resource
5162 VAddr = N0;
5163 } else {
5164 // N0 -> offset, or
5165 // (N0 + C1) -> offset
5166 SRDPtr = N0;
5167 }
5168
5169 MachineIRBuilder B(*Root.getParent());
5170 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
5171 splitIllegalMUBUFOffset(B, SOffset, Offset);
5172 return true;
5173}
5174
5175bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
5176 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
5177 int64_t &Offset) const {
5178
5179 // FIXME: Pattern should not reach here.
5180 if (STI.useFlatForGlobal())
5181 return false;
5182
5183 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5184 if (shouldUseAddr64(AddrData))
5185 return false;
5186
5187 // N0 -> offset, or
5188 // (N0 + C1) -> offset
5189 Register SRDPtr = AddrData.N0;
5190 Offset = AddrData.Offset;
5191
5192 // TODO: Look through extensions for 32-bit soffset.
5193 MachineIRBuilder B(*Root.getParent());
5194
5195 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
5196 splitIllegalMUBUFOffset(B, SOffset, Offset);
5197 return true;
5198}
5199
5201AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
5202 Register VAddr;
5203 Register RSrcReg;
5204 Register SOffset;
5205 int64_t Offset = 0;
5206
5207 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
5208 return {};
5209
5210 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
5211 // pattern.
5212 return {{
5213 [=](MachineInstrBuilder &MIB) { // rsrc
5214 MIB.addReg(RSrcReg);
5215 },
5216 [=](MachineInstrBuilder &MIB) { // vaddr
5217 MIB.addReg(VAddr);
5218 },
5219 [=](MachineInstrBuilder &MIB) { // soffset
5220 if (SOffset)
5221 MIB.addReg(SOffset);
5222 else if (STI.hasRestrictedSOffset())
5223 MIB.addReg(AMDGPU::SGPR_NULL);
5224 else
5225 MIB.addImm(0);
5226 },
5227 [=](MachineInstrBuilder &MIB) { // offset
5228 MIB.addImm(Offset);
5229 },
5230 addZeroImm, // cpol
5231 addZeroImm, // tfe
5232 addZeroImm // swz
5233 }};
5234}
5235
5237AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
5238 Register RSrcReg;
5239 Register SOffset;
5240 int64_t Offset = 0;
5241
5242 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
5243 return {};
5244
5245 return {{
5246 [=](MachineInstrBuilder &MIB) { // rsrc
5247 MIB.addReg(RSrcReg);
5248 },
5249 [=](MachineInstrBuilder &MIB) { // soffset
5250 if (SOffset)
5251 MIB.addReg(SOffset);
5252 else if (STI.hasRestrictedSOffset())
5253 MIB.addReg(AMDGPU::SGPR_NULL);
5254 else
5255 MIB.addImm(0);
5256 },
5257 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
5258 addZeroImm, // cpol
5259 addZeroImm, // tfe
5260 addZeroImm, // swz
5261 }};
5262}
5263
5265AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
5266
5267 Register SOffset = Root.getReg();
5268
5269 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
5270 SOffset = AMDGPU::SGPR_NULL;
5271
5272 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
5273}
5274
5275/// Get an immediate that must be 32-bits, and treated as zero extended.
5276static std::optional<uint64_t>
5278 // getIConstantVRegVal sexts any values, so see if that matters.
5279 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
5280 if (!OffsetVal || !isInt<32>(*OffsetVal))
5281 return std::nullopt;
5282 return Lo_32(*OffsetVal);
5283}
5284
5286AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
5287 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
5288 if (!OffsetVal)
5289 return {};
5290
5291 std::optional<int64_t> EncodedImm =
5292 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
5293 if (!EncodedImm)
5294 return {};
5295
5296 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
5297}
5298
5300AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
5302
5303 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
5304 if (!OffsetVal)
5305 return {};
5306
5307 std::optional<int64_t> EncodedImm =
5309 if (!EncodedImm)
5310 return {};
5311
5312 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
5313}
5314
5316AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
5317 // Match the (soffset + offset) pair as a 32-bit register base and
5318 // an immediate offset.
5319 Register SOffset;
5320 unsigned Offset;
5321 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
5322 *MRI, Root.getReg(), KB, /*CheckNUW*/ true);
5323 if (!SOffset)
5324 return std::nullopt;
5325
5326 std::optional<int64_t> EncodedOffset =
5327 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
5328 if (!EncodedOffset)
5329 return std::nullopt;
5330
5331 assert(MRI->getType(SOffset) == LLT::scalar(32));
5332 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5333 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
5334}
5335
5336// Variant of stripBitCast that returns the instruction instead of a
5337// MachineOperand.
5339 if (MI->getOpcode() == AMDGPU::G_BITCAST)
5340 return getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI);
5341 return MI;
5342}
5343
5344// Figure out if this is really an extract of the high 16-bits of a dword,
5345// returns nullptr if it isn't.
5348 Inst = stripBitCast(Inst, MRI);
5349
5350 if (Inst->getOpcode() != AMDGPU::G_TRUNC)
5351 return nullptr;
5352
5353 MachineInstr *TruncOp =
5355 TruncOp = stripBitCast(TruncOp, MRI);
5356
5357 // G_LSHR x, (G_CONSTANT i32 16)
5358 if (TruncOp->getOpcode() == AMDGPU::G_LSHR) {
5359 auto SrlAmount = getIConstantVRegValWithLookThrough(
5360 TruncOp->getOperand(2).getReg(), MRI);
5361 if (SrlAmount && SrlAmount->Value.getZExtValue() == 16) {
5362 MachineInstr *SrlOp =
5363 getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
5364 return stripBitCast(SrlOp, MRI);
5365 }
5366 }
5367
5368 // G_SHUFFLE_VECTOR x, y, shufflemask(1, 1|0)
5369 // 1, 0 swaps the low/high 16 bits.
5370 // 1, 1 sets the high 16 bits to be the same as the low 16.
5371 // in any case, it selects the high elts.
5372 if (TruncOp->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) {
5373 assert(MRI.getType(TruncOp->getOperand(0).getReg()) ==
5374 LLT::fixed_vector(2, 16));
5375
5376 ArrayRef<int> Mask = TruncOp->getOperand(3).getShuffleMask();
5377 assert(Mask.size() == 2);
5378
5379 if (Mask[0] == 1 && Mask[1] <= 1) {
5380 MachineInstr *LHS =
5381 getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
5382 return stripBitCast(LHS, MRI);
5383 }
5384 }
5385
5386 return nullptr;
5387}
5388
5389std::pair<Register, unsigned>
5390AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
5391 bool &Matched) const {
5392 Matched = false;
5393
5394 Register Src;
5395 unsigned Mods;
5396 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
5397
5398 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
5399 if (MI->getOpcode() == AMDGPU::G_FPEXT) {
5400 MachineOperand *MO = &MI->getOperand(1);
5401 Src = MO->getReg();
5402 MI = getDefIgnoringCopies(Src, *MRI);
5403
5404 assert(MRI->getType(Src) == LLT::scalar(16));
5405
5406 // See through bitcasts.
5407 // FIXME: Would be nice to use stripBitCast here.
5408 if (MI->getOpcode() == AMDGPU::G_BITCAST) {
5409 MO = &MI->getOperand(1);
5410 Src = MO->getReg();
5411 MI = getDefIgnoringCopies(Src, *MRI);
5412 }
5413
5414 const auto CheckAbsNeg = [&]() {
5415 // Be careful about folding modifiers if we already have an abs. fneg is
5416 // applied last, so we don't want to apply an earlier fneg.
5417 if ((Mods & SISrcMods::ABS) == 0) {
5418 unsigned ModsTmp;
5419 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(*MO);
5420 MI = getDefIgnoringCopies(Src, *MRI);
5421
5422 if ((ModsTmp & SISrcMods::NEG) != 0)
5423 Mods ^= SISrcMods::NEG;
5424
5425 if ((ModsTmp & SISrcMods::ABS) != 0)
5426 Mods |= SISrcMods::ABS;
5427 }
5428 };
5429
5430 CheckAbsNeg();
5431
5432 // op_sel/op_sel_hi decide the source type and source.
5433 // If the source's op_sel_hi is set, it indicates to do a conversion from
5434 // fp16. If the sources's op_sel is set, it picks the high half of the
5435 // source register.
5436
5437 Mods |= SISrcMods::OP_SEL_1;
5438
5439 if (MachineInstr *ExtractHiEltMI = isExtractHiElt(MI, *MRI)) {
5440 Mods |= SISrcMods::OP_SEL_0;
5441 MI = ExtractHiEltMI;
5442 MO = &MI->getOperand(0);
5443 Src = MO->getReg();
5444
5445 CheckAbsNeg();
5446 }
5447
5448 Matched = true;
5449 }
5450
5451 return {Src, Mods};
5452}
5453
5455AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
5456 MachineOperand &Root) const {
5457 Register Src;
5458 unsigned Mods;
5459 bool Matched;
5460 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5461 if (!Matched)
5462 return {};
5463
5464 return {{
5465 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5466 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5467 }};
5468}
5469
5471AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
5472 Register Src;
5473 unsigned Mods;
5474 bool Matched;
5475 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5476
5477 return {{
5478 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5479 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5480 }};
5481}
5482
5483bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
5484 MachineInstr &I, Intrinsic::ID IntrID) const {
5485 MachineBasicBlock *MBB = I.getParent();
5486 const DebugLoc &DL = I.getDebugLoc();
5487 Register CCReg = I.getOperand(0).getReg();
5488
5489 bool HasM0 = IntrID == Intrinsic::amdgcn_s_barrier_signal_isfirst_var;
5490
5491 if (HasM0) {
5492 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
5493 .addReg(I.getOperand(2).getReg());
5494 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0));
5495 if (!constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI))
5496 return false;
5497 } else {
5498 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
5499 .addImm(I.getOperand(2).getImm());
5500 }
5501
5502 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
5503
5504 I.eraseFromParent();
5505 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
5506 *MRI);
5507}
5508
5509unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
5510 if (HasInlineConst) {
5511 switch (IntrID) {
5512 default:
5513 llvm_unreachable("not a named barrier op");
5514 case Intrinsic::amdgcn_s_barrier_init:
5515 return AMDGPU::S_BARRIER_INIT_IMM;
5516 case Intrinsic::amdgcn_s_barrier_join:
5517 return AMDGPU::S_BARRIER_JOIN_IMM;
5518 case Intrinsic::amdgcn_s_wakeup_barrier:
5519 return AMDGPU::S_WAKEUP_BARRIER_IMM;
5520 case Intrinsic::amdgcn_s_get_barrier_state:
5521 return AMDGPU::S_GET_BARRIER_STATE_IMM;
5522 };
5523 } else {
5524 switch (IntrID) {
5525 default:
5526 llvm_unreachable("not a named barrier op");
5527 case Intrinsic::amdgcn_s_barrier_init:
5528 return AMDGPU::S_BARRIER_INIT_M0;
5529 case Intrinsic::amdgcn_s_barrier_join:
5530 return AMDGPU::S_BARRIER_JOIN_M0;
5531 case Intrinsic::amdgcn_s_wakeup_barrier:
5532 return AMDGPU::S_WAKEUP_BARRIER_M0;
5533 case Intrinsic::amdgcn_s_get_barrier_state:
5534 return AMDGPU::S_GET_BARRIER_STATE_M0;
5535 };
5536 }
5537}
5538
5539bool AMDGPUInstructionSelector::selectNamedBarrierInst(
5540 MachineInstr &I, Intrinsic::ID IntrID) const {
5541 MachineBasicBlock *MBB = I.getParent();
5542 const DebugLoc &DL = I.getDebugLoc();
5543 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_barrier_state
5544 ? I.getOperand(2)
5545 : I.getOperand(1);
5546 std::optional<int64_t> BarValImm =
5547 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
5548 Register M0Val;
5549 Register TmpReg0;
5550
5551 // For S_BARRIER_INIT, member count will always be read from M0[16:22]
5552 if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
5553 Register MemberCount = I.getOperand(2).getReg();
5554 TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5555 // TODO: This should be expanded during legalization so that the the S_LSHL
5556 // and S_OR can be constant-folded
5557 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
5558 .addImm(16)
5559 .addReg(MemberCount);
5560 M0Val = TmpReg0;
5561 }
5562
5563 // If not inlinable, get reference to barrier depending on the instruction
5564 if (!BarValImm) {
5565 if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
5566 // If reference to barrier id is not an inlinable constant then it must be
5567 // referenced with M0[4:0]. Perform an OR with the member count to include
5568 // it in M0 for S_BARRIER_INIT.
5569 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5570 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg1)
5571 .addReg(BarOp.getReg())
5572 .addReg(TmpReg0);
5573 M0Val = TmpReg1;
5574 } else {
5575 M0Val = BarOp.getReg();
5576 }
5577 }
5578
5579 // Build copy to M0 if needed. For S_BARRIER_INIT, M0 is always required.
5580 if (M0Val) {
5581 auto CopyMIB =
5582 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(M0Val);
5583 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
5584 }
5585
5587 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
5588 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
5589
5590 if (IntrID == Intrinsic::amdgcn_s_get_barrier_state)
5591 MIB.addDef(I.getOperand(0).getReg());
5592
5593 if (BarValImm)
5594 MIB.addImm(*BarValImm);
5595
5596 I.eraseFromParent();
5597 return true;
5598}
5599
5600bool AMDGPUInstructionSelector::selectSBarrierLeave(MachineInstr &I) const {
5601 MachineBasicBlock *BB = I.getParent();
5602 const DebugLoc &DL = I.getDebugLoc();
5603 Register CCReg = I.getOperand(0).getReg();
5604
5605 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_BARRIER_LEAVE));
5606 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
5607
5608 I.eraseFromParent();
5609 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
5610 *MRI);
5611}
5612
5613void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
5614 const MachineInstr &MI,
5615 int OpIdx) const {
5616 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5617 "Expected G_CONSTANT");
5618 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
5619}
5620
5621void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
5622 const MachineInstr &MI,
5623 int OpIdx) const {
5624 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5625 "Expected G_CONSTANT");
5626 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
5627}
5628
5629void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
5630 const MachineInstr &MI,
5631 int OpIdx) const {
5632 assert(OpIdx == -1);
5633
5634 const MachineOperand &Op = MI.getOperand(1);
5635 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
5636 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
5637 else {
5638 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
5639 MIB.addImm(Op.getCImm()->getSExtValue());
5640 }
5641}
5642
5643void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
5644 const MachineInstr &MI,
5645 int OpIdx) const {
5646 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5647 "Expected G_CONSTANT");
5648 MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount());
5649}
5650
5651/// This only really exists to satisfy DAG type checking machinery, so is a
5652/// no-op here.
5653void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
5654 const MachineInstr &MI,
5655 int OpIdx) const {
5656 MIB.addImm(MI.getOperand(OpIdx).getImm());
5657}
5658
5659void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
5660 const MachineInstr &MI,
5661 int OpIdx) const {
5662 assert(OpIdx >= 0 && "expected to match an immediate operand");
5663 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
5664}
5665
5666void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
5667 const MachineInstr &MI,
5668 int OpIdx) const {
5669 assert(OpIdx >= 0 && "expected to match an immediate operand");
5670 MIB.addImm(MI.getOperand(OpIdx).getImm() &
5673}
5674
5675void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
5676 const MachineInstr &MI,
5677 int OpIdx) const {
5678 assert(OpIdx >= 0 && "expected to match an immediate operand");
5679 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
5682 MIB.addImm(Swizzle);
5683}
5684
5685void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
5686 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
5687 assert(OpIdx >= 0 && "expected to match an immediate operand");
5688 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
5691 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
5692}
5693
5694void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
5695 const MachineInstr &MI,
5696 int OpIdx) const {
5697 MIB.addFrameIndex(MI.getOperand(1).getIndex());
5698}
5699
5700void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
5701 const MachineInstr &MI,
5702 int OpIdx) const {
5703 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
5704 int ExpVal = APF.getExactLog2Abs();
5705 assert(ExpVal != INT_MIN);
5706 MIB.addImm(ExpVal);
5707}
5708
5709bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
5710 return TII.isInlineConstant(Imm);
5711}
5712
5713bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
5714 return TII.isInlineConstant(Imm);
5715}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelKnownBits &KnownBits)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static int sizeToSubRegIndex(unsigned Size)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg)
Match a zero extend from a 32-bit value to 64-bits.
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static MachineInstr * stripBitCast(MachineInstr *MI, MachineRegisterInfo &MRI)
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
static const LLT S1
amdgpu AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static Error getAddrSpace(StringRef R, unsigned &AddrSpace)
Definition: DataLayout.cpp:266
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
unsigned const TargetRegisterInfo * TRI
#define P(N)
const char LLVMTargetMachineRef TM
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
Value * RHS
Value * LHS
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelKnownBits *KB, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
const RegisterBank & getRegBankFromRegClass(const TargetRegisterClass &RC, LLT) const override
Get a register bank that covers RC.
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
unsigned getWavefrontSizeLog2() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
LLVM_READONLY int getExactLog2Abs() const
Definition: APFloat.h:1398
APInt bitcastToAPInt() const
Definition: APFloat.h:1260
Class for arbitrary precision integers.
Definition: APInt.h:78
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:286
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:276
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1522
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1615
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:760
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition: InstrTypes.h:774
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:786
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:787
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:763
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition: InstrTypes.h:772
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:761
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:762
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:781
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:780
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:784
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition: InstrTypes.h:771
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:765
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:768
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:782
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition: InstrTypes.h:769
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:764
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:766
@ ICMP_EQ
equal
Definition: InstrTypes.h:778
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:785
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:773
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:783
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition: InstrTypes.h:770
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition: InstrTypes.h:759
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:767
bool isFPPredicate() const
Definition: InstrTypes.h:864
bool isIntPredicate() const
Definition: InstrTypes.h:865
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:269
const APFloat & getValueAPF() const
Definition: Constants.h:312
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:161
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:155
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
Represents a G_BUILD_VECTOR.
bool useVGPRIndexMode() const
bool hasScalarCompareEq64() const
Definition: GCNSubtarget.h:999
int getLDSBankCount() const
Definition: GCNSubtarget.h:340
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:468
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:472
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:627
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasMADIntraFwdBug() const
bool privateMemoryResourceIsRangeChecked() const
Definition: GCNSubtarget.h:553
bool hasSignedScratchOffsets() const
bool hasRestrictedSOffset() const
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:274
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:702
bool isWave32() const
bool hasSPackHL() const
Return true if the target has the S_PACK_HL_B32_B16 instruction.
bool hasG16() const
bool hasFlatScratchSVSSwizzleBug() const
bool hasGWS() const
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:531
Generation getGeneration() const
Definition: GCNSubtarget.h:317
bool hasSplitBarriers() const
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:732
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:716
bool hasAddr64() const
Definition: GCNSubtarget.h:381
bool isWave64() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:724
bool hasSALUFloatInsts() const
bool hasPartialNSAEncoding() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
Represents a G_CONCAT_VECTORS.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelKnownBits *kb, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
APInt getKnownOnes(Register R)
KnownBits getKnownBits(Register R)
bool signBitIsZero(Register Op)
APInt getKnownZeroes(Register R)
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
constexpr bool isScalar() const
Definition: LowLevelType.h:146
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
constexpr bool isValid() const
Definition: LowLevelType.h:145
constexpr bool isVector() const
Definition: LowLevelType.h:148
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:193
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
Definition: LowLevelType.h:290
constexpr unsigned getAddressSpace() const
Definition: LowLevelType.h:280
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
Metadata node.
Definition: Metadata.h:1067
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
void setReturnAddressIsTaken(bool s)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:569
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:346
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:572
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:498
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
A description of a memory reference used in the backend.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
bool isCImm() const
isCImm - Test if this is a MO_CImmediate operand.
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
const ConstantFP * getFPImm() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
T get() const
Returns the value of the specified pointer type.
Definition: PointerUnion.h:155
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:162
Analysis providing profile information.
static const TargetRegisterClass * constrainGenericRegister(Register Reg, const TargetRegisterClass &RC, MachineRegisterInfo &MRI)
Constrain the (possibly generic) virtual register Reg to RC.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
TypeSize getSizeInBits(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Get the size in bits of Reg.
This class implements the register bank concept.
Definition: RegisterBank.h:28
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:45
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCRegister getReturnAddressReg(const MachineFunction &MF) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const
const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const
const TargetRegisterClass * getBoolRC() const
const TargetRegisterClass * getWaveMaskRegClass() const
static bool isSGPRClass(const TargetRegisterClass *RC)
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
const Triple & getTargetTriple() const
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:384
static IntegerType * getInt32Ty(LLVMContext &C)
LLVM Value Representation.
Definition: Value.h:74
Value(Type *Ty, unsigned scid)
Definition: Value.cpp:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
Key
PAL metadata keys.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelKnownBits *KnownBits=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
IndexMode
ARM Index Modes.
Definition: ARMBaseInfo.h:177
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1513
operand_type_match m_Reg()
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
SpecificConstantMatch m_SpecificICst(int64_t RequestedValue)
Matches a constant equal to RequestedValue.
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
ConstantMatch< APInt > m_ICst(APInt &Cst)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:599
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition: Utils.cpp:903
@ Offset
Definition: DWP.cpp:480
Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition: Utils.cpp:56
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:639
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition: Utils.cpp:452
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition: Utils.cpp:295
bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition: Utils.cpp:155
MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition: Utils.cpp:479
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition: Utils.cpp:307
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:154
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition: Utils.cpp:432
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:159
unsigned getUndefRegState(bool B)
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ Add
Sum of integers.
@ DS_Error
std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition: Utils.cpp:426
std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition: Utils.cpp:460
Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition: Utils.cpp:486
@ Default
The result values are uniform if and only if all operands are uniform.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:290
static KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition: KnownBits.cpp:51
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.