LLVM  16.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
15 #include "AMDGPU.h"
16 #include "AMDGPUGlobalISelUtils.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-isel"
31 
32 using namespace llvm;
33 using namespace MIPatternMatch;
34 
36  "amdgpu-global-isel-risky-select",
37  cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
38  cl::init(false),
40 
41 #define GET_GLOBALISEL_IMPL
42 #define AMDGPUSubtarget GCNSubtarget
43 #include "AMDGPUGenGlobalISel.inc"
44 #undef GET_GLOBALISEL_IMPL
45 #undef AMDGPUSubtarget
46 
48  const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
49  const AMDGPUTargetMachine &TM)
50  : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
51  STI(STI),
52  EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
54 #include "AMDGPUGenGlobalISel.inc"
57 #include "AMDGPUGenGlobalISel.inc"
59 {
60 }
61 
62 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
63 
65  CodeGenCoverage &CoverageInfo,
66  ProfileSummaryInfo *PSI,
68  MRI = &MF.getRegInfo();
69  Subtarget = &MF.getSubtarget<GCNSubtarget>();
71 }
72 
73 bool AMDGPUInstructionSelector::isVCC(Register Reg,
74  const MachineRegisterInfo &MRI) const {
75  // The verifier is oblivious to s1 being a valid value for wavesize registers.
76  if (Reg.isPhysical())
77  return false;
78 
79  auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
80  const TargetRegisterClass *RC =
81  RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
82  if (RC) {
83  const LLT Ty = MRI.getType(Reg);
84  if (!Ty.isValid() || Ty.getSizeInBits() != 1)
85  return false;
86  // G_TRUNC s1 result is never vcc.
87  return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
88  RC->hasSuperClassEq(TRI.getBoolRC());
89  }
90 
91  const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
92  return RB->getID() == AMDGPU::VCCRegBankID;
93 }
94 
95 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
96  unsigned NewOpc) const {
97  MI.setDesc(TII.get(NewOpc));
98  MI.removeOperand(1); // Remove intrinsic ID.
99  MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
100 
101  MachineOperand &Dst = MI.getOperand(0);
102  MachineOperand &Src = MI.getOperand(1);
103 
104  // TODO: This should be legalized to s32 if needed
105  if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
106  return false;
107 
108  const TargetRegisterClass *DstRC
109  = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
110  const TargetRegisterClass *SrcRC
111  = TRI.getConstrainedRegClassForOperand(Src, *MRI);
112  if (!DstRC || DstRC != SrcRC)
113  return false;
114 
115  return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
116  RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
117 }
118 
119 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
120  const DebugLoc &DL = I.getDebugLoc();
121  MachineBasicBlock *BB = I.getParent();
122  I.setDesc(TII.get(TargetOpcode::COPY));
123 
124  const MachineOperand &Src = I.getOperand(1);
125  MachineOperand &Dst = I.getOperand(0);
126  Register DstReg = Dst.getReg();
127  Register SrcReg = Src.getReg();
128 
129  if (isVCC(DstReg, *MRI)) {
130  if (SrcReg == AMDGPU::SCC) {
131  const TargetRegisterClass *RC
132  = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
133  if (!RC)
134  return true;
135  return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
136  }
137 
138  if (!isVCC(SrcReg, *MRI)) {
139  // TODO: Should probably leave the copy and let copyPhysReg expand it.
140  if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
141  return false;
142 
143  const TargetRegisterClass *SrcRC
144  = TRI.getConstrainedRegClassForOperand(Src, *MRI);
145 
146  Optional<ValueAndVReg> ConstVal =
147  getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
148  if (ConstVal) {
149  unsigned MovOpc =
150  STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
151  BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
152  .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
153  } else {
154  Register MaskedReg = MRI->createVirtualRegister(SrcRC);
155 
156  // We can't trust the high bits at this point, so clear them.
157 
158  // TODO: Skip masking high bits if def is known boolean.
159 
160  unsigned AndOpc =
161  TRI.isSGPRClass(SrcRC) ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
162  BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
163  .addImm(1)
164  .addReg(SrcReg);
165  BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
166  .addImm(0)
167  .addReg(MaskedReg);
168  }
169 
170  if (!MRI->getRegClassOrNull(SrcReg))
171  MRI->setRegClass(SrcReg, SrcRC);
172  I.eraseFromParent();
173  return true;
174  }
175 
176  const TargetRegisterClass *RC =
177  TRI.getConstrainedRegClassForOperand(Dst, *MRI);
178  if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
179  return false;
180 
181  return true;
182  }
183 
184  for (const MachineOperand &MO : I.operands()) {
185  if (MO.getReg().isPhysical())
186  continue;
187 
188  const TargetRegisterClass *RC =
189  TRI.getConstrainedRegClassForOperand(MO, *MRI);
190  if (!RC)
191  continue;
192  RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
193  }
194  return true;
195 }
196 
197 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
198  const Register DefReg = I.getOperand(0).getReg();
199  const LLT DefTy = MRI->getType(DefReg);
200  if (DefTy == LLT::scalar(1)) {
201  if (!AllowRiskySelect) {
202  LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
203  return false;
204  }
205 
206  LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n");
207  }
208 
209  // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
210 
211  const RegClassOrRegBank &RegClassOrBank =
212  MRI->getRegClassOrRegBank(DefReg);
213 
214  const TargetRegisterClass *DefRC
215  = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
216  if (!DefRC) {
217  if (!DefTy.isValid()) {
218  LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
219  return false;
220  }
221 
222  const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
223  DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
224  if (!DefRC) {
225  LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
226  return false;
227  }
228  }
229 
230  // TODO: Verify that all registers have the same bank
231  I.setDesc(TII.get(TargetOpcode::PHI));
232  return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
233 }
234 
236 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
237  const TargetRegisterClass &SubRC,
238  unsigned SubIdx) const {
239 
240  MachineInstr *MI = MO.getParent();
242  Register DstReg = MRI->createVirtualRegister(&SubRC);
243 
244  if (MO.isReg()) {
245  unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
246  Register Reg = MO.getReg();
247  BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
248  .addReg(Reg, 0, ComposedSubIdx);
249 
250  return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
251  MO.isKill(), MO.isDead(), MO.isUndef(),
252  MO.isEarlyClobber(), 0, MO.isDebug(),
253  MO.isInternalRead());
254  }
255 
256  assert(MO.isImm());
257 
258  APInt Imm(64, MO.getImm());
259 
260  switch (SubIdx) {
261  default:
262  llvm_unreachable("do not know to split immediate with this sub index.");
263  case AMDGPU::sub0:
264  return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
265  case AMDGPU::sub1:
266  return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
267  }
268 }
269 
270 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
271  switch (Opc) {
272  case AMDGPU::G_AND:
273  return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
274  case AMDGPU::G_OR:
275  return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
276  case AMDGPU::G_XOR:
277  return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
278  default:
279  llvm_unreachable("not a bit op");
280  }
281 }
282 
283 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
284  Register DstReg = I.getOperand(0).getReg();
285  unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
286 
287  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
288  if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
289  DstRB->getID() != AMDGPU::VCCRegBankID)
290  return false;
291 
292  bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
293  STI.isWave64());
294  I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
295 
296  // Dead implicit-def of scc
297  I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
298  true, // isImp
299  false, // isKill
300  true)); // isDead
301  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
302 }
303 
304 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
305  MachineBasicBlock *BB = I.getParent();
306  MachineFunction *MF = BB->getParent();
307  Register DstReg = I.getOperand(0).getReg();
308  const DebugLoc &DL = I.getDebugLoc();
309  LLT Ty = MRI->getType(DstReg);
310  if (Ty.isVector())
311  return false;
312 
313  unsigned Size = Ty.getSizeInBits();
314  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
315  const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
316  const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
317 
318  if (Size == 32) {
319  if (IsSALU) {
320  const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
321  MachineInstr *Add =
322  BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
323  .add(I.getOperand(1))
324  .add(I.getOperand(2));
325  I.eraseFromParent();
326  return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
327  }
328 
329  if (STI.hasAddNoCarry()) {
330  const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
331  I.setDesc(TII.get(Opc));
332  I.addOperand(*MF, MachineOperand::CreateImm(0));
333  I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
334  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
335  }
336 
337  const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
338 
341  = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
342  .addDef(UnusedCarry, RegState::Dead)
343  .add(I.getOperand(1))
344  .add(I.getOperand(2))
345  .addImm(0);
346  I.eraseFromParent();
347  return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
348  }
349 
350  assert(!Sub && "illegal sub should not reach here");
351 
352  const TargetRegisterClass &RC
353  = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
354  const TargetRegisterClass &HalfRC
355  = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
356 
357  MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
358  MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
359  MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
360  MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
361 
362  Register DstLo = MRI->createVirtualRegister(&HalfRC);
363  Register DstHi = MRI->createVirtualRegister(&HalfRC);
364 
365  if (IsSALU) {
366  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
367  .add(Lo1)
368  .add(Lo2);
369  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
370  .add(Hi1)
371  .add(Hi2);
372  } else {
373  const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
374  Register CarryReg = MRI->createVirtualRegister(CarryRC);
375  BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
376  .addDef(CarryReg)
377  .add(Lo1)
378  .add(Lo2)
379  .addImm(0);
380  MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
382  .add(Hi1)
383  .add(Hi2)
384  .addReg(CarryReg, RegState::Kill)
385  .addImm(0);
386 
387  if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
388  return false;
389  }
390 
391  BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
392  .addReg(DstLo)
393  .addImm(AMDGPU::sub0)
394  .addReg(DstHi)
395  .addImm(AMDGPU::sub1);
396 
397 
398  if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
399  return false;
400 
401  I.eraseFromParent();
402  return true;
403 }
404 
405 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
406  MachineInstr &I) const {
407  MachineBasicBlock *BB = I.getParent();
408  MachineFunction *MF = BB->getParent();
409  const DebugLoc &DL = I.getDebugLoc();
410  Register Dst0Reg = I.getOperand(0).getReg();
411  Register Dst1Reg = I.getOperand(1).getReg();
412  const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
413  I.getOpcode() == AMDGPU::G_UADDE;
414  const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
415  I.getOpcode() == AMDGPU::G_USUBE;
416 
417  if (isVCC(Dst1Reg, *MRI)) {
418  unsigned NoCarryOpc =
419  IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
420  unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
421  I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
422  I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
423  I.addOperand(*MF, MachineOperand::CreateImm(0));
424  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
425  }
426 
427  Register Src0Reg = I.getOperand(2).getReg();
428  Register Src1Reg = I.getOperand(3).getReg();
429 
430  if (HasCarryIn) {
431  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
432  .addReg(I.getOperand(4).getReg());
433  }
434 
435  unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
436  unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
437 
438  BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
439  .add(I.getOperand(2))
440  .add(I.getOperand(3));
441  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
443 
444  if (!MRI->getRegClassOrNull(Dst1Reg))
445  MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
446 
447  if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
448  !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
449  !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
450  return false;
451 
452  if (HasCarryIn &&
453  !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
454  AMDGPU::SReg_32RegClass, *MRI))
455  return false;
456 
457  I.eraseFromParent();
458  return true;
459 }
460 
461 bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
462  MachineInstr &I) const {
463  MachineBasicBlock *BB = I.getParent();
464  MachineFunction *MF = BB->getParent();
465  const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
466 
467  unsigned Opc;
468  if (Subtarget->hasMADIntraFwdBug())
469  Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
470  : AMDGPU::V_MAD_I64_I32_gfx11_e64;
471  else
472  Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
473  I.setDesc(TII.get(Opc));
474  I.addOperand(*MF, MachineOperand::CreateImm(0));
475  I.addImplicitDefUseOperands(*MF);
476  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
477 }
478 
479 // TODO: We should probably legalize these to only using 32-bit results.
480 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
481  MachineBasicBlock *BB = I.getParent();
482  Register DstReg = I.getOperand(0).getReg();
483  Register SrcReg = I.getOperand(1).getReg();
484  LLT DstTy = MRI->getType(DstReg);
485  LLT SrcTy = MRI->getType(SrcReg);
486  const unsigned SrcSize = SrcTy.getSizeInBits();
487  unsigned DstSize = DstTy.getSizeInBits();
488 
489  // TODO: Should handle any multiple of 32 offset.
490  unsigned Offset = I.getOperand(2).getImm();
491  if (Offset % 32 != 0 || DstSize > 128)
492  return false;
493 
494  // 16-bit operations really use 32-bit registers.
495  // FIXME: Probably should not allow 16-bit G_EXTRACT results.
496  if (DstSize == 16)
497  DstSize = 32;
498 
499  const TargetRegisterClass *DstRC =
500  TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
501  if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
502  return false;
503 
504  const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
505  const TargetRegisterClass *SrcRC =
506  TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
507  if (!SrcRC)
508  return false;
509  unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
510  DstSize / 32);
511  SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
512  if (!SrcRC)
513  return false;
514 
515  SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
516  *SrcRC, I.getOperand(1));
517  const DebugLoc &DL = I.getDebugLoc();
518  BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
519  .addReg(SrcReg, 0, SubReg);
520 
521  I.eraseFromParent();
522  return true;
523 }
524 
525 bool AMDGPUInstructionSelector::selectG_FMA_FMAD(MachineInstr &I) const {
526  assert(I.getOpcode() == AMDGPU::G_FMA || I.getOpcode() == AMDGPU::G_FMAD);
527 
528  // Try to manually select MAD_MIX/FMA_MIX.
529  Register Dst = I.getOperand(0).getReg();
530  LLT ResultTy = MRI->getType(Dst);
531  bool IsFMA = I.getOpcode() == AMDGPU::G_FMA;
532  if (ResultTy != LLT::scalar(32) ||
533  (IsFMA ? !Subtarget->hasFmaMixInsts() : !Subtarget->hasMadMixInsts()))
534  return false;
535 
536  // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand
537  // using the conversion from f16.
538  bool MatchedSrc0, MatchedSrc1, MatchedSrc2;
539  auto [Src0, Src0Mods] =
540  selectVOP3PMadMixModsImpl(I.getOperand(1), MatchedSrc0);
541  auto [Src1, Src1Mods] =
542  selectVOP3PMadMixModsImpl(I.getOperand(2), MatchedSrc1);
543  auto [Src2, Src2Mods] =
544  selectVOP3PMadMixModsImpl(I.getOperand(3), MatchedSrc2);
545 
546 #ifndef NDEBUG
547  const SIMachineFunctionInfo *MFI =
548  I.getMF()->getInfo<SIMachineFunctionInfo>();
549  AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
550  assert((IsFMA || !Mode.allFP32Denormals()) &&
551  "fmad selected with denormals enabled");
552 #endif
553 
554  // TODO: We can select this with f32 denormals enabled if all the sources are
555  // converted from f16 (in which case fmad isn't legal).
556  if (!MatchedSrc0 && !MatchedSrc1 && !MatchedSrc2)
557  return false;
558 
559  const unsigned OpC = IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32;
560  MachineInstr *MixInst =
561  BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(OpC), Dst)
562  .addImm(Src0Mods)
563  .addReg(copyToVGPRIfSrcFolded(Src0, Src0Mods, I.getOperand(1), &I))
564  .addImm(Src1Mods)
565  .addReg(copyToVGPRIfSrcFolded(Src1, Src1Mods, I.getOperand(2), &I))
566  .addImm(Src2Mods)
567  .addReg(copyToVGPRIfSrcFolded(Src2, Src2Mods, I.getOperand(3), &I))
568  .addImm(0)
569  .addImm(0)
570  .addImm(0);
571 
572  if (!constrainSelectedInstRegOperands(*MixInst, TII, TRI, RBI))
573  return false;
574 
575  I.eraseFromParent();
576  return true;
577 }
578 
579 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
580  MachineBasicBlock *BB = MI.getParent();
581  Register DstReg = MI.getOperand(0).getReg();
582  LLT DstTy = MRI->getType(DstReg);
583  LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
584 
585  const unsigned SrcSize = SrcTy.getSizeInBits();
586  if (SrcSize < 32)
587  return selectImpl(MI, *CoverageInfo);
588 
589  const DebugLoc &DL = MI.getDebugLoc();
590  const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
591  const unsigned DstSize = DstTy.getSizeInBits();
592  const TargetRegisterClass *DstRC =
593  TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
594  if (!DstRC)
595  return false;
596 
597  ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
598  MachineInstrBuilder MIB =
599  BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
600  for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
601  MachineOperand &Src = MI.getOperand(I + 1);
602  MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
603  MIB.addImm(SubRegs[I]);
604 
605  const TargetRegisterClass *SrcRC
606  = TRI.getConstrainedRegClassForOperand(Src, *MRI);
607  if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
608  return false;
609  }
610 
611  if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
612  return false;
613 
614  MI.eraseFromParent();
615  return true;
616 }
617 
618 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
619  MachineBasicBlock *BB = MI.getParent();
620  const int NumDst = MI.getNumOperands() - 1;
621 
622  MachineOperand &Src = MI.getOperand(NumDst);
623 
624  Register SrcReg = Src.getReg();
625  Register DstReg0 = MI.getOperand(0).getReg();
626  LLT DstTy = MRI->getType(DstReg0);
627  LLT SrcTy = MRI->getType(SrcReg);
628 
629  const unsigned DstSize = DstTy.getSizeInBits();
630  const unsigned SrcSize = SrcTy.getSizeInBits();
631  const DebugLoc &DL = MI.getDebugLoc();
632  const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
633 
634  const TargetRegisterClass *SrcRC =
635  TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
636  if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
637  return false;
638 
639  // Note we could have mixed SGPR and VGPR destination banks for an SGPR
640  // source, and this relies on the fact that the same subregister indices are
641  // used for both.
642  ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
643  for (int I = 0, E = NumDst; I != E; ++I) {
644  MachineOperand &Dst = MI.getOperand(I);
645  BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
646  .addReg(SrcReg, 0, SubRegs[I]);
647 
648  // Make sure the subregister index is valid for the source register.
649  SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
650  if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
651  return false;
652 
653  const TargetRegisterClass *DstRC =
654  TRI.getConstrainedRegClassForOperand(Dst, *MRI);
655  if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
656  return false;
657  }
658 
659  MI.eraseFromParent();
660  return true;
661 }
662 
663 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
664  assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
665  MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
666 
667  Register Src0 = MI.getOperand(1).getReg();
668  Register Src1 = MI.getOperand(2).getReg();
669  LLT SrcTy = MRI->getType(Src0);
670  const unsigned SrcSize = SrcTy.getSizeInBits();
671 
672  // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
673  if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
674  return selectG_MERGE_VALUES(MI);
675  }
676 
677  // Selection logic below is for V2S16 only.
678  // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
679  Register Dst = MI.getOperand(0).getReg();
680  if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
681  (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
682  SrcTy != LLT::scalar(32)))
683  return selectImpl(MI, *CoverageInfo);
684 
685  const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
686  if (DstBank->getID() == AMDGPU::AGPRRegBankID)
687  return false;
688 
689  assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
690  DstBank->getID() == AMDGPU::VGPRRegBankID);
691  const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
692 
693  const DebugLoc &DL = MI.getDebugLoc();
694  MachineBasicBlock *BB = MI.getParent();
695 
696  // First, before trying TableGen patterns, check if both sources are
697  // constants. In those cases, we can trivially compute the final constant
698  // and emit a simple move.
699  auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
700  if (ConstSrc1) {
701  auto ConstSrc0 =
702  getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
703  if (ConstSrc0) {
704  const int64_t K0 = ConstSrc0->Value.getSExtValue();
705  const int64_t K1 = ConstSrc1->Value.getSExtValue();
706  uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
707  uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
708  uint32_t Imm = Lo16 | (Hi16 << 16);
709 
710  // VALU
711  if (IsVector) {
712  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
713  MI.eraseFromParent();
714  return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
715  }
716 
717  // SALU
718  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
719  MI.eraseFromParent();
720  return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
721  }
722  }
723 
724  // Now try TableGen patterns.
725  if (selectImpl(MI, *CoverageInfo))
726  return true;
727 
728  // TODO: This should probably be a combine somewhere
729  // (build_vector $src0, undef) -> copy $src0
730  MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
731  if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
732  MI.setDesc(TII.get(AMDGPU::COPY));
733  MI.removeOperand(2);
734  const auto &RC =
735  IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
736  return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
737  RBI.constrainGenericRegister(Src0, RC, *MRI);
738  }
739 
740  // TODO: Can be improved?
741  if (IsVector) {
742  Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
743  auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
744  .addImm(0xFFFF)
745  .addReg(Src0);
746  if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
747  return false;
748 
749  MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
750  .addReg(Src1)
751  .addImm(16)
752  .addReg(TmpReg);
753  if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
754  return false;
755 
756  MI.eraseFromParent();
757  return true;
758  }
759 
760  Register ShiftSrc0;
761  Register ShiftSrc1;
762 
763  // With multiple uses of the shift, this will duplicate the shift and
764  // increase register pressure.
765  //
766  // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
767  // => (S_PACK_HH_B32_B16 $src0, $src1)
768  // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
769  // => (S_PACK_HL_B32_B16 $src0, $src1)
770  // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
771  // => (S_PACK_LH_B32_B16 $src0, $src1)
772  // (build_vector $src0, $src1)
773  // => (S_PACK_LL_B32_B16 $src0, $src1)
774 
775  bool Shift0 = mi_match(
776  Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
777 
778  bool Shift1 = mi_match(
779  Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
780 
781  unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
782  if (Shift0 && Shift1) {
783  Opc = AMDGPU::S_PACK_HH_B32_B16;
784  MI.getOperand(1).setReg(ShiftSrc0);
785  MI.getOperand(2).setReg(ShiftSrc1);
786  } else if (Shift1) {
787  Opc = AMDGPU::S_PACK_LH_B32_B16;
788  MI.getOperand(2).setReg(ShiftSrc1);
789  } else if (Shift0) {
790  auto ConstSrc1 =
791  getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
792  if (ConstSrc1 && ConstSrc1->Value == 0) {
793  // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
794  auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
795  .addReg(ShiftSrc0)
796  .addImm(16);
797 
798  MI.eraseFromParent();
799  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
800  }
801  if (STI.hasSPackHL()) {
802  Opc = AMDGPU::S_PACK_HL_B32_B16;
803  MI.getOperand(1).setReg(ShiftSrc0);
804  }
805  }
806 
807  MI.setDesc(TII.get(Opc));
808  return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
809 }
810 
811 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
812  return selectG_ADD_SUB(I);
813 }
814 
815 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
816  const MachineOperand &MO = I.getOperand(0);
817 
818  // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
819  // regbank check here is to know why getConstrainedRegClassForOperand failed.
820  const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
821  if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
822  (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
823  I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
824  return true;
825  }
826 
827  return false;
828 }
829 
830 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
831  MachineBasicBlock *BB = I.getParent();
832 
833  Register DstReg = I.getOperand(0).getReg();
834  Register Src0Reg = I.getOperand(1).getReg();
835  Register Src1Reg = I.getOperand(2).getReg();
836  LLT Src1Ty = MRI->getType(Src1Reg);
837 
838  unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
839  unsigned InsSize = Src1Ty.getSizeInBits();
840 
841  int64_t Offset = I.getOperand(3).getImm();
842 
843  // FIXME: These cases should have been illegal and unnecessary to check here.
844  if (Offset % 32 != 0 || InsSize % 32 != 0)
845  return false;
846 
847  // Currently not handled by getSubRegFromChannel.
848  if (InsSize > 128)
849  return false;
850 
851  unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
852  if (SubReg == AMDGPU::NoSubRegister)
853  return false;
854 
855  const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
856  const TargetRegisterClass *DstRC =
857  TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
858  if (!DstRC)
859  return false;
860 
861  const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
862  const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
863  const TargetRegisterClass *Src0RC =
864  TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
865  const TargetRegisterClass *Src1RC =
866  TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
867 
868  // Deal with weird cases where the class only partially supports the subreg
869  // index.
870  Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
871  if (!Src0RC || !Src1RC)
872  return false;
873 
874  if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
875  !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
876  !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
877  return false;
878 
879  const DebugLoc &DL = I.getDebugLoc();
880  BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
881  .addReg(Src0Reg)
882  .addReg(Src1Reg)
883  .addImm(SubReg);
884 
885  I.eraseFromParent();
886  return true;
887 }
888 
889 bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
890  Register DstReg = MI.getOperand(0).getReg();
891  Register SrcReg = MI.getOperand(1).getReg();
892  Register OffsetReg = MI.getOperand(2).getReg();
893  Register WidthReg = MI.getOperand(3).getReg();
894 
895  assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
896  "scalar BFX instructions are expanded in regbankselect");
897  assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
898  "64-bit vector BFX instructions are expanded in regbankselect");
899 
900  const DebugLoc &DL = MI.getDebugLoc();
901  MachineBasicBlock *MBB = MI.getParent();
902 
903  bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
904  unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
905  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
906  .addReg(SrcReg)
907  .addReg(OffsetReg)
908  .addReg(WidthReg);
909  MI.eraseFromParent();
910  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
911 }
912 
913 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
914  if (STI.getLDSBankCount() != 16)
915  return selectImpl(MI, *CoverageInfo);
916 
917  Register Dst = MI.getOperand(0).getReg();
918  Register Src0 = MI.getOperand(2).getReg();
919  Register M0Val = MI.getOperand(6).getReg();
920  if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
921  !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
922  !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
923  return false;
924 
925  // This requires 2 instructions. It is possible to write a pattern to support
926  // this, but the generated isel emitter doesn't correctly deal with multiple
927  // output instructions using the same physical register input. The copy to m0
928  // is incorrectly placed before the second instruction.
929  //
930  // TODO: Match source modifiers.
931 
932  Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
933  const DebugLoc &DL = MI.getDebugLoc();
934  MachineBasicBlock *MBB = MI.getParent();
935 
936  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
937  .addReg(M0Val);
938  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
939  .addImm(2)
940  .addImm(MI.getOperand(4).getImm()) // $attr
941  .addImm(MI.getOperand(3).getImm()); // $attrchan
942 
943  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
944  .addImm(0) // $src0_modifiers
945  .addReg(Src0) // $src0
946  .addImm(MI.getOperand(4).getImm()) // $attr
947  .addImm(MI.getOperand(3).getImm()) // $attrchan
948  .addImm(0) // $src2_modifiers
949  .addReg(InterpMov) // $src2 - 2 f16 values selected by high
950  .addImm(MI.getOperand(5).getImm()) // $high
951  .addImm(0) // $clamp
952  .addImm(0); // $omod
953 
954  MI.eraseFromParent();
955  return true;
956 }
957 
958 // Writelane is special in that it can use SGPR and M0 (which would normally
959 // count as using the constant bus twice - but in this case it is allowed since
960 // the lane selector doesn't count as a use of the constant bus). However, it is
961 // still required to abide by the 1 SGPR rule. Fix this up if we might have
962 // multiple SGPRs.
963 bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
964  // With a constant bus limit of at least 2, there's no issue.
965  if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
966  return selectImpl(MI, *CoverageInfo);
967 
968  MachineBasicBlock *MBB = MI.getParent();
969  const DebugLoc &DL = MI.getDebugLoc();
970  Register VDst = MI.getOperand(0).getReg();
971  Register Val = MI.getOperand(2).getReg();
972  Register LaneSelect = MI.getOperand(3).getReg();
973  Register VDstIn = MI.getOperand(4).getReg();
974 
975  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
976 
977  Optional<ValueAndVReg> ConstSelect =
978  getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
979  if (ConstSelect) {
980  // The selector has to be an inline immediate, so we can use whatever for
981  // the other operands.
982  MIB.addReg(Val);
983  MIB.addImm(ConstSelect->Value.getSExtValue() &
984  maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
985  } else {
986  Optional<ValueAndVReg> ConstVal =
988 
989  // If the value written is an inline immediate, we can get away without a
990  // copy to m0.
991  if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
992  STI.hasInv2PiInlineImm())) {
993  MIB.addImm(ConstVal->Value.getSExtValue());
994  MIB.addReg(LaneSelect);
995  } else {
996  MIB.addReg(Val);
997 
998  // If the lane selector was originally in a VGPR and copied with
999  // readfirstlane, there's a hazard to read the same SGPR from the
1000  // VALU. Constrain to a different SGPR to help avoid needing a nop later.
1001  RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
1002 
1003  BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1004  .addReg(LaneSelect);
1005  MIB.addReg(AMDGPU::M0);
1006  }
1007  }
1008 
1009  MIB.addReg(VDstIn);
1010 
1011  MI.eraseFromParent();
1012  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1013 }
1014 
1015 // We need to handle this here because tablegen doesn't support matching
1016 // instructions with multiple outputs.
1017 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
1018  Register Dst0 = MI.getOperand(0).getReg();
1019  Register Dst1 = MI.getOperand(1).getReg();
1020 
1021  LLT Ty = MRI->getType(Dst0);
1022  unsigned Opc;
1023  if (Ty == LLT::scalar(32))
1024  Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1025  else if (Ty == LLT::scalar(64))
1026  Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1027  else
1028  return false;
1029 
1030  // TODO: Match source modifiers.
1031 
1032  const DebugLoc &DL = MI.getDebugLoc();
1033  MachineBasicBlock *MBB = MI.getParent();
1034 
1035  Register Numer = MI.getOperand(3).getReg();
1036  Register Denom = MI.getOperand(4).getReg();
1037  unsigned ChooseDenom = MI.getOperand(5).getImm();
1038 
1039  Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1040 
1041  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1042  .addDef(Dst1)
1043  .addImm(0) // $src0_modifiers
1044  .addUse(Src0) // $src0
1045  .addImm(0) // $src1_modifiers
1046  .addUse(Denom) // $src1
1047  .addImm(0) // $src2_modifiers
1048  .addUse(Numer) // $src2
1049  .addImm(0) // $clamp
1050  .addImm(0); // $omod
1051 
1052  MI.eraseFromParent();
1053  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1054 }
1055 
1056 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1057  unsigned IntrinsicID = I.getIntrinsicID();
1058  switch (IntrinsicID) {
1059  case Intrinsic::amdgcn_if_break: {
1060  MachineBasicBlock *BB = I.getParent();
1061 
1062  // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1063  // SelectionDAG uses for wave32 vs wave64.
1064  BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1065  .add(I.getOperand(0))
1066  .add(I.getOperand(2))
1067  .add(I.getOperand(3));
1068 
1069  Register DstReg = I.getOperand(0).getReg();
1070  Register Src0Reg = I.getOperand(2).getReg();
1071  Register Src1Reg = I.getOperand(3).getReg();
1072 
1073  I.eraseFromParent();
1074 
1075  for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1077 
1078  return true;
1079  }
1080  case Intrinsic::amdgcn_interp_p1_f16:
1081  return selectInterpP1F16(I);
1082  case Intrinsic::amdgcn_wqm:
1083  return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1084  case Intrinsic::amdgcn_softwqm:
1085  return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1086  case Intrinsic::amdgcn_strict_wwm:
1087  case Intrinsic::amdgcn_wwm:
1088  return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1089  case Intrinsic::amdgcn_strict_wqm:
1090  return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1091  case Intrinsic::amdgcn_writelane:
1092  return selectWritelane(I);
1093  case Intrinsic::amdgcn_div_scale:
1094  return selectDivScale(I);
1095  case Intrinsic::amdgcn_icmp:
1096  case Intrinsic::amdgcn_fcmp:
1097  if (selectImpl(I, *CoverageInfo))
1098  return true;
1099  return selectIntrinsicCmp(I);
1100  case Intrinsic::amdgcn_ballot:
1101  return selectBallot(I);
1102  case Intrinsic::amdgcn_reloc_constant:
1103  return selectRelocConstant(I);
1104  case Intrinsic::amdgcn_groupstaticsize:
1105  return selectGroupStaticSize(I);
1106  case Intrinsic::returnaddress:
1107  return selectReturnAddress(I);
1108  case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1109  case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1110  case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1111  case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1112  case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1113  case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1114  case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1115  case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1116  case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1117  case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1118  case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1119  case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1120  case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1121  case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1122  return selectSMFMACIntrin(I);
1123  default:
1124  return selectImpl(I, *CoverageInfo);
1125  }
1126 }
1127 
1128 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size,
1129  const GCNSubtarget &ST) {
1130  if (Size != 16 && Size != 32 && Size != 64)
1131  return -1;
1132 
1133  if (Size == 16 && !ST.has16BitInsts())
1134  return -1;
1135 
1136  const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc, unsigned S32Opc,
1137  unsigned S64Opc) {
1138  if (Size == 16)
1139  return ST.hasTrue16BitInsts() ? TrueS16Opc : S16Opc;
1140  if (Size == 32)
1141  return S32Opc;
1142  return S64Opc;
1143  };
1144 
1145  switch (P) {
1146  default:
1147  llvm_unreachable("Unknown condition code!");
1148  case CmpInst::ICMP_NE:
1149  return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1150  AMDGPU::V_CMP_NE_U32_e64, AMDGPU::V_CMP_NE_U64_e64);
1151  case CmpInst::ICMP_EQ:
1152  return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1153  AMDGPU::V_CMP_EQ_U32_e64, AMDGPU::V_CMP_EQ_U64_e64);
1154  case CmpInst::ICMP_SGT:
1155  return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1156  AMDGPU::V_CMP_GT_I32_e64, AMDGPU::V_CMP_GT_I64_e64);
1157  case CmpInst::ICMP_SGE:
1158  return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1159  AMDGPU::V_CMP_GE_I32_e64, AMDGPU::V_CMP_GE_I64_e64);
1160  case CmpInst::ICMP_SLT:
1161  return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1162  AMDGPU::V_CMP_LT_I32_e64, AMDGPU::V_CMP_LT_I64_e64);
1163  case CmpInst::ICMP_SLE:
1164  return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1165  AMDGPU::V_CMP_LE_I32_e64, AMDGPU::V_CMP_LE_I64_e64);
1166  case CmpInst::ICMP_UGT:
1167  return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1168  AMDGPU::V_CMP_GT_U32_e64, AMDGPU::V_CMP_GT_U64_e64);
1169  case CmpInst::ICMP_UGE:
1170  return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1171  AMDGPU::V_CMP_GE_U32_e64, AMDGPU::V_CMP_GE_U64_e64);
1172  case CmpInst::ICMP_ULT:
1173  return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1174  AMDGPU::V_CMP_LT_U32_e64, AMDGPU::V_CMP_LT_U64_e64);
1175  case CmpInst::ICMP_ULE:
1176  return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1177  AMDGPU::V_CMP_LE_U32_e64, AMDGPU::V_CMP_LE_U64_e64);
1178 
1179  case CmpInst::FCMP_OEQ:
1180  return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1181  AMDGPU::V_CMP_EQ_F32_e64, AMDGPU::V_CMP_EQ_F64_e64);
1182  case CmpInst::FCMP_OGT:
1183  return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1184  AMDGPU::V_CMP_GT_F32_e64, AMDGPU::V_CMP_GT_F64_e64);
1185  case CmpInst::FCMP_OGE:
1186  return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1187  AMDGPU::V_CMP_GE_F32_e64, AMDGPU::V_CMP_GE_F64_e64);
1188  case CmpInst::FCMP_OLT:
1189  return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1190  AMDGPU::V_CMP_LT_F32_e64, AMDGPU::V_CMP_LT_F64_e64);
1191  case CmpInst::FCMP_OLE:
1192  return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1193  AMDGPU::V_CMP_LE_F32_e64, AMDGPU::V_CMP_LE_F64_e64);
1194  case CmpInst::FCMP_ONE:
1195  return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1196  AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1197  case CmpInst::FCMP_ORD:
1198  return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1199  AMDGPU::V_CMP_O_F32_e64, AMDGPU::V_CMP_O_F64_e64);
1200  case CmpInst::FCMP_UNO:
1201  return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1202  AMDGPU::V_CMP_U_F32_e64, AMDGPU::V_CMP_U_F64_e64);
1203  case CmpInst::FCMP_UEQ:
1204  return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1205  AMDGPU::V_CMP_NLG_F32_e64, AMDGPU::V_CMP_NLG_F64_e64);
1206  case CmpInst::FCMP_UGT:
1207  return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1208  AMDGPU::V_CMP_NLE_F32_e64, AMDGPU::V_CMP_NLE_F64_e64);
1209  case CmpInst::FCMP_UGE:
1210  return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1211  AMDGPU::V_CMP_NLT_F32_e64, AMDGPU::V_CMP_NLT_F64_e64);
1212  case CmpInst::FCMP_ULT:
1213  return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1214  AMDGPU::V_CMP_NGE_F32_e64, AMDGPU::V_CMP_NGE_F64_e64);
1215  case CmpInst::FCMP_ULE:
1216  return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1217  AMDGPU::V_CMP_NGT_F32_e64, AMDGPU::V_CMP_NGT_F64_e64);
1218  case CmpInst::FCMP_UNE:
1219  return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1220  AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1221  case CmpInst::FCMP_TRUE:
1222  return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1223  AMDGPU::V_CMP_TRU_F32_e64, AMDGPU::V_CMP_TRU_F64_e64);
1224  case CmpInst::FCMP_FALSE:
1225  return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1226  AMDGPU::V_CMP_F_F32_e64, AMDGPU::V_CMP_F_F64_e64);
1227  }
1228 }
1229 
1230 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1231  unsigned Size) const {
1232  if (Size == 64) {
1233  if (!STI.hasScalarCompareEq64())
1234  return -1;
1235 
1236  switch (P) {
1237  case CmpInst::ICMP_NE:
1238  return AMDGPU::S_CMP_LG_U64;
1239  case CmpInst::ICMP_EQ:
1240  return AMDGPU::S_CMP_EQ_U64;
1241  default:
1242  return -1;
1243  }
1244  }
1245 
1246  if (Size != 32)
1247  return -1;
1248 
1249  switch (P) {
1250  case CmpInst::ICMP_NE:
1251  return AMDGPU::S_CMP_LG_U32;
1252  case CmpInst::ICMP_EQ:
1253  return AMDGPU::S_CMP_EQ_U32;
1254  case CmpInst::ICMP_SGT:
1255  return AMDGPU::S_CMP_GT_I32;
1256  case CmpInst::ICMP_SGE:
1257  return AMDGPU::S_CMP_GE_I32;
1258  case CmpInst::ICMP_SLT:
1259  return AMDGPU::S_CMP_LT_I32;
1260  case CmpInst::ICMP_SLE:
1261  return AMDGPU::S_CMP_LE_I32;
1262  case CmpInst::ICMP_UGT:
1263  return AMDGPU::S_CMP_GT_U32;
1264  case CmpInst::ICMP_UGE:
1265  return AMDGPU::S_CMP_GE_U32;
1266  case CmpInst::ICMP_ULT:
1267  return AMDGPU::S_CMP_LT_U32;
1268  case CmpInst::ICMP_ULE:
1269  return AMDGPU::S_CMP_LE_U32;
1270  default:
1271  llvm_unreachable("Unknown condition code!");
1272  }
1273 }
1274 
1275 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
1276  MachineBasicBlock *BB = I.getParent();
1277  const DebugLoc &DL = I.getDebugLoc();
1278 
1279  Register SrcReg = I.getOperand(2).getReg();
1280  unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1281 
1282  auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1283 
1284  Register CCReg = I.getOperand(0).getReg();
1285  if (!isVCC(CCReg, *MRI)) {
1286  int Opcode = getS_CMPOpcode(Pred, Size);
1287  if (Opcode == -1)
1288  return false;
1289  MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1290  .add(I.getOperand(2))
1291  .add(I.getOperand(3));
1292  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1293  .addReg(AMDGPU::SCC);
1294  bool Ret =
1295  constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1296  RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1297  I.eraseFromParent();
1298  return Ret;
1299  }
1300 
1301  int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1302  if (Opcode == -1)
1303  return false;
1304 
1305  MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1306  I.getOperand(0).getReg())
1307  .add(I.getOperand(2))
1308  .add(I.getOperand(3));
1310  *TRI.getBoolRC(), *MRI);
1311  bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1312  I.eraseFromParent();
1313  return Ret;
1314 }
1315 
1316 bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1317  Register Dst = I.getOperand(0).getReg();
1318  if (isVCC(Dst, *MRI))
1319  return false;
1320 
1321  LLT DstTy = MRI->getType(Dst);
1322  if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1323  return false;
1324 
1325  MachineBasicBlock *BB = I.getParent();
1326  const DebugLoc &DL = I.getDebugLoc();
1327  Register SrcReg = I.getOperand(2).getReg();
1328  unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1329 
1330  // i1 inputs are not supported in GlobalISel.
1331  if (Size == 1)
1332  return false;
1333 
1334  auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1335  if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1336  BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1337  I.eraseFromParent();
1338  return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1339  }
1340 
1341  const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1342  if (Opcode == -1)
1343  return false;
1344 
1345  MachineInstr *SelectedMI;
1346  if (CmpInst::isFPPredicate(Pred)) {
1347  MachineOperand &LHS = I.getOperand(2);
1348  MachineOperand &RHS = I.getOperand(3);
1349  auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS);
1350  auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS);
1351  Register Src0Reg =
1352  copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1353  Register Src1Reg =
1354  copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1355  SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
1356  .addImm(Src0Mods)
1357  .addReg(Src0Reg)
1358  .addImm(Src1Mods)
1359  .addReg(Src1Reg)
1360  .addImm(0); // clamp
1361  } else {
1362  SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
1363  .add(I.getOperand(2))
1364  .add(I.getOperand(3));
1365  }
1366 
1367  RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1368  if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1369  return false;
1370 
1371  I.eraseFromParent();
1372  return true;
1373 }
1374 
1375 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1376  MachineBasicBlock *BB = I.getParent();
1377  const DebugLoc &DL = I.getDebugLoc();
1378  Register DstReg = I.getOperand(0).getReg();
1379  const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1380  const bool Is64 = Size == 64;
1381 
1382  if (Size != STI.getWavefrontSize())
1383  return false;
1384 
1386  getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
1387 
1388  if (Arg) {
1389  const int64_t Value = Arg.value().Value.getSExtValue();
1390  if (Value == 0) {
1391  unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1392  BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1393  } else if (Value == -1) { // all ones
1394  Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
1395  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1396  } else
1397  return false;
1398  } else {
1399  Register SrcReg = I.getOperand(2).getReg();
1400  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1401  }
1402 
1403  I.eraseFromParent();
1404  return true;
1405 }
1406 
1407 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1408  Register DstReg = I.getOperand(0).getReg();
1409  const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1410  const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1411  if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1412  return false;
1413 
1414  const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1415 
1416  Module *M = MF->getFunction().getParent();
1417  const MDNode *Metadata = I.getOperand(2).getMetadata();
1418  auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1419  auto RelocSymbol = cast<GlobalVariable>(
1420  M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1421 
1422  MachineBasicBlock *BB = I.getParent();
1423  BuildMI(*BB, &I, I.getDebugLoc(),
1424  TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1425  .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);
1426 
1427  I.eraseFromParent();
1428  return true;
1429 }
1430 
1431 bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1433 
1434  Register DstReg = I.getOperand(0).getReg();
1435  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1436  unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1437  AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1438 
1439  MachineBasicBlock *MBB = I.getParent();
1440  const DebugLoc &DL = I.getDebugLoc();
1441 
1442  auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1443 
1444  if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1446  MIB.addImm(MFI->getLDSSize());
1447  } else {
1448  Module *M = MF->getFunction().getParent();
1449  const GlobalValue *GV
1450  = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1452  }
1453 
1454  I.eraseFromParent();
1455  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1456 }
1457 
1458 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1459  MachineBasicBlock *MBB = I.getParent();
1461  const DebugLoc &DL = I.getDebugLoc();
1462 
1463  MachineOperand &Dst = I.getOperand(0);
1464  Register DstReg = Dst.getReg();
1465  unsigned Depth = I.getOperand(2).getImm();
1466 
1467  const TargetRegisterClass *RC
1468  = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1469  if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1470  !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1471  return false;
1472 
1473  // Check for kernel and shader functions
1474  if (Depth != 0 ||
1476  BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1477  .addImm(0);
1478  I.eraseFromParent();
1479  return true;
1480  }
1481 
1482  MachineFrameInfo &MFI = MF.getFrameInfo();
1483  // There is a call to @llvm.returnaddress in this function
1484  MFI.setReturnAddressIsTaken(true);
1485 
1486  // Get the return address reg and mark it as an implicit live-in
1487  Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1488  Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1489  AMDGPU::SReg_64RegClass, DL);
1490  BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1491  .addReg(LiveIn);
1492  I.eraseFromParent();
1493  return true;
1494 }
1495 
1496 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1497  // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1498  // SelectionDAG uses for wave32 vs wave64.
1499  MachineBasicBlock *BB = MI.getParent();
1500  BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1501  .add(MI.getOperand(1));
1502 
1503  Register Reg = MI.getOperand(1).getReg();
1504  MI.eraseFromParent();
1505 
1506  if (!MRI->getRegClassOrNull(Reg))
1508  return true;
1509 }
1510 
1511 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1512  MachineInstr &MI, Intrinsic::ID IntrID) const {
1513  MachineBasicBlock *MBB = MI.getParent();
1515  const DebugLoc &DL = MI.getDebugLoc();
1516 
1517  unsigned IndexOperand = MI.getOperand(7).getImm();
1518  bool WaveRelease = MI.getOperand(8).getImm() != 0;
1519  bool WaveDone = MI.getOperand(9).getImm() != 0;
1520 
1521  if (WaveDone && !WaveRelease)
1522  report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1523 
1524  unsigned OrderedCountIndex = IndexOperand & 0x3f;
1525  IndexOperand &= ~0x3f;
1526  unsigned CountDw = 0;
1527 
1528  if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1529  CountDw = (IndexOperand >> 24) & 0xf;
1530  IndexOperand &= ~(0xf << 24);
1531 
1532  if (CountDw < 1 || CountDw > 4) {
1534  "ds_ordered_count: dword count must be between 1 and 4");
1535  }
1536  }
1537 
1538  if (IndexOperand)
1539  report_fatal_error("ds_ordered_count: bad index operand");
1540 
1541  unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1542  unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1543 
1544  unsigned Offset0 = OrderedCountIndex << 2;
1545  unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1546 
1548  Offset1 |= (CountDw - 1) << 6;
1549 
1551  Offset1 |= ShaderType << 2;
1552 
1553  unsigned Offset = Offset0 | (Offset1 << 8);
1554 
1555  Register M0Val = MI.getOperand(2).getReg();
1556  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1557  .addReg(M0Val);
1558 
1559  Register DstReg = MI.getOperand(0).getReg();
1560  Register ValReg = MI.getOperand(3).getReg();
1562  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1563  .addReg(ValReg)
1564  .addImm(Offset)
1565  .cloneMemRefs(MI);
1566 
1567  if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1568  return false;
1569 
1570  bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1571  MI.eraseFromParent();
1572  return Ret;
1573 }
1574 
1575 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1576  switch (IntrID) {
1577  case Intrinsic::amdgcn_ds_gws_init:
1578  return AMDGPU::DS_GWS_INIT;
1579  case Intrinsic::amdgcn_ds_gws_barrier:
1580  return AMDGPU::DS_GWS_BARRIER;
1581  case Intrinsic::amdgcn_ds_gws_sema_v:
1582  return AMDGPU::DS_GWS_SEMA_V;
1583  case Intrinsic::amdgcn_ds_gws_sema_br:
1584  return AMDGPU::DS_GWS_SEMA_BR;
1585  case Intrinsic::amdgcn_ds_gws_sema_p:
1586  return AMDGPU::DS_GWS_SEMA_P;
1587  case Intrinsic::amdgcn_ds_gws_sema_release_all:
1588  return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1589  default:
1590  llvm_unreachable("not a gws intrinsic");
1591  }
1592 }
1593 
1594 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1595  Intrinsic::ID IID) const {
1596  if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1597  !STI.hasGWSSemaReleaseAll())
1598  return false;
1599 
1600  // intrinsic ID, vsrc, offset
1601  const bool HasVSrc = MI.getNumOperands() == 3;
1602  assert(HasVSrc || MI.getNumOperands() == 2);
1603 
1604  Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1605  const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1606  if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1607  return false;
1608 
1609  MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1610  unsigned ImmOffset;
1611 
1612  MachineBasicBlock *MBB = MI.getParent();
1613  const DebugLoc &DL = MI.getDebugLoc();
1614 
1615  MachineInstr *Readfirstlane = nullptr;
1616 
1617  // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1618  // incoming offset, in case there's an add of a constant. We'll have to put it
1619  // back later.
1620  if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1621  Readfirstlane = OffsetDef;
1622  BaseOffset = OffsetDef->getOperand(1).getReg();
1623  OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1624  }
1625 
1626  if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1627  // If we have a constant offset, try to use the 0 in m0 as the base.
1628  // TODO: Look into changing the default m0 initialization value. If the
1629  // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1630  // the immediate offset.
1631 
1632  ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1633  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1634  .addImm(0);
1635  } else {
1636  std::tie(BaseOffset, ImmOffset) =
1637  AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, KnownBits);
1638 
1639  if (Readfirstlane) {
1640  // We have the constant offset now, so put the readfirstlane back on the
1641  // variable component.
1642  if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1643  return false;
1644 
1645  Readfirstlane->getOperand(1).setReg(BaseOffset);
1646  BaseOffset = Readfirstlane->getOperand(0).getReg();
1647  } else {
1648  if (!RBI.constrainGenericRegister(BaseOffset,
1649  AMDGPU::SReg_32RegClass, *MRI))
1650  return false;
1651  }
1652 
1653  Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1654  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1655  .addReg(BaseOffset)
1656  .addImm(16);
1657 
1658  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1659  .addReg(M0Base);
1660  }
1661 
1662  // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1663  // offset field) % 64. Some versions of the programming guide omit the m0
1664  // part, or claim it's from offset 0.
1665  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1666 
1667  if (HasVSrc) {
1668  Register VSrc = MI.getOperand(1).getReg();
1669  MIB.addReg(VSrc);
1670 
1671  if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1672  return false;
1673  }
1674 
1675  MIB.addImm(ImmOffset)
1676  .cloneMemRefs(MI);
1677 
1678  TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
1679 
1680  MI.eraseFromParent();
1681  return true;
1682 }
1683 
1684 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1685  bool IsAppend) const {
1686  Register PtrBase = MI.getOperand(2).getReg();
1687  LLT PtrTy = MRI->getType(PtrBase);
1688  bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1689 
1690  unsigned Offset;
1691  std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1692 
1693  // TODO: Should this try to look through readfirstlane like GWS?
1694  if (!isDSOffsetLegal(PtrBase, Offset)) {
1695  PtrBase = MI.getOperand(2).getReg();
1696  Offset = 0;
1697  }
1698 
1699  MachineBasicBlock *MBB = MI.getParent();
1700  const DebugLoc &DL = MI.getDebugLoc();
1701  const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1702 
1703  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1704  .addReg(PtrBase);
1705  if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1706  return false;
1707 
1708  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1709  .addImm(Offset)
1710  .addImm(IsGDS ? -1 : 0)
1711  .cloneMemRefs(MI);
1712  MI.eraseFromParent();
1713  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1714 }
1715 
1716 bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1717  if (TM.getOptLevel() > CodeGenOpt::None) {
1718  unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1719  if (WGSize <= STI.getWavefrontSize()) {
1720  MachineBasicBlock *MBB = MI.getParent();
1721  const DebugLoc &DL = MI.getDebugLoc();
1722  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1723  MI.eraseFromParent();
1724  return true;
1725  }
1726  }
1727  return selectImpl(MI, *CoverageInfo);
1728 }
1729 
1730 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1731  bool &IsTexFail) {
1732  if (TexFailCtrl)
1733  IsTexFail = true;
1734 
1735  TFE = (TexFailCtrl & 0x1) ? true : false;
1736  TexFailCtrl &= ~(uint64_t)0x1;
1737  LWE = (TexFailCtrl & 0x2) ? true : false;
1738  TexFailCtrl &= ~(uint64_t)0x2;
1739 
1740  return TexFailCtrl == 0;
1741 }
1742 
1743 bool AMDGPUInstructionSelector::selectImageIntrinsic(
1744  MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
1745  MachineBasicBlock *MBB = MI.getParent();
1746  const DebugLoc &DL = MI.getDebugLoc();
1747 
1748  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1749  AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1750 
1751  const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1752  unsigned IntrOpcode = Intr->BaseOpcode;
1753  const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
1754  const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
1755 
1756  const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
1757 
1758  Register VDataIn, VDataOut;
1759  LLT VDataTy;
1760  int NumVDataDwords = -1;
1761  bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
1762  MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
1763 
1764  bool Unorm;
1765  if (!BaseOpcode->Sampler)
1766  Unorm = true;
1767  else
1768  Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
1769 
1770  bool TFE;
1771  bool LWE;
1772  bool IsTexFail = false;
1773  if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
1774  TFE, LWE, IsTexFail))
1775  return false;
1776 
1777  const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
1778  const bool IsA16 = (Flags & 1) != 0;
1779  const bool IsG16 = (Flags & 2) != 0;
1780 
1781  // A16 implies 16 bit gradients if subtarget doesn't support G16
1782  if (IsA16 && !STI.hasG16() && !IsG16)
1783  return false;
1784 
1785  unsigned DMask = 0;
1786  unsigned DMaskLanes = 0;
1787 
1788  if (BaseOpcode->Atomic) {
1789  VDataOut = MI.getOperand(0).getReg();
1790  VDataIn = MI.getOperand(2).getReg();
1791  LLT Ty = MRI->getType(VDataIn);
1792 
1793  // Be careful to allow atomic swap on 16-bit element vectors.
1794  const bool Is64Bit = BaseOpcode->AtomicX2 ?
1795  Ty.getSizeInBits() == 128 :
1796  Ty.getSizeInBits() == 64;
1797 
1798  if (BaseOpcode->AtomicX2) {
1799  assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1800 
1801  DMask = Is64Bit ? 0xf : 0x3;
1802  NumVDataDwords = Is64Bit ? 4 : 2;
1803  } else {
1804  DMask = Is64Bit ? 0x3 : 0x1;
1805  NumVDataDwords = Is64Bit ? 2 : 1;
1806  }
1807  } else {
1808  DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
1809  DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
1810 
1811  if (BaseOpcode->Store) {
1812  VDataIn = MI.getOperand(1).getReg();
1813  VDataTy = MRI->getType(VDataIn);
1814  NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1815  } else {
1816  VDataOut = MI.getOperand(0).getReg();
1817  VDataTy = MRI->getType(VDataOut);
1818  NumVDataDwords = DMaskLanes;
1819 
1820  if (IsD16 && !STI.hasUnpackedD16VMem())
1821  NumVDataDwords = (DMaskLanes + 1) / 2;
1822  }
1823  }
1824 
1825  // Set G16 opcode
1826  if (IsG16 && !IsA16) {
1827  const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1828  AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
1829  assert(G16MappingInfo);
1830  IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1831  }
1832 
1833  // TODO: Check this in verifier.
1834  assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1835 
1836  unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
1837  if (BaseOpcode->Atomic)
1838  CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
1839  if (CPol & ~AMDGPU::CPol::ALL)
1840  return false;
1841 
1842  int NumVAddrRegs = 0;
1843  int NumVAddrDwords = 0;
1844  for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
1845  // Skip the $noregs and 0s inserted during legalization.
1846  MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
1847  if (!AddrOp.isReg())
1848  continue; // XXX - Break?
1849 
1850  Register Addr = AddrOp.getReg();
1851  if (!Addr)
1852  break;
1853 
1854  ++NumVAddrRegs;
1855  NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1856  }
1857 
1858  // The legalizer preprocessed the intrinsic arguments. If we aren't using
1859  // NSA, these should have been packed into a single value in the first
1860  // address register
1861  const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
1862  if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1863  LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1864  return false;
1865  }
1866 
1867  if (IsTexFail)
1868  ++NumVDataDwords;
1869 
1870  int Opcode = -1;
1871  if (IsGFX11Plus) {
1872  Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1873  UseNSA ? AMDGPU::MIMGEncGfx11NSA
1874  : AMDGPU::MIMGEncGfx11Default,
1875  NumVDataDwords, NumVAddrDwords);
1876  } else if (IsGFX10Plus) {
1877  Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1878  UseNSA ? AMDGPU::MIMGEncGfx10NSA
1879  : AMDGPU::MIMGEncGfx10Default,
1880  NumVDataDwords, NumVAddrDwords);
1881  } else {
1882  if (Subtarget->hasGFX90AInsts()) {
1883  Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
1884  NumVDataDwords, NumVAddrDwords);
1885  if (Opcode == -1) {
1886  LLVM_DEBUG(
1887  dbgs()
1888  << "requested image instruction is not supported on this GPU\n");
1889  return false;
1890  }
1891  }
1892  if (Opcode == -1 &&
1894  Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1895  NumVDataDwords, NumVAddrDwords);
1896  if (Opcode == -1)
1897  Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1898  NumVDataDwords, NumVAddrDwords);
1899  }
1900  assert(Opcode != -1);
1901 
1902  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1903  .cloneMemRefs(MI);
1904 
1905  if (VDataOut) {
1906  if (BaseOpcode->AtomicX2) {
1907  const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1908 
1910  Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1911  unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1912 
1913  MIB.addDef(TmpReg);
1914  if (!MRI->use_empty(VDataOut)) {
1915  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1916  .addReg(TmpReg, RegState::Kill, SubReg);
1917  }
1918 
1919  } else {
1920  MIB.addDef(VDataOut); // vdata output
1921  }
1922  }
1923 
1924  if (VDataIn)
1925  MIB.addReg(VDataIn); // vdata input
1926 
1927  for (int I = 0; I != NumVAddrRegs; ++I) {
1928  MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
1929  if (SrcOp.isReg()) {
1930  assert(SrcOp.getReg() != 0);
1931  MIB.addReg(SrcOp.getReg());
1932  }
1933  }
1934 
1935  MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
1936  if (BaseOpcode->Sampler)
1937  MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
1938 
1939  MIB.addImm(DMask); // dmask
1940 
1941  if (IsGFX10Plus)
1942  MIB.addImm(DimInfo->Encoding);
1943  MIB.addImm(Unorm);
1944 
1945  MIB.addImm(CPol);
1946  MIB.addImm(IsA16 && // a16 or r128
1947  STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
1948  if (IsGFX10Plus)
1949  MIB.addImm(IsA16 ? -1 : 0);
1950 
1951  if (!Subtarget->hasGFX90AInsts()) {
1952  MIB.addImm(TFE); // tfe
1953  } else if (TFE) {
1954  LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
1955  return false;
1956  }
1957 
1958  MIB.addImm(LWE); // lwe
1959  if (!IsGFX10Plus)
1960  MIB.addImm(DimInfo->DA ? -1 : 0);
1961  if (BaseOpcode->HasD16)
1962  MIB.addImm(IsD16 ? -1 : 0);
1963 
1964  if (IsTexFail) {
1965  // An image load instruction with TFE/LWE only conditionally writes to its
1966  // result registers. Initialize them to zero so that we always get well
1967  // defined result values.
1968  assert(VDataOut && !VDataIn);
1969  Register Tied = MRI->cloneVirtualRegister(VDataOut);
1970  Register Zero = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1971  BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::V_MOV_B32_e32), Zero)
1972  .addImm(0);
1973  auto Parts = TRI.getRegSplitParts(MRI->getRegClass(Tied), 4);
1974  if (STI.usePRTStrictNull()) {
1975  // With enable-prt-strict-null enabled, initialize all result registers to
1976  // zero.
1977  auto RegSeq =
1978  BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
1979  for (auto Sub : Parts)
1980  RegSeq.addReg(Zero).addImm(Sub);
1981  } else {
1982  // With enable-prt-strict-null disabled, only initialize the extra TFE/LWE
1983  // result register.
1984  Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1985  BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
1986  auto RegSeq =
1987  BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
1988  for (auto Sub : Parts.drop_back(1))
1989  RegSeq.addReg(Undef).addImm(Sub);
1990  RegSeq.addReg(Zero).addImm(Parts.back());
1991  }
1992  MIB.addReg(Tied, RegState::Implicit);
1993  MIB->tieOperands(0, MIB->getNumOperands() - 1);
1994  }
1995 
1996  MI.eraseFromParent();
1997  constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1998  TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
1999  return true;
2000 }
2001 
2002 // We need to handle this here because tablegen doesn't support matching
2003 // instructions with multiple outputs.
2004 bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2005  MachineInstr &MI) const {
2006  Register Dst0 = MI.getOperand(0).getReg();
2007  Register Dst1 = MI.getOperand(1).getReg();
2008 
2009  const DebugLoc &DL = MI.getDebugLoc();
2010  MachineBasicBlock *MBB = MI.getParent();
2011 
2012  Register Addr = MI.getOperand(3).getReg();
2013  Register Data0 = MI.getOperand(4).getReg();
2014  Register Data1 = MI.getOperand(5).getReg();
2015  unsigned Offset = MI.getOperand(6).getImm();
2016 
2017  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0)
2018  .addDef(Dst1)
2019  .addUse(Addr)
2020  .addUse(Data0)
2021  .addUse(Data1)
2022  .addImm(Offset)
2023  .cloneMemRefs(MI);
2024 
2025  MI.eraseFromParent();
2026  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2027 }
2028 
2029 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2030  MachineInstr &I) const {
2031  unsigned IntrinsicID = I.getIntrinsicID();
2032  switch (IntrinsicID) {
2033  case Intrinsic::amdgcn_end_cf:
2034  return selectEndCfIntrinsic(I);
2035  case Intrinsic::amdgcn_ds_ordered_add:
2036  case Intrinsic::amdgcn_ds_ordered_swap:
2037  return selectDSOrderedIntrinsic(I, IntrinsicID);
2038  case Intrinsic::amdgcn_ds_gws_init:
2039  case Intrinsic::amdgcn_ds_gws_barrier:
2040  case Intrinsic::amdgcn_ds_gws_sema_v:
2041  case Intrinsic::amdgcn_ds_gws_sema_br:
2042  case Intrinsic::amdgcn_ds_gws_sema_p:
2043  case Intrinsic::amdgcn_ds_gws_sema_release_all:
2044  return selectDSGWSIntrinsic(I, IntrinsicID);
2045  case Intrinsic::amdgcn_ds_append:
2046  return selectDSAppendConsume(I, true);
2047  case Intrinsic::amdgcn_ds_consume:
2048  return selectDSAppendConsume(I, false);
2049  case Intrinsic::amdgcn_s_barrier:
2050  return selectSBarrier(I);
2051  case Intrinsic::amdgcn_raw_buffer_load_lds:
2052  case Intrinsic::amdgcn_struct_buffer_load_lds:
2053  return selectBufferLoadLds(I);
2054  case Intrinsic::amdgcn_global_load_lds:
2055  return selectGlobalLoadLds(I);
2056  case Intrinsic::amdgcn_exp_compr:
2057  if (!STI.hasCompressedExport()) {
2058  Function &F = I.getMF()->getFunction();
2059  DiagnosticInfoUnsupported NoFpRet(
2060  F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error);
2061  F.getContext().diagnose(NoFpRet);
2062  return false;
2063  }
2064  break;
2065  case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2066  return selectDSBvhStackIntrinsic(I);
2067  }
2068  return selectImpl(I, *CoverageInfo);
2069 }
2070 
2071 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2072  if (selectImpl(I, *CoverageInfo))
2073  return true;
2074 
2075  MachineBasicBlock *BB = I.getParent();
2076  const DebugLoc &DL = I.getDebugLoc();
2077 
2078  Register DstReg = I.getOperand(0).getReg();
2079  unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2080  assert(Size <= 32 || Size == 64);
2081  const MachineOperand &CCOp = I.getOperand(1);
2082  Register CCReg = CCOp.getReg();
2083  if (!isVCC(CCReg, *MRI)) {
2084  unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2085  AMDGPU::S_CSELECT_B32;
2086  MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2087  .addReg(CCReg);
2088 
2089  // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2090  // bank, because it does not cover the register class that we used to represent
2091  // for it. So we need to manually set the register class here.
2092  if (!MRI->getRegClassOrNull(CCReg))
2093  MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2094  MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2095  .add(I.getOperand(2))
2096  .add(I.getOperand(3));
2097 
2098  bool Ret = false;
2099  Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2100  Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2101  I.eraseFromParent();
2102  return Ret;
2103  }
2104 
2105  // Wide VGPR select should have been split in RegBankSelect.
2106  if (Size > 32)
2107  return false;
2108 
2109  MachineInstr *Select =
2110  BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2111  .addImm(0)
2112  .add(I.getOperand(3))
2113  .addImm(0)
2114  .add(I.getOperand(2))
2115  .add(I.getOperand(1));
2116 
2117  bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2118  I.eraseFromParent();
2119  return Ret;
2120 }
2121 
2122 static int sizeToSubRegIndex(unsigned Size) {
2123  switch (Size) {
2124  case 32:
2125  return AMDGPU::sub0;
2126  case 64:
2127  return AMDGPU::sub0_sub1;
2128  case 96:
2129  return AMDGPU::sub0_sub1_sub2;
2130  case 128:
2131  return AMDGPU::sub0_sub1_sub2_sub3;
2132  case 256:
2133  return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
2134  default:
2135  if (Size < 32)
2136  return AMDGPU::sub0;
2137  if (Size > 256)
2138  return -1;
2139  return sizeToSubRegIndex(PowerOf2Ceil(Size));
2140  }
2141 }
2142 
2143 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2144  Register DstReg = I.getOperand(0).getReg();
2145  Register SrcReg = I.getOperand(1).getReg();
2146  const LLT DstTy = MRI->getType(DstReg);
2147  const LLT SrcTy = MRI->getType(SrcReg);
2148  const LLT S1 = LLT::scalar(1);
2149 
2150  const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2151  const RegisterBank *DstRB;
2152  if (DstTy == S1) {
2153  // This is a special case. We don't treat s1 for legalization artifacts as
2154  // vcc booleans.
2155  DstRB = SrcRB;
2156  } else {
2157  DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2158  if (SrcRB != DstRB)
2159  return false;
2160  }
2161 
2162  const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2163 
2164  unsigned DstSize = DstTy.getSizeInBits();
2165  unsigned SrcSize = SrcTy.getSizeInBits();
2166 
2167  const TargetRegisterClass *SrcRC =
2168  TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2169  const TargetRegisterClass *DstRC =
2170  TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2171  if (!SrcRC || !DstRC)
2172  return false;
2173 
2174  if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2175  !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2176  LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2177  return false;
2178  }
2179 
2180  if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2181  MachineBasicBlock *MBB = I.getParent();
2182  const DebugLoc &DL = I.getDebugLoc();
2183 
2184  Register LoReg = MRI->createVirtualRegister(DstRC);
2185  Register HiReg = MRI->createVirtualRegister(DstRC);
2186  BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2187  .addReg(SrcReg, 0, AMDGPU::sub0);
2188  BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2189  .addReg(SrcReg, 0, AMDGPU::sub1);
2190 
2191  if (IsVALU && STI.hasSDWA()) {
2192  // Write the low 16-bits of the high element into the high 16-bits of the
2193  // low element.
2194  MachineInstr *MovSDWA =
2195  BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2196  .addImm(0) // $src0_modifiers
2197  .addReg(HiReg) // $src0
2198  .addImm(0) // $clamp
2199  .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2200  .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2201  .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2202  .addReg(LoReg, RegState::Implicit);
2203  MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2204  } else {
2205  Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2206  Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2207  Register ImmReg = MRI->createVirtualRegister(DstRC);
2208  if (IsVALU) {
2209  BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2210  .addImm(16)
2211  .addReg(HiReg);
2212  } else {
2213  BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2214  .addReg(HiReg)
2215  .addImm(16);
2216  }
2217 
2218  unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2219  unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2220  unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2221 
2222  BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2223  .addImm(0xffff);
2224  BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2225  .addReg(LoReg)
2226  .addReg(ImmReg);
2227  BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2228  .addReg(TmpReg0)
2229  .addReg(TmpReg1);
2230  }
2231 
2232  I.eraseFromParent();
2233  return true;
2234  }
2235 
2236  if (!DstTy.isScalar())
2237  return false;
2238 
2239  if (SrcSize > 32) {
2240  int SubRegIdx = sizeToSubRegIndex(DstSize);
2241  if (SubRegIdx == -1)
2242  return false;
2243 
2244  // Deal with weird cases where the class only partially supports the subreg
2245  // index.
2246  const TargetRegisterClass *SrcWithSubRC
2247  = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2248  if (!SrcWithSubRC)
2249  return false;
2250 
2251  if (SrcWithSubRC != SrcRC) {
2252  if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2253  return false;
2254  }
2255 
2256  I.getOperand(1).setSubReg(SubRegIdx);
2257  }
2258 
2259  I.setDesc(TII.get(TargetOpcode::COPY));
2260  return true;
2261 }
2262 
2263 /// \returns true if a bitmask for \p Size bits will be an inline immediate.
2264 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2265  Mask = maskTrailingOnes<unsigned>(Size);
2266  int SignedMask = static_cast<int>(Mask);
2267  return SignedMask >= -16 && SignedMask <= 64;
2268 }
2269 
2270 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2271 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2273  const TargetRegisterInfo &TRI) const {
2274  const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2275  if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
2276  return RB;
2277 
2278  // Ignore the type, since we don't use vcc in artifacts.
2279  if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
2280  return &RBI.getRegBankFromRegClass(*RC, LLT());
2281  return nullptr;
2282 }
2283 
2284 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2285  bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2286  bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2287  const DebugLoc &DL = I.getDebugLoc();
2288  MachineBasicBlock &MBB = *I.getParent();
2289  const Register DstReg = I.getOperand(0).getReg();
2290  const Register SrcReg = I.getOperand(1).getReg();
2291 
2292  const LLT DstTy = MRI->getType(DstReg);
2293  const LLT SrcTy = MRI->getType(SrcReg);
2294  const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2295  I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2296  const unsigned DstSize = DstTy.getSizeInBits();
2297  if (!DstTy.isScalar())
2298  return false;
2299 
2300  // Artifact casts should never use vcc.
2301  const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2302 
2303  // FIXME: This should probably be illegal and split earlier.
2304  if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2305  if (DstSize <= 32)
2306  return selectCOPY(I);
2307 
2308  const TargetRegisterClass *SrcRC =
2309  TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2310  const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2311  const TargetRegisterClass *DstRC =
2312  TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2313 
2314  Register UndefReg = MRI->createVirtualRegister(SrcRC);
2315  BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2316  BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2317  .addReg(SrcReg)
2318  .addImm(AMDGPU::sub0)
2319  .addReg(UndefReg)
2320  .addImm(AMDGPU::sub1);
2321  I.eraseFromParent();
2322 
2323  return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2324  RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2325  }
2326 
2327  if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2328  // 64-bit should have been split up in RegBankSelect
2329 
2330  // Try to use an and with a mask if it will save code size.
2331  unsigned Mask;
2332  if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2333  MachineInstr *ExtI =
2334  BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2335  .addImm(Mask)
2336  .addReg(SrcReg);
2337  I.eraseFromParent();
2338  return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2339  }
2340 
2341  const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2342  MachineInstr *ExtI =
2343  BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2344  .addReg(SrcReg)
2345  .addImm(0) // Offset
2346  .addImm(SrcSize); // Width
2347  I.eraseFromParent();
2348  return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2349  }
2350 
2351  if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2352  const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2353  AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2354  if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2355  return false;
2356 
2357  if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2358  const unsigned SextOpc = SrcSize == 8 ?
2359  AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2360  BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2361  .addReg(SrcReg);
2362  I.eraseFromParent();
2363  return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2364  }
2365 
2366  // Using a single 32-bit SALU to calculate the high half is smaller than
2367  // S_BFE with a literal constant operand.
2368  if (DstSize > 32 && SrcSize == 32) {
2369  Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2370  unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2371  if (Signed) {
2372  BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2373  .addReg(SrcReg, 0, SubReg)
2374  .addImm(31);
2375  } else {
2376  BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2377  .addImm(0);
2378  }
2379  BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2380  .addReg(SrcReg, 0, SubReg)
2381  .addImm(AMDGPU::sub0)
2382  .addReg(HiReg)
2383  .addImm(AMDGPU::sub1);
2384  I.eraseFromParent();
2385  return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2386  *MRI);
2387  }
2388 
2389  const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2390  const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2391 
2392  // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2393  if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2394  // We need a 64-bit register source, but the high bits don't matter.
2395  Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2396  Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2397  unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2398 
2399  BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2400  BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2401  .addReg(SrcReg, 0, SubReg)
2402  .addImm(AMDGPU::sub0)
2403  .addReg(UndefReg)
2404  .addImm(AMDGPU::sub1);
2405 
2406  BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2407  .addReg(ExtReg)
2408  .addImm(SrcSize << 16);
2409 
2410  I.eraseFromParent();
2411  return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2412  }
2413 
2414  unsigned Mask;
2415  if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2416  BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2417  .addReg(SrcReg)
2418  .addImm(Mask);
2419  } else {
2420  BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2421  .addReg(SrcReg)
2422  .addImm(SrcSize << 16);
2423  }
2424 
2425  I.eraseFromParent();
2426  return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2427  }
2428 
2429  return false;
2430 }
2431 
2432 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
2433  MachineBasicBlock *BB = I.getParent();
2434  MachineOperand &ImmOp = I.getOperand(1);
2435  Register DstReg = I.getOperand(0).getReg();
2436  unsigned Size = MRI->getType(DstReg).getSizeInBits();
2437 
2438  // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
2439  if (ImmOp.isFPImm()) {
2440  const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
2441  ImmOp.ChangeToImmediate(Imm.getZExtValue());
2442  } else if (ImmOp.isCImm()) {
2443  ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
2444  } else {
2445  llvm_unreachable("Not supported by g_constants");
2446  }
2447 
2448  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2449  const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
2450 
2451  unsigned Opcode;
2452  if (DstRB->getID() == AMDGPU::VCCRegBankID) {
2453  Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2454  } else {
2455  Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2456 
2457  // We should never produce s1 values on banks other than VCC. If the user of
2458  // this already constrained the register, we may incorrectly think it's VCC
2459  // if it wasn't originally.
2460  if (Size == 1)
2461  return false;
2462  }
2463 
2464  if (Size != 64) {
2465  I.setDesc(TII.get(Opcode));
2466  I.addImplicitDefUseOperands(*MF);
2467  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2468  }
2469 
2470  const DebugLoc &DL = I.getDebugLoc();
2471 
2472  APInt Imm(Size, I.getOperand(1).getImm());
2473 
2474  MachineInstr *ResInst;
2475  if (IsSgpr && TII.isInlineConstant(Imm)) {
2476  ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2477  .addImm(I.getOperand(1).getImm());
2478  } else {
2479  const TargetRegisterClass *RC = IsSgpr ?
2480  &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2481  Register LoReg = MRI->createVirtualRegister(RC);
2482  Register HiReg = MRI->createVirtualRegister(RC);
2483 
2484  BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2485  .addImm(Imm.trunc(32).getZExtValue());
2486 
2487  BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2488  .addImm(Imm.ashr(32).getZExtValue());
2489 
2490  ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2491  .addReg(LoReg)
2492  .addImm(AMDGPU::sub0)
2493  .addReg(HiReg)
2494  .addImm(AMDGPU::sub1);
2495  }
2496 
2497  // We can't call constrainSelectedInstRegOperands here, because it doesn't
2498  // work for target independent opcodes
2499  I.eraseFromParent();
2500  const TargetRegisterClass *DstRC =
2501  TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2502  if (!DstRC)
2503  return true;
2504  return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2505 }
2506 
2507 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2508  // Only manually handle the f64 SGPR case.
2509  //
2510  // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2511  // the bit ops theoretically have a second result due to the implicit def of
2512  // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2513  // that is easy by disabling the check. The result works, but uses a
2514  // nonsensical sreg32orlds_and_sreg_1 regclass.
2515  //
2516  // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2517  // the variadic REG_SEQUENCE operands.
2518 
2519  Register Dst = MI.getOperand(0).getReg();
2520  const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2521  if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2522  MRI->getType(Dst) != LLT::scalar(64))
2523  return false;
2524 
2525  Register Src = MI.getOperand(1).getReg();
2526  MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2527  if (Fabs)
2528  Src = Fabs->getOperand(1).getReg();
2529 
2530  if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2531  !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2532  return false;
2533 
2534  MachineBasicBlock *BB = MI.getParent();
2535  const DebugLoc &DL = MI.getDebugLoc();
2536  Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2537  Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2538  Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2539  Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2540 
2541  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2542  .addReg(Src, 0, AMDGPU::sub0);
2543  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2544  .addReg(Src, 0, AMDGPU::sub1);
2545  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2546  .addImm(0x80000000);
2547 
2548  // Set or toggle sign bit.
2549  unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2550  BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2551  .addReg(HiReg)
2552  .addReg(ConstReg);
2553  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2554  .addReg(LoReg)
2555  .addImm(AMDGPU::sub0)
2556  .addReg(OpReg)
2557  .addImm(AMDGPU::sub1);
2558  MI.eraseFromParent();
2559  return true;
2560 }
2561 
2562 // FIXME: This is a workaround for the same tablegen problems as G_FNEG
2563 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2564  Register Dst = MI.getOperand(0).getReg();
2565  const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2566  if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2567  MRI->getType(Dst) != LLT::scalar(64))
2568  return false;
2569 
2570  Register Src = MI.getOperand(1).getReg();
2571  MachineBasicBlock *BB = MI.getParent();
2572  const DebugLoc &DL = MI.getDebugLoc();
2573  Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2574  Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2575  Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2576  Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2577 
2578  if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2579  !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2580  return false;
2581 
2582  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2583  .addReg(Src, 0, AMDGPU::sub0);
2584  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2585  .addReg(Src, 0, AMDGPU::sub1);
2586  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2587  .addImm(0x7fffffff);
2588 
2589  // Clear sign bit.
2590  // TODO: Should this used S_BITSET0_*?
2591  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2592  .addReg(HiReg)
2593  .addReg(ConstReg);
2594  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2595  .addReg(LoReg)
2596  .addImm(AMDGPU::sub0)
2597  .addReg(OpReg)
2598  .addImm(AMDGPU::sub1);
2599 
2600  MI.eraseFromParent();
2601  return true;
2602 }
2603 
2604 static bool isConstant(const MachineInstr &MI) {
2605  return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2606 }
2607 
2608 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2609  const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2610 
2611  const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg());
2612 
2613  assert(PtrMI);
2614 
2615  if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2616  return;
2617 
2618  GEPInfo GEPInfo;
2619 
2620  for (unsigned i = 1; i != 3; ++i) {
2621  const MachineOperand &GEPOp = PtrMI->getOperand(i);
2622  const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2623  assert(OpDef);
2624  if (i == 2 && isConstant(*OpDef)) {
2625  // TODO: Could handle constant base + variable offset, but a combine
2626  // probably should have commuted it.
2627  assert(GEPInfo.Imm == 0);
2628  GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2629  continue;
2630  }
2631  const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2632  if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2633  GEPInfo.SgprParts.push_back(GEPOp.getReg());
2634  else
2635  GEPInfo.VgprParts.push_back(GEPOp.getReg());
2636  }
2637 
2638  AddrInfo.push_back(GEPInfo);
2639  getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2640 }
2641 
2642 bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2643  return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2644 }
2645 
2646 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2647  if (!MI.hasOneMemOperand())
2648  return false;
2649 
2650  const MachineMemOperand *MMO = *MI.memoperands_begin();
2651  const Value *Ptr = MMO->getValue();
2652 
2653  // UndefValue means this is a load of a kernel input. These are uniform.
2654  // Sometimes LDS instructions have constant pointers.
2655  // If Ptr is null, then that means this mem operand contains a
2656  // PseudoSourceValue like GOT.
2657  if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2658  isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2659  return true;
2660 
2662  return true;
2663 
2664  const Instruction *I = dyn_cast<Instruction>(Ptr);
2665  return I && I->getMetadata("amdgpu.uniform");
2666 }
2667 
2668 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2669  for (const GEPInfo &GEPInfo : AddrInfo) {
2670  if (!GEPInfo.VgprParts.empty())
2671  return true;
2672  }
2673  return false;
2674 }
2675 
2676 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2677  const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2678  unsigned AS = PtrTy.getAddressSpace();
2679  if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
2680  STI.ldsRequiresM0Init()) {
2681  MachineBasicBlock *BB = I.getParent();
2682 
2683  // If DS instructions require M0 initialization, insert it before selecting.
2684  BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2685  .addImm(-1);
2686  }
2687 }
2688 
2689 bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2690  MachineInstr &I) const {
2691  initM0(I);
2692  return selectImpl(I, *CoverageInfo);
2693 }
2694 
2696  if (Reg.isPhysical())
2697  return false;
2698 
2700  const unsigned Opcode = MI.getOpcode();
2701 
2702  if (Opcode == AMDGPU::COPY)
2703  return isVCmpResult(MI.getOperand(1).getReg(), MRI);
2704 
2705  if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
2706  Opcode == AMDGPU::G_XOR)
2707  return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
2708  isVCmpResult(MI.getOperand(2).getReg(), MRI);
2709 
2710  if (Opcode == TargetOpcode::G_INTRINSIC)
2711  return MI.getIntrinsicID() == Intrinsic::amdgcn_class;
2712 
2713  return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
2714 }
2715 
2716 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2717  MachineBasicBlock *BB = I.getParent();
2718  MachineOperand &CondOp = I.getOperand(0);
2719  Register CondReg = CondOp.getReg();
2720  const DebugLoc &DL = I.getDebugLoc();
2721 
2722  unsigned BrOpcode;
2723  Register CondPhysReg;
2724  const TargetRegisterClass *ConstrainRC;
2725 
2726  // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2727  // whether the branch is uniform when selecting the instruction. In
2728  // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2729  // RegBankSelect knows what it's doing if the branch condition is scc, even
2730  // though it currently does not.
2731  if (!isVCC(CondReg, *MRI)) {
2732  if (MRI->getType(CondReg) != LLT::scalar(32))
2733  return false;
2734 
2735  CondPhysReg = AMDGPU::SCC;
2736  BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2737  ConstrainRC = &AMDGPU::SReg_32RegClass;
2738  } else {
2739  // FIXME: Should scc->vcc copies and with exec?
2740 
2741  // Unless the value of CondReg is a result of a V_CMP* instruction then we
2742  // need to insert an and with exec.
2743  if (!isVCmpResult(CondReg, *MRI)) {
2744  const bool Is64 = STI.isWave64();
2745  const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
2746  const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
2747 
2748  Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
2749  BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
2750  .addReg(CondReg)
2751  .addReg(Exec);
2752  CondReg = TmpReg;
2753  }
2754 
2755  CondPhysReg = TRI.getVCC();
2756  BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2757  ConstrainRC = TRI.getBoolRC();
2758  }
2759 
2760  if (!MRI->getRegClassOrNull(CondReg))
2761  MRI->setRegClass(CondReg, ConstrainRC);
2762 
2763  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2764  .addReg(CondReg);
2765  BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2766  .addMBB(I.getOperand(1).getMBB());
2767 
2768  I.eraseFromParent();
2769  return true;
2770 }
2771 
2772 bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2773  MachineInstr &I) const {
2774  Register DstReg = I.getOperand(0).getReg();
2775  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2776  const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2777  I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2778  if (IsVGPR)
2779  I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2780 
2781  return RBI.constrainGenericRegister(
2782  DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2783 }
2784 
2785 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2786  Register DstReg = I.getOperand(0).getReg();
2787  Register SrcReg = I.getOperand(1).getReg();
2788  Register MaskReg = I.getOperand(2).getReg();
2789  LLT Ty = MRI->getType(DstReg);
2790  LLT MaskTy = MRI->getType(MaskReg);
2791  MachineBasicBlock *BB = I.getParent();
2792  const DebugLoc &DL = I.getDebugLoc();
2793 
2794  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2795  const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2796  const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2797  const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2798  if (DstRB != SrcRB) // Should only happen for hand written MIR.
2799  return false;
2800 
2801  // Try to avoid emitting a bit operation when we only need to touch half of
2802  // the 64-bit pointer.
2803  APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zext(64);
2804  const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2805  const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2806 
2807  const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
2808  const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
2809 
2810  if (!IsVGPR && Ty.getSizeInBits() == 64 &&
2811  !CanCopyLow32 && !CanCopyHi32) {
2812  auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
2813  .addReg(SrcReg)
2814  .addReg(MaskReg);
2815  I.eraseFromParent();
2816  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2817  }
2818 
2819  unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2820  const TargetRegisterClass &RegRC
2821  = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2822 
2823  const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
2824  const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
2825  const TargetRegisterClass *MaskRC =
2826  TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
2827 
2828  if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2829  !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2830  !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2831  return false;
2832 
2833  if (Ty.getSizeInBits() == 32) {
2834  assert(MaskTy.getSizeInBits() == 32 &&
2835  "ptrmask should have been narrowed during legalize");
2836 
2837  BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2838  .addReg(SrcReg)
2839  .addReg(MaskReg);
2840  I.eraseFromParent();
2841  return true;
2842  }
2843 
2844  Register HiReg = MRI->createVirtualRegister(&RegRC);
2845  Register LoReg = MRI->createVirtualRegister(&RegRC);
2846 
2847  // Extract the subregisters from the source pointer.
2848  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2849  .addReg(SrcReg, 0, AMDGPU::sub0);
2850  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2851  .addReg(SrcReg, 0, AMDGPU::sub1);
2852 
2853  Register MaskedLo, MaskedHi;
2854 
2855  if (CanCopyLow32) {
2856  // If all the bits in the low half are 1, we only need a copy for it.
2857  MaskedLo = LoReg;
2858  } else {
2859  // Extract the mask subregister and apply the and.
2860  Register MaskLo = MRI->createVirtualRegister(&RegRC);
2861  MaskedLo = MRI->createVirtualRegister(&RegRC);
2862 
2863  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2864  .addReg(MaskReg, 0, AMDGPU::sub0);
2865  BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2866  .addReg(LoReg)
2867  .addReg(MaskLo);
2868  }
2869 
2870  if (CanCopyHi32) {
2871  // If all the bits in the high half are 1, we only need a copy for it.
2872  MaskedHi = HiReg;
2873  } else {
2874  Register MaskHi = MRI->createVirtualRegister(&RegRC);
2875  MaskedHi = MRI->createVirtualRegister(&RegRC);
2876 
2877  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
2878  .addReg(MaskReg, 0, AMDGPU::sub1);
2879  BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
2880  .addReg(HiReg)
2881  .addReg(MaskHi);
2882  }
2883 
2884  BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2885  .addReg(MaskedLo)
2886  .addImm(AMDGPU::sub0)
2887  .addReg(MaskedHi)
2888  .addImm(AMDGPU::sub1);
2889  I.eraseFromParent();
2890  return true;
2891 }
2892 
2893 /// Return the register to use for the index value, and the subregister to use
2894 /// for the indirectly accessed register.
2895 static std::pair<Register, unsigned>
2897  const TargetRegisterClass *SuperRC, Register IdxReg,
2898  unsigned EltSize, GISelKnownBits &KnownBits) {
2899  Register IdxBaseReg;
2900  int Offset;
2901 
2902  std::tie(IdxBaseReg, Offset) =
2904  if (IdxBaseReg == AMDGPU::NoRegister) {
2905  // This will happen if the index is a known constant. This should ordinarily
2906  // be legalized out, but handle it as a register just in case.
2907  assert(Offset == 0);
2908  IdxBaseReg = IdxReg;
2909  }
2910 
2911  ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
2912 
2913  // Skip out of bounds offsets, or else we would end up using an undefined
2914  // register.
2915  if (static_cast<unsigned>(Offset) >= SubRegs.size())
2916  return std::make_pair(IdxReg, SubRegs[0]);
2917  return std::make_pair(IdxBaseReg, SubRegs[Offset]);
2918 }
2919 
2920 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
2921  MachineInstr &MI) const {
2922  Register DstReg = MI.getOperand(0).getReg();
2923  Register SrcReg = MI.getOperand(1).getReg();
2924  Register IdxReg = MI.getOperand(2).getReg();
2925 
2926  LLT DstTy = MRI->getType(DstReg);
2927  LLT SrcTy = MRI->getType(SrcReg);
2928 
2929  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2930  const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2931  const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2932 
2933  // The index must be scalar. If it wasn't RegBankSelect should have moved this
2934  // into a waterfall loop.
2935  if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2936  return false;
2937 
2938  const TargetRegisterClass *SrcRC =
2939  TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
2940  const TargetRegisterClass *DstRC =
2941  TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
2942  if (!SrcRC || !DstRC)
2943  return false;
2944  if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2945  !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2946  !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2947  return false;
2948 
2949  MachineBasicBlock *BB = MI.getParent();
2950  const DebugLoc &DL = MI.getDebugLoc();
2951  const bool Is64 = DstTy.getSizeInBits() == 64;
2952 
2953  unsigned SubReg;
2954  std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
2955  *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *KnownBits);
2956 
2957  if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
2958  if (DstTy.getSizeInBits() != 32 && !Is64)
2959  return false;
2960 
2961  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2962  .addReg(IdxReg);
2963 
2964  unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
2965  BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
2966  .addReg(SrcReg, 0, SubReg)
2967  .addReg(SrcReg, RegState::Implicit);
2968  MI.eraseFromParent();
2969  return true;
2970  }
2971 
2972  if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
2973  return false;
2974 
2975  if (!STI.useVGPRIndexMode()) {
2976  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2977  .addReg(IdxReg);
2978  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
2979  .addReg(SrcReg, 0, SubReg)
2980  .addReg(SrcReg, RegState::Implicit);
2981  MI.eraseFromParent();
2982  return true;
2983  }
2984 
2985  const MCInstrDesc &GPRIDXDesc =
2986  TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
2987  BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
2988  .addReg(SrcReg)
2989  .addReg(IdxReg)
2990  .addImm(SubReg);
2991 
2992  MI.eraseFromParent();
2993  return true;
2994 }
2995 
2996 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
2997 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
2998  MachineInstr &MI) const {
2999  Register DstReg = MI.getOperand(0).getReg();
3000  Register VecReg = MI.getOperand(1).getReg();
3001  Register ValReg = MI.getOperand(2).getReg();
3002  Register IdxReg = MI.getOperand(3).getReg();
3003 
3004  LLT VecTy = MRI->getType(DstReg);
3005  LLT ValTy = MRI->getType(ValReg);
3006  unsigned VecSize = VecTy.getSizeInBits();
3007  unsigned ValSize = ValTy.getSizeInBits();
3008 
3009  const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3010  const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3011  const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3012 
3013  assert(VecTy.getElementType() == ValTy);
3014 
3015  // The index must be scalar. If it wasn't RegBankSelect should have moved this
3016  // into a waterfall loop.
3017  if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3018  return false;
3019 
3020  const TargetRegisterClass *VecRC =
3021  TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3022  const TargetRegisterClass *ValRC =
3023  TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3024 
3025  if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3026  !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3027  !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3028  !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3029  return false;
3030 
3031  if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3032  return false;
3033 
3034  unsigned SubReg;
3035  std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg,
3036  ValSize / 8, *KnownBits);
3037 
3038  const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3039  STI.useVGPRIndexMode();
3040 
3041  MachineBasicBlock *BB = MI.getParent();
3042  const DebugLoc &DL = MI.getDebugLoc();
3043 
3044  if (!IndexMode) {
3045  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3046  .addReg(IdxReg);
3047 
3048  const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3049  VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3050  BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3051  .addReg(VecReg)
3052  .addReg(ValReg)
3053  .addImm(SubReg);
3054  MI.eraseFromParent();
3055  return true;
3056  }
3057 
3058  const MCInstrDesc &GPRIDXDesc =
3059  TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3060  BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3061  .addReg(VecReg)
3062  .addReg(ValReg)
3063  .addReg(IdxReg)
3064  .addImm(SubReg);
3065 
3066  MI.eraseFromParent();
3067  return true;
3068 }
3069 
3070 bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3071  unsigned Opc;
3072  unsigned Size = MI.getOperand(3).getImm();
3073 
3074  // The struct intrinsic variants add one additional operand over raw.
3075  const bool HasVIndex = MI.getNumOperands() == 9;
3076  Register VIndex;
3077  int OpOffset = 0;
3078  if (HasVIndex) {
3079  VIndex = MI.getOperand(4).getReg();
3080  OpOffset = 1;
3081  }
3082 
3083  Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3084  Optional<ValueAndVReg> MaybeVOffset =
3085  getIConstantVRegValWithLookThrough(VOffset, *MRI);
3086  const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3087 
3088  switch (Size) {
3089  default:
3090  return false;
3091  case 1:
3092  Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3093  : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3094  : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3095  : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3096  break;
3097  case 2:
3098  Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3099  : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3100  : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3101  : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3102  break;
3103  case 4:
3104  Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3105  : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3106  : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3107  : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3108  break;
3109  }
3110 
3111  MachineBasicBlock *MBB = MI.getParent();
3112  const DebugLoc &DL = MI.getDebugLoc();
3113  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3114  .add(MI.getOperand(2));
3115 
3116  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3117 
3118  if (HasVIndex && HasVOffset) {
3119  Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3120  BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3121  .addReg(VIndex)
3122  .addImm(AMDGPU::sub0)
3123  .addReg(VOffset)
3124  .addImm(AMDGPU::sub1);
3125 
3126  MIB.addReg(IdxReg);
3127  } else if (HasVIndex) {
3128  MIB.addReg(VIndex);
3129  } else if (HasVOffset) {
3130  MIB.addReg(VOffset);
3131  }
3132 
3133  MIB.add(MI.getOperand(1)); // rsrc
3134  MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3135  MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3136  unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3137  MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol
3138  MIB.addImm((Aux >> 3) & 1); // swz
3139 
3140  MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3141  MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3142  LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
3143  MachinePointerInfo StorePtrI = LoadPtrI;
3144  StorePtrI.V = nullptr;
3145  StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
3146 
3147  auto F = LoadMMO->getFlags() &
3149  LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3150  Size, LoadMMO->getBaseAlign());
3151 
3152  MachineMemOperand *StoreMMO =
3154  sizeof(int32_t), LoadMMO->getBaseAlign());
3155 
3156  MIB.setMemRefs({LoadMMO, StoreMMO});
3157 
3158  MI.eraseFromParent();
3159  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3160 }
3161 
3162 /// Match a zero extend from a 32-bit value to 64-bits.
3164  Register ZExtSrc;
3165  if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3166  return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3167 
3168  // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3170  if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3171  return Register();
3172 
3173  assert(Def->getNumOperands() == 3 &&
3174  MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3175  if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3176  return Def->getOperand(1).getReg();
3177  }
3178 
3179  return Register();
3180 }
3181 
3182 bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3183  unsigned Opc;
3184  unsigned Size = MI.getOperand(3).getImm();
3185 
3186  switch (Size) {
3187  default:
3188  return false;
3189  case 1:
3190  Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3191  break;
3192  case 2:
3193  Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3194  break;
3195  case 4:
3196  Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3197  break;
3198  }
3199 
3200  MachineBasicBlock *MBB = MI.getParent();
3201  const DebugLoc &DL = MI.getDebugLoc();
3202  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3203  .add(MI.getOperand(2));
3204 
3205  Register Addr = MI.getOperand(1).getReg();
3206  Register VOffset;
3207  // Try to split SAddr and VOffset. Global and LDS pointers share the same
3208  // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3209  if (!isSGPR(Addr)) {
3210  auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3211  if (isSGPR(AddrDef->Reg)) {
3212  Addr = AddrDef->Reg;
3213  } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3214  Register SAddr =
3215  getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3216  if (isSGPR(SAddr)) {
3217  Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3218  if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
3219  Addr = SAddr;
3220  VOffset = Off;
3221  }
3222  }
3223  }
3224  }
3225 
3226  if (isSGPR(Addr)) {
3227  Opc = AMDGPU::getGlobalSaddrOp(Opc);
3228  if (!VOffset) {
3229  VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3230  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3231  .addImm(0);
3232  }
3233  }
3234 
3235  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3236  .addReg(Addr);
3237 
3238  if (isSGPR(Addr))
3239  MIB.addReg(VOffset);
3240 
3241  MIB.add(MI.getOperand(4)) // offset
3242  .add(MI.getOperand(5)); // cpol
3243 
3244  MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3245  MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3246  LoadPtrI.Offset = MI.getOperand(4).getImm();
3247  MachinePointerInfo StorePtrI = LoadPtrI;
3249  StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
3250  auto F = LoadMMO->getFlags() &
3252  LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3253  Size, LoadMMO->getBaseAlign());
3254  MachineMemOperand *StoreMMO =
3256  sizeof(int32_t), Align(4));
3257 
3258  MIB.setMemRefs({LoadMMO, StoreMMO});
3259 
3260  MI.eraseFromParent();
3261  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3262 }
3263 
3264 bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3265  MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3266  MI.removeOperand(1);
3267  MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3268  return true;
3269 }
3270 
3271 bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3272  unsigned Opc;
3273  switch (MI.getIntrinsicID()) {
3274  case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3275  Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3276  break;
3277  case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3278  Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3279  break;
3280  case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3281  Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3282  break;
3283  case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3284  Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3285  break;
3286  case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3287  Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3288  break;
3289  case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3290  Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3291  break;
3292  case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3293  Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3294  break;
3295  case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3296  Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3297  break;
3298  case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3299  Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3300  break;
3301  case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3302  Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3303  break;
3304  case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3305  Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3306  break;
3307  case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3308  Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3309  break;
3310  case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3311  Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3312  break;
3313  case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3314  Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3315  break;
3316  default:
3317  llvm_unreachable("unhandled smfmac intrinsic");
3318  }
3319 
3320  auto VDst_In = MI.getOperand(4);
3321 
3322  MI.setDesc(TII.get(Opc));
3323  MI.removeOperand(4); // VDst_In
3324  MI.removeOperand(1); // Intrinsic ID
3325  MI.addOperand(VDst_In); // Readd VDst_In to the end
3326  MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3327  return true;
3328 }
3329 
3330 bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3331  Register DstReg = MI.getOperand(0).getReg();
3332  Register SrcReg = MI.getOperand(1).getReg();
3333  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3334  const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3335  MachineBasicBlock *MBB = MI.getParent();
3336  const DebugLoc &DL = MI.getDebugLoc();
3337 
3338  if (IsVALU) {
3339  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3340  .addImm(Subtarget->getWavefrontSizeLog2())
3341  .addReg(SrcReg);
3342  } else {
3343  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3344  .addReg(SrcReg)
3345  .addImm(Subtarget->getWavefrontSizeLog2());
3346  }
3347 
3348  const TargetRegisterClass &RC =
3349  IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3350  if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3351  return false;
3352 
3353  MI.eraseFromParent();
3354  return true;
3355 }
3356 
3358  if (I.isPHI())
3359  return selectPHI(I);
3360 
3361  if (!I.isPreISelOpcode()) {
3362  if (I.isCopy())
3363  return selectCOPY(I);
3364  return true;
3365  }
3366 
3367  switch (I.getOpcode()) {
3368  case TargetOpcode::G_AND:
3369  case TargetOpcode::G_OR:
3370  case TargetOpcode::G_XOR:
3371  if (selectImpl(I, *CoverageInfo))
3372  return true;
3373  return selectG_AND_OR_XOR(I);
3374  case TargetOpcode::G_ADD:
3375  case TargetOpcode::G_SUB:
3376  if (selectImpl(I, *CoverageInfo))
3377  return true;
3378  return selectG_ADD_SUB(I);
3379  case TargetOpcode::G_UADDO:
3380  case TargetOpcode::G_USUBO:
3381  case TargetOpcode::G_UADDE:
3382  case TargetOpcode::G_USUBE:
3383  return selectG_UADDO_USUBO_UADDE_USUBE(I);
3384  case AMDGPU::G_AMDGPU_MAD_U64_U32:
3385  case AMDGPU::G_AMDGPU_MAD_I64_I32:
3386  return selectG_AMDGPU_MAD_64_32(I);
3387  case TargetOpcode::G_INTTOPTR:
3388  case TargetOpcode::G_BITCAST:
3389  case TargetOpcode::G_PTRTOINT:
3390  return selectCOPY(I);
3391  case TargetOpcode::G_CONSTANT:
3392  case TargetOpcode::G_FCONSTANT:
3393  return selectG_CONSTANT(I);
3394  case TargetOpcode::G_FNEG:
3395  if (selectImpl(I, *CoverageInfo))
3396  return true;
3397  return selectG_FNEG(I);
3398  case TargetOpcode::G_FABS:
3399  if (selectImpl(I, *CoverageInfo))
3400  return true;
3401  return selectG_FABS(I);
3402  case TargetOpcode::G_EXTRACT:
3403  return selectG_EXTRACT(I);
3404  case TargetOpcode::G_FMA:
3405  case TargetOpcode::G_FMAD:
3406  if (selectG_FMA_FMAD(I))
3407  return true;
3408  return selectImpl(I, *CoverageInfo);
3409  case TargetOpcode::G_MERGE_VALUES:
3410  case TargetOpcode::G_CONCAT_VECTORS:
3411  return selectG_MERGE_VALUES(I);
3412  case TargetOpcode::G_UNMERGE_VALUES:
3413  return selectG_UNMERGE_VALUES(I);
3414  case TargetOpcode::G_BUILD_VECTOR:
3415  case TargetOpcode::G_BUILD_VECTOR_TRUNC:
3416  return selectG_BUILD_VECTOR(I);
3417  case TargetOpcode::G_PTR_ADD:
3418  if (selectImpl(I, *CoverageInfo))
3419  return true;
3420  return selectG_PTR_ADD(I);
3421  case TargetOpcode::G_IMPLICIT_DEF:
3422  return selectG_IMPLICIT_DEF(I);
3423  case TargetOpcode::G_FREEZE:
3424  return selectCOPY(I);
3425  case TargetOpcode::G_INSERT:
3426  return selectG_INSERT(I);
3427  case TargetOpcode::G_INTRINSIC:
3428  return selectG_INTRINSIC(I);
3429  case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3430  return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
3431  case TargetOpcode::G_ICMP:
3432  if (selectG_ICMP(I))
3433  return true;
3434  return selectImpl(I, *CoverageInfo);
3435  case TargetOpcode::G_LOAD:
3436  case TargetOpcode::G_STORE:
3437  case TargetOpcode::G_ATOMIC_CMPXCHG:
3438  case TargetOpcode::G_ATOMICRMW_XCHG:
3439  case TargetOpcode::G_ATOMICRMW_ADD:
3440  case TargetOpcode::G_ATOMICRMW_SUB:
3441  case TargetOpcode::G_ATOMICRMW_AND:
3442  case TargetOpcode::G_ATOMICRMW_OR:
3443  case TargetOpcode::G_ATOMICRMW_XOR:
3444  case TargetOpcode::G_ATOMICRMW_MIN:
3445  case TargetOpcode::G_ATOMICRMW_MAX:
3446  case TargetOpcode::G_ATOMICRMW_UMIN:
3447  case TargetOpcode::G_ATOMICRMW_UMAX:
3448  case TargetOpcode::G_ATOMICRMW_FADD:
3449  case AMDGPU::G_AMDGPU_ATOMIC_INC:
3450  case AMDGPU::G_AMDGPU_ATOMIC_DEC:
3451  case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
3452  case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
3453  return selectG_LOAD_STORE_ATOMICRMW(I);
3454  case TargetOpcode::G_SELECT:
3455  return selectG_SELECT(I);
3456  case TargetOpcode::G_TRUNC:
3457  return selectG_TRUNC(I);
3458  case TargetOpcode::G_SEXT:
3459  case TargetOpcode::G_ZEXT:
3460  case TargetOpcode::G_ANYEXT:
3461  case TargetOpcode::G_SEXT_INREG:
3462  if (selectImpl(I, *CoverageInfo))
3463  return true;
3464  return selectG_SZA_EXT(I);
3465  case TargetOpcode::G_BRCOND:
3466  return selectG_BRCOND(I);
3467  case TargetOpcode::G_GLOBAL_VALUE:
3468  return selectG_GLOBAL_VALUE(I);
3469  case TargetOpcode::G_PTRMASK:
3470  return selectG_PTRMASK(I);
3471  case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3472  return selectG_EXTRACT_VECTOR_ELT(I);
3473  case TargetOpcode::G_INSERT_VECTOR_ELT:
3474  return selectG_INSERT_VECTOR_ELT(I);
3475  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3476  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3477  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3478  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3480  = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID());
3481  assert(Intr && "not an image intrinsic with image pseudo");
3482  return selectImageIntrinsic(I, Intr);
3483  }
3484  case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
3485  return selectBVHIntrinsic(I);
3486  case AMDGPU::G_SBFX:
3487  case AMDGPU::G_UBFX:
3488  return selectG_SBFX_UBFX(I);
3489  case AMDGPU::G_SI_CALL:
3490  I.setDesc(TII.get(AMDGPU::SI_CALL));
3491  return true;
3492  case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
3493  return selectWaveAddress(I);
3494  default:
3495  return selectImpl(I, *CoverageInfo);
3496  }
3497  return false;
3498 }
3499 
3501 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
3502  return {{
3503  [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3504  }};
3505 
3506 }
3507 
3508 std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
3509  MachineOperand &Root, bool AllowAbs, bool OpSel) const {
3510  Register Src = Root.getReg();
3511  unsigned Mods = 0;
3512  MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
3513 
3514  if (MI->getOpcode() == AMDGPU::G_FNEG) {
3515  Src = MI->getOperand(1).getReg();
3516  Mods |= SISrcMods::NEG;
3517  MI = getDefIgnoringCopies(Src, *MRI);
3518  }
3519 
3520  if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
3521  Src = MI->getOperand(1).getReg();
3522  Mods |= SISrcMods::ABS;
3523  }
3524 
3525  if (OpSel)
3526  Mods |= SISrcMods::OP_SEL_0;
3527 
3528  return std::make_pair(Src, Mods);
3529 }
3530 
3531 Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
3532  Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
3533  bool ForceVGPR) const {
3534  if ((Mods != 0 || ForceVGPR) &&
3535  RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
3536 
3537  // If we looked through copies to find source modifiers on an SGPR operand,
3538  // we now have an SGPR register source. To avoid potentially violating the
3539  // constant bus restriction, we need to insert a copy to a VGPR.
3540  Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
3541  BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
3542  TII.get(AMDGPU::COPY), VGPRSrc)
3543  .addReg(Src);
3544  Src = VGPRSrc;
3545  }
3546 
3547  return Src;
3548 }
3549 
3550 ///
3551 /// This will select either an SGPR or VGPR operand and will save us from
3552 /// having to write an extra tablegen pattern.
3554 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
3555  return {{
3556  [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3557  }};
3558 }
3559 
3561 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
3562  Register Src;
3563  unsigned Mods;
3564  std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3565 
3566  return {{
3567  [=](MachineInstrBuilder &MIB) {
3568  MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3569  },
3570  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3571  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3572  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3573  }};
3574 }
3575 
3577 AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
3578  Register Src;
3579  unsigned Mods;
3580  std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false);
3581 
3582  return {{
3583  [=](MachineInstrBuilder &MIB) {
3584  MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3585  },
3586  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3587  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3588  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3589  }};
3590 }
3591 
3593 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
3594  return {{
3595  [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
3596  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3597  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3598  }};
3599 }
3600 
3602 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3603  Register Src;
3604  unsigned Mods;
3605  std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3606 
3607  return {{
3608  [=](MachineInstrBuilder &MIB) {
3609  MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3610  },
3611  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3612  }};
3613 }
3614 
3616 AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
3617  Register Src;
3618  unsigned Mods;
3619  std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false);
3620 
3621  return {{
3622  [=](MachineInstrBuilder &MIB) {
3623  MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3624  },
3625  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3626  }};
3627 }
3628 
3630 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3631  Register Reg = Root.getReg();
3632  const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3633  if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
3634  return {};
3635  return {{
3636  [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3637  }};
3638 }
3639 
3640 std::pair<Register, unsigned>
3641 AMDGPUInstructionSelector::selectVOP3PModsImpl(
3642  Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
3643  unsigned Mods = 0;
3644  MachineInstr *MI = MRI.getVRegDef(Src);
3645 
3646  if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3647  // It's possible to see an f32 fneg here, but unlikely.
3648  // TODO: Treat f32 fneg as only high bit.
3649  MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
3650  Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3651  Src = MI->getOperand(1).getReg();
3652  MI = MRI.getVRegDef(Src);
3653  }
3654 
3655  // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3656  (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard()
3657 
3658  // Packed instructions do not have abs modifiers.
3659  Mods |= SISrcMods::OP_SEL_1;
3660 
3661  return std::make_pair(Src, Mods);
3662 }
3663 
3665 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3667  = Root.getParent()->getParent()->getParent()->getRegInfo();
3668 
3669  Register Src;
3670  unsigned Mods;
3671  std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3672 
3673  return {{
3674  [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3675  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3676  }};
3677 }
3678 
3680 AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
3682  = Root.getParent()->getParent()->getParent()->getRegInfo();
3683 
3684  Register Src;
3685  unsigned Mods;
3686  std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true);
3687 
3688  return {{
3689  [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3690  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3691  }};
3692 }
3693 
3695 AMDGPUInstructionSelector::selectDotIUVOP3PMods(MachineOperand &Root) const {
3696  // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3697  // Value is in Imm operand as i1 sign extended to int64_t.
3698  // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
3699  assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3700  "expected i1 value");
3701  unsigned Mods = SISrcMods::OP_SEL_1;
3702  if (Root.getImm() == -1)
3703  Mods ^= SISrcMods::NEG;
3704  return {{
3705  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3706  }};
3707 }
3708 
3710 AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
3711  MachineOperand &Root) const {
3712  assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3713  "expected i1 value");
3714  unsigned Mods = SISrcMods::OP_SEL_1;
3715  if (Root.getImm() != 0)
3716  Mods |= SISrcMods::OP_SEL_0;
3717 
3718  return {{
3719  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3720  }};
3721 }
3722 
3724 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
3725  Register Src;
3726  unsigned Mods;
3727  std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3728  if (!isKnownNeverNaN(Src, *MRI))
3729  return None;
3730 
3731  return {{
3732  [=](MachineInstrBuilder &MIB) {
3733  MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3734  },
3735  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3736  }};
3737 }
3738 
3740 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
3741  Register Src;
3742  unsigned Mods;
3743  std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3744 
3745  // FIXME: Handle op_sel
3746  return {{
3747  [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3748  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3749  }};
3750 }
3751 
3753 AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
3754  Register Src;
3755  unsigned Mods;
3756  std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
3757  /* AllowAbs */ false,
3758  /* OpSel */ false);
3759 
3760  return {{
3761  [=](MachineInstrBuilder &MIB) {
3762  MIB.addReg(
3763  copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
3764  },
3765  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3766  }};
3767 }
3768 
3770 AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
3771  Register Src;
3772  unsigned Mods;
3773  std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
3774  /* AllowAbs */ false,
3775  /* OpSel */ true);
3776 
3777  return {{
3778  [=](MachineInstrBuilder &MIB) {
3779  MIB.addReg(
3780  copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
3781  },
3782  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3783  }};
3784 }
3785 
3786 bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
3787  Register &Base,
3788  Register *SOffset,
3789  int64_t *Offset) const {
3790  MachineInstr *MI = Root.getParent();
3791  MachineBasicBlock *MBB = MI->getParent();
3792 
3793  // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
3794  // then we can select all ptr + 32-bit offsets.
3795  SmallVector<GEPInfo, 4> AddrInfo;
3796  getAddrModeInfo(*MI, *MRI, AddrInfo);
3797 
3798  if (AddrInfo.empty())
3799  return false;
3800 
3801  const GEPInfo &GEPI = AddrInfo[0];
3802  Optional<int64_t> EncodedImm =
3803  AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, false);
3804 
3805  if (SOffset && Offset) {
3806  if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
3807  AddrInfo.size() > 1) {
3808  const GEPInfo &GEPI2 = AddrInfo[1];
3809  if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
3810  if (Register OffsetReg =
3811  matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
3812  Base = GEPI2.SgprParts[0];
3813  *SOffset = OffsetReg;
3814  *Offset = *EncodedImm;
3815  return true;
3816  }
3817  }
3818  }
3819  return false;
3820  }
3821 
3822  if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
3823  Base = GEPI.SgprParts[0];
3824  *Offset = *EncodedImm;
3825  return true;
3826  }
3827 
3828  // SGPR offset is unsigned.
3829  if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
3830  GEPI.Imm != 0) {
3831  // If we make it this far we have a load with an 32-bit immediate offset.
3832  // It is OK to select this using a sgpr offset, because we have already
3833  // failed trying to select this load into one of the _IMM variants since
3834  // the _IMM Patterns are considered before the _SGPR patterns.
3835  Base = GEPI.SgprParts[0];
3836  *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3837  BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
3838  .addImm(GEPI.Imm);
3839  return true;
3840  }
3841 
3842  if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
3843  if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
3844  Base = GEPI.SgprParts[0];
3845  *SOffset = OffsetReg;
3846  return true;
3847  }
3848  }
3849 
3850  return false;
3851 }
3852 
3854 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
3855  Register Base;
3856  int64_t Offset;
3857  if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
3858  return None;
3859 
3860  return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
3861  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
3862 }
3863 
3865 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
3866  SmallVector<GEPInfo, 4> AddrInfo;
3867  getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3868 
3869  if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3870  return None;
3871 
3872  const GEPInfo &GEPInfo = AddrInfo[0];
3873  Register PtrReg = GEPInfo.SgprParts[0];
3874  Optional<int64_t> EncodedImm =
3875  AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
3876  if (!EncodedImm)
3877  return None;
3878 
3879  return {{
3880  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3881  [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3882  }};
3883 }
3884 
3886 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
3887  Register Base, SOffset;
3888  if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
3889  return None;
3890 
3891  return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
3892  [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
3893 }
3894 
3896 AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
3897  Register Base, SOffset;
3898  int64_t Offset;
3899  if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
3900  return None;
3901 
3902  return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
3903  [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
3904  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
3905 }
3906 
3907 std::pair<Register, int>
3908 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
3909  uint64_t FlatVariant) const {
3910  MachineInstr *MI = Root.getParent();
3911 
3912  auto Default = std::make_pair(Root.getReg(), 0);
3913 
3914  if (!STI.hasFlatInstOffsets())
3915  return Default;
3916 
3917  Register PtrBase;
3918  int64_t ConstOffset;
3919  std::tie(PtrBase, ConstOffset) =
3920  getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3921  if (ConstOffset == 0)
3922  return Default;
3923 
3924  unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
3925  if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
3926  return Default;
3927 
3928  return std::make_pair(PtrBase, ConstOffset);
3929 }
3930 
3932 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
3933  auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
3934 
3935  return {{
3936  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
3937  [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
3938  }};
3939 }
3940 
3942 AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
3943  auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
3944 
3945  return {{
3946  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
3947  [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
3948  }};
3949 }
3950 
3952 AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
3953  auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
3954 
3955  return {{
3956  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
3957  [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
3958  }};
3959 }
3960 
3961 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
3963 AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
3964  Register Addr = Root.getReg();
3965  Register PtrBase;
3966  int64_t ConstOffset;
3967  int64_t ImmOffset = 0;
3968 
3969  // Match the immediate offset first, which canonically is moved as low as
3970  // possible.
3971  std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
3972 
3973  if (ConstOffset != 0) {
3974  if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
3976  Addr = PtrBase;
3977  ImmOffset = ConstOffset;
3978  } else {
3979  auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
3980  if (isSGPR(PtrBaseDef->Reg)) {
3981  if (ConstOffset > 0) {
3982  // Offset is too large.
3983  //
3984  // saddr + large_offset -> saddr +
3985  // (voffset = large_offset & ~MaxOffset) +
3986  // (large_offset & MaxOffset);
3987  int64_t SplitImmOffset, RemainderOffset;
3988  std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
3990 
3991  if (isUInt<32>(RemainderOffset)) {
3992  MachineInstr *MI = Root.getParent();
3993  MachineBasicBlock *MBB = MI->getParent();
3994  Register HighBits =
3995  MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3996 
3997  BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3998  HighBits)
3999  .addImm(RemainderOffset);
4000 
4001  return {{
4002  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
4003  [=](MachineInstrBuilder &MIB) {
4004  MIB.addReg(HighBits);
4005  }, // voffset
4006  [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
4007  }};
4008  }
4009  }
4010 
4011  // We are adding a 64 bit SGPR and a constant. If constant bus limit
4012  // is 1 we would need to perform 1 or 2 extra moves for each half of
4013  // the constant and it is better to do a scalar add and then issue a
4014  // single VALU instruction to materialize zero. Otherwise it is less
4015  // instructions to perform VALU adds with immediates or inline literals.
4016  unsigned NumLiterals =
4017  !TII.isInlineConstant(APInt(32, ConstOffset & 0xffffffff)) +
4018  !TII.isInlineConstant(APInt(32, ConstOffset >> 32));
4019  if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
4020  return None;
4021  }
4022  }
4023  }
4024 
4025  // Match the variable offset.
4026  auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4027  if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4028  // Look through the SGPR->VGPR copy.
4029  Register SAddr =
4030  getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
4031 
4032  if (isSGPR(SAddr)) {
4033  Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
4034 
4035  // It's possible voffset is an SGPR here, but the copy to VGPR will be
4036  // inserted later.
4037  if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
4038  return {{[=](MachineInstrBuilder &MIB) { // saddr
4039  MIB.addReg(SAddr);
4040  },
4041  [=](MachineInstrBuilder &MIB) { // voffset
4042  MIB.addReg(VOffset);
4043  },
4044  [=](MachineInstrBuilder &MIB) { // offset
4045  MIB.addImm(ImmOffset);
4046  }}};
4047  }
4048  }
4049  }
4050 
4051  // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
4052  // drop this.
4053  if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
4054  AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
4055  return None;
4056 
4057  // It's cheaper to materialize a single 32-bit zero for vaddr than the two
4058  // moves required to copy a 64-bit SGPR to VGPR.
4059  MachineInstr *MI = Root.getParent();
4060  MachineBasicBlock *MBB = MI->getParent();
4061  Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4062 
4063  BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
4064  .addImm(0);
4065 
4066  return {{
4067  [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
4068  [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
4069  [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4070  }};
4071 }
4072 
4074 AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
4075  Register Addr = Root.getReg();
4076  Register PtrBase;
4077  int64_t ConstOffset;
4078  int64_t ImmOffset = 0;
4079 
4080  // Match the immediate offset first, which canonically is moved as low as
4081  // possible.
4082  std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4083 
4084  if (ConstOffset != 0 &&
4087  Addr = PtrBase;
4088  ImmOffset = ConstOffset;
4089  }
4090 
4091  auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4092  if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4093  int FI = AddrDef->MI->getOperand(1).getIndex();
4094  return {{
4095  [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4096  [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4097  }};
4098  }
4099 
4100  Register SAddr = AddrDef->Reg;
4101 
4102  if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4103  Register LHS = AddrDef->MI->getOperand(1).getReg();
4104  Register RHS = AddrDef->MI->getOperand(2).getReg();
4105  auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4106  auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
4107 
4108  if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
4109  isSGPR(RHSDef->Reg)) {
4110  int FI = LHSDef->MI->getOperand(1).getIndex();
4111  MachineInstr &I = *Root.getParent();
4112  MachineBasicBlock *BB = I.getParent();
4113  const DebugLoc &DL = I.getDebugLoc();
4114  SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4115 
4116  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
4117  .addFrameIndex(FI)
4118  .addReg(RHSDef->Reg);
4119  }
4120  }
4121 
4122  if (!isSGPR(SAddr))
4123  return None;
4124 
4125  return {{
4126  [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
4127  [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4128  }};
4129 }
4130 
4131 // Check whether the flat scratch SVS swizzle bug affects this access.
4132 bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
4133  Register VAddr, Register SAddr, uint64_t ImmOffset) const {
4134  if (!Subtarget->hasFlatScratchSVSSwizzleBug())
4135  return false;
4136 
4137  // The bug affects the swizzling of SVS accesses if there is any carry out
4138  // from the two low order bits (i.e. from bit 1 into bit 2) when adding
4139  // voffset to (soffset + inst_offset).
4140  auto VKnown = KnownBits->getKnownBits(VAddr);
4141  auto SKnown = KnownBits::computeForAddSub(
4142  true, false, KnownBits->getKnownBits(SAddr),
4143  KnownBits::makeConstant(APInt(32, ImmOffset)));
4144  uint64_t VMax = VKnown.getMaxValue().getZExtValue();
4145  uint64_t SMax = SKnown.getMaxValue().getZExtValue();
4146  return (VMax & 3) + (SMax & 3) >= 4;
4147 }
4148 
4150 AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
4151  Register Addr = Root.getReg();
4152  Register PtrBase;
4153  int64_t ConstOffset;
4154  int64_t ImmOffset = 0;
4155 
4156  // Match the immediate offset first, which canonically is moved as low as
4157  // possible.
4158  std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4159 
4160  if (ConstOffset != 0 &&
4161  TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
4162  Addr = PtrBase;
4163  ImmOffset = ConstOffset;
4164  }
4165 
4166  auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4167  if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
4168  return None;
4169 
4170  Register RHS = AddrDef->MI->getOperand(2).getReg();
4171  if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
4172  return None;
4173 
4174  Register LHS = AddrDef->MI->getOperand(1).getReg();
4175  auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4176 
4177  if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
4178  return None;
4179 
4180  if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4181  int FI = LHSDef->MI->getOperand(1).getIndex();
4182  return {{
4183  [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4184  [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4185  [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4186  }};
4187  }
4188 
4189  if (!isSGPR(LHS))
4190  return None;
4191 
4192  return {{
4193  [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4194  [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
4195  [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4196  }};
4197 }
4198 
4200 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
4201  MachineInstr *MI = Root.getParent();
4202  MachineBasicBlock *MBB = MI->getParent();
4205 
4206  int64_t Offset = 0;
4207  if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
4209  Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4210 
4211  // TODO: Should this be inside the render function? The iterator seems to
4212  // move.
4213  BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4214  HighBits)
4215  .addImm(Offset & ~4095);
4216 
4217  return {{[=](MachineInstrBuilder &MIB) { // rsrc
4218  MIB.addReg(Info->getScratchRSrcReg());
4219  },
4220  [=](MachineInstrBuilder &MIB) { // vaddr
4221  MIB.addReg(HighBits);
4222  },
4223  [=](MachineInstrBuilder &MIB) { // soffset
4224  // Use constant zero for soffset and rely on eliminateFrameIndex
4225  // to choose the appropriate frame register if need be.
4226  MIB.addImm(0);
4227  },
4228  [=](MachineInstrBuilder &MIB) { // offset
4229  MIB.addImm(Offset & 4095);
4230  }}};
4231  }
4232 
4233  assert(Offset == 0 || Offset == -1);
4234 
4235  // Try to fold a frame index directly into the MUBUF vaddr field, and any
4236  // offsets.
4237  Optional<int> FI;
4238  Register VAddr = Root.getReg();
4239  if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
4240  Register PtrBase;
4241  int64_t ConstOffset;
4242  std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
4243  if (ConstOffset != 0) {
4244  if (SIInstrInfo::isLegalMUBUFImmOffset(ConstOffset) &&
4246  KnownBits->signBitIsZero(PtrBase))) {
4247  const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
4248  if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
4249  FI = PtrBaseDef->getOperand(1).getIndex();
4250  else
4251  VAddr = PtrBase;
4252  Offset = ConstOffset;
4253  }
4254  } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4255  FI = RootDef->getOperand(1).getIndex();
4256  }
4257  }
4258 
4259  return {{[=](MachineInstrBuilder &MIB) { // rsrc
4260  MIB.addReg(Info->getScratchRSrcReg());
4261  },
4262  [=](MachineInstrBuilder &MIB) { // vaddr
4263  if (FI)
4264  MIB.addFrameIndex(FI.value());
4265  else
4266  MIB.addReg(VAddr);
4267  },
4268  [=](MachineInstrBuilder &MIB) { // soffset
4269  // Use constant zero for soffset and rely on eliminateFrameIndex
4270  // to choose the appropriate frame register if need be.
4271  MIB.addImm(0);
4272  },
4273  [=](MachineInstrBuilder &MIB) { // offset
4274  MIB.addImm(Offset);
4275  }}};
4276 }
4277 
4278 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
4279  int64_t Offset) const {
4280  if (!isUInt<16>(Offset))
4281  return false;
4282 
4284  return true;
4285 
4286  // On Southern Islands instruction with a negative base value and an offset
4287  // don't seem to work.
4288  return KnownBits->signBitIsZero(Base);
4289 }
4290 
4291 bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
4292  int64_t Offset1,
4293  unsigned Size) const {
4294  if (Offset0 % Size != 0 || Offset1 % Size != 0)
4295  return false;
4296  if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
4297  return false;
4298 
4300  return true;
4301 
4302  // On Southern Islands instruction with a negative base value and an offset
4303  // don't seem to work.
4304  return KnownBits->signBitIsZero(Base);
4305 }
4306 
4307 bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
4308  unsigned ShAmtBits) const {
4309  assert(MI.getOpcode() == TargetOpcode::G_AND);
4310 
4311  Optional<APInt> RHS = getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
4312  if (!RHS)
4313  return false;
4314 
4315  if (RHS->countTrailingOnes() >= ShAmtBits)
4316  return true;
4317 
4318  const APInt &LHSKnownZeros =
4319  KnownBits->getKnownZeroes(MI.getOperand(1).getReg());
4320  return (LHSKnownZeros | *RHS).countTrailingOnes() >= ShAmtBits;
4321 }
4322 
4323 // Return the wave level SGPR base address if this is a wave address.
4325  return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
4326  ? Def->getOperand(1).getReg()
4327  : Register();
4328 }
4329 
4331 AMDGPUInstructionSelector::selectMUBUFScratchOffset(
4332  MachineOperand &Root) const {
4333  Register Reg = Root.getReg();
4335 
4336  const MachineInstr *Def = MRI->getVRegDef(Reg);
4337  if (Register WaveBase = getWaveAddress(Def)) {
4338  return {{
4339  [=](MachineInstrBuilder &MIB) { // rsrc
4340  MIB.addReg(Info->getScratchRSrcReg());
4341  },
4342  [=](MachineInstrBuilder &MIB) { // soffset
4343  MIB.addReg(WaveBase);
4344  },
4345  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
4346  }};
4347  }
4348 
4349  int64_t Offset = 0;
4350 
4351  // FIXME: Copy check is a hack
4352  Register BasePtr;
4353  if (mi_match(Reg, *MRI, m_GPtrAdd(m_Reg(BasePtr), m_Copy(m_ICst(Offset))))) {
4355  return {};
4356  const MachineInstr *BasePtrDef = MRI->getVRegDef(BasePtr);
4357  Register WaveBase = getWaveAddress(BasePtrDef);
4358  if (!WaveBase)
4359  return {};
4360 
4361  return {{
4362  [=](MachineInstrBuilder &MIB) { // rsrc
4363  MIB.addReg(Info->getScratchRSrcReg());
4364  },
4365  [=](MachineInstrBuilder &MIB) { // soffset
4366  MIB.addReg(WaveBase);
4367  },
4368  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
4369  }};
4370  }
4371 
4372  if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
4374  return {};
4375 
4376  return {{
4377  [=](MachineInstrBuilder &MIB) { // rsrc
4378  MIB.addReg(Info->getScratchRSrcReg());
4379  },
4380  [=](MachineInstrBuilder &MIB) { // soffset
4381  MIB.addImm(0);
4382  },
4383  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
4384  }};
4385 }
4386 
4387 std::pair<Register, unsigned>
4388 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
4389  const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4390  if (!RootDef)
4391  return std::make_pair(Root.getReg(), 0);
4392 
4393  int64_t ConstAddr = 0;
4394 
4395  Register PtrBase;
4396  int64_t Offset;
4397  std::tie(PtrBase, Offset) =
4398  getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4399 
4400  if (Offset) {
4401  if (isDSOffsetLegal(PtrBase, Offset)) {
4402  // (add n0, c0)
4403  return std::make_pair(PtrBase, Offset);
4404  }
4405  } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
4406  // TODO
4407 
4408 
4409  } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
4410  // TODO
4411 
4412  }
4413 
4414  return std::make_pair(Root.getReg(), 0);
4415 }
4416 
4418 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
4419  Register Reg;
4420  unsigned Offset;
4421  std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
4422  return {{
4423  [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4424  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
4425  }};
4426 }
4427 
4429 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
4430  return selectDSReadWrite2(Root, 4);
4431 }
4432 
4434 AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
4435  return selectDSReadWrite2(Root, 8);
4436 }
4437 
4439 AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
4440  unsigned Size) const {
4441  Register Reg;
4442  unsigned Offset;
4443  std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
4444  return {{
4445  [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4446  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
4447  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
4448  }};
4449 }
4450 
4451 std::pair<Register, unsigned>
4452 AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
4453  unsigned Size) const {
4454  const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4455  if (!RootDef)
4456  return std::make_pair(Root.getReg(), 0);
4457 
4458  int64_t ConstAddr = 0;
4459 
4460  Register PtrBase;
4461  int64_t Offset;
4462  std::tie(PtrBase, Offset) =
4463  getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4464 
4465  if (Offset) {
4466  int64_t OffsetValue0 = Offset;
4467  int64_t OffsetValue1 = Offset + Size;
4468  if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
4469  // (add n0, c0)
4470  return std::make_pair(PtrBase, OffsetValue0 / Size);
4471  }
4472  } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
4473  // TODO
4474 
4475  } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
4476  // TODO
4477 
4478  }
4479 
4480  return std::make_pair(Root.getReg(), 0);
4481 }
4482 
4483 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
4484 /// the base value with the constant offset. There may be intervening copies
4485 /// between \p Root and the identified constant. Returns \p Root, 0 if this does
4486 /// not match the pattern.
4487 std::pair<Register, int64_t>
4488 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
4489  Register Root, const MachineRegisterInfo &MRI) const {
4490  MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
4491  if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
4492  return {Root, 0};
4493 
4494  MachineOperand &RHS = RootI->getOperand(2);
4495  Optional<ValueAndVReg> MaybeOffset =
4496  getIConstantVRegValWithLookThrough(RHS.getReg(), MRI);
4497  if (!MaybeOffset)
4498  return {Root, 0};
4499  return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
4500 }
4501 
4502 static void addZeroImm(MachineInstrBuilder &MIB) {
4503  MIB.addImm(0);
4504 }
4505 
4506 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
4507 /// BasePtr is not valid, a null base pointer will be used.
4509  uint32_t FormatLo, uint32_t FormatHi,
4510  Register BasePtr) {
4511  Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4512  Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4513  Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4514  Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
4515 
4516  B.buildInstr(AMDGPU::S_MOV_B32)
4517  .addDef(RSrc2)
4518  .addImm(FormatLo);
4519  B.buildInstr(AMDGPU::S_MOV_B32)
4520  .addDef(RSrc3)
4521  .addImm(FormatHi);
4522 
4523  // Build the half of the subregister with the constants before building the
4524  // full 128-bit register. If we are building multiple resource descriptors,
4525  // this will allow CSEing of the 2-component register.
4526  B.buildInstr(AMDGPU::REG_SEQUENCE)
4527  .addDef(RSrcHi)
4528  .addReg(RSrc2)
4529  .addImm(AMDGPU::sub0)
4530  .addReg(RSrc3)
4531  .addImm(AMDGPU::sub1);
4532 
4533  Register RSrcLo = BasePtr;
4534  if (!BasePtr) {
4535  RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4536  B.buildInstr(AMDGPU::S_MOV_B64)
4537  .addDef(RSrcLo)
4538  .addImm(0);
4539  }
4540 
4541  B.buildInstr(AMDGPU::REG_SEQUENCE)
4542  .addDef(RSrc)
4543  .addReg(RSrcLo)
4544  .addImm(AMDGPU::sub0_sub1)
4545  .addReg(RSrcHi)
4546  .addImm(AMDGPU::sub2_sub3);
4547 
4548  return RSrc;
4549 }
4550 
4552  const SIInstrInfo &TII, Register BasePtr) {
4553  uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
4554 
4555  // FIXME: Why are half the "default" bits ignored based on the addressing
4556  // mode?
4557  return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
4558 }
4559 
4561  const SIInstrInfo &TII, Register BasePtr) {
4562  uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
4563 
4564  // FIXME: Why are half the "default" bits ignored based on the addressing
4565  // mode?
4566  return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
4567 }
4568 
4569 AMDGPUInstructionSelector::MUBUFAddressData
4570 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
4571  MUBUFAddressData Data;
4572  Data.N0 = Src;
4573 
4574  Register PtrBase;
4575  int64_t Offset;
4576 
4577  std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
4578  if (isUInt<32>(Offset)) {
4579  Data.N0 = PtrBase;
4580  Data.Offset = Offset;
4581  }
4582 
4583  if (MachineInstr *InputAdd
4584  = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
4585  Data.N2 = InputAdd->getOperand(1).getReg();
4586  Data.N3 = InputAdd->getOperand(2).getReg();
4587 
4588  // FIXME: Need to fix extra SGPR->VGPRcopies inserted
4589  // FIXME: Don't know this was defined by operand 0
4590  //
4591  // TODO: Remove this when we have copy folding optimizations after
4592  // RegBankSelect.
4593  Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
4594  Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
4595  }
4596 
4597  return Data;
4598 }
4599 
4600 /// Return if the addr64 mubuf mode should be used for the given address.
4601 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
4602  // (ptr_add N2, N3) -> addr64, or
4603  // (ptr_add (ptr_add N2, N3), C1) -> addr64
4604  if (Addr.N2)
4605  return true;
4606 
4607  const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
4608  return N0Bank->getID() == AMDGPU::VGPRRegBankID;
4609 }
4610 
4611 /// Split an immediate offset \p ImmOffset depending on whether it fits in the
4612 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
4613 /// component.
4614 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
4615  MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
4616  if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset))
4617  return;
4618 
4619  // Illegal offset, store it in soffset.
4620  SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4621  B.buildInstr(AMDGPU::S_MOV_B32)
4622  .addDef(SOffset)
4623  .addImm(ImmOffset);
4624  ImmOffset = 0;
4625 }
4626 
4627 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
4628  MachineOperand &Root, Register &VAddr, Register &RSrcReg,
4629  Register &SOffset, int64_t &Offset) const {
4630  // FIXME: Predicates should stop this from reaching here.
4631  // addr64 bit was removed for volcanic islands.
4632  if (!STI.hasAddr64() || STI.useFlatForGlobal())
4633  return false;
4634 
4635  MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
4636  if (!shouldUseAddr64(AddrData))
4637  return false;
4638 
4639  Register N0 = AddrData.N0;
4640  Register N2 = AddrData.N2;
4641  Register N3 = AddrData.N3;
4642  Offset = AddrData.Offset;
4643 
4644  // Base pointer for the SRD.
4645  Register SRDPtr;
4646 
4647  if (N2) {
4648  if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4649  assert(N3);
4650  if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4651  // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
4652  // addr64, and construct the default resource from a 0 address.
4653  VAddr = N0;
4654  } else {
4655  SRDPtr = N3;
4656  VAddr = N2;
4657  }
4658  } else {
4659  // N2 is not divergent.
4660  SRDPtr = N2;
4661  VAddr = N3;
4662  }
4663  } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4664  // Use the default null pointer in the resource
4665  VAddr = N0;
4666  } else {
4667  // N0 -> offset, or
4668  // (N0 + C1) -> offset
4669  SRDPtr = N0;
4670  }
4671 
4672  MachineIRBuilder B(*Root.getParent());
4673  RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
4674  splitIllegalMUBUFOffset(B, SOffset, Offset);
4675  return true;
4676 }
4677 
4678 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
4679  MachineOperand &Root, Register &RSrcReg, Register &SOffset,
4680  int64_t &Offset) const {
4681 
4682  // FIXME: Pattern should not reach here.
4683  if (STI.useFlatForGlobal())
4684  return false;
4685 
4686  MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
4687  if (shouldUseAddr64(AddrData))
4688  return false;
4689 
4690  // N0 -> offset, or
4691  // (N0 + C1) -> offset
4692  Register SRDPtr = AddrData.N0;
4693  Offset = AddrData.Offset;
4694 
4695  // TODO: Look through extensions for 32-bit soffset.
4696  MachineIRBuilder B(*Root.getParent());
4697 
4698  RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
4699  splitIllegalMUBUFOffset(B, SOffset, Offset);
4700  return true;
4701 }
4702 
4704 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
4705  Register VAddr;
4706  Register RSrcReg;
4707  Register SOffset;
4708  int64_t Offset = 0;
4709 
4710  if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
4711  return {};
4712 
4713  // FIXME: Use defaulted operands for trailing 0s and remove from the complex
4714  // pattern.
4715  return {{
4716  [=](MachineInstrBuilder &MIB) { // rsrc
4717  MIB.addReg(RSrcReg);
4718  },
4719  [=](MachineInstrBuilder &MIB) { // vaddr
4720  MIB.addReg(VAddr);
4721  },
4722  [=](MachineInstrBuilder &MIB) { // soffset
4723  if (SOffset)
4724  MIB.addReg(SOffset);
4725  else
4726  MIB.addImm(0);
4727  },
4728  [=](MachineInstrBuilder &MIB) { // offset
4729  MIB.addImm(Offset);
4730  },
4731  addZeroImm, // cpol
4732  addZeroImm, // tfe
4733  addZeroImm // swz
4734  }};
4735 }
4736 
4738 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
4739  Register RSrcReg;
4740  Register SOffset;
4741  int64_t Offset = 0;
4742 
4743  if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
4744  return {};
4745 
4746  return {{
4747  [=](MachineInstrBuilder &MIB) { // rsrc
4748  MIB.addReg(RSrcReg);
4749  },
4750  [=](MachineInstrBuilder &MIB) { // soffset
4751  if (SOffset)
4752  MIB.addReg(SOffset);
4753  else
4754  MIB.addImm(0);
4755  },
4756  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
4757  addZeroImm, // cpol
4758  addZeroImm, // tfe
4759  addZeroImm, // swz
4760  }};
4761 }
4762 
4764 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const {
4765  Register VAddr;
4766  Register RSrcReg;
4767  Register SOffset;
4768  int64_t Offset = 0;
4769 
4770  if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
4771  return {};
4772 
4773  // FIXME: Use defaulted operands for trailing 0s and remove from the complex
4774  // pattern.
4775  return {{
4776  [=](MachineInstrBuilder &MIB) { // rsrc
4777  MIB.addReg(RSrcReg);
4778  },
4779  [=](MachineInstrBuilder &MIB) { // vaddr
4780  MIB.addReg(VAddr);
4781  },
4782  [=](MachineInstrBuilder &MIB) { // soffset
4783  if (SOffset)
4784  MIB.addReg(SOffset);
4785  else
4786  MIB.addImm(0);
4787  },
4788  [=](MachineInstrBuilder &MIB) { // offset
4789  MIB.addImm(Offset);
4790  },
4791  [=](MachineInstrBuilder &MIB) {
4792  MIB.addImm(AMDGPU::CPol::GLC); // cpol
4793  }
4794  }};
4795 }
4796 
4798 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
4799  Register RSrcReg;
4800  Register SOffset;
4801  int64_t Offset = 0;
4802 
4803  if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
4804  return {};
4805 
4806  return {{
4807  [=](MachineInstrBuilder &MIB) { // rsrc
4808  MIB.addReg(RSrcReg);
4809  },
4810  [=](MachineInstrBuilder &MIB) { // soffset
4811  if (SOffset)
4812  MIB.addReg(SOffset);
4813  else
4814  MIB.addImm(0);
4815  },
4816  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
4817  [=](MachineInstrBuilder &MIB) { MIB.addImm(AMDGPU::CPol::GLC); } // cpol
4818  }};
4819 }
4820 
4821 /// Get an immediate that must be 32-bits, and treated as zero extended.
4823  const MachineRegisterInfo &MRI) {
4824  // getIConstantVRegVal sexts any values, so see if that matters.
4826  if (!OffsetVal || !isInt<32>(*OffsetVal))
4827  return None;
4828  return Lo_32(*OffsetVal);
4829 }
4830 
4832 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
4833  Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
4834  if (!OffsetVal)
4835  return {};
4836 
4837  Optional<int64_t> EncodedImm =
4838  AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
4839  if (!EncodedImm)
4840  return {};
4841 
4842  return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
4843 }
4844 
4846 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
4848 
4849  Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
4850  if (!OffsetVal)
4851  return {};
4852 
4853  Optional<int64_t> EncodedImm
4854  = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
4855  if (!EncodedImm)
4856  return {};
4857 
4858  return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
4859 }
4860 
4862 AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
4863  // Match the (soffset + offset) pair as a 32-bit register base and
4864  // an immediate offset.
4865  Register SOffset;
4866  unsigned Offset;
4867  std::tie(SOffset, Offset) =
4869  if (!SOffset)
4870  return None;
4871 
4872  Optional<int64_t> EncodedOffset =
4873  AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
4874  if (!EncodedOffset)
4875  return None;
4876 
4877  assert(MRI->getType(SOffset) == LLT::scalar(32));
4878  return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
4879  [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
4880 }
4881 
4882 // Variant of stripBitCast that returns the instruction instead of a
4883 // MachineOperand.
4885  if (MI->getOpcode() == AMDGPU::G_BITCAST)
4886  return getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI);
4887  return MI;
4888 }
4889 
4890 // Figure out if this is really an extract of the high 16-bits of a dword,
4891 // returns nullptr if it isn't.
4894  Inst = stripBitCast(Inst, MRI);
4895 
4896  if (Inst->getOpcode() != AMDGPU::G_TRUNC)
4897  return nullptr;
4898 
4899  MachineInstr *TruncOp =
4901  TruncOp = stripBitCast(TruncOp, MRI);
4902 
4903  // G_LSHR x, (G_CONSTANT i32 16)
4904  if (TruncOp->getOpcode() == AMDGPU::G_LSHR) {
4905  auto SrlAmount = getIConstantVRegValWithLookThrough(
4906  TruncOp->getOperand(2).getReg(), MRI);
4907  if (SrlAmount && SrlAmount->Value.getZExtValue() == 16) {
4908  MachineInstr *SrlOp =
4909  getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
4910  return stripBitCast(SrlOp, MRI);
4911  }
4912  }
4913 
4914  // G_SHUFFLE_VECTOR x, y, shufflemask(1, 1|0)
4915  // 1, 0 swaps the low/high 16 bits.
4916  // 1, 1 sets the high 16 bits to be the same as the low 16.
4917  // in any case, it selects the high elts.
4918  if (TruncOp->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) {
4919  assert(MRI.getType(TruncOp->getOperand(0).getReg()) ==
4920  LLT::fixed_vector(2, 16));
4921 
4922  ArrayRef<int> Mask = TruncOp->getOperand(3).getShuffleMask();
4923  assert(Mask.size() == 2);
4924 
4925  if (Mask[0] == 1 && Mask[1] <= 1) {
4926  MachineInstr *LHS =
4927  getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
4928  return stripBitCast(LHS, MRI);
4929  }
4930  }
4931 
4932  return nullptr;
4933 }
4934 
4935 std::pair<Register, unsigned>
4936 AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
4937  bool &Matched) const {
4938  Matched = false;
4939 
4940  Register Src;
4941  unsigned Mods;
4942  std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
4943 
4944  MachineInstr *