LLVM  14.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
15 #include "AMDGPU.h"
16 #include "AMDGPUGlobalISelUtils.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
26 #include "llvm/IR/DiagnosticInfo.h"
27 
28 #define DEBUG_TYPE "amdgpu-isel"
29 
30 using namespace llvm;
31 using namespace MIPatternMatch;
32 
34  "amdgpu-global-isel-risky-select",
35  cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
36  cl::init(false),
38 
39 #define GET_GLOBALISEL_IMPL
40 #define AMDGPUSubtarget GCNSubtarget
41 #include "AMDGPUGenGlobalISel.inc"
42 #undef GET_GLOBALISEL_IMPL
43 #undef AMDGPUSubtarget
44 
46  const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
47  const AMDGPUTargetMachine &TM)
48  : InstructionSelector(), TII(*STI.getInstrInfo()),
49  TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
50  STI(STI),
51  EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
53 #include "AMDGPUGenGlobalISel.inc"
56 #include "AMDGPUGenGlobalISel.inc"
58 {
59 }
60 
61 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
62 
64  CodeGenCoverage &CoverageInfo,
65  ProfileSummaryInfo *PSI,
67  MRI = &MF.getRegInfo();
68  Subtarget = &MF.getSubtarget<GCNSubtarget>();
70 }
71 
72 bool AMDGPUInstructionSelector::isVCC(Register Reg,
73  const MachineRegisterInfo &MRI) const {
74  // The verifier is oblivious to s1 being a valid value for wavesize registers.
75  if (Reg.isPhysical())
76  return false;
77 
78  auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
79  const TargetRegisterClass *RC =
80  RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
81  if (RC) {
82  const LLT Ty = MRI.getType(Reg);
83  return RC->hasSuperClassEq(TRI.getBoolRC()) &&
84  Ty.isValid() && Ty.getSizeInBits() == 1;
85  }
86 
87  const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
88  return RB->getID() == AMDGPU::VCCRegBankID;
89 }
90 
91 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
92  unsigned NewOpc) const {
93  MI.setDesc(TII.get(NewOpc));
94  MI.RemoveOperand(1); // Remove intrinsic ID.
95  MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
96 
97  MachineOperand &Dst = MI.getOperand(0);
98  MachineOperand &Src = MI.getOperand(1);
99 
100  // TODO: This should be legalized to s32 if needed
101  if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
102  return false;
103 
104  const TargetRegisterClass *DstRC
105  = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
106  const TargetRegisterClass *SrcRC
107  = TRI.getConstrainedRegClassForOperand(Src, *MRI);
108  if (!DstRC || DstRC != SrcRC)
109  return false;
110 
111  return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
112  RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
113 }
114 
115 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
116  const DebugLoc &DL = I.getDebugLoc();
117  MachineBasicBlock *BB = I.getParent();
118  I.setDesc(TII.get(TargetOpcode::COPY));
119 
120  const MachineOperand &Src = I.getOperand(1);
121  MachineOperand &Dst = I.getOperand(0);
122  Register DstReg = Dst.getReg();
123  Register SrcReg = Src.getReg();
124 
125  if (isVCC(DstReg, *MRI)) {
126  if (SrcReg == AMDGPU::SCC) {
127  const TargetRegisterClass *RC
128  = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
129  if (!RC)
130  return true;
131  return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
132  }
133 
134  if (!isVCC(SrcReg, *MRI)) {
135  // TODO: Should probably leave the copy and let copyPhysReg expand it.
136  if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
137  return false;
138 
139  const TargetRegisterClass *SrcRC
140  = TRI.getConstrainedRegClassForOperand(Src, *MRI);
141 
142  Optional<ValueAndVReg> ConstVal =
143  getConstantVRegValWithLookThrough(SrcReg, *MRI, true, true);
144  if (ConstVal) {
145  unsigned MovOpc =
146  STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
147  BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
148  .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
149  } else {
150  Register MaskedReg = MRI->createVirtualRegister(SrcRC);
151 
152  // We can't trust the high bits at this point, so clear them.
153 
154  // TODO: Skip masking high bits if def is known boolean.
155 
156  unsigned AndOpc =
157  TRI.isSGPRClass(SrcRC) ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
158  BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
159  .addImm(1)
160  .addReg(SrcReg);
161  BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
162  .addImm(0)
163  .addReg(MaskedReg);
164  }
165 
166  if (!MRI->getRegClassOrNull(SrcReg))
167  MRI->setRegClass(SrcReg, SrcRC);
168  I.eraseFromParent();
169  return true;
170  }
171 
172  const TargetRegisterClass *RC =
173  TRI.getConstrainedRegClassForOperand(Dst, *MRI);
174  if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
175  return false;
176 
177  return true;
178  }
179 
180  for (const MachineOperand &MO : I.operands()) {
181  if (MO.getReg().isPhysical())
182  continue;
183 
184  const TargetRegisterClass *RC =
185  TRI.getConstrainedRegClassForOperand(MO, *MRI);
186  if (!RC)
187  continue;
188  RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
189  }
190  return true;
191 }
192 
193 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
194  const Register DefReg = I.getOperand(0).getReg();
195  const LLT DefTy = MRI->getType(DefReg);
196  if (DefTy == LLT::scalar(1)) {
197  if (!AllowRiskySelect) {
198  LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
199  return false;
200  }
201 
202  LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n");
203  }
204 
205  // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
206 
207  const RegClassOrRegBank &RegClassOrBank =
208  MRI->getRegClassOrRegBank(DefReg);
209 
210  const TargetRegisterClass *DefRC
211  = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
212  if (!DefRC) {
213  if (!DefTy.isValid()) {
214  LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
215  return false;
216  }
217 
218  const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
219  DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI);
220  if (!DefRC) {
221  LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
222  return false;
223  }
224  }
225 
226  // TODO: Verify that all registers have the same bank
227  I.setDesc(TII.get(TargetOpcode::PHI));
228  return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
229 }
230 
232 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
233  const TargetRegisterClass &SubRC,
234  unsigned SubIdx) const {
235 
236  MachineInstr *MI = MO.getParent();
238  Register DstReg = MRI->createVirtualRegister(&SubRC);
239 
240  if (MO.isReg()) {
241  unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
242  Register Reg = MO.getReg();
243  BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
244  .addReg(Reg, 0, ComposedSubIdx);
245 
246  return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
247  MO.isKill(), MO.isDead(), MO.isUndef(),
248  MO.isEarlyClobber(), 0, MO.isDebug(),
249  MO.isInternalRead());
250  }
251 
252  assert(MO.isImm());
253 
254  APInt Imm(64, MO.getImm());
255 
256  switch (SubIdx) {
257  default:
258  llvm_unreachable("do not know to split immediate with this sub index.");
259  case AMDGPU::sub0:
260  return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
261  case AMDGPU::sub1:
262  return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
263  }
264 }
265 
266 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
267  switch (Opc) {
268  case AMDGPU::G_AND:
269  return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
270  case AMDGPU::G_OR:
271  return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
272  case AMDGPU::G_XOR:
273  return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
274  default:
275  llvm_unreachable("not a bit op");
276  }
277 }
278 
279 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
280  Register DstReg = I.getOperand(0).getReg();
281  unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
282 
283  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
284  if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
285  DstRB->getID() != AMDGPU::VCCRegBankID)
286  return false;
287 
288  bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
289  STI.isWave64());
290  I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
291 
292  // Dead implicit-def of scc
293  I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
294  true, // isImp
295  false, // isKill
296  true)); // isDead
297  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
298 }
299 
300 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
301  MachineBasicBlock *BB = I.getParent();
302  MachineFunction *MF = BB->getParent();
303  Register DstReg = I.getOperand(0).getReg();
304  const DebugLoc &DL = I.getDebugLoc();
305  LLT Ty = MRI->getType(DstReg);
306  if (Ty.isVector())
307  return false;
308 
309  unsigned Size = Ty.getSizeInBits();
310  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
311  const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
312  const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
313 
314  if (Size == 32) {
315  if (IsSALU) {
316  const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
317  MachineInstr *Add =
318  BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
319  .add(I.getOperand(1))
320  .add(I.getOperand(2));
321  I.eraseFromParent();
322  return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
323  }
324 
325  if (STI.hasAddNoCarry()) {
326  const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
327  I.setDesc(TII.get(Opc));
328  I.addOperand(*MF, MachineOperand::CreateImm(0));
329  I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
330  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
331  }
332 
333  const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
334 
337  = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
338  .addDef(UnusedCarry, RegState::Dead)
339  .add(I.getOperand(1))
340  .add(I.getOperand(2))
341  .addImm(0);
342  I.eraseFromParent();
343  return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
344  }
345 
346  assert(!Sub && "illegal sub should not reach here");
347 
348  const TargetRegisterClass &RC
349  = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
350  const TargetRegisterClass &HalfRC
351  = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
352 
353  MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
354  MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
355  MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
356  MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
357 
358  Register DstLo = MRI->createVirtualRegister(&HalfRC);
359  Register DstHi = MRI->createVirtualRegister(&HalfRC);
360 
361  if (IsSALU) {
362  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
363  .add(Lo1)
364  .add(Lo2);
365  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
366  .add(Hi1)
367  .add(Hi2);
368  } else {
369  const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
370  Register CarryReg = MRI->createVirtualRegister(CarryRC);
371  BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
372  .addDef(CarryReg)
373  .add(Lo1)
374  .add(Lo2)
375  .addImm(0);
376  MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
378  .add(Hi1)
379  .add(Hi2)
380  .addReg(CarryReg, RegState::Kill)
381  .addImm(0);
382 
383  if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
384  return false;
385  }
386 
387  BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
388  .addReg(DstLo)
389  .addImm(AMDGPU::sub0)
390  .addReg(DstHi)
391  .addImm(AMDGPU::sub1);
392 
393 
394  if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
395  return false;
396 
397  I.eraseFromParent();
398  return true;
399 }
400 
401 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
402  MachineInstr &I) const {
403  MachineBasicBlock *BB = I.getParent();
404  MachineFunction *MF = BB->getParent();
405  const DebugLoc &DL = I.getDebugLoc();
406  Register Dst0Reg = I.getOperand(0).getReg();
407  Register Dst1Reg = I.getOperand(1).getReg();
408  const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
409  I.getOpcode() == AMDGPU::G_UADDE;
410  const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
411  I.getOpcode() == AMDGPU::G_USUBE;
412 
413  if (isVCC(Dst1Reg, *MRI)) {
414  unsigned NoCarryOpc =
415  IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
416  unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
417  I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
418  I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
419  I.addOperand(*MF, MachineOperand::CreateImm(0));
420  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
421  }
422 
423  Register Src0Reg = I.getOperand(2).getReg();
424  Register Src1Reg = I.getOperand(3).getReg();
425 
426  if (HasCarryIn) {
427  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
428  .addReg(I.getOperand(4).getReg());
429  }
430 
431  unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
432  unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
433 
434  BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
435  .add(I.getOperand(2))
436  .add(I.getOperand(3));
437  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
439 
440  if (!MRI->getRegClassOrNull(Dst1Reg))
441  MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
442 
443  if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
444  !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
445  !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
446  return false;
447 
448  if (HasCarryIn &&
449  !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
450  AMDGPU::SReg_32RegClass, *MRI))
451  return false;
452 
453  I.eraseFromParent();
454  return true;
455 }
456 
457 // TODO: We should probably legalize these to only using 32-bit results.
458 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
459  MachineBasicBlock *BB = I.getParent();
460  Register DstReg = I.getOperand(0).getReg();
461  Register SrcReg = I.getOperand(1).getReg();
462  LLT DstTy = MRI->getType(DstReg);
463  LLT SrcTy = MRI->getType(SrcReg);
464  const unsigned SrcSize = SrcTy.getSizeInBits();
465  unsigned DstSize = DstTy.getSizeInBits();
466 
467  // TODO: Should handle any multiple of 32 offset.
468  unsigned Offset = I.getOperand(2).getImm();
469  if (Offset % 32 != 0 || DstSize > 128)
470  return false;
471 
472  // 16-bit operations really use 32-bit registers.
473  // FIXME: Probably should not allow 16-bit G_EXTRACT results.
474  if (DstSize == 16)
475  DstSize = 32;
476 
477  const TargetRegisterClass *DstRC =
478  TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
479  if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
480  return false;
481 
482  const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
483  const TargetRegisterClass *SrcRC =
484  TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
485  if (!SrcRC)
486  return false;
488  DstSize / 32);
489  SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
490  if (!SrcRC)
491  return false;
492 
493  SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
494  *SrcRC, I.getOperand(1));
495  const DebugLoc &DL = I.getDebugLoc();
496  BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
497  .addReg(SrcReg, 0, SubReg);
498 
499  I.eraseFromParent();
500  return true;
501 }
502 
503 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
504  MachineBasicBlock *BB = MI.getParent();
505  Register DstReg = MI.getOperand(0).getReg();
506  LLT DstTy = MRI->getType(DstReg);
507  LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
508 
509  const unsigned SrcSize = SrcTy.getSizeInBits();
510  if (SrcSize < 32)
511  return selectImpl(MI, *CoverageInfo);
512 
513  const DebugLoc &DL = MI.getDebugLoc();
514  const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
515  const unsigned DstSize = DstTy.getSizeInBits();
516  const TargetRegisterClass *DstRC =
517  TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
518  if (!DstRC)
519  return false;
520 
521  ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
522  MachineInstrBuilder MIB =
523  BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
524  for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
525  MachineOperand &Src = MI.getOperand(I + 1);
526  MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
527  MIB.addImm(SubRegs[I]);
528 
529  const TargetRegisterClass *SrcRC
530  = TRI.getConstrainedRegClassForOperand(Src, *MRI);
531  if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
532  return false;
533  }
534 
535  if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
536  return false;
537 
538  MI.eraseFromParent();
539  return true;
540 }
541 
542 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
543  MachineBasicBlock *BB = MI.getParent();
544  const int NumDst = MI.getNumOperands() - 1;
545 
546  MachineOperand &Src = MI.getOperand(NumDst);
547 
548  Register SrcReg = Src.getReg();
549  Register DstReg0 = MI.getOperand(0).getReg();
550  LLT DstTy = MRI->getType(DstReg0);
551  LLT SrcTy = MRI->getType(SrcReg);
552 
553  const unsigned DstSize = DstTy.getSizeInBits();
554  const unsigned SrcSize = SrcTy.getSizeInBits();
555  const DebugLoc &DL = MI.getDebugLoc();
556  const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
557 
558  const TargetRegisterClass *SrcRC =
559  TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
560  if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
561  return false;
562 
563  // Note we could have mixed SGPR and VGPR destination banks for an SGPR
564  // source, and this relies on the fact that the same subregister indices are
565  // used for both.
566  ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
567  for (int I = 0, E = NumDst; I != E; ++I) {
568  MachineOperand &Dst = MI.getOperand(I);
569  BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
570  .addReg(SrcReg, 0, SubRegs[I]);
571 
572  // Make sure the subregister index is valid for the source register.
573  SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
574  if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
575  return false;
576 
577  const TargetRegisterClass *DstRC =
578  TRI.getConstrainedRegClassForOperand(Dst, *MRI);
579  if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
580  return false;
581  }
582 
583  MI.eraseFromParent();
584  return true;
585 }
586 
587 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
588  MachineInstr &MI) const {
589  if (selectImpl(MI, *CoverageInfo))
590  return true;
591 
592  const LLT S32 = LLT::scalar(32);
593  const LLT V2S16 = LLT::fixed_vector(2, 16);
594 
595  Register Dst = MI.getOperand(0).getReg();
596  if (MRI->getType(Dst) != V2S16)
597  return false;
598 
599  const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
600  if (DstBank->getID() != AMDGPU::SGPRRegBankID)
601  return false;
602 
603  Register Src0 = MI.getOperand(1).getReg();
604  Register Src1 = MI.getOperand(2).getReg();
605  if (MRI->getType(Src0) != S32)
606  return false;
607 
608  const DebugLoc &DL = MI.getDebugLoc();
609  MachineBasicBlock *BB = MI.getParent();
610 
611  auto ConstSrc1 =
612  getConstantVRegValWithLookThrough(Src1, *MRI, true, true, true);
613  if (ConstSrc1) {
614  auto ConstSrc0 =
615  getConstantVRegValWithLookThrough(Src0, *MRI, true, true, true);
616  if (ConstSrc0) {
617  const int64_t K0 = ConstSrc0->Value.getSExtValue();
618  const int64_t K1 = ConstSrc1->Value.getSExtValue();
619  uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
620  uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
621 
622  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst)
623  .addImm(Lo16 | (Hi16 << 16));
624  MI.eraseFromParent();
625  return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
626  }
627  }
628 
629  // TODO: This should probably be a combine somewhere
630  // (build_vector_trunc $src0, undef -> copy $src0
631  MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
632  if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
633  MI.setDesc(TII.get(AMDGPU::COPY));
634  MI.RemoveOperand(2);
635  return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) &&
636  RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI);
637  }
638 
639  Register ShiftSrc0;
640  Register ShiftSrc1;
641 
642  // With multiple uses of the shift, this will duplicate the shift and
643  // increase register pressure.
644  //
645  // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
646  // => (S_PACK_HH_B32_B16 $src0, $src1)
647  // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16))
648  // => (S_PACK_LH_B32_B16 $src0, $src1)
649  // (build_vector_trunc $src0, $src1)
650  // => (S_PACK_LL_B32_B16 $src0, $src1)
651 
652  bool Shift0 = mi_match(
653  Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
654 
655  bool Shift1 = mi_match(
656  Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
657 
658  unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
659  if (Shift0 && Shift1) {
660  Opc = AMDGPU::S_PACK_HH_B32_B16;
661  MI.getOperand(1).setReg(ShiftSrc0);
662  MI.getOperand(2).setReg(ShiftSrc1);
663  } else if (Shift1) {
664  Opc = AMDGPU::S_PACK_LH_B32_B16;
665  MI.getOperand(2).setReg(ShiftSrc1);
666  } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) {
667  // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
668  auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
669  .addReg(ShiftSrc0)
670  .addImm(16);
671 
672  MI.eraseFromParent();
673  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
674  }
675 
676  MI.setDesc(TII.get(Opc));
677  return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
678 }
679 
680 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
681  return selectG_ADD_SUB(I);
682 }
683 
684 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
685  const MachineOperand &MO = I.getOperand(0);
686 
687  // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
688  // regbank check here is to know why getConstrainedRegClassForOperand failed.
689  const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
690  if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
691  (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
692  I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
693  return true;
694  }
695 
696  return false;
697 }
698 
699 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
700  MachineBasicBlock *BB = I.getParent();
701 
702  Register DstReg = I.getOperand(0).getReg();
703  Register Src0Reg = I.getOperand(1).getReg();
704  Register Src1Reg = I.getOperand(2).getReg();
705  LLT Src1Ty = MRI->getType(Src1Reg);
706 
707  unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
708  unsigned InsSize = Src1Ty.getSizeInBits();
709 
710  int64_t Offset = I.getOperand(3).getImm();
711 
712  // FIXME: These cases should have been illegal and unnecessary to check here.
713  if (Offset % 32 != 0 || InsSize % 32 != 0)
714  return false;
715 
716  // Currently not handled by getSubRegFromChannel.
717  if (InsSize > 128)
718  return false;
719 
720  unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
721  if (SubReg == AMDGPU::NoSubRegister)
722  return false;
723 
724  const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
725  const TargetRegisterClass *DstRC =
726  TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
727  if (!DstRC)
728  return false;
729 
730  const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
731  const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
732  const TargetRegisterClass *Src0RC =
733  TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI);
734  const TargetRegisterClass *Src1RC =
735  TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI);
736 
737  // Deal with weird cases where the class only partially supports the subreg
738  // index.
739  Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
740  if (!Src0RC || !Src1RC)
741  return false;
742 
743  if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
744  !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
745  !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
746  return false;
747 
748  const DebugLoc &DL = I.getDebugLoc();
749  BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
750  .addReg(Src0Reg)
751  .addReg(Src1Reg)
752  .addImm(SubReg);
753 
754  I.eraseFromParent();
755  return true;
756 }
757 
758 bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
759  Register DstReg = MI.getOperand(0).getReg();
760  Register SrcReg = MI.getOperand(1).getReg();
761  Register OffsetReg = MI.getOperand(2).getReg();
762  Register WidthReg = MI.getOperand(3).getReg();
763 
764  assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
765  "scalar BFX instructions are expanded in regbankselect");
766  assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
767  "64-bit vector BFX instructions are expanded in regbankselect");
768 
769  const DebugLoc &DL = MI.getDebugLoc();
770  MachineBasicBlock *MBB = MI.getParent();
771 
772  bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
773  unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
774  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
775  .addReg(SrcReg)
776  .addReg(OffsetReg)
777  .addReg(WidthReg);
778  MI.eraseFromParent();
779  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
780 }
781 
782 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
783  if (STI.getLDSBankCount() != 16)
784  return selectImpl(MI, *CoverageInfo);
785 
786  Register Dst = MI.getOperand(0).getReg();
787  Register Src0 = MI.getOperand(2).getReg();
788  Register M0Val = MI.getOperand(6).getReg();
789  if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
790  !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
791  !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
792  return false;
793 
794  // This requires 2 instructions. It is possible to write a pattern to support
795  // this, but the generated isel emitter doesn't correctly deal with multiple
796  // output instructions using the same physical register input. The copy to m0
797  // is incorrectly placed before the second instruction.
798  //
799  // TODO: Match source modifiers.
800 
801  Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
802  const DebugLoc &DL = MI.getDebugLoc();
803  MachineBasicBlock *MBB = MI.getParent();
804 
805  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
806  .addReg(M0Val);
807  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
808  .addImm(2)
809  .addImm(MI.getOperand(4).getImm()) // $attr
810  .addImm(MI.getOperand(3).getImm()); // $attrchan
811 
812  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
813  .addImm(0) // $src0_modifiers
814  .addReg(Src0) // $src0
815  .addImm(MI.getOperand(4).getImm()) // $attr
816  .addImm(MI.getOperand(3).getImm()) // $attrchan
817  .addImm(0) // $src2_modifiers
818  .addReg(InterpMov) // $src2 - 2 f16 values selected by high
819  .addImm(MI.getOperand(5).getImm()) // $high
820  .addImm(0) // $clamp
821  .addImm(0); // $omod
822 
823  MI.eraseFromParent();
824  return true;
825 }
826 
827 // Writelane is special in that it can use SGPR and M0 (which would normally
828 // count as using the constant bus twice - but in this case it is allowed since
829 // the lane selector doesn't count as a use of the constant bus). However, it is
830 // still required to abide by the 1 SGPR rule. Fix this up if we might have
831 // multiple SGPRs.
832 bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
833  // With a constant bus limit of at least 2, there's no issue.
834  if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
835  return selectImpl(MI, *CoverageInfo);
836 
837  MachineBasicBlock *MBB = MI.getParent();
838  const DebugLoc &DL = MI.getDebugLoc();
839  Register VDst = MI.getOperand(0).getReg();
840  Register Val = MI.getOperand(2).getReg();
841  Register LaneSelect = MI.getOperand(3).getReg();
842  Register VDstIn = MI.getOperand(4).getReg();
843 
844  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
845 
846  Optional<ValueAndVReg> ConstSelect =
847  getConstantVRegValWithLookThrough(LaneSelect, *MRI, true, true);
848  if (ConstSelect) {
849  // The selector has to be an inline immediate, so we can use whatever for
850  // the other operands.
851  MIB.addReg(Val);
852  MIB.addImm(ConstSelect->Value.getSExtValue() &
853  maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
854  } else {
855  Optional<ValueAndVReg> ConstVal =
856  getConstantVRegValWithLookThrough(Val, *MRI, true, true);
857 
858  // If the value written is an inline immediate, we can get away without a
859  // copy to m0.
860  if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
861  STI.hasInv2PiInlineImm())) {
862  MIB.addImm(ConstVal->Value.getSExtValue());
863  MIB.addReg(LaneSelect);
864  } else {
865  MIB.addReg(Val);
866 
867  // If the lane selector was originally in a VGPR and copied with
868  // readfirstlane, there's a hazard to read the same SGPR from the
869  // VALU. Constrain to a different SGPR to help avoid needing a nop later.
870  RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
871 
872  BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
873  .addReg(LaneSelect);
874  MIB.addReg(AMDGPU::M0);
875  }
876  }
877 
878  MIB.addReg(VDstIn);
879 
880  MI.eraseFromParent();
881  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
882 }
883 
884 // We need to handle this here because tablegen doesn't support matching
885 // instructions with multiple outputs.
886 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
887  Register Dst0 = MI.getOperand(0).getReg();
888  Register Dst1 = MI.getOperand(1).getReg();
889 
890  LLT Ty = MRI->getType(Dst0);
891  unsigned Opc;
892  if (Ty == LLT::scalar(32))
893  Opc = AMDGPU::V_DIV_SCALE_F32_e64;
894  else if (Ty == LLT::scalar(64))
895  Opc = AMDGPU::V_DIV_SCALE_F64_e64;
896  else
897  return false;
898 
899  // TODO: Match source modifiers.
900 
901  const DebugLoc &DL = MI.getDebugLoc();
902  MachineBasicBlock *MBB = MI.getParent();
903 
904  Register Numer = MI.getOperand(3).getReg();
905  Register Denom = MI.getOperand(4).getReg();
906  unsigned ChooseDenom = MI.getOperand(5).getImm();
907 
908  Register Src0 = ChooseDenom != 0 ? Numer : Denom;
909 
910  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
911  .addDef(Dst1)
912  .addImm(0) // $src0_modifiers
913  .addUse(Src0) // $src0
914  .addImm(0) // $src1_modifiers
915  .addUse(Denom) // $src1
916  .addImm(0) // $src2_modifiers
917  .addUse(Numer) // $src2
918  .addImm(0) // $clamp
919  .addImm(0); // $omod
920 
921  MI.eraseFromParent();
922  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
923 }
924 
925 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
926  unsigned IntrinsicID = I.getIntrinsicID();
927  switch (IntrinsicID) {
928  case Intrinsic::amdgcn_if_break: {
929  MachineBasicBlock *BB = I.getParent();
930 
931  // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
932  // SelectionDAG uses for wave32 vs wave64.
933  BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
934  .add(I.getOperand(0))
935  .add(I.getOperand(2))
936  .add(I.getOperand(3));
937 
938  Register DstReg = I.getOperand(0).getReg();
939  Register Src0Reg = I.getOperand(2).getReg();
940  Register Src1Reg = I.getOperand(3).getReg();
941 
942  I.eraseFromParent();
943 
944  for (Register Reg : { DstReg, Src0Reg, Src1Reg })
946 
947  return true;
948  }
949  case Intrinsic::amdgcn_interp_p1_f16:
950  return selectInterpP1F16(I);
951  case Intrinsic::amdgcn_wqm:
952  return constrainCopyLikeIntrin(I, AMDGPU::WQM);
953  case Intrinsic::amdgcn_softwqm:
954  return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
955  case Intrinsic::amdgcn_strict_wwm:
956  case Intrinsic::amdgcn_wwm:
957  return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
958  case Intrinsic::amdgcn_strict_wqm:
959  return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
960  case Intrinsic::amdgcn_writelane:
961  return selectWritelane(I);
962  case Intrinsic::amdgcn_div_scale:
963  return selectDivScale(I);
964  case Intrinsic::amdgcn_icmp:
965  return selectIntrinsicIcmp(I);
966  case Intrinsic::amdgcn_ballot:
967  return selectBallot(I);
968  case Intrinsic::amdgcn_reloc_constant:
969  return selectRelocConstant(I);
970  case Intrinsic::amdgcn_groupstaticsize:
971  return selectGroupStaticSize(I);
972  case Intrinsic::returnaddress:
973  return selectReturnAddress(I);
974  default:
975  return selectImpl(I, *CoverageInfo);
976  }
977 }
978 
979 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) {
980  if (Size != 32 && Size != 64)
981  return -1;
982  switch (P) {
983  default:
984  llvm_unreachable("Unknown condition code!");
985  case CmpInst::ICMP_NE:
986  return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64;
987  case CmpInst::ICMP_EQ:
988  return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64;
989  case CmpInst::ICMP_SGT:
990  return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64;
991  case CmpInst::ICMP_SGE:
992  return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64;
993  case CmpInst::ICMP_SLT:
994  return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64;
995  case CmpInst::ICMP_SLE:
996  return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64;
997  case CmpInst::ICMP_UGT:
998  return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64;
999  case CmpInst::ICMP_UGE:
1000  return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64;
1001  case CmpInst::ICMP_ULT:
1002  return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64;
1003  case CmpInst::ICMP_ULE:
1004  return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64;
1005  }
1006 }
1007 
1008 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1009  unsigned Size) const {
1010  if (Size == 64) {
1011  if (!STI.hasScalarCompareEq64())
1012  return -1;
1013 
1014  switch (P) {
1015  case CmpInst::ICMP_NE:
1016  return AMDGPU::S_CMP_LG_U64;
1017  case CmpInst::ICMP_EQ:
1018  return AMDGPU::S_CMP_EQ_U64;
1019  default:
1020  return -1;
1021  }
1022  }
1023 
1024  if (Size != 32)
1025  return -1;
1026 
1027  switch (P) {
1028  case CmpInst::ICMP_NE:
1029  return AMDGPU::S_CMP_LG_U32;
1030  case CmpInst::ICMP_EQ:
1031  return AMDGPU::S_CMP_EQ_U32;
1032  case CmpInst::ICMP_SGT:
1033  return AMDGPU::S_CMP_GT_I32;
1034  case CmpInst::ICMP_SGE:
1035  return AMDGPU::S_CMP_GE_I32;
1036  case CmpInst::ICMP_SLT:
1037  return AMDGPU::S_CMP_LT_I32;
1038  case CmpInst::ICMP_SLE:
1039  return AMDGPU::S_CMP_LE_I32;
1040  case CmpInst::ICMP_UGT:
1041  return AMDGPU::S_CMP_GT_U32;
1042  case CmpInst::ICMP_UGE:
1043  return AMDGPU::S_CMP_GE_U32;
1044  case CmpInst::ICMP_ULT:
1045  return AMDGPU::S_CMP_LT_U32;
1046  case CmpInst::ICMP_ULE:
1047  return AMDGPU::S_CMP_LE_U32;
1048  default:
1049  llvm_unreachable("Unknown condition code!");
1050  }
1051 }
1052 
1053 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
1054  MachineBasicBlock *BB = I.getParent();
1055  const DebugLoc &DL = I.getDebugLoc();
1056 
1057  Register SrcReg = I.getOperand(2).getReg();
1058  unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1059 
1060  auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1061 
1062  Register CCReg = I.getOperand(0).getReg();
1063  if (!isVCC(CCReg, *MRI)) {
1064  int Opcode = getS_CMPOpcode(Pred, Size);
1065  if (Opcode == -1)
1066  return false;
1067  MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1068  .add(I.getOperand(2))
1069  .add(I.getOperand(3));
1070  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1071  .addReg(AMDGPU::SCC);
1072  bool Ret =
1073  constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1074  RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1075  I.eraseFromParent();
1076  return Ret;
1077  }
1078 
1079  int Opcode = getV_CMPOpcode(Pred, Size);
1080  if (Opcode == -1)
1081  return false;
1082 
1083  MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1084  I.getOperand(0).getReg())
1085  .add(I.getOperand(2))
1086  .add(I.getOperand(3));
1088  *TRI.getBoolRC(), *MRI);
1089  bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1090  I.eraseFromParent();
1091  return Ret;
1092 }
1093 
1094 bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const {
1095  Register Dst = I.getOperand(0).getReg();
1096  if (isVCC(Dst, *MRI))
1097  return false;
1098 
1099  if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize())
1100  return false;
1101 
1102  MachineBasicBlock *BB = I.getParent();
1103  const DebugLoc &DL = I.getDebugLoc();
1104  Register SrcReg = I.getOperand(2).getReg();
1105  unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1106  auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1107 
1108  int Opcode = getV_CMPOpcode(Pred, Size);
1109  if (Opcode == -1)
1110  return false;
1111 
1112  MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
1113  .add(I.getOperand(2))
1114  .add(I.getOperand(3));
1115  RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(),
1116  *MRI);
1117  bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1118  I.eraseFromParent();
1119  return Ret;
1120 }
1121 
1122 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1123  MachineBasicBlock *BB = I.getParent();
1124  const DebugLoc &DL = I.getDebugLoc();
1125  Register DstReg = I.getOperand(0).getReg();
1126  const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1127  const bool Is64 = Size == 64;
1128 
1129  if (Size != STI.getWavefrontSize())
1130  return false;
1131 
1133  getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true);
1134 
1135  if (Arg.hasValue()) {
1136  const int64_t Value = Arg.getValue().Value.getSExtValue();
1137  if (Value == 0) {
1138  unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1139  BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1140  } else if (Value == -1) { // all ones
1141  Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
1142  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1143  } else
1144  return false;
1145  } else {
1146  Register SrcReg = I.getOperand(2).getReg();
1147  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1148  }
1149 
1150  I.eraseFromParent();
1151  return true;
1152 }
1153 
1154 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1155  Register DstReg = I.getOperand(0).getReg();
1156  const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1157  const TargetRegisterClass *DstRC =
1158  TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI);
1159  if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1160  return false;
1161 
1162  const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1163 
1164  Module *M = MF->getFunction().getParent();
1165  const MDNode *Metadata = I.getOperand(2).getMetadata();
1166  auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1167  auto RelocSymbol = cast<GlobalVariable>(
1168  M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1169 
1170  MachineBasicBlock *BB = I.getParent();
1171  BuildMI(*BB, &I, I.getDebugLoc(),
1172  TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1173  .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);
1174 
1175  I.eraseFromParent();
1176  return true;
1177 }
1178 
1179 bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1181 
1182  Register DstReg = I.getOperand(0).getReg();
1183  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1184  unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1185  AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1186 
1187  MachineBasicBlock *MBB = I.getParent();
1188  const DebugLoc &DL = I.getDebugLoc();
1189 
1190  auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1191 
1192  if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1194  MIB.addImm(MFI->getLDSSize());
1195  } else {
1196  Module *M = MF->getFunction().getParent();
1197  const GlobalValue *GV
1198  = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1200  }
1201 
1202  I.eraseFromParent();
1203  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1204 }
1205 
1206 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1207  MachineBasicBlock *MBB = I.getParent();
1209  const DebugLoc &DL = I.getDebugLoc();
1210 
1211  MachineOperand &Dst = I.getOperand(0);
1212  Register DstReg = Dst.getReg();
1213  unsigned Depth = I.getOperand(2).getImm();
1214 
1215  const TargetRegisterClass *RC
1216  = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1217  if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1218  !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1219  return false;
1220 
1221  // Check for kernel and shader functions
1222  if (Depth != 0 ||
1224  BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1225  .addImm(0);
1226  I.eraseFromParent();
1227  return true;
1228  }
1229 
1230  MachineFrameInfo &MFI = MF.getFrameInfo();
1231  // There is a call to @llvm.returnaddress in this function
1232  MFI.setReturnAddressIsTaken(true);
1233 
1234  // Get the return address reg and mark it as an implicit live-in
1235  Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1236  Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1237  AMDGPU::SReg_64RegClass);
1238  BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1239  .addReg(LiveIn);
1240  I.eraseFromParent();
1241  return true;
1242 }
1243 
1244 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1245  // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
1246  // SelectionDAG uses for wave32 vs wave64.
1247  MachineBasicBlock *BB = MI.getParent();
1248  BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1249  .add(MI.getOperand(1));
1250 
1251  Register Reg = MI.getOperand(1).getReg();
1252  MI.eraseFromParent();
1253 
1254  if (!MRI->getRegClassOrNull(Reg))
1256  return true;
1257 }
1258 
1259 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1260  MachineInstr &MI, Intrinsic::ID IntrID) const {
1261  MachineBasicBlock *MBB = MI.getParent();
1263  const DebugLoc &DL = MI.getDebugLoc();
1264 
1265  unsigned IndexOperand = MI.getOperand(7).getImm();
1266  bool WaveRelease = MI.getOperand(8).getImm() != 0;
1267  bool WaveDone = MI.getOperand(9).getImm() != 0;
1268 
1269  if (WaveDone && !WaveRelease)
1270  report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1271 
1272  unsigned OrderedCountIndex = IndexOperand & 0x3f;
1273  IndexOperand &= ~0x3f;
1274  unsigned CountDw = 0;
1275 
1276  if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1277  CountDw = (IndexOperand >> 24) & 0xf;
1278  IndexOperand &= ~(0xf << 24);
1279 
1280  if (CountDw < 1 || CountDw > 4) {
1282  "ds_ordered_count: dword count must be between 1 and 4");
1283  }
1284  }
1285 
1286  if (IndexOperand)
1287  report_fatal_error("ds_ordered_count: bad index operand");
1288 
1289  unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1290  unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1291 
1292  unsigned Offset0 = OrderedCountIndex << 2;
1293  unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
1294  (Instruction << 4);
1295 
1297  Offset1 |= (CountDw - 1) << 6;
1298 
1299  unsigned Offset = Offset0 | (Offset1 << 8);
1300 
1301  Register M0Val = MI.getOperand(2).getReg();
1302  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1303  .addReg(M0Val);
1304 
1305  Register DstReg = MI.getOperand(0).getReg();
1306  Register ValReg = MI.getOperand(3).getReg();
1308  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1309  .addReg(ValReg)
1310  .addImm(Offset)
1311  .cloneMemRefs(MI);
1312 
1313  if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1314  return false;
1315 
1316  bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1317  MI.eraseFromParent();
1318  return Ret;
1319 }
1320 
1321 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1322  switch (IntrID) {
1323  case Intrinsic::amdgcn_ds_gws_init:
1324  return AMDGPU::DS_GWS_INIT;
1325  case Intrinsic::amdgcn_ds_gws_barrier:
1326  return AMDGPU::DS_GWS_BARRIER;
1327  case Intrinsic::amdgcn_ds_gws_sema_v:
1328  return AMDGPU::DS_GWS_SEMA_V;
1329  case Intrinsic::amdgcn_ds_gws_sema_br:
1330  return AMDGPU::DS_GWS_SEMA_BR;
1331  case Intrinsic::amdgcn_ds_gws_sema_p:
1332  return AMDGPU::DS_GWS_SEMA_P;
1333  case Intrinsic::amdgcn_ds_gws_sema_release_all:
1334  return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1335  default:
1336  llvm_unreachable("not a gws intrinsic");
1337  }
1338 }
1339 
1340 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1341  Intrinsic::ID IID) const {
1342  if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1343  !STI.hasGWSSemaReleaseAll())
1344  return false;
1345 
1346  // intrinsic ID, vsrc, offset
1347  const bool HasVSrc = MI.getNumOperands() == 3;
1348  assert(HasVSrc || MI.getNumOperands() == 2);
1349 
1350  Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1351  const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1352  if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1353  return false;
1354 
1355  MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1356  assert(OffsetDef);
1357 
1358  unsigned ImmOffset;
1359 
1360  MachineBasicBlock *MBB = MI.getParent();
1361  const DebugLoc &DL = MI.getDebugLoc();
1362 
1363  MachineInstr *Readfirstlane = nullptr;
1364 
1365  // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1366  // incoming offset, in case there's an add of a constant. We'll have to put it
1367  // back later.
1368  if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1369  Readfirstlane = OffsetDef;
1370  BaseOffset = OffsetDef->getOperand(1).getReg();
1371  OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1372  }
1373 
1374  if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1375  // If we have a constant offset, try to use the 0 in m0 as the base.
1376  // TODO: Look into changing the default m0 initialization value. If the
1377  // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1378  // the immediate offset.
1379 
1380  ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1381  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1382  .addImm(0);
1383  } else {
1384  std::tie(BaseOffset, ImmOffset) =
1385  AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset);
1386 
1387  if (Readfirstlane) {
1388  // We have the constant offset now, so put the readfirstlane back on the
1389  // variable component.
1390  if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1391  return false;
1392 
1393  Readfirstlane->getOperand(1).setReg(BaseOffset);
1394  BaseOffset = Readfirstlane->getOperand(0).getReg();
1395  } else {
1396  if (!RBI.constrainGenericRegister(BaseOffset,
1397  AMDGPU::SReg_32RegClass, *MRI))
1398  return false;
1399  }
1400 
1401  Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1402  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1403  .addReg(BaseOffset)
1404  .addImm(16);
1405 
1406  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1407  .addReg(M0Base);
1408  }
1409 
1410  // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1411  // offset field) % 64. Some versions of the programming guide omit the m0
1412  // part, or claim it's from offset 0.
1413  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1414 
1415  if (HasVSrc) {
1416  Register VSrc = MI.getOperand(1).getReg();
1417 
1418  if (STI.needsAlignedVGPRs()) {
1419  // Add implicit aligned super-reg to force alignment on the data operand.
1420  Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1421  BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
1422  Register NewVR =
1423  MRI->createVirtualRegister(&AMDGPU::VReg_64_Align2RegClass);
1424  BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), NewVR)
1425  .addReg(VSrc, 0, MI.getOperand(1).getSubReg())
1426  .addImm(AMDGPU::sub0)
1427  .addReg(Undef)
1428  .addImm(AMDGPU::sub1);
1429  MIB.addReg(NewVR, 0, AMDGPU::sub0);
1430  MIB.addReg(NewVR, RegState::Implicit);
1431  } else {
1432  MIB.addReg(VSrc);
1433  }
1434 
1435  if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1436  return false;
1437  }
1438 
1439  MIB.addImm(ImmOffset)
1440  .cloneMemRefs(MI);
1441 
1442  MI.eraseFromParent();
1443  return true;
1444 }
1445 
1446 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1447  bool IsAppend) const {
1448  Register PtrBase = MI.getOperand(2).getReg();
1449  LLT PtrTy = MRI->getType(PtrBase);
1450  bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1451 
1452  unsigned Offset;
1453  std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1454 
1455  // TODO: Should this try to look through readfirstlane like GWS?
1456  if (!isDSOffsetLegal(PtrBase, Offset)) {
1457  PtrBase = MI.getOperand(2).getReg();
1458  Offset = 0;
1459  }
1460 
1461  MachineBasicBlock *MBB = MI.getParent();
1462  const DebugLoc &DL = MI.getDebugLoc();
1463  const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1464 
1465  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1466  .addReg(PtrBase);
1467  if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1468  return false;
1469 
1470  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1471  .addImm(Offset)
1472  .addImm(IsGDS ? -1 : 0)
1473  .cloneMemRefs(MI);
1474  MI.eraseFromParent();
1475  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1476 }
1477 
1478 bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1479  if (TM.getOptLevel() > CodeGenOpt::None) {
1480  unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1481  if (WGSize <= STI.getWavefrontSize()) {
1482  MachineBasicBlock *MBB = MI.getParent();
1483  const DebugLoc &DL = MI.getDebugLoc();
1484  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1485  MI.eraseFromParent();
1486  return true;
1487  }
1488  }
1489  return selectImpl(MI, *CoverageInfo);
1490 }
1491 
1492 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1493  bool &IsTexFail) {
1494  if (TexFailCtrl)
1495  IsTexFail = true;
1496 
1497  TFE = (TexFailCtrl & 0x1) ? 1 : 0;
1498  TexFailCtrl &= ~(uint64_t)0x1;
1499  LWE = (TexFailCtrl & 0x2) ? 1 : 0;
1500  TexFailCtrl &= ~(uint64_t)0x2;
1501 
1502  return TexFailCtrl == 0;
1503 }
1504 
1505 bool AMDGPUInstructionSelector::selectImageIntrinsic(
1507  MachineBasicBlock *MBB = MI.getParent();
1508  const DebugLoc &DL = MI.getDebugLoc();
1509 
1510  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1511  AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1512 
1513  const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1514  const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
1515  AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
1516  const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
1517  AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
1518  unsigned IntrOpcode = Intr->BaseOpcode;
1519  const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
1520 
1521  const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
1522 
1523  Register VDataIn, VDataOut;
1524  LLT VDataTy;
1525  int NumVDataDwords = -1;
1526  bool IsD16 = false;
1527 
1528  bool Unorm;
1529  if (!BaseOpcode->Sampler)
1530  Unorm = true;
1531  else
1532  Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
1533 
1534  bool TFE;
1535  bool LWE;
1536  bool IsTexFail = false;
1537  if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
1538  TFE, LWE, IsTexFail))
1539  return false;
1540 
1541  const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
1542  const bool IsA16 = (Flags & 1) != 0;
1543  const bool IsG16 = (Flags & 2) != 0;
1544 
1545  // A16 implies 16 bit gradients if subtarget doesn't support G16
1546  if (IsA16 && !STI.hasG16() && !IsG16)
1547  return false;
1548 
1549  unsigned DMask = 0;
1550  unsigned DMaskLanes = 0;
1551 
1552  if (BaseOpcode->Atomic) {
1553  VDataOut = MI.getOperand(0).getReg();
1554  VDataIn = MI.getOperand(2).getReg();
1555  LLT Ty = MRI->getType(VDataIn);
1556 
1557  // Be careful to allow atomic swap on 16-bit element vectors.
1558  const bool Is64Bit = BaseOpcode->AtomicX2 ?
1559  Ty.getSizeInBits() == 128 :
1560  Ty.getSizeInBits() == 64;
1561 
1562  if (BaseOpcode->AtomicX2) {
1563  assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1564 
1565  DMask = Is64Bit ? 0xf : 0x3;
1566  NumVDataDwords = Is64Bit ? 4 : 2;
1567  } else {
1568  DMask = Is64Bit ? 0x3 : 0x1;
1569  NumVDataDwords = Is64Bit ? 2 : 1;
1570  }
1571  } else {
1572  DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
1573  DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
1574 
1575  // One memoperand is mandatory, except for getresinfo.
1576  // FIXME: Check this in verifier.
1577  if (!MI.memoperands_empty()) {
1578  const MachineMemOperand *MMO = *MI.memoperands_begin();
1579 
1580  // Infer d16 from the memory size, as the register type will be mangled by
1581  // unpacked subtargets, or by TFE.
1582  IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
1583  }
1584 
1585  if (BaseOpcode->Store) {
1586  VDataIn = MI.getOperand(1).getReg();
1587  VDataTy = MRI->getType(VDataIn);
1588  NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1589  } else {
1590  VDataOut = MI.getOperand(0).getReg();
1591  VDataTy = MRI->getType(VDataOut);
1592  NumVDataDwords = DMaskLanes;
1593 
1594  if (IsD16 && !STI.hasUnpackedD16VMem())
1595  NumVDataDwords = (DMaskLanes + 1) / 2;
1596  }
1597  }
1598 
1599  // Optimize _L to _LZ when _L is zero
1600  if (LZMappingInfo) {
1601  // The legalizer replaced the register with an immediate 0 if we need to
1602  // change the opcode.
1603  const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->LodIndex);
1604  if (Lod.isImm()) {
1605  assert(Lod.getImm() == 0);
1606  IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l
1607  }
1608  }
1609 
1610  // Optimize _mip away, when 'lod' is zero
1611  if (MIPMappingInfo) {
1612  const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->MipIndex);
1613  if (Lod.isImm()) {
1614  assert(Lod.getImm() == 0);
1615  IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip
1616  }
1617  }
1618 
1619  // Set G16 opcode
1620  if (IsG16 && !IsA16) {
1621  const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1622  AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
1623  assert(G16MappingInfo);
1624  IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1625  }
1626 
1627  // TODO: Check this in verifier.
1628  assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1629 
1630  unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
1631  if (BaseOpcode->Atomic)
1632  CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
1633  if (CPol & ~AMDGPU::CPol::ALL)
1634  return false;
1635 
1636  int NumVAddrRegs = 0;
1637  int NumVAddrDwords = 0;
1638  for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
1639  // Skip the $noregs and 0s inserted during legalization.
1640  MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
1641  if (!AddrOp.isReg())
1642  continue; // XXX - Break?
1643 
1644  Register Addr = AddrOp.getReg();
1645  if (!Addr)
1646  break;
1647 
1648  ++NumVAddrRegs;
1649  NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1650  }
1651 
1652  // The legalizer preprocessed the intrinsic arguments. If we aren't using
1653  // NSA, these should have beeen packed into a single value in the first
1654  // address register
1655  const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
1656  if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1657  LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1658  return false;
1659  }
1660 
1661  if (IsTexFail)
1662  ++NumVDataDwords;
1663 
1664  int Opcode = -1;
1665  if (IsGFX10Plus) {
1666  Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1667  UseNSA ? AMDGPU::MIMGEncGfx10NSA
1668  : AMDGPU::MIMGEncGfx10Default,
1669  NumVDataDwords, NumVAddrDwords);
1670  } else {
1672  Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1673  NumVDataDwords, NumVAddrDwords);
1674  if (Opcode == -1)
1675  Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1676  NumVDataDwords, NumVAddrDwords);
1677  }
1678  assert(Opcode != -1);
1679 
1680  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1681  .cloneMemRefs(MI);
1682 
1683  if (VDataOut) {
1684  if (BaseOpcode->AtomicX2) {
1685  const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1686 
1688  Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1689  unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1690 
1691  MIB.addDef(TmpReg);
1692  if (!MRI->use_empty(VDataOut)) {
1693  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1694  .addReg(TmpReg, RegState::Kill, SubReg);
1695  }
1696 
1697  } else {
1698  MIB.addDef(VDataOut); // vdata output
1699  }
1700  }
1701 
1702  if (VDataIn)
1703  MIB.addReg(VDataIn); // vdata input
1704 
1705  for (int I = 0; I != NumVAddrRegs; ++I) {
1706  MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
1707  if (SrcOp.isReg()) {
1708  assert(SrcOp.getReg() != 0);
1709  MIB.addReg(SrcOp.getReg());
1710  }
1711  }
1712 
1713  MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
1714  if (BaseOpcode->Sampler)
1715  MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
1716 
1717  MIB.addImm(DMask); // dmask
1718 
1719  if (IsGFX10Plus)
1720  MIB.addImm(DimInfo->Encoding);
1721  MIB.addImm(Unorm);
1722 
1723  MIB.addImm(CPol);
1724  MIB.addImm(IsA16 && // a16 or r128
1725  STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
1726  if (IsGFX10Plus)
1727  MIB.addImm(IsA16 ? -1 : 0);
1728 
1729  MIB.addImm(TFE); // tfe
1730  MIB.addImm(LWE); // lwe
1731  if (!IsGFX10Plus)
1732  MIB.addImm(DimInfo->DA ? -1 : 0);
1733  if (BaseOpcode->HasD16)
1734  MIB.addImm(IsD16 ? -1 : 0);
1735 
1736  if (IsTexFail) {
1737  // An image load instruction with TFE/LWE only conditionally writes to its
1738  // result registers. Initialize them to zero so that we always get well
1739  // defined result values.
1740  assert(VDataOut && !VDataIn);
1741  Register Tied = MRI->cloneVirtualRegister(VDataOut);
1742  Register Zero = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1743  BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::V_MOV_B32_e32), Zero)
1744  .addImm(0);
1745  auto Parts = TRI.getRegSplitParts(MRI->getRegClass(Tied), 4);
1746  if (STI.usePRTStrictNull()) {
1747  // With enable-prt-strict-null enabled, initialize all result registers to
1748  // zero.
1749  auto RegSeq =
1750  BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
1751  for (auto Sub : Parts)
1752  RegSeq.addReg(Zero).addImm(Sub);
1753  } else {
1754  // With enable-prt-strict-null disabled, only initialize the extra TFE/LWE
1755  // result register.
1756  Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1757  BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
1758  auto RegSeq =
1759  BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
1760  for (auto Sub : Parts.drop_back(1))
1761  RegSeq.addReg(Undef).addImm(Sub);
1762  RegSeq.addReg(Zero).addImm(Parts.back());
1763  }
1764  MIB.addReg(Tied, RegState::Implicit);
1765  MIB->tieOperands(0, MIB->getNumOperands() - 1);
1766  }
1767 
1768  MI.eraseFromParent();
1769  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1770 }
1771 
1772 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
1773  MachineInstr &I) const {
1774  unsigned IntrinsicID = I.getIntrinsicID();
1775  switch (IntrinsicID) {
1776  case Intrinsic::amdgcn_end_cf:
1777  return selectEndCfIntrinsic(I);
1778  case Intrinsic::amdgcn_ds_ordered_add:
1779  case Intrinsic::amdgcn_ds_ordered_swap:
1780  return selectDSOrderedIntrinsic(I, IntrinsicID);
1781  case Intrinsic::amdgcn_ds_gws_init:
1782  case Intrinsic::amdgcn_ds_gws_barrier:
1783  case Intrinsic::amdgcn_ds_gws_sema_v:
1784  case Intrinsic::amdgcn_ds_gws_sema_br:
1785  case Intrinsic::amdgcn_ds_gws_sema_p:
1786  case Intrinsic::amdgcn_ds_gws_sema_release_all:
1787  return selectDSGWSIntrinsic(I, IntrinsicID);
1788  case Intrinsic::amdgcn_ds_append:
1789  return selectDSAppendConsume(I, true);
1790  case Intrinsic::amdgcn_ds_consume:
1791  return selectDSAppendConsume(I, false);
1792  case Intrinsic::amdgcn_s_barrier:
1793  return selectSBarrier(I);
1794  case Intrinsic::amdgcn_global_atomic_fadd:
1795  return selectGlobalAtomicFadd(I, I.getOperand(2), I.getOperand(3));
1796  default: {
1797  return selectImpl(I, *CoverageInfo);
1798  }
1799  }
1800 }
1801 
1802 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
1803  if (selectImpl(I, *CoverageInfo))
1804  return true;
1805 
1806  MachineBasicBlock *BB = I.getParent();
1807  const DebugLoc &DL = I.getDebugLoc();
1808 
1809  Register DstReg = I.getOperand(0).getReg();
1810  unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
1811  assert(Size <= 32 || Size == 64);
1812  const MachineOperand &CCOp = I.getOperand(1);
1813  Register CCReg = CCOp.getReg();
1814  if (!isVCC(CCReg, *MRI)) {
1815  unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
1816  AMDGPU::S_CSELECT_B32;
1817  MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
1818  .addReg(CCReg);
1819 
1820  // The generic constrainSelectedInstRegOperands doesn't work for the scc register
1821  // bank, because it does not cover the register class that we used to represent
1822  // for it. So we need to manually set the register class here.
1823  if (!MRI->getRegClassOrNull(CCReg))
1824  MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
1825  MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
1826  .add(I.getOperand(2))
1827  .add(I.getOperand(3));
1828 
1829  bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) |
1830  constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
1831  I.eraseFromParent();
1832  return Ret;
1833  }
1834 
1835  // Wide VGPR select should have been split in RegBankSelect.
1836  if (Size > 32)
1837  return false;
1838 
1839  MachineInstr *Select =
1840  BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1841  .addImm(0)
1842  .add(I.getOperand(3))
1843  .addImm(0)
1844  .add(I.getOperand(2))
1845  .add(I.getOperand(1));
1846 
1847  bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
1848  I.eraseFromParent();
1849  return Ret;
1850 }
1851 
1852 static int sizeToSubRegIndex(unsigned Size) {
1853  switch (Size) {
1854  case 32:
1855  return AMDGPU::sub0;
1856  case 64:
1857  return AMDGPU::sub0_sub1;
1858  case 96:
1859  return AMDGPU::sub0_sub1_sub2;
1860  case 128:
1861  return AMDGPU::sub0_sub1_sub2_sub3;
1862  case 256:
1863  return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
1864  default:
1865  if (Size < 32)
1866  return AMDGPU::sub0;
1867  if (Size > 256)
1868  return -1;
1870  }
1871 }
1872 
1873 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
1874  Register DstReg = I.getOperand(0).getReg();
1875  Register SrcReg = I.getOperand(1).getReg();
1876  const LLT DstTy = MRI->getType(DstReg);
1877  const LLT SrcTy = MRI->getType(SrcReg);
1878  const LLT S1 = LLT::scalar(1);
1879 
1880  const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
1881  const RegisterBank *DstRB;
1882  if (DstTy == S1) {
1883  // This is a special case. We don't treat s1 for legalization artifacts as
1884  // vcc booleans.
1885  DstRB = SrcRB;
1886  } else {
1887  DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1888  if (SrcRB != DstRB)
1889  return false;
1890  }
1891 
1892  const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
1893 
1894  unsigned DstSize = DstTy.getSizeInBits();
1895  unsigned SrcSize = SrcTy.getSizeInBits();
1896 
1897  const TargetRegisterClass *SrcRC
1898  = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI);
1899  const TargetRegisterClass *DstRC
1900  = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
1901  if (!SrcRC || !DstRC)
1902  return false;
1903 
1904  if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
1905  !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
1906  LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
1907  return false;
1908  }
1909 
1910  if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
1911  MachineBasicBlock *MBB = I.getParent();
1912  const DebugLoc &DL = I.getDebugLoc();
1913 
1914  Register LoReg = MRI->createVirtualRegister(DstRC);
1915  Register HiReg = MRI->createVirtualRegister(DstRC);
1916  BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
1917  .addReg(SrcReg, 0, AMDGPU::sub0);
1918  BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
1919  .addReg(SrcReg, 0, AMDGPU::sub1);
1920 
1921  if (IsVALU && STI.hasSDWA()) {
1922  // Write the low 16-bits of the high element into the high 16-bits of the
1923  // low element.
1924  MachineInstr *MovSDWA =
1925  BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
1926  .addImm(0) // $src0_modifiers
1927  .addReg(HiReg) // $src0
1928  .addImm(0) // $clamp
1929  .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
1930  .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
1931  .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
1932  .addReg(LoReg, RegState::Implicit);
1933  MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
1934  } else {
1935  Register TmpReg0 = MRI->createVirtualRegister(DstRC);
1936  Register TmpReg1 = MRI->createVirtualRegister(DstRC);
1937  Register ImmReg = MRI->createVirtualRegister(DstRC);
1938  if (IsVALU) {
1939  BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
1940  .addImm(16)
1941  .addReg(HiReg);
1942  } else {
1943  BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
1944  .addReg(HiReg)
1945  .addImm(16);
1946  }
1947 
1948  unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
1949  unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
1950  unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
1951 
1952  BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
1953  .addImm(0xffff);
1954  BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
1955  .addReg(LoReg)
1956  .addReg(ImmReg);
1957  BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
1958  .addReg(TmpReg0)
1959  .addReg(TmpReg1);
1960  }
1961 
1962  I.eraseFromParent();
1963  return true;
1964  }
1965 
1966  if (!DstTy.isScalar())
1967  return false;
1968 
1969  if (SrcSize > 32) {
1970  int SubRegIdx = sizeToSubRegIndex(DstSize);
1971  if (SubRegIdx == -1)
1972  return false;
1973 
1974  // Deal with weird cases where the class only partially supports the subreg
1975  // index.
1976  const TargetRegisterClass *SrcWithSubRC
1977  = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
1978  if (!SrcWithSubRC)
1979  return false;
1980 
1981  if (SrcWithSubRC != SrcRC) {
1982  if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
1983  return false;
1984  }
1985 
1986  I.getOperand(1).setSubReg(SubRegIdx);
1987  }
1988 
1989  I.setDesc(TII.get(TargetOpcode::COPY));
1990  return true;
1991 }
1992 
1993 /// \returns true if a bitmask for \p Size bits will be an inline immediate.
1994 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
1995  Mask = maskTrailingOnes<unsigned>(Size);
1996  int SignedMask = static_cast<int>(Mask);
1997  return SignedMask >= -16 && SignedMask <= 64;
1998 }
1999 
2000 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2001 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2003  const TargetRegisterInfo &TRI) const {
2004  const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2005  if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
2006  return RB;
2007 
2008  // Ignore the type, since we don't use vcc in artifacts.
2009  if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
2010  return &RBI.getRegBankFromRegClass(*RC, LLT());
2011  return nullptr;
2012 }
2013 
2014 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2015  bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2016  bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2017  const DebugLoc &DL = I.getDebugLoc();
2018  MachineBasicBlock &MBB = *I.getParent();
2019  const Register DstReg = I.getOperand(0).getReg();
2020  const Register SrcReg = I.getOperand(1).getReg();
2021 
2022  const LLT DstTy = MRI->getType(DstReg);
2023  const LLT SrcTy = MRI->getType(SrcReg);
2024  const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2025  I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2026  const unsigned DstSize = DstTy.getSizeInBits();
2027  if (!DstTy.isScalar())
2028  return false;
2029 
2030  // Artifact casts should never use vcc.
2031  const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2032 
2033  // FIXME: This should probably be illegal and split earlier.
2034  if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2035  if (DstSize <= 32)
2036  return selectCOPY(I);
2037 
2038  const TargetRegisterClass *SrcRC =
2039  TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI);
2040  const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2041  const TargetRegisterClass *DstRC =
2042  TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
2043 
2044  Register UndefReg = MRI->createVirtualRegister(SrcRC);
2045  BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2046  BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2047  .addReg(SrcReg)
2048  .addImm(AMDGPU::sub0)
2049  .addReg(UndefReg)
2050  .addImm(AMDGPU::sub1);
2051  I.eraseFromParent();
2052 
2053  return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2054  RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2055  }
2056 
2057  if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2058  // 64-bit should have been split up in RegBankSelect
2059 
2060  // Try to use an and with a mask if it will save code size.
2061  unsigned Mask;
2062  if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2063  MachineInstr *ExtI =
2064  BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2065  .addImm(Mask)
2066  .addReg(SrcReg);
2067  I.eraseFromParent();
2068  return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2069  }
2070 
2071  const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2072  MachineInstr *ExtI =
2073  BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2074  .addReg(SrcReg)
2075  .addImm(0) // Offset
2076  .addImm(SrcSize); // Width
2077  I.eraseFromParent();
2078  return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2079  }
2080 
2081  if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2082  const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2083  AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2084  if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2085  return false;
2086 
2087  if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2088  const unsigned SextOpc = SrcSize == 8 ?
2089  AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2090  BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2091  .addReg(SrcReg);
2092  I.eraseFromParent();
2093  return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2094  }
2095 
2096  const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2097  const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2098 
2099  // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2100  if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2101  // We need a 64-bit register source, but the high bits don't matter.
2102  Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2103  Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2104  unsigned SubReg = InReg ? AMDGPU::sub0 : 0;
2105 
2106  BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2107  BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2108  .addReg(SrcReg, 0, SubReg)
2109  .addImm(AMDGPU::sub0)
2110  .addReg(UndefReg)
2111  .addImm(AMDGPU::sub1);
2112 
2113  BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2114  .addReg(ExtReg)
2115  .addImm(SrcSize << 16);
2116 
2117  I.eraseFromParent();
2118  return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2119  }
2120 
2121  unsigned Mask;
2122  if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2123  BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2124  .addReg(SrcReg)
2125  .addImm(Mask);
2126  } else {
2127  BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2128  .addReg(SrcReg)
2129  .addImm(SrcSize << 16);
2130  }
2131 
2132  I.eraseFromParent();
2133  return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2134  }
2135 
2136  return false;
2137 }
2138 
2139 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
2140  MachineBasicBlock *BB = I.getParent();
2141  MachineOperand &ImmOp = I.getOperand(1);
2142  Register DstReg = I.getOperand(0).getReg();
2143  unsigned Size = MRI->getType(DstReg).getSizeInBits();
2144 
2145  // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
2146  if (ImmOp.isFPImm()) {
2147  const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
2148  ImmOp.ChangeToImmediate(Imm.getZExtValue());
2149  } else if (ImmOp.isCImm()) {
2150  ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
2151  } else {
2152  llvm_unreachable("Not supported by g_constants");
2153  }
2154 
2155  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2156  const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
2157 
2158  unsigned Opcode;
2159  if (DstRB->getID() == AMDGPU::VCCRegBankID) {
2160  Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2161  } else {
2162  Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2163 
2164  // We should never produce s1 values on banks other than VCC. If the user of
2165  // this already constrained the register, we may incorrectly think it's VCC
2166  // if it wasn't originally.
2167  if (Size == 1)
2168  return false;
2169  }
2170 
2171  if (Size != 64) {
2172  I.setDesc(TII.get(Opcode));
2173  I.addImplicitDefUseOperands(*MF);
2174  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2175  }
2176 
2177  const DebugLoc &DL = I.getDebugLoc();
2178 
2179  APInt Imm(Size, I.getOperand(1).getImm());
2180 
2181  MachineInstr *ResInst;
2182  if (IsSgpr && TII.isInlineConstant(Imm)) {
2183  ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2184  .addImm(I.getOperand(1).getImm());
2185  } else {
2186  const TargetRegisterClass *RC = IsSgpr ?
2187  &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2188  Register LoReg = MRI->createVirtualRegister(RC);
2189  Register HiReg = MRI->createVirtualRegister(RC);
2190 
2191  BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2192  .addImm(Imm.trunc(32).getZExtValue());
2193 
2194  BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2195  .addImm(Imm.ashr(32).getZExtValue());
2196 
2197  ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2198  .addReg(LoReg)
2199  .addImm(AMDGPU::sub0)
2200  .addReg(HiReg)
2201  .addImm(AMDGPU::sub1);
2202  }
2203 
2204  // We can't call constrainSelectedInstRegOperands here, because it doesn't
2205  // work for target independent opcodes
2206  I.eraseFromParent();
2207  const TargetRegisterClass *DstRC =
2208  TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2209  if (!DstRC)
2210  return true;
2211  return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2212 }
2213 
2214 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2215  // Only manually handle the f64 SGPR case.
2216  //
2217  // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2218  // the bit ops theoretically have a second result due to the implicit def of
2219  // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2220  // that is easy by disabling the check. The result works, but uses a
2221  // nonsensical sreg32orlds_and_sreg_1 regclass.
2222  //
2223  // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2224  // the variadic REG_SEQUENCE operands.
2225 
2226  Register Dst = MI.getOperand(0).getReg();
2227  const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2228  if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2229  MRI->getType(Dst) != LLT::scalar(64))
2230  return false;
2231 
2232  Register Src = MI.getOperand(1).getReg();
2233  MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2234  if (Fabs)
2235  Src = Fabs->getOperand(1).getReg();
2236 
2237  if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2238  !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2239  return false;
2240 
2241  MachineBasicBlock *BB = MI.getParent();
2242  const DebugLoc &DL = MI.getDebugLoc();
2243  Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2244  Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2245  Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2246  Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2247 
2248  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2249  .addReg(Src, 0, AMDGPU::sub0);
2250  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2251  .addReg(Src, 0, AMDGPU::sub1);
2252  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2253  .addImm(0x80000000);
2254 
2255  // Set or toggle sign bit.
2256  unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2257  BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2258  .addReg(HiReg)
2259  .addReg(ConstReg);
2260  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2261  .addReg(LoReg)
2262  .addImm(AMDGPU::sub0)
2263  .addReg(OpReg)
2264  .addImm(AMDGPU::sub1);
2265  MI.eraseFromParent();
2266  return true;
2267 }
2268 
2269 // FIXME: This is a workaround for the same tablegen problems as G_FNEG
2270 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2271  Register Dst = MI.getOperand(0).getReg();
2272  const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2273  if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2274  MRI->getType(Dst) != LLT::scalar(64))
2275  return false;
2276 
2277  Register Src = MI.getOperand(1).getReg();
2278  MachineBasicBlock *BB = MI.getParent();
2279  const DebugLoc &DL = MI.getDebugLoc();
2280  Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2281  Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2282  Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2283  Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2284 
2285  if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2286  !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2287  return false;
2288 
2289  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2290  .addReg(Src, 0, AMDGPU::sub0);
2291  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2292  .addReg(Src, 0, AMDGPU::sub1);
2293  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2294  .addImm(0x7fffffff);
2295 
2296  // Clear sign bit.
2297  // TODO: Should this used S_BITSET0_*?
2298  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2299  .addReg(HiReg)
2300  .addReg(ConstReg);
2301  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2302  .addReg(LoReg)
2303  .addImm(AMDGPU::sub0)
2304  .addReg(OpReg)
2305  .addImm(AMDGPU::sub1);
2306 
2307  MI.eraseFromParent();
2308  return true;
2309 }
2310 
2311 static bool isConstant(const MachineInstr &MI) {
2312  return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2313 }
2314 
2315 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2316  const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2317 
2318  const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg());
2319 
2320  assert(PtrMI);
2321 
2322  if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2323  return;
2324 
2325  GEPInfo GEPInfo(*PtrMI);
2326 
2327  for (unsigned i = 1; i != 3; ++i) {
2328  const MachineOperand &GEPOp = PtrMI->getOperand(i);
2329  const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2330  assert(OpDef);
2331  if (i == 2 && isConstant(*OpDef)) {
2332  // TODO: Could handle constant base + variable offset, but a combine
2333  // probably should have commuted it.
2334  assert(GEPInfo.Imm == 0);
2335  GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2336  continue;
2337  }
2338  const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2339  if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2340  GEPInfo.SgprParts.push_back(GEPOp.getReg());
2341  else
2342  GEPInfo.VgprParts.push_back(GEPOp.getReg());
2343  }
2344 
2345  AddrInfo.push_back(GEPInfo);
2346  getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2347 }
2348 
2349 bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2350  return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2351 }
2352 
2353 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2354  if (!MI.hasOneMemOperand())
2355  return false;
2356 
2357  const MachineMemOperand *MMO = *MI.memoperands_begin();
2358  const Value *Ptr = MMO->getValue();
2359 
2360  // UndefValue means this is a load of a kernel input. These are uniform.
2361  // Sometimes LDS instructions have constant pointers.
2362  // If Ptr is null, then that means this mem operand contains a
2363  // PseudoSourceValue like GOT.
2364  if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2365  isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2366  return true;
2367 
2369  return true;
2370 
2371  const Instruction *I = dyn_cast<Instruction>(Ptr);
2372  return I && I->getMetadata("amdgpu.uniform");
2373 }
2374 
2375 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2376  for (const GEPInfo &GEPInfo : AddrInfo) {
2377  if (!GEPInfo.VgprParts.empty())
2378  return true;
2379  }
2380  return false;
2381 }
2382 
2383 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2384  const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2385  unsigned AS = PtrTy.getAddressSpace();
2386  if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
2387  STI.ldsRequiresM0Init()) {
2388  MachineBasicBlock *BB = I.getParent();
2389 
2390  // If DS instructions require M0 initializtion, insert it before selecting.
2391  BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2392  .addImm(-1);
2393  }
2394 }
2395 
2396 bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2397  MachineInstr &I) const {
2398  if (I.getOpcode() == TargetOpcode::G_ATOMICRMW_FADD) {
2399  const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2400  unsigned AS = PtrTy.getAddressSpace();
2401  if (AS == AMDGPUAS::GLOBAL_ADDRESS)
2402  return selectGlobalAtomicFadd(I, I.getOperand(1), I.getOperand(2));
2403  }
2404 
2405  initM0(I);
2406  return selectImpl(I, *CoverageInfo);
2407 }
2408 
2409 // TODO: No rtn optimization.
2410 bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
2411  MachineInstr &MI) const {
2412  Register PtrReg = MI.getOperand(1).getReg();
2413  const LLT PtrTy = MRI->getType(PtrReg);
2414  if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
2415  STI.useFlatForGlobal())
2416  return selectImpl(MI, *CoverageInfo);
2417 
2418  Register DstReg = MI.getOperand(0).getReg();
2419  const LLT Ty = MRI->getType(DstReg);
2420  const bool Is64 = Ty.getSizeInBits() == 64;
2421  const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2423  Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2424 
2425  const DebugLoc &DL = MI.getDebugLoc();
2426  MachineBasicBlock *BB = MI.getParent();
2427 
2428  Register VAddr, RSrcReg, SOffset;
2429  int64_t Offset = 0;
2430 
2431  unsigned Opcode;
2432  if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) {
2433  Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN :
2434  AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN;
2435  } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr,
2436  RSrcReg, SOffset, Offset)) {
2437  Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN :
2438  AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN;
2439  } else
2440  return selectImpl(MI, *CoverageInfo);
2441 
2442  auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg)
2443  .addReg(MI.getOperand(2).getReg());
2444 
2445  if (VAddr)
2446  MIB.addReg(VAddr);
2447 
2448  MIB.addReg(RSrcReg);
2449  if (SOffset)
2450  MIB.addReg(SOffset);
2451  else
2452  MIB.addImm(0);
2453 
2454  MIB.addImm(Offset);
2456  MIB.cloneMemRefs(MI);
2457 
2458  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg)
2459  .addReg(TmpReg, RegState::Kill, SubReg);
2460 
2461  MI.eraseFromParent();
2462 
2463  MRI->setRegClass(
2464  DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass);
2465  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2466 }
2467 
2469  if (Reg.isPhysical())
2470  return false;
2471 
2473  const unsigned Opcode = MI.getOpcode();
2474 
2475  if (Opcode == AMDGPU::COPY)
2476  return isVCmpResult(MI.getOperand(1).getReg(), MRI);
2477 
2478  if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
2479  Opcode == AMDGPU::G_XOR)
2480  return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
2481  isVCmpResult(MI.getOperand(2).getReg(), MRI);
2482 
2483  if (Opcode == TargetOpcode::G_INTRINSIC)
2484  return MI.getIntrinsicID() == Intrinsic::amdgcn_class;
2485 
2486  return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
2487 }
2488 
2489 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2490  MachineBasicBlock *BB = I.getParent();
2491  MachineOperand &CondOp = I.getOperand(0);
2492  Register CondReg = CondOp.getReg();
2493  const DebugLoc &DL = I.getDebugLoc();
2494 
2495  unsigned BrOpcode;
2496  Register CondPhysReg;
2497  const TargetRegisterClass *ConstrainRC;
2498 
2499  // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2500  // whether the branch is uniform when selecting the instruction. In
2501  // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2502  // RegBankSelect knows what it's doing if the branch condition is scc, even
2503  // though it currently does not.
2504  if (!isVCC(CondReg, *MRI)) {
2505  if (MRI->getType(CondReg) != LLT::scalar(32))
2506  return false;
2507 
2508  CondPhysReg = AMDGPU::SCC;
2509  BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2510  ConstrainRC = &AMDGPU::SReg_32RegClass;
2511  } else {
2512  // FIXME: Should scc->vcc copies and with exec?
2513 
2514  // Unless the value of CondReg is a result of a V_CMP* instruction then we
2515  // need to insert an and with exec.
2516  if (!isVCmpResult(CondReg, *MRI)) {
2517  const bool Is64 = STI.isWave64();
2518  const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
2519  const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
2520 
2521  Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
2522  BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
2523  .addReg(CondReg)
2524  .addReg(Exec);
2525  CondReg = TmpReg;
2526  }
2527 
2528  CondPhysReg = TRI.getVCC();
2529  BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2530  ConstrainRC = TRI.getBoolRC();
2531  }
2532 
2533  if (!MRI->getRegClassOrNull(CondReg))
2534  MRI->setRegClass(CondReg, ConstrainRC);
2535 
2536  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2537  .addReg(CondReg);
2538  BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2539  .addMBB(I.getOperand(1).getMBB());
2540 
2541  I.eraseFromParent();
2542  return true;
2543 }
2544 
2545 bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2546  MachineInstr &I) const {
2547  Register DstReg = I.getOperand(0).getReg();
2548  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2549  const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2550  I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2551  if (IsVGPR)
2552  I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2553 
2554  return RBI.constrainGenericRegister(
2555  DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2556 }
2557 
2558 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2559  Register DstReg = I.getOperand(0).getReg();
2560  Register SrcReg = I.getOperand(1).getReg();
2561  Register MaskReg = I.getOperand(2).getReg();
2562  LLT Ty = MRI->getType(DstReg);
2563  LLT MaskTy = MRI->getType(MaskReg);
2564 
2565  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2566  const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2567  const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2568  const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2569  if (DstRB != SrcRB) // Should only happen for hand written MIR.
2570  return false;
2571 
2572  unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2573  const TargetRegisterClass &RegRC
2574  = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2575 
2576  const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB,
2577  *MRI);
2578  const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB,
2579  *MRI);
2580  const TargetRegisterClass *MaskRC =
2581  TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI);
2582 
2583  if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2584  !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2585  !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2586  return false;
2587 
2588  MachineBasicBlock *BB = I.getParent();
2589  const DebugLoc &DL = I.getDebugLoc();
2590  if (Ty.getSizeInBits() == 32) {
2591  assert(MaskTy.getSizeInBits() == 32 &&
2592  "ptrmask should have been narrowed during legalize");
2593 
2594  BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2595  .addReg(SrcReg)
2596  .addReg(MaskReg);
2597  I.eraseFromParent();
2598  return true;
2599  }
2600 
2601  Register HiReg = MRI->createVirtualRegister(&RegRC);
2602  Register LoReg = MRI->createVirtualRegister(&RegRC);
2603 
2604  // Extract the subregisters from the source pointer.
2605  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2606  .addReg(SrcReg, 0, AMDGPU::sub0);
2607  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2608  .addReg(SrcReg, 0, AMDGPU::sub1);
2609 
2610  Register MaskedLo, MaskedHi;
2611 
2612  // Try to avoid emitting a bit operation when we only need to touch half of
2613  // the 64-bit pointer.
2614  APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
2615 
2616  const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2617  const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2618  if ((MaskOnes & MaskLo32) == MaskLo32) {
2619  // If all the bits in the low half are 1, we only need a copy for it.
2620  MaskedLo = LoReg;
2621  } else {
2622  // Extract the mask subregister and apply the and.
2623  Register MaskLo = MRI->createVirtualRegister(&RegRC);
2624  MaskedLo = MRI->createVirtualRegister(&RegRC);
2625 
2626  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2627  .addReg(MaskReg, 0, AMDGPU::sub0);
2628  BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2629  .addReg(LoReg)
2630  .addReg(MaskLo);
2631  }
2632 
2633  if ((MaskOnes & MaskHi32) == MaskHi32) {
2634  // If all the bits in the high half are 1, we only need a copy for it.
2635  MaskedHi = HiReg;
2636  } else {
2637  Register MaskHi = MRI->createVirtualRegister(&RegRC);
2638  MaskedHi = MRI->createVirtualRegister(&RegRC);
2639 
2640  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
2641  .addReg(MaskReg, 0, AMDGPU::sub1);
2642  BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
2643  .addReg(HiReg)
2644  .addReg(MaskHi);
2645  }
2646 
2647  BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2648  .addReg(MaskedLo)
2649  .addImm(AMDGPU::sub0)
2650  .addReg(MaskedHi)
2651  .addImm(AMDGPU::sub1);
2652  I.eraseFromParent();
2653  return true;
2654 }
2655 
2656 /// Return the register to use for the index value, and the subregister to use
2657 /// for the indirectly accessed register.
2658 static std::pair<Register, unsigned>
2660  const SIRegisterInfo &TRI,
2661  const TargetRegisterClass *SuperRC,
2662  Register IdxReg,
2663  unsigned EltSize) {
2664  Register IdxBaseReg;
2665  int Offset;
2666 
2667  std::tie(IdxBaseReg, Offset) = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg);
2668  if (IdxBaseReg == AMDGPU::NoRegister) {
2669  // This will happen if the index is a known constant. This should ordinarily
2670  // be legalized out, but handle it as a register just in case.
2671  assert(Offset == 0);
2672  IdxBaseReg = IdxReg;
2673  }
2674 
2675  ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
2676 
2677  // Skip out of bounds offsets, or else we would end up using an undefined
2678  // register.
2679  if (static_cast<unsigned>(Offset) >= SubRegs.size())
2680  return std::make_pair(IdxReg, SubRegs[0]);
2681  return std::make_pair(IdxBaseReg, SubRegs[Offset]);
2682 }
2683 
2684 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
2685  MachineInstr &MI) const {
2686  Register DstReg = MI.getOperand(0).getReg();
2687  Register SrcReg = MI.getOperand(1).getReg();
2688  Register IdxReg = MI.getOperand(2).getReg();
2689 
2690  LLT DstTy = MRI->getType(DstReg);
2691  LLT SrcTy = MRI->getType(SrcReg);
2692 
2693  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2694  const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2695  const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2696 
2697  // The index must be scalar. If it wasn't RegBankSelect should have moved this
2698  // into a waterfall loop.
2699  if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2700  return false;
2701 
2702  const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB,
2703  *MRI);
2704  const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB,
2705  *MRI);
2706  if (!SrcRC || !DstRC)
2707  return false;
2708  if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2709  !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2710  !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2711  return false;
2712 
2713  MachineBasicBlock *BB = MI.getParent();
2714  const DebugLoc &DL = MI.getDebugLoc();
2715  const bool Is64 = DstTy.getSizeInBits() == 64;
2716 
2717  unsigned SubReg;
2718  std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg,
2719  DstTy.getSizeInBits() / 8);
2720 
2721  if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
2722  if (DstTy.getSizeInBits() != 32 && !Is64)
2723  return false;
2724 
2725  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2726  .addReg(IdxReg);
2727 
2728  unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
2729  BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
2730  .addReg(SrcReg, 0, SubReg)
2731  .addReg(SrcReg, RegState::Implicit);
2732  MI.eraseFromParent();
2733  return true;
2734  }
2735 
2736  if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
2737  return false;
2738 
2739  if (!STI.useVGPRIndexMode()) {
2740  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2741  .addReg(IdxReg);
2742  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
2743  .addReg(SrcReg, 0, SubReg)
2744  .addReg(SrcReg, RegState::Implicit);
2745  MI.eraseFromParent();
2746  return true;
2747  }
2748 
2749  const MCInstrDesc &GPRIDXDesc =
2750  TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
2751  BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
2752  .addReg(SrcReg)
2753  .addReg(IdxReg)
2754  .addImm(SubReg);
2755 
2756  MI.eraseFromParent();
2757  return true;
2758 }
2759 
2760 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
2761 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
2762  MachineInstr &MI) const {
2763  Register DstReg = MI.getOperand(0).getReg();
2764  Register VecReg = MI.getOperand(1).getReg();
2765  Register ValReg = MI.getOperand(2).getReg();
2766  Register IdxReg = MI.getOperand(3).getReg();
2767 
2768  LLT VecTy = MRI->getType(DstReg);
2769  LLT ValTy = MRI->getType(ValReg);
2770  unsigned VecSize = VecTy.getSizeInBits();
2771  unsigned ValSize = ValTy.getSizeInBits();
2772 
2773  const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
2774  const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
2775  const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2776 
2777  assert(VecTy.getElementType() == ValTy);
2778 
2779  // The index must be scalar. If it wasn't RegBankSelect should have moved this
2780  // into a waterfall loop.
2781  if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2782  return false;
2783 
2784  const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB,
2785  *MRI);
2786  const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB,
2787  *MRI);
2788 
2789  if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
2790  !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
2791  !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
2792  !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2793  return false;
2794 
2795  if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
2796  return false;
2797 
2798  unsigned SubReg;
2799  std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg,
2800  ValSize / 8);
2801 
2802  const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
2803  STI.useVGPRIndexMode();
2804 
2805  MachineBasicBlock *BB = MI.getParent();
2806  const DebugLoc &DL = MI.getDebugLoc();
2807 
2808  if (!IndexMode) {
2809  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2810  .addReg(IdxReg);
2811 
2812  const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
2813  VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
2814  BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
2815  .addReg(VecReg)
2816  .addReg(ValReg)
2817  .addImm(SubReg);
2818  MI.eraseFromParent();
2819  return true;
2820  }
2821 
2822  const MCInstrDesc &GPRIDXDesc =
2823  TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
2824  BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
2825  .addReg(VecReg)
2826  .addReg(ValReg)
2827  .addReg(IdxReg)
2828  .addImm(SubReg);
2829 
2830  MI.eraseFromParent();
2831  return true;
2832 }
2833 
2834 static bool isZeroOrUndef(int X) {
2835  return X == 0 || X == -1;
2836 }
2837 
2838 static bool isOneOrUndef(int X) {
2839  return X == 1 || X == -1;
2840 }
2841 
2842 static bool isZeroOrOneOrUndef(int X) {
2843  return X == 0 || X == 1 || X == -1;
2844 }
2845 
2846 // Normalize a VOP3P shuffle mask to refer to the low/high half of a single
2847 // 32-bit register.
2848 static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1,
2849  ArrayRef<int> Mask) {
2850  NewMask[0] = Mask[0];
2851  NewMask[1] = Mask[1];
2853  return Src0;
2854 
2855  assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1);
2856  assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1);
2857 
2858  // Shift the mask inputs to be 0/1;
2859  NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2;
2860  NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2;
2861  return Src1;
2862 }
2863 
2864 // This is only legal with VOP3P instructions as an aid to op_sel matching.
2865 bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
2866  MachineInstr &MI) const {
2867  Register DstReg = MI.getOperand(0).getReg();
2868  Register Src0Reg = MI.getOperand(1).getReg();
2869  Register Src1Reg = MI.getOperand(2).getReg();
2870  ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask();
2871 
2872  const LLT V2S16 = LLT::fixed_vector(2, 16);
2873  if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16)
2874  return false;
2875 
2876  if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask))
2877  return false;
2878 
2879  assert(ShufMask.size() == 2);
2880  assert(STI.hasSDWA() && "no target has VOP3P but not SDWA");
2881 
2882  MachineBasicBlock *MBB = MI.getParent();
2883  const DebugLoc &DL = MI.getDebugLoc();
2884 
2885  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2886  const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2887  const TargetRegisterClass &RC = IsVALU ?
2888  AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2889 
2890  // Handle the degenerate case which should have folded out.
2891  if (ShufMask[0] == -1 && ShufMask[1] == -1) {
2892  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg);
2893 
2894  MI.eraseFromParent();
2895  return RBI.constrainGenericRegister(DstReg, RC, *MRI);
2896  }
2897 
2898  // A legal VOP3P mask only reads one of the sources.
2899  int Mask[2];
2900  Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask);
2901 
2902  if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) ||
2903  !RBI.constrainGenericRegister(SrcVec, RC, *MRI))
2904  return false;
2905 
2906  // TODO: This also should have been folded out
2907  if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) {
2908  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg)
2909  .addReg(SrcVec);
2910 
2911  MI.eraseFromParent();
2912  return true;
2913  }
2914 
2915  if (Mask[0] == 1 && Mask[1] == -1) {
2916  if (IsVALU) {
2917  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
2918  .addImm(16)
2919  .addReg(SrcVec);
2920  } else {
2921  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
2922  .addReg(SrcVec)
2923  .addImm(16);
2924  }
2925  } else if (Mask[0] == -1 && Mask[1] == 0) {
2926  if (IsVALU) {
2927  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg)
2928  .addImm(16)
2929  .addReg(SrcVec);
2930  } else {
2931  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg)
2932  .addReg(SrcVec)
2933  .addImm(16);
2934  }
2935  } else if (Mask[0] == 0 && Mask[1] == 0) {
2936  if (IsVALU) {
2937  // Write low half of the register into the high half.
2938  MachineInstr *MovSDWA =
2939  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2940  .addImm(0) // $src0_modifiers
2941  .addReg(SrcVec) // $src0
2942  .addImm(0) // $clamp
2943  .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2944  .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2945  .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2946  .addReg(SrcVec, RegState::Implicit);
2947  MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2948  } else {
2949  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2950  .addReg(SrcVec)
2951  .addReg(SrcVec);
2952  }
2953  } else if (Mask[0] == 1 && Mask[1] == 1) {
2954  if (IsVALU) {
2955  // Write high half of the register into the low half.
2956  MachineInstr *MovSDWA =
2957  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2958  .addImm(0) // $src0_modifiers
2959  .addReg(SrcVec) // $src0
2960  .addImm(0) // $clamp
2961  .addImm(AMDGPU::SDWA::WORD_0) // $dst_sel
2962  .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2963  .addImm(AMDGPU::SDWA::WORD_1) // $src0_sel
2964  .addReg(SrcVec, RegState::Implicit);
2965  MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2966  } else {
2967  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg)
2968  .addReg(SrcVec)
2969  .addReg(SrcVec);
2970  }
2971  } else if (Mask[0] == 1 && Mask[1] == 0) {
2972  if (IsVALU) {
2973  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32_e64), DstReg)
2974  .addReg(SrcVec)
2975  .addReg(SrcVec)
2976  .addImm(16);
2977  } else {
2978  Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2979  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg)
2980  .addReg(SrcVec)
2981  .addImm(16);
2982  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2983  .addReg(TmpReg)
2984  .addReg(SrcVec);
2985  }
2986  } else
2987  llvm_unreachable("all shuffle masks should be handled");
2988 
2989  MI.eraseFromParent();
2990  return true;
2991 }
2992 
2993 bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
2994  MachineInstr &MI) const {
2995  if (STI.hasGFX90AInsts())
2996  return selectImpl(MI, *CoverageInfo);
2997 
2998  MachineBasicBlock *MBB = MI.getParent();
2999  const DebugLoc &DL = MI.getDebugLoc();
3000 
3001  if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
3002  Function &F = MBB->getParent()->getFunction();
3004  NoFpRet(F, "return versions of fp atomics not supported",
3005  MI.getDebugLoc(), DS_Error);
3006  F.getContext().diagnose(NoFpRet);
3007  return false;
3008  }
3009 
3010  // FIXME: This is only needed because tablegen requires number of dst operands
3011  // in match and replace pattern to be the same. Otherwise patterns can be
3012  // exported from SDag path.
3013  MachineOperand &VDataIn = MI.getOperand(1);
3014  MachineOperand &VIndex = MI.getOperand(3);
3015  MachineOperand &VOffset = MI.getOperand(4);
3016  MachineOperand &SOffset = MI.getOperand(5);
3017  int16_t Offset = MI.getOperand(6).getImm();
3018 
3019  bool HasVOffset = !isOperandImmEqual(VOffset, 0, *MRI);
3020  bool HasVIndex = !isOperandImmEqual(VIndex, 0, *MRI);
3021 
3022  unsigned Opcode;
3023  if (HasVOffset) {
3024  Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN
3025  : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN;
3026  } else {
3027  Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN
3028  : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET;
3029  }
3030 
3031  if (MRI->getType(VDataIn.getReg()).isVector()) {
3032  switch (Opcode) {
3033  case AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN:
3034  Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN;
3035  break;
3036  case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN:
3037  Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFEN;
3038  break;
3039  case AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN:
3040  Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_IDXEN;
3041  break;
3042  case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET:
3043  Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFSET;
3044  break;
3045  }
3046  }
3047 
3048  auto I = BuildMI(*MBB, MI, DL, TII.get(Opcode));
3049  I.add(VDataIn);
3050 
3051  if (Opcode == AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN ||
3052  Opcode == AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN) {
3053  Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3054  BuildMI(*MBB, &*I, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3055  .addReg(VIndex.getReg())
3056  .addImm(AMDGPU::sub0)
3057  .addReg(VOffset.getReg())
3058  .addImm(AMDGPU::sub1);
3059 
3060  I.addReg(IdxReg);
3061  } else if (HasVIndex) {
3062  I.add(VIndex);
3063  } else if (HasVOffset) {
3064  I.add(VOffset);
3065  }
3066 
3067  I.add(MI.getOperand(2)); // rsrc
3068  I.add(SOffset);
3069  I.addImm(Offset);
3070  I.addImm(MI.getOperand(7).getImm()); // cpol
3071  I.cloneMemRefs(MI);
3072 
3073  MI.eraseFromParent();
3074 
3075  return true;
3076 }
3077 
3078 bool AMDGPUInstructionSelector::selectGlobalAtomicFadd(
3079  MachineInstr &MI, MachineOperand &AddrOp, MachineOperand &DataOp) const {
3080 
3081  if (STI.hasGFX90AInsts()) {
3082  // gfx90a adds return versions of the global atomic fadd instructions so no
3083  // special handling is required.
3084  return selectImpl(MI, *CoverageInfo);
3085  }
3086 
3087  MachineBasicBlock *MBB = MI.getParent();
3088  const DebugLoc &DL = MI.getDebugLoc();
3089 
3090  if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
3091  Function &F = MBB->getParent()->getFunction();
3093  NoFpRet(F, "return versions of fp atomics not supported",
3094  MI.getDebugLoc(), DS_Error);
3095  F.getContext().diagnose(NoFpRet);
3096  return false;
3097  }
3098 
3099  // FIXME: This is only needed because tablegen requires number of dst operands
3100  // in match and replace pattern to be the same. Otherwise patterns can be
3101  // exported from SDag path.
3102  auto Addr = selectFlatOffsetImpl(AddrOp, SIInstrFlags::FlatGlobal);
3103 
3104  Register Data = DataOp.getReg();
3105  const unsigned Opc = MRI->getType(Data).isVector() ?
3106  AMDGPU::GLOBAL_ATOMIC_PK_ADD_F16 : AMDGPU::GLOBAL_ATOMIC_ADD_F32;
3107  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3108  .addReg(Addr.first)
3109  .addReg(Data)
3110  .addImm(Addr.second)
3111  .addImm(0) // cpol
3112  .cloneMemRefs(MI);
3113 
3114  MI.eraseFromParent();
3115  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3116 }
3117 
3118 bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3119  MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3120  MI.RemoveOperand(1);
3121  MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3122  return true;
3123 }
3124 
3126  if (I.isPHI())
3127  return selectPHI(I);
3128 
3129  if (!I.isPreISelOpcode()) {
3130  if (I.isCopy())
3131  return selectCOPY(I);
3132  return true;
3133  }
3134 
3135  switch (I.getOpcode()) {
3136  case TargetOpcode::G_AND:
3137  case TargetOpcode::G_OR:
3138  case TargetOpcode::G_XOR:
3139  if (selectImpl(I, *CoverageInfo))
3140  return true;
3141  return selectG_AND_OR_XOR(I);
3142  case TargetOpcode::G_ADD:
3143  case TargetOpcode::G_SUB:
3144  if (selectImpl(I, *CoverageInfo))
3145  return true;
3146  return selectG_ADD_SUB(I);
3147  case TargetOpcode::G_UADDO:
3148  case TargetOpcode::G_USUBO:
3149  case TargetOpcode::G_UADDE:
3150  case TargetOpcode::G_USUBE:
3151  return selectG_UADDO_USUBO_UADDE_USUBE(I);
3152  case TargetOpcode::G_INTTOPTR:
3153  case TargetOpcode::G_BITCAST:
3154  case TargetOpcode::G_PTRTOINT:
3155  return selectCOPY(I);
3156  case TargetOpcode::G_CONSTANT:
3157  case TargetOpcode::G_FCONSTANT:
3158  return selectG_CONSTANT(I);
3159  case TargetOpcode::G_FNEG:
3160  if (selectImpl(I, *CoverageInfo))
3161  return true;
3162  return selectG_FNEG(I);
3163  case TargetOpcode::G_FABS:
3164  if (selectImpl(I, *CoverageInfo))
3165  return true;
3166  return selectG_FABS(I);
3167  case TargetOpcode::G_EXTRACT:
3168  return selectG_EXTRACT(I);
3169  case TargetOpcode::G_MERGE_VALUES:
3170  case TargetOpcode::G_BUILD_VECTOR:
3171  case TargetOpcode::G_CONCAT_VECTORS:
3172  return selectG_MERGE_VALUES(I);
3173  case TargetOpcode::G_UNMERGE_VALUES:
3174  return selectG_UNMERGE_VALUES(I);
3175  case TargetOpcode::G_BUILD_VECTOR_TRUNC:
3176  return selectG_BUILD_VECTOR_TRUNC(I);
3177  case TargetOpcode::G_PTR_ADD:
3178  return selectG_PTR_ADD(I);
3179  case TargetOpcode::G_IMPLICIT_DEF:
3180  return selectG_IMPLICIT_DEF(I);
3181  case TargetOpcode::G_FREEZE:
3182  return selectCOPY(I);
3183  case TargetOpcode::G_INSERT:
3184  return selectG_INSERT(I);
3185  case TargetOpcode::G_INTRINSIC:
3186  return selectG_INTRINSIC(I);
3187  case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3188  return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
3189  case TargetOpcode::G_ICMP:
3190  if (selectG_ICMP(I))
3191  return true;
3192  return selectImpl(I, *CoverageInfo);
3193  case TargetOpcode::G_LOAD:
3194  case TargetOpcode::G_STORE:
3195  case TargetOpcode::G_ATOMIC_CMPXCHG:
3196  case TargetOpcode::G_ATOMICRMW_XCHG:
3197  case TargetOpcode::G_ATOMICRMW_ADD:
3198  case TargetOpcode::G_ATOMICRMW_SUB:
3199  case TargetOpcode::G_ATOMICRMW_AND:
3200  case TargetOpcode::G_ATOMICRMW_OR:
3201  case TargetOpcode::G_ATOMICRMW_XOR:
3202  case TargetOpcode::G_ATOMICRMW_MIN:
3203  case TargetOpcode::G_ATOMICRMW_MAX:
3204  case TargetOpcode::G_ATOMICRMW_UMIN:
3205  case TargetOpcode::G_ATOMICRMW_UMAX:
3206  case TargetOpcode::G_ATOMICRMW_FADD:
3207  case AMDGPU::G_AMDGPU_ATOMIC_INC:
3208  case AMDGPU::G_AMDGPU_ATOMIC_DEC:
3209  case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
3210  case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
3211  return selectG_LOAD_STORE_ATOMICRMW(I);
3212  case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
3213  return selectG_AMDGPU_ATOMIC_CMPXCHG(I);
3214  case TargetOpcode::G_SELECT:
3215  return selectG_SELECT(I);
3216  case TargetOpcode::G_TRUNC:
3217  return selectG_TRUNC(I);
3218  case TargetOpcode::G_SEXT:
3219  case TargetOpcode::G_ZEXT:
3220  case TargetOpcode::G_ANYEXT:
3221  case TargetOpcode::G_SEXT_INREG:
3222  if (selectImpl(I, *CoverageInfo))
3223  return true;
3224  return selectG_SZA_EXT(I);
3225  case TargetOpcode::G_BRCOND:
3226  return selectG_BRCOND(I);
3227  case TargetOpcode::G_GLOBAL_VALUE:
3228  return selectG_GLOBAL_VALUE(I);
3229  case TargetOpcode::G_PTRMASK:
3230  return selectG_PTRMASK(I);
3231  case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3232  return selectG_EXTRACT_VECTOR_ELT(I);
3233  case TargetOpcode::G_INSERT_VECTOR_ELT:
3234  return selectG_INSERT_VECTOR_ELT(I);
3235  case TargetOpcode::G_SHUFFLE_VECTOR:
3236  return selectG_SHUFFLE_VECTOR(I);
3237  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3238  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
3240  = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID());
3241  assert(Intr && "not an image intrinsic with image pseudo");
3242  return selectImageIntrinsic(I, Intr);
3243  }
3244  case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
3245  return selectBVHIntrinsic(I);
3246  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3247  return selectAMDGPU_BUFFER_ATOMIC_FADD(I);
3248  case AMDGPU::G_SBFX:
3249  case AMDGPU::G_UBFX:
3250  return selectG_SBFX_UBFX(I);
3251  default:
3252  return selectImpl(I, *CoverageInfo);
3253  }
3254  return false;
3255 }
3256 
3258 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
3259  return {{
3260  [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3261  }};
3262 
3263 }
3264 
3265 std::pair<Register, unsigned>
3266 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
3267  bool AllowAbs) const {
3268  Register Src = Root.getReg();
3269  Register OrigSrc = Src;
3270  unsigned Mods = 0;
3271  MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
3272 
3273  if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
3274  Src = MI->getOperand(1).getReg();
3275  Mods |= SISrcMods::NEG;
3276  MI = getDefIgnoringCopies(Src, *MRI);
3277  }
3278 
3279  if (AllowAbs && MI && MI->getOpcode() == AMDGPU::G_FABS) {
3280  Src = MI->getOperand(1).getReg();
3281  Mods |= SISrcMods::ABS;
3282  }
3283 
3284  if (Mods != 0 &&
3285  RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
3286  MachineInstr *UseMI = Root.getParent();
3287 
3288  // If we looked through copies to find source modifiers on an SGPR operand,
3289  // we now have an SGPR register source. To avoid potentially violating the
3290  // constant bus restriction, we need to insert a copy to a VGPR.
3291  Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc);
3293  TII.get(AMDGPU::COPY), VGPRSrc)
3294  .addReg(Src);
3295  Src = VGPRSrc;
3296  }
3297 
3298  return std::make_pair(Src, Mods);
3299 }
3300 
3301 ///
3302 /// This will select either an SGPR or VGPR operand and will save us from
3303 /// having to write an extra tablegen pattern.
3305 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
3306  return {{
3307  [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3308  }};
3309 }
3310 
3312 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
3313  Register Src;
3314  unsigned Mods;
3315  std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3316 
3317  return {{
3318  [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3319  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3320  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3321  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3322  }};
3323 }
3324 
3326 AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
3327  Register Src;
3328  unsigned Mods;
3329  std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false);
3330 
3331  return {{
3332  [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3333  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3334  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3335  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3336  }};
3337 }
3338 
3340 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
3341  return {{
3342  [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
3343  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3344  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3345  }};
3346 }
3347 
3349 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3350  Register Src;
3351  unsigned Mods;
3352  std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3353 
3354  return {{
3355  [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3356  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3357  }};
3358 }
3359 
3361 AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
3362  Register Src;
3363  unsigned Mods;
3364  std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false);
3365 
3366  return {{
3367  [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3368  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3369  }};
3370 }
3371 
3373 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3374  Register Reg = Root.getReg();
3375  const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3376  if (Def && (Def->getOpcode() == AMDGPU::G_FNEG ||
3377  Def->getOpcode() == AMDGPU::G_FABS))
3378  return {};
3379  return {{
3380  [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3381  }};
3382 }
3383 
3384 std::pair<Register, unsigned>
3385 AMDGPUInstructionSelector::selectVOP3PModsImpl(
3386  Register Src, const MachineRegisterInfo &MRI) const {
3387  unsigned Mods = 0;
3388  MachineInstr *MI = MRI.getVRegDef(Src);
3389 
3390  if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3391  // It's possible to see an f32 fneg here, but unlikely.
3392  // TODO: Treat f32 fneg as only high bit.
3393  MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
3394  Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3395  Src = MI->getOperand(1).getReg();
3396  MI = MRI.getVRegDef(Src);
3397  }
3398 
3399  // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3400 
3401  // Packed instructions do not have abs modifiers.
3402  Mods |= SISrcMods::OP_SEL_1;
3403 
3404  return std::make_pair(Src, Mods);
3405 }
3406 
3408 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3410  = Root.getParent()->getParent()->getParent()->getRegInfo();
3411 
3412  Register Src;
3413  unsigned Mods;
3414  std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3415 
3416  return {{
3417  [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3418  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3419  }};
3420 }
3421 
3423 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
3424  Register Src;
3425  unsigned Mods;
3426  std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3427  if (!isKnownNeverNaN(Src, *MRI))
3428  return None;
3429 
3430  return {{
3431  [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3432  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3433  }};
3434 }
3435 
3437 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
3438  // FIXME: Handle op_sel
3439  return {{
3440  [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
3441  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods
3442  }};
3443 }
3444 
3446 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
3447  SmallVector<GEPInfo, 4> AddrInfo;
3448  getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3449 
3450  if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3451  return None;
3452 
3453  const GEPInfo &GEPInfo = AddrInfo[0];
3454  Optional<int64_t> EncodedImm =
3455  AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false);
3456  if (!EncodedImm)
3457  return None;
3458 
3459  unsigned PtrReg = GEPInfo.SgprParts[0];
3460  return {{
3461  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3462  [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3463  }};
3464 }
3465 
3467 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
3468  SmallVector<GEPInfo, 4> AddrInfo;
3469  getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3470 
3471  if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3472  return None;
3473 
3474  const GEPInfo &GEPInfo = AddrInfo[0];
3475  Register PtrReg = GEPInfo.SgprParts[0];
3476  Optional<int64_t> EncodedImm =
3477  AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
3478  if (!EncodedImm)
3479  return None;
3480 
3481  return {{
3482  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3483  [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3484  }};
3485 }
3486 
3488 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
3489  MachineInstr *MI = Root.getParent();
3490  MachineBasicBlock *MBB = MI->getParent();
3491 
3492  SmallVector<GEPInfo, 4> AddrInfo;
3493  getAddrModeInfo(*MI, *MRI, AddrInfo);
3494 
3495  // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
3496  // then we can select all ptr + 32-bit offsets not just immediate offsets.
3497  if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3498  return None;
3499 
3500  const GEPInfo &GEPInfo = AddrInfo[0];
3501  // SGPR offset is unsigned.
3502  if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm))
3503  return None;
3504 
3505  // If we make it this far we have a load with an 32-bit immediate offset.
3506  // It is OK to select this using a sgpr offset, because we have already
3507  // failed trying to select this load into one of the _IMM variants since
3508  // the _IMM Patterns are considered before the _SGPR patterns.
3509  Register PtrReg = GEPInfo.SgprParts[0];
3510  Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3511  BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
3512  .addImm(GEPInfo.Imm);
3513  return {{
3514  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3515  [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }
3516  }};
3517 }
3518 
3519 std::pair<Register, int>
3520 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
3521  uint64_t FlatVariant) const {
3522  MachineInstr *MI = Root.getParent();
3523 
3524  auto Default = std::make_pair(Root.getReg(), 0);
3525 
3526  if (!STI.hasFlatInstOffsets())
3527  return Default;
3528 
3529  Register PtrBase;
3530  int64_t ConstOffset;
3531  std::tie(PtrBase, ConstOffset) =
3532  getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3533  if (ConstOffset == 0)
3534  return Default;
3535 
3536  unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
3537  if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
3538  return Default;
3539 
3540  return std::make_pair(PtrBase, ConstOffset);
3541 }
3542 
3544 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
3545  auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
3546 
3547  return {{
3548  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
3549  [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
3550  }};
3551 }
3552 
3554 AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
3555  auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
3556 
3557  return {{
3558  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
3559  [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
3560  }};
3561 }
3562 
3564 AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
3565  auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
3566 
3567  return {{
3568  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
3569  [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
3570  }};
3571 }
3572 
3573 /// Match a zero extend from a 32-bit value to 64-bits.
3575  Register ZExtSrc;
3576  if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3577  return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3578 
3579  // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3581  if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3582  return false;
3583 
3584  if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3585  return Def->getOperand(1).getReg();
3586  }
3587 
3588  return Register();
3589 }
3590 
3591 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
3593 AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
3594  Register Addr = Root.getReg();
3595  Register PtrBase;
3596  int64_t ConstOffset;
3597  int64_t ImmOffset = 0;
3598 
3599  // Match the immediate offset first, which canonically is moved as low as
3600  // possible.
3601  std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
3602 
3603  if (ConstOffset != 0) {
3604  if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
3606  Addr = PtrBase;
3607  ImmOffset = ConstOffset;
3608  } else {
3609  auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
3610  if (!PtrBaseDef)
3611  return None;
3612 
3613  if (isSGPR(PtrBaseDef->Reg)) {
3614  if (ConstOffset > 0) {
3615  // Offset is too large.
3616  //
3617  // saddr + large_offset -> saddr +
3618  // (voffset = large_offset & ~MaxOffset) +
3619  // (large_offset & MaxOffset);
3620  int64_t SplitImmOffset, RemainderOffset;
3621  std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
3623 
3624  if (isUInt<32>(RemainderOffset)) {
3625  MachineInstr *MI = Root.getParent();
3626  MachineBasicBlock *MBB = MI->getParent();
3627  Register HighBits =
3628  MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3629 
3630  BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3631  HighBits)
3632  .addImm(RemainderOffset);
3633 
3634  return {{
3635  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
3636  [=](MachineInstrBuilder &MIB) {
3637  MIB.addReg(HighBits);
3638  }, // voffset
3639  [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
3640  }};
3641  }
3642  }
3643 
3644  // We are adding a 64 bit SGPR and a constant. If constant bus limit
3645  // is 1 we would need to perform 1 or 2 extra moves for each half of
3646  // the constant and it is better to do a scalar add and then issue a
3647  // single VALU instruction to materialize zero. Otherwise it is less
3648  // instructions to perform VALU adds with immediates or inline literals.
3649  unsigned NumLiterals =
3650  !TII.isInlineConstant(APInt(32, ConstOffset & 0xffffffff)) +
3651  !TII.isInlineConstant(APInt(32, ConstOffset >> 32));
3652  if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
3653  return None;
3654  }
3655  }
3656  }
3657 
3658  auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3659  if (!AddrDef)
3660  return None;
3661 
3662  // Match the variable offset.
3663  if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3664  // Look through the SGPR->VGPR copy.
3665  Register SAddr =
3666  getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3667 
3668  if (SAddr && isSGPR(SAddr)) {
3669  Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3670 
3671  // It's possible voffset is an SGPR here, but the copy to VGPR will be
3672  // inserted later.
3673  if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
3674  return {{[=](MachineInstrBuilder &MIB) { // saddr
3675  MIB.addReg(SAddr);
3676  },
3677  [=](MachineInstrBuilder &MIB) { // voffset
3678  MIB.addReg(VOffset);
3679  },
3680  [=](MachineInstrBuilder &MIB) { // offset
3681  MIB.addImm(ImmOffset);
3682  }}};
3683  }
3684  }
3685  }
3686 
3687  // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
3688  // drop this.
3689  if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
3690  AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
3691  return None;
3692 
3693  // It's cheaper to materialize a single 32-bit zero for vaddr than the two
3694  // moves required to copy a 64-bit SGPR to VGPR.
3695  MachineInstr *MI = Root.getParent();
3696  MachineBasicBlock *MBB = MI->getParent();
3697  Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3698 
3699  BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3700  .addImm(0);
3701 
3702  return {{
3703  [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
3704  [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
3705  [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
3706  }};
3707 }
3708 
3710 AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
3711  Register Addr = Root.getReg();
3712  Register PtrBase;
3713  int64_t ConstOffset;
3714  int64_t ImmOffset = 0;
3715 
3716  // Match the immediate offset first, which canonically is moved as low as
3717  // possible.
3718  std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
3719 
3720  if (ConstOffset != 0 &&
3723  Addr = PtrBase;
3724  ImmOffset = ConstOffset;
3725  }
3726 
3727  auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3728  if (!AddrDef)
3729  return None;
3730 
3731  if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
3732  int FI = AddrDef->MI->getOperand(1).getIndex();
3733  return {{
3734  [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
3735  [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
3736  }};
3737  }
3738 
3739  Register SAddr = AddrDef->Reg;
3740 
3741  if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3742  Register LHS = AddrDef->MI->getOperand(1).getReg();
3743  Register RHS = AddrDef->MI->getOperand(2).getReg();
3744  auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
3745  auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
3746 
3747  if (LHSDef && RHSDef &&
3748  LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
3749  isSGPR(RHSDef->Reg)) {
3750  int FI = LHSDef->MI->getOperand(1).getIndex();
3751  MachineInstr &I = *Root.getParent();
3752  MachineBasicBlock *BB = I.getParent();
3753  const DebugLoc &DL = I.getDebugLoc();
3754  SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3755 
3756  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
3757  .addFrameIndex(FI)
3758  .addReg(RHSDef->Reg);
3759  }
3760  }
3761 
3762  if (!isSGPR(SAddr))
3763  return None;
3764 
3765  return {{
3766  [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
3767  [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
3768  }};
3769 }
3770 
3772 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
3773  MachineInstr *MI = Root.getParent();
3774  MachineBasicBlock *MBB = MI->getParent();
3777 
3778  int64_t Offset = 0;
3779  if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
3781  Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3782 
3783  // TODO: Should this be inside the render function? The iterator seems to
3784  // move.
3785  BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3786  HighBits)
3787  .addImm(Offset & ~4095);
3788 
3789  return {{[=](MachineInstrBuilder &MIB) { // rsrc
3790  MIB.addReg(Info->getScratchRSrcReg());
3791  },
3792  [=](MachineInstrBuilder &MIB) { // vaddr
3793  MIB.addReg(HighBits);
3794  },
3795  [=](MachineInstrBuilder &MIB) { // soffset
3796  // Use constant zero for soffset and rely on eliminateFrameIndex
3797  // to choose the appropriate frame register if need be.
3798  MIB.addImm(0);
3799  },
3800  [=](MachineInstrBuilder &MIB) { // offset
3801  MIB.addImm(Offset & 4095);
3802  }}};
3803  }
3804 
3805  assert(Offset == 0 || Offset == -1);
3806 
3807  // Try to fold a frame index directly into the MUBUF vaddr field, and any
3808  // offsets.
3809  Optional<int> FI;
3810  Register VAddr = Root.getReg();
3811  if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
3812  Register PtrBase;
3813  int64_t ConstOffset;
3814  std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
3815  if (ConstOffset != 0) {
3816  if (SIInstrInfo::isLegalMUBUFImmOffset(ConstOffset) &&
3818  KnownBits->signBitIsZero(PtrBase))) {
3819  const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
3820  if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
3821  FI = PtrBaseDef->getOperand(1).getIndex();
3822  else
3823  VAddr = PtrBase;
3824  Offset = ConstOffset;
3825  }
3826  } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
3827  FI = RootDef->getOperand(1).getIndex();
3828  }
3829  }
3830 
3831  return {{[=](MachineInstrBuilder &MIB) { // rsrc
3832  MIB.addReg(Info->getScratchRSrcReg());
3833  },
3834  [=](MachineInstrBuilder &MIB) { // vaddr
3835  if (FI.hasValue())
3836  MIB.addFrameIndex(FI.getValue());
3837  else
3838  MIB.addReg(VAddr);
3839  },
3840  [=](MachineInstrBuilder &MIB) { // soffset
3841  // Use constant zero for soffset and rely on eliminateFrameIndex
3842  // to choose the appropriate frame register if need be.
3843  MIB.addImm(0);
3844  },
3845  [=](MachineInstrBuilder &MIB) { // offset
3846  MIB.addImm(Offset);
3847  }}};
3848 }
3849 
3850 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
3851  int64_t Offset) const {
3852  if (!isUInt<16>(Offset))
3853  return false;
3854 
3856  return true;
3857 
3858  // On Southern Islands instruction with a negative base value and an offset
3859  // don't seem to work.
3860  return KnownBits->signBitIsZero(Base);
3861 }
3862 
3863 bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
3864  int64_t Offset1,
3865  unsigned Size) const {
3866  if (Offset0 % Size != 0 || Offset1 % Size != 0)
3867  return false;
3868  if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
3869  return false;
3870 
3872  return true;
3873 
3874  // On Southern Islands instruction with a negative base value and an offset
3875  // don't seem to work.
3876  return KnownBits->signBitIsZero(Base);
3877 }
3878 
3880 AMDGPUInstructionSelector::selectMUBUFScratchOffset(
3881  MachineOperand &Root) const {
3882  MachineInstr *MI = Root.getParent();
3883  MachineBasicBlock *MBB = MI->getParent();
3884 
3885  int64_t Offset = 0;
3886  if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
3888  return {};
3889 
3890  const MachineFunction *MF = MBB->getParent();
3892 
3893  return {{
3894  [=](MachineInstrBuilder &MIB) { // rsrc
3895  MIB.addReg(Info->getScratchRSrcReg());
3896  },
3897  [=](MachineInstrBuilder &MIB) { // soffset
3898  MIB.addImm(0);
3899  },
3900  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
3901  }};
3902 }
3903 
3904 std::pair<Register, unsigned>
3905 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
3906  const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3907  if (!RootDef)
3908  return std::make_pair(Root.getReg(), 0);
3909 
3910  int64_t ConstAddr = 0;
3911 
3912  Register PtrBase;
3913  int64_t Offset;
3914  std::tie(PtrBase, Offset) =
3915  getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3916 
3917  if (Offset) {
3918  if (isDSOffsetLegal(PtrBase, Offset)) {
3919  // (add n0, c0)
3920  return std::make_pair(PtrBase, Offset);
3921  }
3922  } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3923  // TODO
3924 
3925 
3926  } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3927  // TODO
3928 
3929  }
3930 
3931  return std::make_pair(Root.getReg(), 0);
3932 }
3933 
3935 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
3936  Register Reg;
3937  unsigned Offset;
3938  std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
3939  return {{
3940  [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3941  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
3942  }};
3943 }
3944 
3946 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
3947  return selectDSReadWrite2(Root, 4);
3948 }
3949 
3951 AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
3952  return selectDSReadWrite2(Root, 8);
3953 }
3954 
3956 AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
3957  unsigned Size) const {
3958  Register Reg;
3959  unsigned Offset;
3960  std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
3961  return {{
3962  [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3963  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
3964  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
3965  }};
3966 }
3967 
3968 std::pair<Register, unsigned>
3969 AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
3970  unsigned Size) const {
3971  const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3972  if (!RootDef)
3973  return std::make_pair(Root.getReg(), 0);
3974 
3975  int64_t ConstAddr = 0;
3976 
3977  Register PtrBase;
3978  int64_t Offset;
3979  std::tie(PtrBase, Offset) =
3980  getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3981 
3982  if (Offset) {
3983  int64_t OffsetValue0 = Offset;
3984  int64_t OffsetValue1 = Offset + Size;
3985  if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
3986  // (add n0, c0)
3987  return std::make_pair(PtrBase, OffsetValue0 / Size);
3988  }
3989  } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3990  // TODO
3991 
3992  } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3993  // TODO
3994 
3995  }
3996 
3997  return std::make_pair(Root.getReg(), 0);
3998 }
3999 
4000 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
4001 /// the base value with the constant offset. There may be intervening copies
4002 /// between \p Root and the identified constant. Returns \p Root, 0 if this does
4003 /// not match the pattern.
4004 std::pair<Register, int64_t>
4005 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
4006  Register Root, const MachineRegisterInfo &MRI) const {
4007  MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
4008  if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
4009  return {Root, 0};
4010 
4011  MachineOperand &RHS = RootI->getOperand(2);
4012  Optional<ValueAndVReg> MaybeOffset
4013  = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true);
4014  if (!MaybeOffset)
4015  return {Root, 0};
4016  return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
4017 }
4018 
4019 static void addZeroImm(MachineInstrBuilder &MIB) {
4020  MIB.addImm(0);
4021 }
4022 
4023 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
4024 /// BasePtr is not valid, a null base pointer will be used.
4026  uint32_t FormatLo, uint32_t FormatHi,
4027  Register BasePtr) {
4028  Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4029  Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4030  Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4031  Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
4032 
4033  B.buildInstr(AMDGPU::S_MOV_B32)
4034  .addDef(RSrc2)
4035  .addImm(FormatLo);
4036  B.buildInstr(AMDGPU::S_MOV_B32)
4037  .addDef(RSrc3)
4038  .addImm(FormatHi);
4039 
4040  // Build the half of the subregister with the constants before building the
4041  // full 128-bit register. If we are building multiple resource descriptors,
4042  // this will allow CSEing of the 2-component register.
4043  B.buildInstr(AMDGPU::REG_SEQUENCE)
4044  .addDef(RSrcHi)
4045  .addReg(RSrc2)
4046  .addImm(AMDGPU::sub0)
4047  .addReg(RSrc3)
4048  .addImm(AMDGPU::sub1);
4049 
4050  Register RSrcLo = BasePtr;
4051  if (!BasePtr) {
4052  RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4053  B.buildInstr(AMDGPU::S_MOV_B64)
4054  .addDef(RSrcLo)
4055  .addImm(0);
4056  }
4057 
4058  B.buildInstr(AMDGPU::REG_SEQUENCE)
4059  .addDef(RSrc)
4060  .addReg(RSrcLo)
4061  .addImm(AMDGPU::sub0_sub1)
4062  .addReg(RSrcHi)
4063  .addImm(AMDGPU::sub2_sub3);
4064 
4065  return RSrc;
4066 }
4067 
4069  const SIInstrInfo &TII, Register BasePtr) {
4070  uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
4071 
4072  // FIXME: Why are half the "default" bits ignored based on the addressing
4073  // mode?
4074  return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
4075 }
4076 
4078  const SIInstrInfo &TII, Register BasePtr) {
4079  uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
4080 
4081  // FIXME: Why are half the "default" bits ignored based on the addressing
4082  // mode?
4083  return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
4084 }
4085 
4086 AMDGPUInstructionSelector::MUBUFAddressData
4087 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
4088  MUBUFAddressData Data;
4089  Data.N0 = Src;
4090 
4091  Register PtrBase;
4092  int64_t Offset;
4093 
4094  std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
4095  if (isUInt<32>(Offset)) {
4096  Data.N0 = PtrBase;
4097  Data.Offset = Offset;
4098  }
4099 
4100  if (MachineInstr *InputAdd
4101  = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
4102  Data.N2 = InputAdd->getOperand(1).getReg();
4103  Data.N3 = InputAdd->getOperand(2).getReg();
4104 
4105  // FIXME: Need to fix extra SGPR->VGPRcopies inserted
4106  // FIXME: Don't know this was defined by operand 0
4107  //
4108  // TODO: Remove this when we have copy folding optimizations after
4109  // RegBankSelect.
4110  Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
4111  Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
4112  }
4113 
4114  return Data;
4115 }
4116 
4117 /// Return if the addr64 mubuf mode should be used for the given address.
4118 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
4119  // (ptr_add N2, N3) -> addr64, or
4120  // (ptr_add (ptr_add N2, N3), C1) -> addr64
4121  if (Addr.N2)
4122  return true;
4123 
4124  const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
4125  return N0Bank->getID() == AMDGPU::VGPRRegBankID;
4126 }
4127 
4128 /// Split an immediate offset \p ImmOffset depending on whether it fits in the
4129 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
4130 /// component.
4131 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
4132  MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
4133  if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset))
4134  return;
4135 
4136  // Illegal offset, store it in soffset.
4137  SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4138  B.buildInstr(AMDGPU::S_MOV_B32)
4139  .addDef(SOffset)
4140  .addImm(ImmOffset);
4141  ImmOffset = 0;
4142 }
4143 
4144 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
4145  MachineOperand &Root, Register &VAddr, Register &RSrcReg,
4146  Register &SOffset, int64_t &Offset) const {
4147  // FIXME: Predicates should stop this from reaching here.
4148  // addr64 bit was removed for volcanic islands.
4149  if (!STI.hasAddr64() || STI.useFlatForGlobal())
4150  return false;
4151 
4152  MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
4153  if (!shouldUseAddr64(AddrData))
4154  return false;
4155 
4156  Register N0 = AddrData.N0;
4157  Register N2 = AddrData.N2;
4158  Register N3 = AddrData.N3;
4159  Offset = AddrData.Offset;
4160 
4161  // Base pointer for the SRD.
4162  Register SRDPtr;
4163 
4164  if (N2) {
4165  if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4166  assert(N3);
4167  if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4168  // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
4169  // addr64, and construct the default resource from a 0 address.
4170  VAddr = N0;
4171  } else {
4172  SRDPtr = N3;
4173  VAddr = N2;
4174  }
4175  } else {
4176  // N2 is not divergent.
4177  SRDPtr = N2;
4178  VAddr = N3;
4179  }
4180  } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4181  // Use the default null pointer in the resource
4182  VAddr = N0;
4183  } else {
4184  // N0 -> offset, or
4185  // (N0 + C1) -> offset
4186  SRDPtr = N0;
4187  }
4188 
4189  MachineIRBuilder B(*Root.getParent());
4190  RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
4191  splitIllegalMUBUFOffset(B, SOffset, Offset);
4192  return true;
4193 }
4194 
4195 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
4196  MachineOperand &Root, Register &RSrcReg, Register &SOffset,
4197  int64_t &Offset) const {
4198 
4199  // FIXME: Pattern should not reach here.
4200  if (STI.useFlatForGlobal())
4201  return false;
4202 
4203  MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
4204  if (shouldUseAddr64(AddrData))
4205  return false;
4206 
4207  // N0 -> offset, or
4208  // (N0 + C1) -> offset
4209  Register SRDPtr = AddrData.N0;
4210  Offset = AddrData.Offset;
4211 
4212  // TODO: Look through extensions for 32-bit soffset.
4213  MachineIRBuilder B(*Root.getParent());
4214 
4215  RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
4216  splitIllegalMUBUFOffset(B, SOffset, Offset);
4217  return true;
4218 }
4219 
4221 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
4222  Register VAddr;
4223  Register RSrcReg;
4224  Register SOffset;
4225  int64_t Offset = 0;
4226 
4227  if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
4228  return {};
4229 
4230  // FIXME: Use defaulted operands for trailing 0s and remove from the complex
4231  // pattern.
4232  return {{
4233  [=](MachineInstrBuilder &MIB) { // rsrc
4234  MIB.addReg(RSrcReg);
4235  },
4236  [=](MachineInstrBuilder &MIB) { // vaddr
4237  MIB.addReg(VAddr);
4238  },
4239  [=](MachineInstrBuilder &MIB) { // soffset
4240  if (SOffset)
4241  MIB.addReg(SOffset);
4242  else
4243  MIB.addImm(0);
4244  },
4245  [=](MachineInstrBuilder &MIB) { // offset
4246  MIB.addImm(Offset);
4247  },
4248  addZeroImm, // cpol
4249  addZeroImm, // tfe
4250  addZeroImm // swz
4251  }};
4252 }
4253 
4255 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
4256  Register RSrcReg;
4257  Register SOffset;
4258  int64_t Offset = 0;
4259 
4260  if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
4261  return {};
4262 
4263  return {{
4264  [=](MachineInstrBuilder &MIB) { // rsrc
4265  MIB.addReg(RSrcReg);
4266  },
4267  [=](MachineInstrBuilder &MIB) { // soffset
4268  if (SOffset)
4269  MIB.addReg(SOffset);
4270  else
4271  MIB.addImm(0);
4272  },
4273  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
4274  addZeroImm, // cpol
4275  addZeroImm, // tfe
4276  addZeroImm, // swz
4277  }};
4278 }
4279 
4281 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const {
4282  Register VAddr;
4283  Register RSrcReg;
4284  Register SOffset;
4285  int64_t Offset = 0;
4286 
4287  if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
4288  return {};
4289 
4290  // FIXME: Use defaulted operands for trailing 0s and remove from the complex
4291  // pattern.
4292  return {{
4293  [=](MachineInstrBuilder &MIB) { // rsrc
4294  MIB.addReg(RSrcReg);
4295  },
4296  [=](MachineInstrBuilder &MIB) { // vaddr
4297  MIB.addReg(VAddr);
4298  },
4299  [=](MachineInstrBuilder &MIB) { // soffset
4300  if (SOffset)
4301  MIB.addReg(SOffset);
4302  else
4303  MIB.addImm(0);
4304  },
4305  [=](MachineInstrBuilder &MIB) { // offset
4306  MIB.addImm(Offset);
4307  },
4308  [=](MachineInstrBuilder &MIB) {
4309  MIB.addImm(AMDGPU::CPol::GLC); // cpol
4310  }
4311  }};
4312 }
4313 
4315 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
4316  Register RSrcReg;
4317  Register SOffset;
4318  int64_t Offset = 0;
4319 
4320  if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
4321  return {};
4322 
4323  return {{
4324  [=](MachineInstrBuilder &MIB) { // rsrc
4325  MIB.addReg(RSrcReg);
4326  },
4327  [=](MachineInstrBuilder &MIB) { // soffset
4328  if (SOffset)
4329  MIB.addReg(SOffset);
4330  else
4331  MIB.addImm(0);
4332  },
4333  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
4334  [=](MachineInstrBuilder &MIB) { MIB.addImm(AMDGPU::CPol::GLC); } // cpol
4335  }};
4336 }
4337 
4338 /// Get an immediate that must be 32-bits, and treated as zero extended.
4340  const MachineRegisterInfo &MRI) {
4341  // getConstantVRegVal sexts any values, so see if that matters.
4343  if (!OffsetVal || !isInt<32>(*OffsetVal))
4344  return None;
4345  return Lo_32(*OffsetVal);
4346 }
4347 
4349 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
4350  Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
4351  if (!OffsetVal)
4352  return {};
4353 
4354  Optional<int64_t> EncodedImm =
4355  AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
4356  if (!EncodedImm)
4357  return {};
4358 
4359  return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
4360 }
4361 
4363 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
4365 
4366  Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
4367  if (!OffsetVal)
4368  return {};
4369 
4370  Optional<int64_t> EncodedImm
4371  = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
4372  if (!EncodedImm)
4373  return {};
4374 
4375  return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
4376 }
4377 
4378 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
4379  const MachineInstr &MI,
4380  int OpIdx) const {
4381  assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
4382  "Expected G_CONSTANT");
4383  MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
4384 }
4385 
4386 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
4387  const MachineInstr &MI,
4388  int OpIdx) const {
4389  assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
4390  "Expected G_CONSTANT");
4391  MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
4392 }
4393 
4394 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
4395  const MachineInstr &MI,
4396  int OpIdx) const {
4397  assert(OpIdx == -1);
4398 
4399  const MachineOperand &Op = MI.getOperand(1);
4400  if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
4401  MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
4402  else {
4403  assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
4404  MIB.addImm(Op.getCImm()->getSExtValue());
4405  }
4406 }
4407 
4408 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
4409  const MachineInstr &MI,
4410  int OpIdx) const {
4411  assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
4412  "Expected G_CONSTANT");
4413  MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation());
4414 }
4415 
4416 /// This only really exists to satisfy DAG type checking machinery, so is a
4417 /// no-op here.
4418 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
4419  const MachineInstr &MI,
4420  int OpIdx) const {
4421  MIB.addImm(MI.getOperand(OpIdx).getImm());
4422 }
4423 
4424 void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
4425  const MachineInstr &MI,
4426  int OpIdx) const {
4427  assert(OpIdx >= 0 && "expected to match an immediate operand");
4428  MIB.addImm(MI.getOperand(OpIdx).getImm() & AMDGPU::CPol::ALL);
4429 }
4430 
4431 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
4432  const MachineInstr &MI,
4433  int OpIdx) const {
4434  assert(OpIdx >= 0 && "expected to match an immediate operand");
4435  MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
4436 }
4437 
4438 void AMDGPUInstructionSelector::renderSetGLC(MachineInstrBuilder &MIB,
4439  const MachineInstr &MI,
4440  int OpIdx) const {
4441  assert(OpIdx >= 0 && "expected to match an immediate operand");
4442  MIB.addImm(MI.getOperand(OpIdx).getImm() | AMDGPU::CPol::GLC);
4443 }
4444 
4445 void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
4446  const MachineInstr &MI,
4447  int OpIdx) const {
4448  MIB.addFrameIndex((MI.getOperand(1).getIndex()));
4449 }
4450 
4451 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
4453 }
4454 
4455 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const {
4457 }
4458 
4459 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const {
4461 }
4462 
4463 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
4464  return TII.isInlineConstant(Imm);
4465 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
i
i
Definition: README.txt:29
GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_PREDICATES_INIT
MIPatternMatch.h
addZeroImm
static void addZeroImm(MachineInstrBuilder &MIB)
Definition: AMDGPUInstructionSelector.cpp:4019
llvm::TargetMachine::getOptLevel
CodeGenOpt::Level getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
Definition: TargetMachine.cpp:198
llvm::TargetRegisterInfo::getConstrainedRegClassForOperand
virtual const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const
Definition: TargetRegisterInfo.h:1088
sizeToSubRegIndex
static int sizeToSubRegIndex(unsigned Size)
Definition: AMDGPUInstructionSelector.cpp:1852
Signed
@ Signed
Definition: NVPTXISelLowering.cpp:4630
llvm::AMDGPUSubtarget::hasInv2PiInlineImm
bool hasInv2PiInlineImm() const
Definition: AMDGPUSubtarget.h:169
llvm::getDefIgnoringCopies
MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition: Utils.cpp:404
llvm::AMDGPURegisterBankInfo
Definition: AMDGPURegisterBankInfo.h:42
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:102
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:131
llvm
---------------------— PointerInfo ------------------------------------—
Definition: AllocatorList.h:23
llvm::tgtok::Def
@ Def
Definition: TGLexer.h:50
llvm::AMDGPU::MIMGBaseOpcodeInfo::HasD16
bool HasD16
Definition: AMDGPUBaseInfo.h:293
Reg
unsigned Reg
Definition: MachineSink.cpp:1566
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::CmpInst::ICMP_EQ
@ ICMP_EQ
equal
Definition: InstrTypes.h:741
UseMI
MachineInstrBuilder & UseMI
Definition: AArch64ExpandPseudoInsts.cpp:102
llvm::AMDGPU::MIMGBaseOpcodeInfo::Store
bool Store
Definition: AMDGPUBaseInfo.h:282
llvm::MachineOperand::CreateReg
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
Definition: MachineOperand.h:791
normalizeVOP3PMask
static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1, ArrayRef< int > Mask)
Definition: AMDGPUInstructionSelector.cpp:2848
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1336
llvm::MachineRegisterInfo::createVirtualRegister
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Definition: MachineRegisterInfo.cpp:158
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:720
llvm::MachineFrameInfo::setReturnAddressIsTaken
void setReturnAddressIsTaken(bool s)
Definition: MachineFrameInfo.h:373
llvm::DiagnosticInfoUnsupported
Diagnostic information for unsupported feature in backend.
Definition: DiagnosticInfo.h:1003
llvm::MIPatternMatch::m_Reg
operand_type_match m_Reg()
Definition: MIPatternMatch.h:127
SIMachineFunctionInfo.h
llvm::GISelKnownBits
Definition: GISelKnownBits.h:29
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:52
llvm::MachineInstrBuilder::add
const MachineInstrBuilder & add(const MachineOperand &MO) const
Definition: MachineInstrBuilder.h:224
llvm::Function
Definition: Function.h:61
llvm::AMDGPU::getSMRDEncodedOffset
Optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer)
Definition: AMDGPUBaseInfo.cpp:1871
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
llvm::MIPatternMatch::m_GLShr
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
Definition: MIPatternMatch.h:363
include
include(LLVM-Build) add_subdirectory(IR) add_subdirectory(FuzzMutate) add_subdirectory(FileCheck) add_subdirectory(InterfaceStub) add_subdirectory(IRReader) add_subdirectory(CodeGen) add_subdirectory(BinaryFormat) add_subdirectory(Bitcode) add_subdirectory(Bitstream) add_subdirectory(DWARFLinker) add_subdirectory(Extensions) add_subdirectory(Frontend) add_subdirectory(Transforms) add_subdirectory(Linker) add_subdirectory(Analysis) add_subdirectory(LTO) add_subdirectory(MC) add_subdirectory(MCA) add_subdirectory(Object) add_subdirectory(ObjectYAML) add_subdirectory(Option) add_subdirectory(Remarks) add_subdirectory(DebugInfo) add_subdirectory(DWP) add_subdirectory(ExecutionEngine) add_subdirectory(Target) add_subdirectory(AsmParser) add_subdirectory(LineEditor) add_subdirectory(ProfileData) add_subdirectory(Passes) add_subdirectory(TextAPI) add_subdirectory(ToolDrivers) add_subdirectory(XRay) if(LLVM_INCLUDE_TESTS) add_subdirectory(Testing) endif() add_subdirectory(WindowsManifest) set(LLVMCONFIGLIBRARYDEPENDENCIESINC "$
Definition: CMakeLists.txt:1
llvm::getOpcodeDef
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:418
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::RegState::Undef
@ Undef
Value of the register doesn't matter.
Definition: MachineInstrBuilder.h:52
llvm::AMDGPU::MIMGBaseOpcodeInfo::Gather4
bool Gather4
Definition: AMDGPUBaseInfo.h:286
llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1643
GISelKnownBits.h
llvm::AMDGPU::ImageDimIntrinsicInfo
Definition: AMDGPUInstrInfo.h:50
llvm::RegisterBankInfo::getRegBank
RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
Definition: RegisterBankInfo.h:432
llvm::GCNSubtarget::needsAlignedVGPRs
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
Definition: GCNSubtarget.h:977
llvm::AMDGPU::MIMGDimInfo
Definition: AMDGPUBaseInfo.h:300
llvm::getSrcRegIgnoringCopies
Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition: Utils.cpp:411
llvm::MachineRegisterInfo::getUniqueVRegDef
MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
Definition: MachineRegisterInfo.cpp:411
llvm::CmpInst::ICMP_NE
@ ICMP_NE
not equal
Definition: InstrTypes.h:742
llvm::RegState::Kill
@ Kill
The last use of a register.
Definition: MachineInstrBuilder.h:48
llvm::AMDGPUSubtarget::SEA_ISLANDS
@ SEA_ISLANDS
Definition: AMDGPUSubtarget.h:38
llvm::InstructionSelector::setupMF
virtual void setupMF(MachineFunction &mf, GISelKnownBits *KB, CodeGenCoverage &covinfo, ProfileSummaryInfo *psi, BlockFrequencyInfo *bfi)
Setup per-MF selector state.
Definition: InstructionSelector.h:452
llvm::AMDGPU::getSMRDEncodedLiteralOffset32
Optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
Definition: AMDGPUBaseInfo.cpp:1888
llvm::TargetRegisterInfo
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Definition: TargetRegisterInfo.h:231
llvm::SIRegisterInfo::getWaveMaskRegClass
const TargetRegisterClass * getWaveMaskRegClass() const
Definition: SIRegisterInfo.h:283
llvm::Depth
@ Depth
Definition: SIMachineScheduler.h:34
llvm::CmpInst::ICMP_SGT
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:747
getAddrSpace
static Error getAddrSpace(StringRef R, unsigned &AddrSpace)
Definition: DataLayout.cpp:248
llvm::getFunctionLiveInPhysReg
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition: Utils.cpp:625
llvm::MachineMemOperand
A description of a memory reference used in the backend.
Definition: MachineMemOperand.h:128
llvm::AMDGPUMachineFunction::getLDSSize
unsigned getLDSSize() const
Definition: AMDGPUMachineFunction.h:70
isZeroOrUndef
static bool isZeroOrUndef(int X)
Definition: AMDGPUInstructionSelector.cpp:2834
llvm::AMDGPU::SDWA::UNUSED_PRESERVE
@ UNUSED_PRESERVE
Definition: SIDefines.h:652
llvm::MachineOperand::isCImm
bool isCImm() const
isCImm - Test if this is a MO_CImmediate operand.
Definition: MachineOperand.h:325
llvm::ConstantFP::getValueAPF
const APFloat & getValueAPF() const
Definition: Constants.h:297
llvm::LLT::isValid
bool isValid() const
Definition: LowLevelTypeImpl.h:117
llvm::Optional
Definition: APInt.h:33
llvm::AMDGPUSubtarget::hasSDWA
bool hasSDWA() const
Definition: AMDGPUSubtarget.h:149
Offset
uint64_t Offset
Definition: ELFObjHandler.cpp:81
llvm::GCNSubtarget
Definition: GCNSubtarget.h:38
llvm::CmpInst::ICMP_SLE
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:750
llvm::MipsISD::Ret
@ Ret
Definition: MipsISelLowering.h:116
MachineIRBuilder.h
llvm::SIInstrInfo::getIndirectRegWriteMovRelPseudo
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
Definition: SIInstrInfo.cpp:1316
llvm::PowerOf2Ceil
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:702
llvm::RegState::Implicit
@ Implicit
Not emitted register (e.g. carry, or temporary result).
Definition: MachineInstrBuilder.h:46
llvm::Lo_32
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:353
llvm::BitmaskEnumDetail::Mask
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1567
llvm::cl::ReallyHidden
@ ReallyHidden
Definition: CommandLine.h:144
llvm::GCNSubtarget::hasScalarCompareEq64
bool hasScalarCompareEq64() const
Definition: GCNSubtarget.h:809
llvm::ValueAndVReg::Value
APInt Value
Definition: Utils.h:176
llvm::AMDGPU::MIMGMIPMappingInfo::NONMIP
MIMGBaseOpcode NONMIP
Definition: AMDGPUBaseInfo.h:326
llvm::Data
@ Data
Definition: SIMachineScheduler.h:56
llvm::PointerUnion::get
T get() const
Returns the value of the specified pointer type.
Definition: PointerUnion.h:157
llvm::ARMII::IndexMode
IndexMode
ARM Index Modes.
Definition: ARMBaseInfo.h:177
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:203
DEBUG_TYPE
#define DEBUG_TYPE
Definition: AMDGPUInstructionSelector.cpp:28
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:122
llvm::constrainSelectedInstRegOperands
bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition: Utils.cpp:134
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::SIInstrFlags::FLAT
@ FLAT
Definition: SIDefines.h:51
llvm::getUndefRegState
unsigned getUndefRegState(bool B)
Definition: MachineInstrBuilder.h:514
llvm::GCNSubtarget::unsafeDSOffsetFoldingEnabled
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:412
llvm::GCNSubtarget::hasGWSSemaReleaseAll
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:630
llvm::Optional::hasValue
constexpr bool hasValue() const
Definition: Optional.h:288
llvm::AMDGPUAS::CONSTANT_ADDRESS_32BIT
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition: AMDGPU.h:389
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::SIRegisterInfo::getRegClassForTypeOnBank
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank, const MachineRegisterInfo &MRI) const
Definition: SIRegisterInfo.h:268
llvm::LLT::fixed_vector
static LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelTypeImpl.h:75
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:206
llvm::MachineOperand::isKill
bool isKill() const
Definition: MachineOperand.h:390
llvm::MachineInstrBuilder::addDef
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Definition: MachineInstrBuilder.h:116
llvm::AMDGPU::CPol::CPol
CPol
Definition: SIDefines.h:281
getLogicalBitOpcode
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
Definition: AMDGPUInstructionSelector.cpp:266
llvm::BlockFrequencyInfo
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Definition: BlockFrequencyInfo.h:37
llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition: MachineFunction.h:632
llvm::MachineOperand::isImplicit
bool isImplicit() const
Definition: MachineOperand.h:380
llvm::AMDGPUTargetMachine
Definition: AMDGPUTargetMachine.h:27
llvm::SISrcMods::NEG_HI
@ NEG_HI
Definition: SIDefines.h:202
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::MachineInstrBuilder::addMBB
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Definition: MachineInstrBuilder.h:146
llvm::RegisterBank
This class implements the register bank concept.
Definition: RegisterBank.h:28
llvm::MIPatternMatch::m_GZExt
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
Definition: MIPatternMatch.h:414
llvm::SIInstrInfo::isInlineConstant
bool isInlineConstant(const APInt &Imm) const
Definition: SIInstrInfo.cpp:3349
llvm::MachineOperand::CreateImm
static MachineOperand CreateImm(int64_t Val)
Definition: MachineOperand.h:773
llvm::GCNSubtarget::useVGPRIndexMode
bool useVGPRIndexMode() const
Definition: AMDGPUSubtarget.cpp:762
llvm::GCNSubtarget::hasAddr64
bool hasAddr64() const
Definition: GCNSubtarget.h:323
llvm::AMDGPUInstructionSelector::select
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
Definition: AMDGPUInstructionSelector.cpp:3125
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::MachineMemOperand::getAddrSpace
unsigned getAddrSpace() const
Definition: MachineMemOperand.h:229
llvm::MachineOperand::getImm
int64_t getImm() const
Definition: MachineOperand.h:537
llvm::GCNSubtarget::useFlatForGlobal
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:471
llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition: MachineFunction.h:720
llvm::AMDGPU::MIMGBaseOpcodeInfo
Definition: AMDGPUBaseInfo.h:280
Intr
unsigned Intr
Definition: AMDGPUBaseInfo.cpp:1987
llvm::MachineMemOperand::getValue
const Value * getValue() const
Return the base address of the memory access.
Definition: MachineMemOperand.h:211
isVCmpResult
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
Definition: AMDGPUInstructionSelector.cpp:2468
llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:499
llvm::CmpInst::ICMP_ULE
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:746
llvm::AMDGPUISD::DS_ORDERED_COUNT
@ DS_ORDERED_COUNT
Definition: AMDGPUISelLowering.h:491
llvm::LLT::getSizeInBits
TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelTypeImpl.h:153
llvm::AMDGPU::MIMGDimInfo::Encoding
uint8_t Encoding
Definition: AMDGPUBaseInfo.h:306
llvm::RegState::Dead
@ Dead
Unused definition.
Definition: MachineInstrBuilder.h:50
llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:46
llvm::SIInstrFlags::WQM
@ WQM
Definition: SIDefines.h:63
llvm::SIRegisterInfo::getReturnAddressReg
MCRegister getReturnAddressReg(const MachineFunction &MF) const
Definition: SIRegisterInfo.cpp:2398
llvm::InstructionSelector::CoverageInfo
CodeGenCoverage * CoverageInfo
Definition: InstructionSelector.h:439
gwsIntrinToOpcode
static unsigned gwsIntrinToOpcode(unsigned IntrID)
Definition: AMDGPUInstructionSelector.cpp:1321
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:129
llvm::MachineOperand::ChangeToImmediate
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
Definition: MachineOperand.cpp:156
llvm::MCInstrDesc
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:195
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:49
llvm::MIPatternMatch::m_ZeroInt
SpecificConstantMatch m_ZeroInt()
{ Convenience matchers for specific integer values.
Definition: MIPatternMatch.h:109
llvm::M0
unsigned M0(unsigned Val)
Definition: VE.h:371
llvm::AMDGPU::MIMGLZMappingInfo
Definition: AMDGPUBaseInfo.h:319
llvm::Instruction
Definition: Instruction.h:45
llvm::GCNSubtarget::isWave32
bool isWave32() const
Definition: GCNSubtarget.h:1142
llvm::AMDGPUTargetMachine::getNullPointerValue
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
Definition: AMDGPUTargetMachine.cpp:786
llvm::TargetRegisterClass::hasSuperClassEq
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
Definition: TargetRegisterInfo.h:136
llvm::report_fatal_error
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:140
llvm::APInt::getZExtValue
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1631
llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:383
llvm::APInt::getHighBitsSet
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Get a value with high bits set.
Definition: APInt.h:655
llvm::RegisterBank::getID
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:47
llvm::SIRegisterInfo::getSubRegFromChannel
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
Definition: SIRegisterInfo.cpp:427
llvm::AMDGPU::getMIMGLZMappingInfo
const LLVM_READONLY MIMGLZMappingInfo * getMIMGLZMappingInfo(unsigned L)
matchZeroExtendFromS32
static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg)
Match a zero extend from a 32-bit value to 64-bits.
Definition: AMDGPUInstructionSelector.cpp:3574
llvm::MIPatternMatch::m_SpecificICst
SpecificConstantMatch m_SpecificICst(int64_t RequestedValue)
Matches a constant equal to RequestedValue.
Definition: MIPatternMatch.h:103
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:29
llvm::APFloat::bitcastToAPInt
APInt bitcastToAPInt() const
Definition: APFloat.h:1132
llvm::AMDGPU::getMIMGOpcode
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
Definition: AMDGPUBaseInfo.cpp:138
llvm::Metadata
Root of the metadata hierarchy.
Definition: Metadata.h:62
GET_GLOBALISEL_TEMPORARIES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
llvm::MachineRegisterInfo::getVRegDef
MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
Definition: MachineRegisterInfo.cpp:400
llvm::MachineOperand::getParent
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
Definition: MachineOperand.h:238
llvm::None
const NoneType None
Definition: None.h:23
llvm::RegisterBankInfo::getSizeInBits
unsigned getSizeInBits(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Get the size in bits of Reg.
Definition: RegisterBankInfo.cpp:493
llvm::MachineRegisterInfo::use_empty
bool use_empty(Register RegNo) const
use_empty - Return true if there are no instructions using the specified register.
Definition: MachineRegisterInfo.h:506
llvm::RegisterBankInfo::constrainGenericRegister
static const TargetRegisterClass * constrainGenericRegister(Register Reg, const TargetRegisterClass &RC, MachineRegisterInfo &MRI)
Constrain the (possibly generic) virtual register Reg to RC.
Definition: RegisterBankInfo.cpp:132
llvm::AMDGPU::isGFX10Plus
bool isGFX10Plus(const MCSubtargetInfo &STI)
Definition: AMDGPUBaseInfo.cpp:1455
X
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:95
llvm::Hi_32
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:348
llvm::MachineInstrBuilder::cloneMemRefs
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Definition: MachineInstrBuilder.h:213
llvm::ProfileSummaryInfo
Analysis providing profile information.
Definition: ProfileSummaryInfo.h:39
llvm::SIInstrInfo::MO_ABS32_LO
@ MO_ABS32_LO
Definition: SIInstrInfo.h:166
llvm::MachineRegisterInfo::getRegClass
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
Definition: MachineRegisterInfo.h:634
llvm::APInt::ashr
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:963
llvm::AMDGPUSubtarget::getWavefrontSize
unsigned getWavefrontSize() const
Definition: AMDGPUSubtarget.h:185