LLVM  14.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
15 #include "AMDGPU.h"
16 #include "AMDGPUGlobalISelUtils.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
26 #include "llvm/IR/DiagnosticInfo.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 
29 #define DEBUG_TYPE "amdgpu-isel"
30 
31 using namespace llvm;
32 using namespace MIPatternMatch;
33 
35  "amdgpu-global-isel-risky-select",
36  cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
37  cl::init(false),
39 
40 #define GET_GLOBALISEL_IMPL
41 #define AMDGPUSubtarget GCNSubtarget
42 #include "AMDGPUGenGlobalISel.inc"
43 #undef GET_GLOBALISEL_IMPL
44 #undef AMDGPUSubtarget
45 
47  const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
48  const AMDGPUTargetMachine &TM)
49  : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
50  STI(STI),
51  EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
53 #include "AMDGPUGenGlobalISel.inc"
56 #include "AMDGPUGenGlobalISel.inc"
58 {
59 }
60 
61 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
62 
64  CodeGenCoverage &CoverageInfo,
65  ProfileSummaryInfo *PSI,
67  MRI = &MF.getRegInfo();
68  Subtarget = &MF.getSubtarget<GCNSubtarget>();
70 }
71 
72 bool AMDGPUInstructionSelector::isVCC(Register Reg,
73  const MachineRegisterInfo &MRI) const {
74  // The verifier is oblivious to s1 being a valid value for wavesize registers.
75  if (Reg.isPhysical())
76  return false;
77 
78  auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
79  const TargetRegisterClass *RC =
80  RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
81  if (RC) {
82  const LLT Ty = MRI.getType(Reg);
83  return RC->hasSuperClassEq(TRI.getBoolRC()) &&
84  Ty.isValid() && Ty.getSizeInBits() == 1;
85  }
86 
87  const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
88  return RB->getID() == AMDGPU::VCCRegBankID;
89 }
90 
91 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
92  unsigned NewOpc) const {
93  MI.setDesc(TII.get(NewOpc));
94  MI.RemoveOperand(1); // Remove intrinsic ID.
95  MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
96 
97  MachineOperand &Dst = MI.getOperand(0);
98  MachineOperand &Src = MI.getOperand(1);
99 
100  // TODO: This should be legalized to s32 if needed
101  if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
102  return false;
103 
104  const TargetRegisterClass *DstRC
105  = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
106  const TargetRegisterClass *SrcRC
107  = TRI.getConstrainedRegClassForOperand(Src, *MRI);
108  if (!DstRC || DstRC != SrcRC)
109  return false;
110 
111  return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
112  RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
113 }
114 
115 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
116  const DebugLoc &DL = I.getDebugLoc();
117  MachineBasicBlock *BB = I.getParent();
118  I.setDesc(TII.get(TargetOpcode::COPY));
119 
120  const MachineOperand &Src = I.getOperand(1);
121  MachineOperand &Dst = I.getOperand(0);
122  Register DstReg = Dst.getReg();
123  Register SrcReg = Src.getReg();
124 
125  if (isVCC(DstReg, *MRI)) {
126  if (SrcReg == AMDGPU::SCC) {
127  const TargetRegisterClass *RC
128  = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
129  if (!RC)
130  return true;
131  return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
132  }
133 
134  if (!isVCC(SrcReg, *MRI)) {
135  // TODO: Should probably leave the copy and let copyPhysReg expand it.
136  if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
137  return false;
138 
139  const TargetRegisterClass *SrcRC
140  = TRI.getConstrainedRegClassForOperand(Src, *MRI);
141 
142  Optional<ValueAndVReg> ConstVal =
143  getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
144  if (ConstVal) {
145  unsigned MovOpc =
146  STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
147  BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
148  .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
149  } else {
150  Register MaskedReg = MRI->createVirtualRegister(SrcRC);
151 
152  // We can't trust the high bits at this point, so clear them.
153 
154  // TODO: Skip masking high bits if def is known boolean.
155 
156  unsigned AndOpc =
157  TRI.isSGPRClass(SrcRC) ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
158  BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
159  .addImm(1)
160  .addReg(SrcReg);
161  BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
162  .addImm(0)
163  .addReg(MaskedReg);
164  }
165 
166  if (!MRI->getRegClassOrNull(SrcReg))
167  MRI->setRegClass(SrcReg, SrcRC);
168  I.eraseFromParent();
169  return true;
170  }
171 
172  const TargetRegisterClass *RC =
173  TRI.getConstrainedRegClassForOperand(Dst, *MRI);
174  if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
175  return false;
176 
177  return true;
178  }
179 
180  for (const MachineOperand &MO : I.operands()) {
181  if (MO.getReg().isPhysical())
182  continue;
183 
184  const TargetRegisterClass *RC =
185  TRI.getConstrainedRegClassForOperand(MO, *MRI);
186  if (!RC)
187  continue;
188  RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
189  }
190  return true;
191 }
192 
193 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
194  const Register DefReg = I.getOperand(0).getReg();
195  const LLT DefTy = MRI->getType(DefReg);
196  if (DefTy == LLT::scalar(1)) {
197  if (!AllowRiskySelect) {
198  LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
199  return false;
200  }
201 
202  LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n");
203  }
204 
205  // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
206 
207  const RegClassOrRegBank &RegClassOrBank =
208  MRI->getRegClassOrRegBank(DefReg);
209 
210  const TargetRegisterClass *DefRC
211  = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
212  if (!DefRC) {
213  if (!DefTy.isValid()) {
214  LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
215  return false;
216  }
217 
218  const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
219  DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI);
220  if (!DefRC) {
221  LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
222  return false;
223  }
224  }
225 
226  // TODO: Verify that all registers have the same bank
227  I.setDesc(TII.get(TargetOpcode::PHI));
228  return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
229 }
230 
232 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
233  const TargetRegisterClass &SubRC,
234  unsigned SubIdx) const {
235 
236  MachineInstr *MI = MO.getParent();
238  Register DstReg = MRI->createVirtualRegister(&SubRC);
239 
240  if (MO.isReg()) {
241  unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
242  Register Reg = MO.getReg();
243  BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
244  .addReg(Reg, 0, ComposedSubIdx);
245 
246  return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
247  MO.isKill(), MO.isDead(), MO.isUndef(),
248  MO.isEarlyClobber(), 0, MO.isDebug(),
249  MO.isInternalRead());
250  }
251 
252  assert(MO.isImm());
253 
254  APInt Imm(64, MO.getImm());
255 
256  switch (SubIdx) {
257  default:
258  llvm_unreachable("do not know to split immediate with this sub index.");
259  case AMDGPU::sub0:
260  return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
261  case AMDGPU::sub1:
262  return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
263  }
264 }
265 
266 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
267  switch (Opc) {
268  case AMDGPU::G_AND:
269  return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
270  case AMDGPU::G_OR:
271  return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
272  case AMDGPU::G_XOR:
273  return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
274  default:
275  llvm_unreachable("not a bit op");
276  }
277 }
278 
279 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
280  Register DstReg = I.getOperand(0).getReg();
281  unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
282 
283  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
284  if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
285  DstRB->getID() != AMDGPU::VCCRegBankID)
286  return false;
287 
288  bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
289  STI.isWave64());
290  I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
291 
292  // Dead implicit-def of scc
293  I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
294  true, // isImp
295  false, // isKill
296  true)); // isDead
297  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
298 }
299 
300 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
301  MachineBasicBlock *BB = I.getParent();
302  MachineFunction *MF = BB->getParent();
303  Register DstReg = I.getOperand(0).getReg();
304  const DebugLoc &DL = I.getDebugLoc();
305  LLT Ty = MRI->getType(DstReg);
306  if (Ty.isVector())
307  return false;
308 
309  unsigned Size = Ty.getSizeInBits();
310  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
311  const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
312  const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
313 
314  if (Size == 32) {
315  if (IsSALU) {
316  const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
317  MachineInstr *Add =
318  BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
319  .add(I.getOperand(1))
320  .add(I.getOperand(2));
321  I.eraseFromParent();
322  return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
323  }
324 
325  if (STI.hasAddNoCarry()) {
326  const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
327  I.setDesc(TII.get(Opc));
328  I.addOperand(*MF, MachineOperand::CreateImm(0));
329  I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
330  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
331  }
332 
333  const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
334 
337  = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
338  .addDef(UnusedCarry, RegState::Dead)
339  .add(I.getOperand(1))
340  .add(I.getOperand(2))
341  .addImm(0);
342  I.eraseFromParent();
343  return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
344  }
345 
346  assert(!Sub && "illegal sub should not reach here");
347 
348  const TargetRegisterClass &RC
349  = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
350  const TargetRegisterClass &HalfRC
351  = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
352 
353  MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
354  MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
355  MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
356  MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
357 
358  Register DstLo = MRI->createVirtualRegister(&HalfRC);
359  Register DstHi = MRI->createVirtualRegister(&HalfRC);
360 
361  if (IsSALU) {
362  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
363  .add(Lo1)
364  .add(Lo2);
365  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
366  .add(Hi1)
367  .add(Hi2);
368  } else {
369  const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
370  Register CarryReg = MRI->createVirtualRegister(CarryRC);
371  BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
372  .addDef(CarryReg)
373  .add(Lo1)
374  .add(Lo2)
375  .addImm(0);
376  MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
378  .add(Hi1)
379  .add(Hi2)
380  .addReg(CarryReg, RegState::Kill)
381  .addImm(0);
382 
383  if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
384  return false;
385  }
386 
387  BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
388  .addReg(DstLo)
389  .addImm(AMDGPU::sub0)
390  .addReg(DstHi)
391  .addImm(AMDGPU::sub1);
392 
393 
394  if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
395  return false;
396 
397  I.eraseFromParent();
398  return true;
399 }
400 
401 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
402  MachineInstr &I) const {
403  MachineBasicBlock *BB = I.getParent();
404  MachineFunction *MF = BB->getParent();
405  const DebugLoc &DL = I.getDebugLoc();
406  Register Dst0Reg = I.getOperand(0).getReg();
407  Register Dst1Reg = I.getOperand(1).getReg();
408  const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
409  I.getOpcode() == AMDGPU::G_UADDE;
410  const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
411  I.getOpcode() == AMDGPU::G_USUBE;
412 
413  if (isVCC(Dst1Reg, *MRI)) {
414  unsigned NoCarryOpc =
415  IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
416  unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
417  I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
418  I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
419  I.addOperand(*MF, MachineOperand::CreateImm(0));
420  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
421  }
422 
423  Register Src0Reg = I.getOperand(2).getReg();
424  Register Src1Reg = I.getOperand(3).getReg();
425 
426  if (HasCarryIn) {
427  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
428  .addReg(I.getOperand(4).getReg());
429  }
430 
431  unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
432  unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
433 
434  BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
435  .add(I.getOperand(2))
436  .add(I.getOperand(3));
437  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
439 
440  if (!MRI->getRegClassOrNull(Dst1Reg))
441  MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
442 
443  if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
444  !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
445  !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
446  return false;
447 
448  if (HasCarryIn &&
449  !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
450  AMDGPU::SReg_32RegClass, *MRI))
451  return false;
452 
453  I.eraseFromParent();
454  return true;
455 }
456 
457 // TODO: We should probably legalize these to only using 32-bit results.
458 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
459  MachineBasicBlock *BB = I.getParent();
460  Register DstReg = I.getOperand(0).getReg();
461  Register SrcReg = I.getOperand(1).getReg();
462  LLT DstTy = MRI->getType(DstReg);
463  LLT SrcTy = MRI->getType(SrcReg);
464  const unsigned SrcSize = SrcTy.getSizeInBits();
465  unsigned DstSize = DstTy.getSizeInBits();
466 
467  // TODO: Should handle any multiple of 32 offset.
468  unsigned Offset = I.getOperand(2).getImm();
469  if (Offset % 32 != 0 || DstSize > 128)
470  return false;
471 
472  // 16-bit operations really use 32-bit registers.
473  // FIXME: Probably should not allow 16-bit G_EXTRACT results.
474  if (DstSize == 16)
475  DstSize = 32;
476 
477  const TargetRegisterClass *DstRC =
478  TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
479  if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
480  return false;
481 
482  const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
483  const TargetRegisterClass *SrcRC =
484  TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
485  if (!SrcRC)
486  return false;
488  DstSize / 32);
489  SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
490  if (!SrcRC)
491  return false;
492 
493  SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
494  *SrcRC, I.getOperand(1));
495  const DebugLoc &DL = I.getDebugLoc();
496  BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
497  .addReg(SrcReg, 0, SubReg);
498 
499  I.eraseFromParent();
500  return true;
501 }
502 
503 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
504  MachineBasicBlock *BB = MI.getParent();
505  Register DstReg = MI.getOperand(0).getReg();
506  LLT DstTy = MRI->getType(DstReg);
507  LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
508 
509  const unsigned SrcSize = SrcTy.getSizeInBits();
510  if (SrcSize < 32)
511  return selectImpl(MI, *CoverageInfo);
512 
513  const DebugLoc &DL = MI.getDebugLoc();
514  const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
515  const unsigned DstSize = DstTy.getSizeInBits();
516  const TargetRegisterClass *DstRC =
517  TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
518  if (!DstRC)
519  return false;
520 
521  ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
522  MachineInstrBuilder MIB =
523  BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
524  for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
525  MachineOperand &Src = MI.getOperand(I + 1);
526  MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
527  MIB.addImm(SubRegs[I]);
528 
529  const TargetRegisterClass *SrcRC
530  = TRI.getConstrainedRegClassForOperand(Src, *MRI);
531  if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
532  return false;
533  }
534 
535  if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
536  return false;
537 
538  MI.eraseFromParent();
539  return true;
540 }
541 
542 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
543  MachineBasicBlock *BB = MI.getParent();
544  const int NumDst = MI.getNumOperands() - 1;
545 
546  MachineOperand &Src = MI.getOperand(NumDst);
547 
548  Register SrcReg = Src.getReg();
549  Register DstReg0 = MI.getOperand(0).getReg();
550  LLT DstTy = MRI->getType(DstReg0);
551  LLT SrcTy = MRI->getType(SrcReg);
552 
553  const unsigned DstSize = DstTy.getSizeInBits();
554  const unsigned SrcSize = SrcTy.getSizeInBits();
555  const DebugLoc &DL = MI.getDebugLoc();
556  const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
557 
558  const TargetRegisterClass *SrcRC =
559  TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
560  if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
561  return false;
562 
563  // Note we could have mixed SGPR and VGPR destination banks for an SGPR
564  // source, and this relies on the fact that the same subregister indices are
565  // used for both.
566  ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
567  for (int I = 0, E = NumDst; I != E; ++I) {
568  MachineOperand &Dst = MI.getOperand(I);
569  BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
570  .addReg(SrcReg, 0, SubRegs[I]);
571 
572  // Make sure the subregister index is valid for the source register.
573  SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
574  if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
575  return false;
576 
577  const TargetRegisterClass *DstRC =
578  TRI.getConstrainedRegClassForOperand(Dst, *MRI);
579  if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
580  return false;
581  }
582 
583  MI.eraseFromParent();
584  return true;
585 }
586 
587 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
588  MachineInstr &MI) const {
589  if (selectImpl(MI, *CoverageInfo))
590  return true;
591 
592  const LLT S32 = LLT::scalar(32);
593  const LLT V2S16 = LLT::fixed_vector(2, 16);
594 
595  Register Dst = MI.getOperand(0).getReg();
596  if (MRI->getType(Dst) != V2S16)
597  return false;
598 
599  const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
600  if (DstBank->getID() != AMDGPU::SGPRRegBankID)
601  return false;
602 
603  Register Src0 = MI.getOperand(1).getReg();
604  Register Src1 = MI.getOperand(2).getReg();
605  if (MRI->getType(Src0) != S32)
606  return false;
607 
608  const DebugLoc &DL = MI.getDebugLoc();
609  MachineBasicBlock *BB = MI.getParent();
610 
611  auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
612  if (ConstSrc1) {
613  auto ConstSrc0 =
614  getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
615  if (ConstSrc0) {
616  const int64_t K0 = ConstSrc0->Value.getSExtValue();
617  const int64_t K1 = ConstSrc1->Value.getSExtValue();
618  uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
619  uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
620 
621  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst)
622  .addImm(Lo16 | (Hi16 << 16));
623  MI.eraseFromParent();
624  return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
625  }
626  }
627 
628  // TODO: This should probably be a combine somewhere
629  // (build_vector_trunc $src0, undef -> copy $src0
630  MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
631  if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
632  MI.setDesc(TII.get(AMDGPU::COPY));
633  MI.RemoveOperand(2);
634  return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) &&
635  RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI);
636  }
637 
638  Register ShiftSrc0;
639  Register ShiftSrc1;
640 
641  // With multiple uses of the shift, this will duplicate the shift and
642  // increase register pressure.
643  //
644  // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
645  // => (S_PACK_HH_B32_B16 $src0, $src1)
646  // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16))
647  // => (S_PACK_LH_B32_B16 $src0, $src1)
648  // (build_vector_trunc $src0, $src1)
649  // => (S_PACK_LL_B32_B16 $src0, $src1)
650 
651  bool Shift0 = mi_match(
652  Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
653 
654  bool Shift1 = mi_match(
655  Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
656 
657  unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
658  if (Shift0 && Shift1) {
659  Opc = AMDGPU::S_PACK_HH_B32_B16;
660  MI.getOperand(1).setReg(ShiftSrc0);
661  MI.getOperand(2).setReg(ShiftSrc1);
662  } else if (Shift1) {
663  Opc = AMDGPU::S_PACK_LH_B32_B16;
664  MI.getOperand(2).setReg(ShiftSrc1);
665  } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) {
666  // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
667  auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
668  .addReg(ShiftSrc0)
669  .addImm(16);
670 
671  MI.eraseFromParent();
672  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
673  }
674 
675  MI.setDesc(TII.get(Opc));
676  return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
677 }
678 
679 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
680  return selectG_ADD_SUB(I);
681 }
682 
683 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
684  const MachineOperand &MO = I.getOperand(0);
685 
686  // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
687  // regbank check here is to know why getConstrainedRegClassForOperand failed.
688  const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
689  if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
690  (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
691  I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
692  return true;
693  }
694 
695  return false;
696 }
697 
698 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
699  MachineBasicBlock *BB = I.getParent();
700 
701  Register DstReg = I.getOperand(0).getReg();
702  Register Src0Reg = I.getOperand(1).getReg();
703  Register Src1Reg = I.getOperand(2).getReg();
704  LLT Src1Ty = MRI->getType(Src1Reg);
705 
706  unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
707  unsigned InsSize = Src1Ty.getSizeInBits();
708 
709  int64_t Offset = I.getOperand(3).getImm();
710 
711  // FIXME: These cases should have been illegal and unnecessary to check here.
712  if (Offset % 32 != 0 || InsSize % 32 != 0)
713  return false;
714 
715  // Currently not handled by getSubRegFromChannel.
716  if (InsSize > 128)
717  return false;
718 
719  unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
720  if (SubReg == AMDGPU::NoSubRegister)
721  return false;
722 
723  const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
724  const TargetRegisterClass *DstRC =
725  TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
726  if (!DstRC)
727  return false;
728 
729  const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
730  const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
731  const TargetRegisterClass *Src0RC =
732  TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI);
733  const TargetRegisterClass *Src1RC =
734  TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI);
735 
736  // Deal with weird cases where the class only partially supports the subreg
737  // index.
738  Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
739  if (!Src0RC || !Src1RC)
740  return false;
741 
742  if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
743  !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
744  !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
745  return false;
746 
747  const DebugLoc &DL = I.getDebugLoc();
748  BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
749  .addReg(Src0Reg)
750  .addReg(Src1Reg)
751  .addImm(SubReg);
752 
753  I.eraseFromParent();
754  return true;
755 }
756 
757 bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
758  Register DstReg = MI.getOperand(0).getReg();
759  Register SrcReg = MI.getOperand(1).getReg();
760  Register OffsetReg = MI.getOperand(2).getReg();
761  Register WidthReg = MI.getOperand(3).getReg();
762 
763  assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
764  "scalar BFX instructions are expanded in regbankselect");
765  assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
766  "64-bit vector BFX instructions are expanded in regbankselect");
767 
768  const DebugLoc &DL = MI.getDebugLoc();
769  MachineBasicBlock *MBB = MI.getParent();
770 
771  bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
772  unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
773  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
774  .addReg(SrcReg)
775  .addReg(OffsetReg)
776  .addReg(WidthReg);
777  MI.eraseFromParent();
778  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
779 }
780 
781 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
782  if (STI.getLDSBankCount() != 16)
783  return selectImpl(MI, *CoverageInfo);
784 
785  Register Dst = MI.getOperand(0).getReg();
786  Register Src0 = MI.getOperand(2).getReg();
787  Register M0Val = MI.getOperand(6).getReg();
788  if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
789  !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
790  !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
791  return false;
792 
793  // This requires 2 instructions. It is possible to write a pattern to support
794  // this, but the generated isel emitter doesn't correctly deal with multiple
795  // output instructions using the same physical register input. The copy to m0
796  // is incorrectly placed before the second instruction.
797  //
798  // TODO: Match source modifiers.
799 
800  Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
801  const DebugLoc &DL = MI.getDebugLoc();
802  MachineBasicBlock *MBB = MI.getParent();
803 
804  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
805  .addReg(M0Val);
806  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
807  .addImm(2)
808  .addImm(MI.getOperand(4).getImm()) // $attr
809  .addImm(MI.getOperand(3).getImm()); // $attrchan
810 
811  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
812  .addImm(0) // $src0_modifiers
813  .addReg(Src0) // $src0
814  .addImm(MI.getOperand(4).getImm()) // $attr
815  .addImm(MI.getOperand(3).getImm()) // $attrchan
816  .addImm(0) // $src2_modifiers
817  .addReg(InterpMov) // $src2 - 2 f16 values selected by high
818  .addImm(MI.getOperand(5).getImm()) // $high
819  .addImm(0) // $clamp
820  .addImm(0); // $omod
821 
822  MI.eraseFromParent();
823  return true;
824 }
825 
826 // Writelane is special in that it can use SGPR and M0 (which would normally
827 // count as using the constant bus twice - but in this case it is allowed since
828 // the lane selector doesn't count as a use of the constant bus). However, it is
829 // still required to abide by the 1 SGPR rule. Fix this up if we might have
830 // multiple SGPRs.
831 bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
832  // With a constant bus limit of at least 2, there's no issue.
833  if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
834  return selectImpl(MI, *CoverageInfo);
835 
836  MachineBasicBlock *MBB = MI.getParent();
837  const DebugLoc &DL = MI.getDebugLoc();
838  Register VDst = MI.getOperand(0).getReg();
839  Register Val = MI.getOperand(2).getReg();
840  Register LaneSelect = MI.getOperand(3).getReg();
841  Register VDstIn = MI.getOperand(4).getReg();
842 
843  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
844 
845  Optional<ValueAndVReg> ConstSelect =
846  getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
847  if (ConstSelect) {
848  // The selector has to be an inline immediate, so we can use whatever for
849  // the other operands.
850  MIB.addReg(Val);
851  MIB.addImm(ConstSelect->Value.getSExtValue() &
852  maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
853  } else {
854  Optional<ValueAndVReg> ConstVal =
856 
857  // If the value written is an inline immediate, we can get away without a
858  // copy to m0.
859  if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
860  STI.hasInv2PiInlineImm())) {
861  MIB.addImm(ConstVal->Value.getSExtValue());
862  MIB.addReg(LaneSelect);
863  } else {
864  MIB.addReg(Val);
865 
866  // If the lane selector was originally in a VGPR and copied with
867  // readfirstlane, there's a hazard to read the same SGPR from the
868  // VALU. Constrain to a different SGPR to help avoid needing a nop later.
869  RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
870 
871  BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
872  .addReg(LaneSelect);
873  MIB.addReg(AMDGPU::M0);
874  }
875  }
876 
877  MIB.addReg(VDstIn);
878 
879  MI.eraseFromParent();
880  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
881 }
882 
883 // We need to handle this here because tablegen doesn't support matching
884 // instructions with multiple outputs.
885 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
886  Register Dst0 = MI.getOperand(0).getReg();
887  Register Dst1 = MI.getOperand(1).getReg();
888 
889  LLT Ty = MRI->getType(Dst0);
890  unsigned Opc;
891  if (Ty == LLT::scalar(32))
892  Opc = AMDGPU::V_DIV_SCALE_F32_e64;
893  else if (Ty == LLT::scalar(64))
894  Opc = AMDGPU::V_DIV_SCALE_F64_e64;
895  else
896  return false;
897 
898  // TODO: Match source modifiers.
899 
900  const DebugLoc &DL = MI.getDebugLoc();
901  MachineBasicBlock *MBB = MI.getParent();
902 
903  Register Numer = MI.getOperand(3).getReg();
904  Register Denom = MI.getOperand(4).getReg();
905  unsigned ChooseDenom = MI.getOperand(5).getImm();
906 
907  Register Src0 = ChooseDenom != 0 ? Numer : Denom;
908 
909  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
910  .addDef(Dst1)
911  .addImm(0) // $src0_modifiers
912  .addUse(Src0) // $src0
913  .addImm(0) // $src1_modifiers
914  .addUse(Denom) // $src1
915  .addImm(0) // $src2_modifiers
916  .addUse(Numer) // $src2
917  .addImm(0) // $clamp
918  .addImm(0); // $omod
919 
920  MI.eraseFromParent();
921  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
922 }
923 
924 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
925  unsigned IntrinsicID = I.getIntrinsicID();
926  switch (IntrinsicID) {
927  case Intrinsic::amdgcn_if_break: {
928  MachineBasicBlock *BB = I.getParent();
929 
930  // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
931  // SelectionDAG uses for wave32 vs wave64.
932  BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
933  .add(I.getOperand(0))
934  .add(I.getOperand(2))
935  .add(I.getOperand(3));
936 
937  Register DstReg = I.getOperand(0).getReg();
938  Register Src0Reg = I.getOperand(2).getReg();
939  Register Src1Reg = I.getOperand(3).getReg();
940 
941  I.eraseFromParent();
942 
943  for (Register Reg : { DstReg, Src0Reg, Src1Reg })
945 
946  return true;
947  }
948  case Intrinsic::amdgcn_interp_p1_f16:
949  return selectInterpP1F16(I);
950  case Intrinsic::amdgcn_wqm:
951  return constrainCopyLikeIntrin(I, AMDGPU::WQM);
952  case Intrinsic::amdgcn_softwqm:
953  return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
954  case Intrinsic::amdgcn_strict_wwm:
955  case Intrinsic::amdgcn_wwm:
956  return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
957  case Intrinsic::amdgcn_strict_wqm:
958  return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
959  case Intrinsic::amdgcn_writelane:
960  return selectWritelane(I);
961  case Intrinsic::amdgcn_div_scale:
962  return selectDivScale(I);
963  case Intrinsic::amdgcn_icmp:
964  return selectIntrinsicIcmp(I);
965  case Intrinsic::amdgcn_ballot:
966  return selectBallot(I);
967  case Intrinsic::amdgcn_reloc_constant:
968  return selectRelocConstant(I);
969  case Intrinsic::amdgcn_groupstaticsize:
970  return selectGroupStaticSize(I);
971  case Intrinsic::returnaddress:
972  return selectReturnAddress(I);
973  default:
974  return selectImpl(I, *CoverageInfo);
975  }
976 }
977 
978 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) {
979  if (Size != 32 && Size != 64)
980  return -1;
981  switch (P) {
982  default:
983  llvm_unreachable("Unknown condition code!");
984  case CmpInst::ICMP_NE:
985  return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64;
986  case CmpInst::ICMP_EQ:
987  return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64;
988  case CmpInst::ICMP_SGT:
989  return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64;
990  case CmpInst::ICMP_SGE:
991  return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64;
992  case CmpInst::ICMP_SLT:
993  return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64;
994  case CmpInst::ICMP_SLE:
995  return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64;
996  case CmpInst::ICMP_UGT:
997  return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64;
998  case CmpInst::ICMP_UGE:
999  return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64;
1000  case CmpInst::ICMP_ULT:
1001  return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64;
1002  case CmpInst::ICMP_ULE:
1003  return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64;
1004  }
1005 }
1006 
1007 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1008  unsigned Size) const {
1009  if (Size == 64) {
1010  if (!STI.hasScalarCompareEq64())
1011  return -1;
1012 
1013  switch (P) {
1014  case CmpInst::ICMP_NE:
1015  return AMDGPU::S_CMP_LG_U64;
1016  case CmpInst::ICMP_EQ:
1017  return AMDGPU::S_CMP_EQ_U64;
1018  default:
1019  return -1;
1020  }
1021  }
1022 
1023  if (Size != 32)
1024  return -1;
1025 
1026  switch (P) {
1027  case CmpInst::ICMP_NE:
1028  return AMDGPU::S_CMP_LG_U32;
1029  case CmpInst::ICMP_EQ:
1030  return AMDGPU::S_CMP_EQ_U32;
1031  case CmpInst::ICMP_SGT:
1032  return AMDGPU::S_CMP_GT_I32;
1033  case CmpInst::ICMP_SGE:
1034  return AMDGPU::S_CMP_GE_I32;
1035  case CmpInst::ICMP_SLT:
1036  return AMDGPU::S_CMP_LT_I32;
1037  case CmpInst::ICMP_SLE:
1038  return AMDGPU::S_CMP_LE_I32;
1039  case CmpInst::ICMP_UGT:
1040  return AMDGPU::S_CMP_GT_U32;
1041  case CmpInst::ICMP_UGE:
1042  return AMDGPU::S_CMP_GE_U32;
1043  case CmpInst::ICMP_ULT:
1044  return AMDGPU::S_CMP_LT_U32;
1045  case CmpInst::ICMP_ULE:
1046  return AMDGPU::S_CMP_LE_U32;
1047  default:
1048  llvm_unreachable("Unknown condition code!");
1049  }
1050 }
1051 
1052 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
1053  MachineBasicBlock *BB = I.getParent();
1054  const DebugLoc &DL = I.getDebugLoc();
1055 
1056  Register SrcReg = I.getOperand(2).getReg();
1057  unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1058 
1059  auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1060 
1061  Register CCReg = I.getOperand(0).getReg();
1062  if (!isVCC(CCReg, *MRI)) {
1063  int Opcode = getS_CMPOpcode(Pred, Size);
1064  if (Opcode == -1)
1065  return false;
1066  MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1067  .add(I.getOperand(2))
1068  .add(I.getOperand(3));
1069  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1070  .addReg(AMDGPU::SCC);
1071  bool Ret =
1072  constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1073  RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1074  I.eraseFromParent();
1075  return Ret;
1076  }
1077 
1078  int Opcode = getV_CMPOpcode(Pred, Size);
1079  if (Opcode == -1)
1080  return false;
1081 
1082  MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1083  I.getOperand(0).getReg())
1084  .add(I.getOperand(2))
1085  .add(I.getOperand(3));
1087  *TRI.getBoolRC(), *MRI);
1088  bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1089  I.eraseFromParent();
1090  return Ret;
1091 }
1092 
1093 bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const {
1094  Register Dst = I.getOperand(0).getReg();
1095  if (isVCC(Dst, *MRI))
1096  return false;
1097 
1098  if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize())
1099  return false;
1100 
1101  MachineBasicBlock *BB = I.getParent();
1102  const DebugLoc &DL = I.getDebugLoc();
1103  Register SrcReg = I.getOperand(2).getReg();
1104  unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1105  auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1106 
1107  int Opcode = getV_CMPOpcode(Pred, Size);
1108  if (Opcode == -1)
1109  return false;
1110 
1111  MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
1112  .add(I.getOperand(2))
1113  .add(I.getOperand(3));
1114  RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(),
1115  *MRI);
1116  bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1117  I.eraseFromParent();
1118  return Ret;
1119 }
1120 
1121 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1122  MachineBasicBlock *BB = I.getParent();
1123  const DebugLoc &DL = I.getDebugLoc();
1124  Register DstReg = I.getOperand(0).getReg();
1125  const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1126  const bool Is64 = Size == 64;
1127 
1128  if (Size != STI.getWavefrontSize())
1129  return false;
1130 
1132  getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
1133 
1134  if (Arg.hasValue()) {
1135  const int64_t Value = Arg.getValue().Value.getSExtValue();
1136  if (Value == 0) {
1137  unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1138  BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1139  } else if (Value == -1) { // all ones
1140  Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
1141  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1142  } else
1143  return false;
1144  } else {
1145  Register SrcReg = I.getOperand(2).getReg();
1146  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1147  }
1148 
1149  I.eraseFromParent();
1150  return true;
1151 }
1152 
1153 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1154  Register DstReg = I.getOperand(0).getReg();
1155  const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1156  const TargetRegisterClass *DstRC =
1157  TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI);
1158  if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1159  return false;
1160 
1161  const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1162 
1163  Module *M = MF->getFunction().getParent();
1164  const MDNode *Metadata = I.getOperand(2).getMetadata();
1165  auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1166  auto RelocSymbol = cast<GlobalVariable>(
1167  M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1168 
1169  MachineBasicBlock *BB = I.getParent();
1170  BuildMI(*BB, &I, I.getDebugLoc(),
1171  TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1172  .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);
1173 
1174  I.eraseFromParent();
1175  return true;
1176 }
1177 
1178 bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1180 
1181  Register DstReg = I.getOperand(0).getReg();
1182  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1183  unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1184  AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1185 
1186  MachineBasicBlock *MBB = I.getParent();
1187  const DebugLoc &DL = I.getDebugLoc();
1188 
1189  auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1190 
1191  if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1193  MIB.addImm(MFI->getLDSSize());
1194  } else {
1195  Module *M = MF->getFunction().getParent();
1196  const GlobalValue *GV
1197  = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1199  }
1200 
1201  I.eraseFromParent();
1202  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1203 }
1204 
1205 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1206  MachineBasicBlock *MBB = I.getParent();
1208  const DebugLoc &DL = I.getDebugLoc();
1209 
1210  MachineOperand &Dst = I.getOperand(0);
1211  Register DstReg = Dst.getReg();
1212  unsigned Depth = I.getOperand(2).getImm();
1213 
1214  const TargetRegisterClass *RC
1215  = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1216  if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1217  !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1218  return false;
1219 
1220  // Check for kernel and shader functions
1221  if (Depth != 0 ||
1223  BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1224  .addImm(0);
1225  I.eraseFromParent();
1226  return true;
1227  }
1228 
1229  MachineFrameInfo &MFI = MF.getFrameInfo();
1230  // There is a call to @llvm.returnaddress in this function
1231  MFI.setReturnAddressIsTaken(true);
1232 
1233  // Get the return address reg and mark it as an implicit live-in
1234  Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1235  Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1236  AMDGPU::SReg_64RegClass, DL);
1237  BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1238  .addReg(LiveIn);
1239  I.eraseFromParent();
1240  return true;
1241 }
1242 
1243 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1244  // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1245  // SelectionDAG uses for wave32 vs wave64.
1246  MachineBasicBlock *BB = MI.getParent();
1247  BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1248  .add(MI.getOperand(1));
1249 
1250  Register Reg = MI.getOperand(1).getReg();
1251  MI.eraseFromParent();
1252 
1253  if (!MRI->getRegClassOrNull(Reg))
1255  return true;
1256 }
1257 
1258 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1259  MachineInstr &MI, Intrinsic::ID IntrID) const {
1260  MachineBasicBlock *MBB = MI.getParent();
1262  const DebugLoc &DL = MI.getDebugLoc();
1263 
1264  unsigned IndexOperand = MI.getOperand(7).getImm();
1265  bool WaveRelease = MI.getOperand(8).getImm() != 0;
1266  bool WaveDone = MI.getOperand(9).getImm() != 0;
1267 
1268  if (WaveDone && !WaveRelease)
1269  report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1270 
1271  unsigned OrderedCountIndex = IndexOperand & 0x3f;
1272  IndexOperand &= ~0x3f;
1273  unsigned CountDw = 0;
1274 
1275  if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1276  CountDw = (IndexOperand >> 24) & 0xf;
1277  IndexOperand &= ~(0xf << 24);
1278 
1279  if (CountDw < 1 || CountDw > 4) {
1281  "ds_ordered_count: dword count must be between 1 and 4");
1282  }
1283  }
1284 
1285  if (IndexOperand)
1286  report_fatal_error("ds_ordered_count: bad index operand");
1287 
1288  unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1289  unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1290 
1291  unsigned Offset0 = OrderedCountIndex << 2;
1292  unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
1293  (Instruction << 4);
1294 
1296  Offset1 |= (CountDw - 1) << 6;
1297 
1298  unsigned Offset = Offset0 | (Offset1 << 8);
1299 
1300  Register M0Val = MI.getOperand(2).getReg();
1301  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1302  .addReg(M0Val);
1303 
1304  Register DstReg = MI.getOperand(0).getReg();
1305  Register ValReg = MI.getOperand(3).getReg();
1307  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1308  .addReg(ValReg)
1309  .addImm(Offset)
1310  .cloneMemRefs(MI);
1311 
1312  if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1313  return false;
1314 
1315  bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1316  MI.eraseFromParent();
1317  return Ret;
1318 }
1319 
1320 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1321  switch (IntrID) {
1322  case Intrinsic::amdgcn_ds_gws_init:
1323  return AMDGPU::DS_GWS_INIT;
1324  case Intrinsic::amdgcn_ds_gws_barrier:
1325  return AMDGPU::DS_GWS_BARRIER;
1326  case Intrinsic::amdgcn_ds_gws_sema_v:
1327  return AMDGPU::DS_GWS_SEMA_V;
1328  case Intrinsic::amdgcn_ds_gws_sema_br:
1329  return AMDGPU::DS_GWS_SEMA_BR;
1330  case Intrinsic::amdgcn_ds_gws_sema_p:
1331  return AMDGPU::DS_GWS_SEMA_P;
1332  case Intrinsic::amdgcn_ds_gws_sema_release_all:
1333  return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1334  default:
1335  llvm_unreachable("not a gws intrinsic");
1336  }
1337 }
1338 
1339 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1340  Intrinsic::ID IID) const {
1341  if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1342  !STI.hasGWSSemaReleaseAll())
1343  return false;
1344 
1345  // intrinsic ID, vsrc, offset
1346  const bool HasVSrc = MI.getNumOperands() == 3;
1347  assert(HasVSrc || MI.getNumOperands() == 2);
1348 
1349  Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1350  const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1351  if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1352  return false;
1353 
1354  MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1355  assert(OffsetDef);
1356 
1357  unsigned ImmOffset;
1358 
1359  MachineBasicBlock *MBB = MI.getParent();
1360  const DebugLoc &DL = MI.getDebugLoc();
1361 
1362  MachineInstr *Readfirstlane = nullptr;
1363 
1364  // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1365  // incoming offset, in case there's an add of a constant. We'll have to put it
1366  // back later.
1367  if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1368  Readfirstlane = OffsetDef;
1369  BaseOffset = OffsetDef->getOperand(1).getReg();
1370  OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1371  }
1372 
1373  if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1374  // If we have a constant offset, try to use the 0 in m0 as the base.
1375  // TODO: Look into changing the default m0 initialization value. If the
1376  // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1377  // the immediate offset.
1378 
1379  ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1380  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1381  .addImm(0);
1382  } else {
1383  std::tie(BaseOffset, ImmOffset) =
1384  AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset);
1385 
1386  if (Readfirstlane) {
1387  // We have the constant offset now, so put the readfirstlane back on the
1388  // variable component.
1389  if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1390  return false;
1391 
1392  Readfirstlane->getOperand(1).setReg(BaseOffset);
1393  BaseOffset = Readfirstlane->getOperand(0).getReg();
1394  } else {
1395  if (!RBI.constrainGenericRegister(BaseOffset,
1396  AMDGPU::SReg_32RegClass, *MRI))
1397  return false;
1398  }
1399 
1400  Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1401  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1402  .addReg(BaseOffset)
1403  .addImm(16);
1404 
1405  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1406  .addReg(M0Base);
1407  }
1408 
1409  // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1410  // offset field) % 64. Some versions of the programming guide omit the m0
1411  // part, or claim it's from offset 0.
1412  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1413 
1414  if (HasVSrc) {
1415  Register VSrc = MI.getOperand(1).getReg();
1416 
1417  if (STI.needsAlignedVGPRs()) {
1418  // Add implicit aligned super-reg to force alignment on the data operand.
1419  Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1420  BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
1421  Register NewVR =
1422  MRI->createVirtualRegister(&AMDGPU::VReg_64_Align2RegClass);
1423  BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), NewVR)
1424  .addReg(VSrc, 0, MI.getOperand(1).getSubReg())
1425  .addImm(AMDGPU::sub0)
1426  .addReg(Undef)
1427  .addImm(AMDGPU::sub1);
1428  MIB.addReg(NewVR, 0, AMDGPU::sub0);
1429  MIB.addReg(NewVR, RegState::Implicit);
1430  } else {
1431  MIB.addReg(VSrc);
1432  }
1433 
1434  if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1435  return false;
1436  }
1437 
1438  MIB.addImm(ImmOffset)
1439  .cloneMemRefs(MI);
1440 
1441  MI.eraseFromParent();
1442  return true;
1443 }
1444 
1445 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1446  bool IsAppend) const {
1447  Register PtrBase = MI.getOperand(2).getReg();
1448  LLT PtrTy = MRI->getType(PtrBase);
1449  bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1450 
1451  unsigned Offset;
1452  std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1453 
1454  // TODO: Should this try to look through readfirstlane like GWS?
1455  if (!isDSOffsetLegal(PtrBase, Offset)) {
1456  PtrBase = MI.getOperand(2).getReg();
1457  Offset = 0;
1458  }
1459 
1460  MachineBasicBlock *MBB = MI.getParent();
1461  const DebugLoc &DL = MI.getDebugLoc();
1462  const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1463 
1464  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1465  .addReg(PtrBase);
1466  if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1467  return false;
1468 
1469  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1470  .addImm(Offset)
1471  .addImm(IsGDS ? -1 : 0)
1472  .cloneMemRefs(MI);
1473  MI.eraseFromParent();
1474  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1475 }
1476 
1477 bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1478  if (TM.getOptLevel() > CodeGenOpt::None) {
1479  unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1480  if (WGSize <= STI.getWavefrontSize()) {
1481  MachineBasicBlock *MBB = MI.getParent();
1482  const DebugLoc &DL = MI.getDebugLoc();
1483  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1484  MI.eraseFromParent();
1485  return true;
1486  }
1487  }
1488  return selectImpl(MI, *CoverageInfo);
1489 }
1490 
1491 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1492  bool &IsTexFail) {
1493  if (TexFailCtrl)
1494  IsTexFail = true;
1495 
1496  TFE = (TexFailCtrl & 0x1) ? true : false;
1497  TexFailCtrl &= ~(uint64_t)0x1;
1498  LWE = (TexFailCtrl & 0x2) ? true : false;
1499  TexFailCtrl &= ~(uint64_t)0x2;
1500 
1501  return TexFailCtrl == 0;
1502 }
1503 
1504 bool AMDGPUInstructionSelector::selectImageIntrinsic(
1505  MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
1506  MachineBasicBlock *MBB = MI.getParent();
1507  const DebugLoc &DL = MI.getDebugLoc();
1508 
1509  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1510  AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1511 
1512  const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1513  unsigned IntrOpcode = Intr->BaseOpcode;
1514  const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
1515 
1516  const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
1517 
1518  Register VDataIn, VDataOut;
1519  LLT VDataTy;
1520  int NumVDataDwords = -1;
1521  bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
1522  MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
1523 
1524  bool Unorm;
1525  if (!BaseOpcode->Sampler)
1526  Unorm = true;
1527  else
1528  Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
1529 
1530  bool TFE;
1531  bool LWE;
1532  bool IsTexFail = false;
1533  if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
1534  TFE, LWE, IsTexFail))
1535  return false;
1536 
1537  const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
1538  const bool IsA16 = (Flags & 1) != 0;
1539  const bool IsG16 = (Flags & 2) != 0;
1540 
1541  // A16 implies 16 bit gradients if subtarget doesn't support G16
1542  if (IsA16 && !STI.hasG16() && !IsG16)
1543  return false;
1544 
1545  unsigned DMask = 0;
1546  unsigned DMaskLanes = 0;
1547 
1548  if (BaseOpcode->Atomic) {
1549  VDataOut = MI.getOperand(0).getReg();
1550  VDataIn = MI.getOperand(2).getReg();
1551  LLT Ty = MRI->getType(VDataIn);
1552 
1553  // Be careful to allow atomic swap on 16-bit element vectors.
1554  const bool Is64Bit = BaseOpcode->AtomicX2 ?
1555  Ty.getSizeInBits() == 128 :
1556  Ty.getSizeInBits() == 64;
1557 
1558  if (BaseOpcode->AtomicX2) {
1559  assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1560 
1561  DMask = Is64Bit ? 0xf : 0x3;
1562  NumVDataDwords = Is64Bit ? 4 : 2;
1563  } else {
1564  DMask = Is64Bit ? 0x3 : 0x1;
1565  NumVDataDwords = Is64Bit ? 2 : 1;
1566  }
1567  } else {
1568  DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
1569  DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
1570 
1571  if (BaseOpcode->Store) {
1572  VDataIn = MI.getOperand(1).getReg();
1573  VDataTy = MRI->getType(VDataIn);
1574  NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1575  } else {
1576  VDataOut = MI.getOperand(0).getReg();
1577  VDataTy = MRI->getType(VDataOut);
1578  NumVDataDwords = DMaskLanes;
1579 
1580  if (IsD16 && !STI.hasUnpackedD16VMem())
1581  NumVDataDwords = (DMaskLanes + 1) / 2;
1582  }
1583  }
1584 
1585  // Set G16 opcode
1586  if (IsG16 && !IsA16) {
1587  const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1588  AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
1589  assert(G16MappingInfo);
1590  IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1591  }
1592 
1593  // TODO: Check this in verifier.
1594  assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1595 
1596  unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
1597  if (BaseOpcode->Atomic)
1598  CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
1599  if (CPol & ~AMDGPU::CPol::ALL)
1600  return false;
1601 
1602  int NumVAddrRegs = 0;
1603  int NumVAddrDwords = 0;
1604  for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
1605  // Skip the $noregs and 0s inserted during legalization.
1606  MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
1607  if (!AddrOp.isReg())
1608  continue; // XXX - Break?
1609 
1610  Register Addr = AddrOp.getReg();
1611  if (!Addr)
1612  break;
1613 
1614  ++NumVAddrRegs;
1615  NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1616  }
1617 
1618  // The legalizer preprocessed the intrinsic arguments. If we aren't using
1619  // NSA, these should have beeen packed into a single value in the first
1620  // address register
1621  const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
1622  if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1623  LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1624  return false;
1625  }
1626 
1627  if (IsTexFail)
1628  ++NumVDataDwords;
1629 
1630  int Opcode = -1;
1631  if (IsGFX10Plus) {
1632  Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1633  UseNSA ? AMDGPU::MIMGEncGfx10NSA
1634  : AMDGPU::MIMGEncGfx10Default,
1635  NumVDataDwords, NumVAddrDwords);
1636  } else {
1638  Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1639  NumVDataDwords, NumVAddrDwords);
1640  if (Opcode == -1)
1641  Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1642  NumVDataDwords, NumVAddrDwords);
1643  }
1644  assert(Opcode != -1);
1645 
1646  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1647  .cloneMemRefs(MI);
1648 
1649  if (VDataOut) {
1650  if (BaseOpcode->AtomicX2) {
1651  const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1652 
1654  Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1655  unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1656 
1657  MIB.addDef(TmpReg);
1658  if (!MRI->use_empty(VDataOut)) {
1659  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1660  .addReg(TmpReg, RegState::Kill, SubReg);
1661  }
1662 
1663  } else {
1664  MIB.addDef(VDataOut); // vdata output
1665  }
1666  }
1667 
1668  if (VDataIn)
1669  MIB.addReg(VDataIn); // vdata input
1670 
1671  for (int I = 0; I != NumVAddrRegs; ++I) {
1672  MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
1673  if (SrcOp.isReg()) {
1674  assert(SrcOp.getReg() != 0);
1675  MIB.addReg(SrcOp.getReg());
1676  }
1677  }
1678 
1679  MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
1680  if (BaseOpcode->Sampler)
1681  MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
1682 
1683  MIB.addImm(DMask); // dmask
1684 
1685  if (IsGFX10Plus)
1686  MIB.addImm(DimInfo->Encoding);
1687  MIB.addImm(Unorm);
1688 
1689  MIB.addImm(CPol);
1690  MIB.addImm(IsA16 && // a16 or r128
1691  STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
1692  if (IsGFX10Plus)
1693  MIB.addImm(IsA16 ? -1 : 0);
1694 
1695  MIB.addImm(TFE); // tfe
1696  MIB.addImm(LWE); // lwe
1697  if (!IsGFX10Plus)
1698  MIB.addImm(DimInfo->DA ? -1 : 0);
1699  if (BaseOpcode->HasD16)
1700  MIB.addImm(IsD16 ? -1 : 0);
1701 
1702  if (IsTexFail) {
1703  // An image load instruction with TFE/LWE only conditionally writes to its
1704  // result registers. Initialize them to zero so that we always get well
1705  // defined result values.
1706  assert(VDataOut && !VDataIn);
1707  Register Tied = MRI->cloneVirtualRegister(VDataOut);
1708  Register Zero = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1709  BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::V_MOV_B32_e32), Zero)
1710  .addImm(0);
1711  auto Parts = TRI.getRegSplitParts(MRI->getRegClass(Tied), 4);
1712  if (STI.usePRTStrictNull()) {
1713  // With enable-prt-strict-null enabled, initialize all result registers to
1714  // zero.
1715  auto RegSeq =
1716  BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
1717  for (auto Sub : Parts)
1718  RegSeq.addReg(Zero).addImm(Sub);
1719  } else {
1720  // With enable-prt-strict-null disabled, only initialize the extra TFE/LWE
1721  // result register.
1722  Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1723  BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
1724  auto RegSeq =
1725  BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
1726  for (auto Sub : Parts.drop_back(1))
1727  RegSeq.addReg(Undef).addImm(Sub);
1728  RegSeq.addReg(Zero).addImm(Parts.back());
1729  }
1730  MIB.addReg(Tied, RegState::Implicit);
1731  MIB->tieOperands(0, MIB->getNumOperands() - 1);
1732  }
1733 
1734  MI.eraseFromParent();
1735  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1736 }
1737 
1738 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
1739  MachineInstr &I) const {
1740  unsigned IntrinsicID = I.getIntrinsicID();
1741  switch (IntrinsicID) {
1742  case Intrinsic::amdgcn_end_cf:
1743  return selectEndCfIntrinsic(I);
1744  case Intrinsic::amdgcn_ds_ordered_add:
1745  case Intrinsic::amdgcn_ds_ordered_swap:
1746  return selectDSOrderedIntrinsic(I, IntrinsicID);
1747  case Intrinsic::amdgcn_ds_gws_init:
1748  case Intrinsic::amdgcn_ds_gws_barrier:
1749  case Intrinsic::amdgcn_ds_gws_sema_v:
1750  case Intrinsic::amdgcn_ds_gws_sema_br:
1751  case Intrinsic::amdgcn_ds_gws_sema_p:
1752  case Intrinsic::amdgcn_ds_gws_sema_release_all:
1753  return selectDSGWSIntrinsic(I, IntrinsicID);
1754  case Intrinsic::amdgcn_ds_append:
1755  return selectDSAppendConsume(I, true);
1756  case Intrinsic::amdgcn_ds_consume:
1757  return selectDSAppendConsume(I, false);
1758  case Intrinsic::amdgcn_s_barrier:
1759  return selectSBarrier(I);
1760  case Intrinsic::amdgcn_global_atomic_fadd:
1761  return selectGlobalAtomicFadd(I, I.getOperand(2), I.getOperand(3));
1762  default: {
1763  return selectImpl(I, *CoverageInfo);
1764  }
1765  }
1766 }
1767 
1768 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
1769  if (selectImpl(I, *CoverageInfo))
1770  return true;
1771 
1772  MachineBasicBlock *BB = I.getParent();
1773  const DebugLoc &DL = I.getDebugLoc();
1774 
1775  Register DstReg = I.getOperand(0).getReg();
1776  unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
1777  assert(Size <= 32 || Size == 64);
1778  const MachineOperand &CCOp = I.getOperand(1);
1779  Register CCReg = CCOp.getReg();
1780  if (!isVCC(CCReg, *MRI)) {
1781  unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
1782  AMDGPU::S_CSELECT_B32;
1783  MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
1784  .addReg(CCReg);
1785 
1786  // The generic constrainSelectedInstRegOperands doesn't work for the scc register
1787  // bank, because it does not cover the register class that we used to represent
1788  // for it. So we need to manually set the register class here.
1789  if (!MRI->getRegClassOrNull(CCReg))
1790  MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
1791  MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
1792  .add(I.getOperand(2))
1793  .add(I.getOperand(3));
1794 
1795  bool Ret = false;
1796  Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
1797  Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
1798  I.eraseFromParent();
1799  return Ret;
1800  }
1801 
1802  // Wide VGPR select should have been split in RegBankSelect.
1803  if (Size > 32)
1804  return false;
1805 
1806  MachineInstr *Select =
1807  BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1808  .addImm(0)
1809  .add(I.getOperand(3))
1810  .addImm(0)
1811  .add(I.getOperand(2))
1812  .add(I.getOperand(1));
1813 
1814  bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
1815  I.eraseFromParent();
1816  return Ret;
1817 }
1818 
1819 static int sizeToSubRegIndex(unsigned Size) {
1820  switch (Size) {
1821  case 32:
1822  return AMDGPU::sub0;
1823  case 64:
1824  return AMDGPU::sub0_sub1;
1825  case 96:
1826  return AMDGPU::sub0_sub1_sub2;
1827  case 128:
1828  return AMDGPU::sub0_sub1_sub2_sub3;
1829  case 256:
1830  return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
1831  default:
1832  if (Size < 32)
1833  return AMDGPU::sub0;
1834  if (Size > 256)
1835  return -1;
1837  }
1838 }
1839 
1840 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
1841  Register DstReg = I.getOperand(0).getReg();
1842  Register SrcReg = I.getOperand(1).getReg();
1843  const LLT DstTy = MRI->getType(DstReg);
1844  const LLT SrcTy = MRI->getType(SrcReg);
1845  const LLT S1 = LLT::scalar(1);
1846 
1847  const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
1848  const RegisterBank *DstRB;
1849  if (DstTy == S1) {
1850  // This is a special case. We don't treat s1 for legalization artifacts as
1851  // vcc booleans.
1852  DstRB = SrcRB;
1853  } else {
1854  DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1855  if (SrcRB != DstRB)
1856  return false;
1857  }
1858 
1859  const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
1860 
1861  unsigned DstSize = DstTy.getSizeInBits();
1862  unsigned SrcSize = SrcTy.getSizeInBits();
1863 
1864  const TargetRegisterClass *SrcRC
1865  = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI);
1866  const TargetRegisterClass *DstRC
1867  = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
1868  if (!SrcRC || !DstRC)
1869  return false;
1870 
1871  if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
1872  !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
1873  LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
1874  return false;
1875  }
1876 
1877  if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
1878  MachineBasicBlock *MBB = I.getParent();
1879  const DebugLoc &DL = I.getDebugLoc();
1880 
1881  Register LoReg = MRI->createVirtualRegister(DstRC);
1882  Register HiReg = MRI->createVirtualRegister(DstRC);
1883  BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
1884  .addReg(SrcReg, 0, AMDGPU::sub0);
1885  BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
1886  .addReg(SrcReg, 0, AMDGPU::sub1);
1887 
1888  if (IsVALU && STI.hasSDWA()) {
1889  // Write the low 16-bits of the high element into the high 16-bits of the
1890  // low element.
1891  MachineInstr *MovSDWA =
1892  BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
1893  .addImm(0) // $src0_modifiers
1894  .addReg(HiReg) // $src0
1895  .addImm(0) // $clamp
1896  .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
1897  .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
1898  .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
1899  .addReg(LoReg, RegState::Implicit);
1900  MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
1901  } else {
1902  Register TmpReg0 = MRI->createVirtualRegister(DstRC);
1903  Register TmpReg1 = MRI->createVirtualRegister(DstRC);
1904  Register ImmReg = MRI->createVirtualRegister(DstRC);
1905  if (IsVALU) {
1906  BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
1907  .addImm(16)
1908  .addReg(HiReg);
1909  } else {
1910  BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
1911  .addReg(HiReg)
1912  .addImm(16);
1913  }
1914 
1915  unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
1916  unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
1917  unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
1918 
1919  BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
1920  .addImm(0xffff);
1921  BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
1922  .addReg(LoReg)
1923  .addReg(ImmReg);
1924  BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
1925  .addReg(TmpReg0)
1926  .addReg(TmpReg1);
1927  }
1928 
1929  I.eraseFromParent();
1930  return true;
1931  }
1932 
1933  if (!DstTy.isScalar())
1934  return false;
1935 
1936  if (SrcSize > 32) {
1937  int SubRegIdx = sizeToSubRegIndex(DstSize);
1938  if (SubRegIdx == -1)
1939  return false;
1940 
1941  // Deal with weird cases where the class only partially supports the subreg
1942  // index.
1943  const TargetRegisterClass *SrcWithSubRC
1944  = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
1945  if (!SrcWithSubRC)
1946  return false;
1947 
1948  if (SrcWithSubRC != SrcRC) {
1949  if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
1950  return false;
1951  }
1952 
1953  I.getOperand(1).setSubReg(SubRegIdx);
1954  }
1955 
1956  I.setDesc(TII.get(TargetOpcode::COPY));
1957  return true;
1958 }
1959 
1960 /// \returns true if a bitmask for \p Size bits will be an inline immediate.
1961 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
1962  Mask = maskTrailingOnes<unsigned>(Size);
1963  int SignedMask = static_cast<int>(Mask);
1964  return SignedMask >= -16 && SignedMask <= 64;
1965 }
1966 
1967 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
1968 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
1970  const TargetRegisterInfo &TRI) const {
1971  const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
1972  if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
1973  return RB;
1974 
1975  // Ignore the type, since we don't use vcc in artifacts.
1976  if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
1977  return &RBI.getRegBankFromRegClass(*RC, LLT());
1978  return nullptr;
1979 }
1980 
1981 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
1982  bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
1983  bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
1984  const DebugLoc &DL = I.getDebugLoc();
1985  MachineBasicBlock &MBB = *I.getParent();
1986  const Register DstReg = I.getOperand(0).getReg();
1987  const Register SrcReg = I.getOperand(1).getReg();
1988 
1989  const LLT DstTy = MRI->getType(DstReg);
1990  const LLT SrcTy = MRI->getType(SrcReg);
1991  const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
1992  I.getOperand(2).getImm() : SrcTy.getSizeInBits();
1993  const unsigned DstSize = DstTy.getSizeInBits();
1994  if (!DstTy.isScalar())
1995  return false;
1996 
1997  // Artifact casts should never use vcc.
1998  const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
1999 
2000  // FIXME: This should probably be illegal and split earlier.
2001  if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2002  if (DstSize <= 32)
2003  return selectCOPY(I);
2004 
2005  const TargetRegisterClass *SrcRC =
2006  TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI);
2007  const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2008  const TargetRegisterClass *DstRC =
2009  TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
2010 
2011  Register UndefReg = MRI->createVirtualRegister(SrcRC);
2012  BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2013  BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2014  .addReg(SrcReg)
2015  .addImm(AMDGPU::sub0)
2016  .addReg(UndefReg)
2017  .addImm(AMDGPU::sub1);
2018  I.eraseFromParent();
2019 
2020  return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2021  RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2022  }
2023 
2024  if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2025  // 64-bit should have been split up in RegBankSelect
2026 
2027  // Try to use an and with a mask if it will save code size.
2028  unsigned Mask;
2029  if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2030  MachineInstr *ExtI =
2031  BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2032  .addImm(Mask)
2033  .addReg(SrcReg);
2034  I.eraseFromParent();
2035  return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2036  }
2037 
2038  const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2039  MachineInstr *ExtI =
2040  BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2041  .addReg(SrcReg)
2042  .addImm(0) // Offset
2043  .addImm(SrcSize); // Width
2044  I.eraseFromParent();
2045  return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2046  }
2047 
2048  if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2049  const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2050  AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2051  if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2052  return false;
2053 
2054  if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2055  const unsigned SextOpc = SrcSize == 8 ?
2056  AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2057  BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2058  .addReg(SrcReg);
2059  I.eraseFromParent();
2060  return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2061  }
2062 
2063  const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2064  const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2065 
2066  // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2067  if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2068  // We need a 64-bit register source, but the high bits don't matter.
2069  Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2070  Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2071  unsigned SubReg = InReg ? AMDGPU::sub0 : 0;
2072 
2073  BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2074  BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2075  .addReg(SrcReg, 0, SubReg)
2076  .addImm(AMDGPU::sub0)
2077  .addReg(UndefReg)
2078  .addImm(AMDGPU::sub1);
2079 
2080  BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2081  .addReg(ExtReg)
2082  .addImm(SrcSize << 16);
2083 
2084  I.eraseFromParent();
2085  return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2086  }
2087 
2088  unsigned Mask;
2089  if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2090  BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2091  .addReg(SrcReg)
2092  .addImm(Mask);
2093  } else {
2094  BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2095  .addReg(SrcReg)
2096  .addImm(SrcSize << 16);
2097  }
2098 
2099  I.eraseFromParent();
2100  return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2101  }
2102 
2103  return false;
2104 }
2105 
2106 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
2107  MachineBasicBlock *BB = I.getParent();
2108  MachineOperand &ImmOp = I.getOperand(1);
2109  Register DstReg = I.getOperand(0).getReg();
2110  unsigned Size = MRI->getType(DstReg).getSizeInBits();
2111 
2112  // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
2113  if (ImmOp.isFPImm()) {
2114  const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
2115  ImmOp.ChangeToImmediate(Imm.getZExtValue());
2116  } else if (ImmOp.isCImm()) {
2117  ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
2118  } else {
2119  llvm_unreachable("Not supported by g_constants");
2120  }
2121 
2122  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2123  const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
2124 
2125  unsigned Opcode;
2126  if (DstRB->getID() == AMDGPU::VCCRegBankID) {
2127  Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2128  } else {
2129  Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2130 
2131  // We should never produce s1 values on banks other than VCC. If the user of
2132  // this already constrained the register, we may incorrectly think it's VCC
2133  // if it wasn't originally.
2134  if (Size == 1)
2135  return false;
2136  }
2137 
2138  if (Size != 64) {
2139  I.setDesc(TII.get(Opcode));
2140  I.addImplicitDefUseOperands(*MF);
2141  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2142  }
2143 
2144  const DebugLoc &DL = I.getDebugLoc();
2145 
2146  APInt Imm(Size, I.getOperand(1).getImm());
2147 
2148  MachineInstr *ResInst;
2149  if (IsSgpr && TII.isInlineConstant(Imm)) {
2150  ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2151  .addImm(I.getOperand(1).getImm());
2152  } else {
2153  const TargetRegisterClass *RC = IsSgpr ?
2154  &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2155  Register LoReg = MRI->createVirtualRegister(RC);
2156  Register HiReg = MRI->createVirtualRegister(RC);
2157 
2158  BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2159  .addImm(Imm.trunc(32).getZExtValue());
2160 
2161  BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2162  .addImm(Imm.ashr(32).getZExtValue());
2163 
2164  ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2165  .addReg(LoReg)
2166  .addImm(AMDGPU::sub0)
2167  .addReg(HiReg)
2168  .addImm(AMDGPU::sub1);
2169  }
2170 
2171  // We can't call constrainSelectedInstRegOperands here, because it doesn't
2172  // work for target independent opcodes
2173  I.eraseFromParent();
2174  const TargetRegisterClass *DstRC =
2175  TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2176  if (!DstRC)
2177  return true;
2178  return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2179 }
2180 
2181 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2182  // Only manually handle the f64 SGPR case.
2183  //
2184  // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2185  // the bit ops theoretically have a second result due to the implicit def of
2186  // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2187  // that is easy by disabling the check. The result works, but uses a
2188  // nonsensical sreg32orlds_and_sreg_1 regclass.
2189  //
2190  // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2191  // the variadic REG_SEQUENCE operands.
2192 
2193  Register Dst = MI.getOperand(0).getReg();
2194  const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2195  if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2196  MRI->getType(Dst) != LLT::scalar(64))
2197  return false;
2198 
2199  Register Src = MI.getOperand(1).getReg();
2200  MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2201  if (Fabs)
2202  Src = Fabs->getOperand(1).getReg();
2203 
2204  if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2205  !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2206  return false;
2207 
2208  MachineBasicBlock *BB = MI.getParent();
2209  const DebugLoc &DL = MI.getDebugLoc();
2210  Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2211  Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2212  Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2213  Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2214 
2215  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2216  .addReg(Src, 0, AMDGPU::sub0);
2217  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2218  .addReg(Src, 0, AMDGPU::sub1);
2219  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2220  .addImm(0x80000000);
2221 
2222  // Set or toggle sign bit.
2223  unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2224  BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2225  .addReg(HiReg)
2226  .addReg(ConstReg);
2227  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2228  .addReg(LoReg)
2229  .addImm(AMDGPU::sub0)
2230  .addReg(OpReg)
2231  .addImm(AMDGPU::sub1);
2232  MI.eraseFromParent();
2233  return true;
2234 }
2235 
2236 // FIXME: This is a workaround for the same tablegen problems as G_FNEG
2237 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2238  Register Dst = MI.getOperand(0).getReg();
2239  const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2240  if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2241  MRI->getType(Dst) != LLT::scalar(64))
2242  return false;
2243 
2244  Register Src = MI.getOperand(1).getReg();
2245  MachineBasicBlock *BB = MI.getParent();
2246  const DebugLoc &DL = MI.getDebugLoc();
2247  Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2248  Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2249  Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2250  Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2251 
2252  if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2253  !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2254  return false;
2255 
2256  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2257  .addReg(Src, 0, AMDGPU::sub0);
2258  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2259  .addReg(Src, 0, AMDGPU::sub1);
2260  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2261  .addImm(0x7fffffff);
2262 
2263  // Clear sign bit.
2264  // TODO: Should this used S_BITSET0_*?
2265  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2266  .addReg(HiReg)
2267  .addReg(ConstReg);
2268  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2269  .addReg(LoReg)
2270  .addImm(AMDGPU::sub0)
2271  .addReg(OpReg)
2272  .addImm(AMDGPU::sub1);
2273 
2274  MI.eraseFromParent();
2275  return true;
2276 }
2277 
2278 static bool isConstant(const MachineInstr &MI) {
2279  return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2280 }
2281 
2282 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2283  const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2284 
2285  const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg());
2286 
2287  assert(PtrMI);
2288 
2289  if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2290  return;
2291 
2292  GEPInfo GEPInfo(*PtrMI);
2293 
2294  for (unsigned i = 1; i != 3; ++i) {
2295  const MachineOperand &GEPOp = PtrMI->getOperand(i);
2296  const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2297  assert(OpDef);
2298  if (i == 2 && isConstant(*OpDef)) {
2299  // TODO: Could handle constant base + variable offset, but a combine
2300  // probably should have commuted it.
2301  assert(GEPInfo.Imm == 0);
2302  GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2303  continue;
2304  }
2305  const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2306  if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2307  GEPInfo.SgprParts.push_back(GEPOp.getReg());
2308  else
2309  GEPInfo.VgprParts.push_back(GEPOp.getReg());
2310  }
2311 
2312  AddrInfo.push_back(GEPInfo);
2313  getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2314 }
2315 
2316 bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2317  return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2318 }
2319 
2320 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2321  if (!MI.hasOneMemOperand())
2322  return false;
2323 
2324  const MachineMemOperand *MMO = *MI.memoperands_begin();
2325  const Value *Ptr = MMO->getValue();
2326 
2327  // UndefValue means this is a load of a kernel input. These are uniform.
2328  // Sometimes LDS instructions have constant pointers.
2329  // If Ptr is null, then that means this mem operand contains a
2330  // PseudoSourceValue like GOT.
2331  if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2332  isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2333  return true;
2334 
2336  return true;
2337 
2338  const Instruction *I = dyn_cast<Instruction>(Ptr);
2339  return I && I->getMetadata("amdgpu.uniform");
2340 }
2341 
2342 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2343  for (const GEPInfo &GEPInfo : AddrInfo) {
2344  if (!GEPInfo.VgprParts.empty())
2345  return true;
2346  }
2347  return false;
2348 }
2349 
2350 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2351  const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2352  unsigned AS = PtrTy.getAddressSpace();
2353  if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
2354  STI.ldsRequiresM0Init()) {
2355  MachineBasicBlock *BB = I.getParent();
2356 
2357  // If DS instructions require M0 initialization, insert it before selecting.
2358  BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2359  .addImm(-1);
2360  }
2361 }
2362 
2363 bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2364  MachineInstr &I) const {
2365  if (I.getOpcode() == TargetOpcode::G_ATOMICRMW_FADD) {
2366  const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2367  unsigned AS = PtrTy.getAddressSpace();
2368  if (AS == AMDGPUAS::GLOBAL_ADDRESS)
2369  return selectGlobalAtomicFadd(I, I.getOperand(1), I.getOperand(2));
2370  }
2371 
2372  initM0(I);
2373  return selectImpl(I, *CoverageInfo);
2374 }
2375 
2376 // TODO: No rtn optimization.
2377 bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
2378  MachineInstr &MI) const {
2379  Register PtrReg = MI.getOperand(1).getReg();
2380  const LLT PtrTy = MRI->getType(PtrReg);
2381  if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
2382  STI.useFlatForGlobal())
2383  return selectImpl(MI, *CoverageInfo);
2384 
2385  Register DstReg = MI.getOperand(0).getReg();
2386  const LLT Ty = MRI->getType(DstReg);
2387  const bool Is64 = Ty.getSizeInBits() == 64;
2388  const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2390  Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2391 
2392  const DebugLoc &DL = MI.getDebugLoc();
2393  MachineBasicBlock *BB = MI.getParent();
2394 
2395  Register VAddr, RSrcReg, SOffset;
2396  int64_t Offset = 0;
2397 
2398  unsigned Opcode;
2399  if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) {
2400  Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN :
2401  AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN;
2402  } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr,
2403  RSrcReg, SOffset, Offset)) {
2404  Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN :
2405  AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN;
2406  } else
2407  return selectImpl(MI, *CoverageInfo);
2408 
2409  auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg)
2410  .addReg(MI.getOperand(2).getReg());
2411 
2412  if (VAddr)
2413  MIB.addReg(VAddr);
2414 
2415  MIB.addReg(RSrcReg);
2416  if (SOffset)
2417  MIB.addReg(SOffset);
2418  else
2419  MIB.addImm(0);
2420 
2421  MIB.addImm(Offset);
2423  MIB.cloneMemRefs(MI);
2424 
2425  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg)
2426  .addReg(TmpReg, RegState::Kill, SubReg);
2427 
2428  MI.eraseFromParent();
2429 
2430  MRI->setRegClass(
2431  DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass);
2432  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2433 }
2434 
2436  if (Reg.isPhysical())
2437  return false;
2438 
2440  const unsigned Opcode = MI.getOpcode();
2441 
2442  if (Opcode == AMDGPU::COPY)
2443  return isVCmpResult(MI.getOperand(1).getReg(), MRI);
2444 
2445  if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
2446  Opcode == AMDGPU::G_XOR)
2447  return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
2448  isVCmpResult(MI.getOperand(2).getReg(), MRI);
2449 
2450  if (Opcode == TargetOpcode::G_INTRINSIC)
2451  return MI.getIntrinsicID() == Intrinsic::amdgcn_class;
2452 
2453  return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
2454 }
2455 
2456 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2457  MachineBasicBlock *BB = I.getParent();
2458  MachineOperand &CondOp = I.getOperand(0);
2459  Register CondReg = CondOp.getReg();
2460  const DebugLoc &DL = I.getDebugLoc();
2461 
2462  unsigned BrOpcode;
2463  Register CondPhysReg;
2464  const TargetRegisterClass *ConstrainRC;
2465 
2466  // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2467  // whether the branch is uniform when selecting the instruction. In
2468  // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2469  // RegBankSelect knows what it's doing if the branch condition is scc, even
2470  // though it currently does not.
2471  if (!isVCC(CondReg, *MRI)) {
2472  if (MRI->getType(CondReg) != LLT::scalar(32))
2473  return false;
2474 
2475  CondPhysReg = AMDGPU::SCC;
2476  BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2477  ConstrainRC = &AMDGPU::SReg_32RegClass;
2478  } else {
2479  // FIXME: Should scc->vcc copies and with exec?
2480 
2481  // Unless the value of CondReg is a result of a V_CMP* instruction then we
2482  // need to insert an and with exec.
2483  if (!isVCmpResult(CondReg, *MRI)) {
2484  const bool Is64 = STI.isWave64();
2485  const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
2486  const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
2487 
2488  Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
2489  BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
2490  .addReg(CondReg)
2491  .addReg(Exec);
2492  CondReg = TmpReg;
2493  }
2494 
2495  CondPhysReg = TRI.getVCC();
2496  BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2497  ConstrainRC = TRI.getBoolRC();
2498  }
2499 
2500  if (!MRI->getRegClassOrNull(CondReg))
2501  MRI->setRegClass(CondReg, ConstrainRC);
2502 
2503  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2504  .addReg(CondReg);
2505  BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2506  .addMBB(I.getOperand(1).getMBB());
2507 
2508  I.eraseFromParent();
2509  return true;
2510 }
2511 
2512 bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2513  MachineInstr &I) const {
2514  Register DstReg = I.getOperand(0).getReg();
2515  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2516  const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2517  I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2518  if (IsVGPR)
2519  I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2520 
2521  return RBI.constrainGenericRegister(
2522  DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2523 }
2524 
2525 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2526  Register DstReg = I.getOperand(0).getReg();
2527  Register SrcReg = I.getOperand(1).getReg();
2528  Register MaskReg = I.getOperand(2).getReg();
2529  LLT Ty = MRI->getType(DstReg);
2530  LLT MaskTy = MRI->getType(MaskReg);
2531  MachineBasicBlock *BB = I.getParent();
2532  const DebugLoc &DL = I.getDebugLoc();
2533 
2534  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2535  const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2536  const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2537  const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2538  if (DstRB != SrcRB) // Should only happen for hand written MIR.
2539  return false;
2540 
2541  // Try to avoid emitting a bit operation when we only need to touch half of
2542  // the 64-bit pointer.
2543  APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
2544  const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2545  const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2546 
2547  const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
2548  const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
2549 
2550  if (!IsVGPR && Ty.getSizeInBits() == 64 &&
2551  !CanCopyLow32 && !CanCopyHi32) {
2552  auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
2553  .addReg(SrcReg)
2554  .addReg(MaskReg);
2555  I.eraseFromParent();
2556  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2557  }
2558 
2559  unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2560  const TargetRegisterClass &RegRC
2561  = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2562 
2563  const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB,
2564  *MRI);
2565  const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB,
2566  *MRI);
2567  const TargetRegisterClass *MaskRC =
2568  TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI);
2569 
2570  if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2571  !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2572  !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2573  return false;
2574 
2575  if (Ty.getSizeInBits() == 32) {
2576  assert(MaskTy.getSizeInBits() == 32 &&
2577  "ptrmask should have been narrowed during legalize");
2578 
2579  BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2580  .addReg(SrcReg)
2581  .addReg(MaskReg);
2582  I.eraseFromParent();
2583  return true;
2584  }
2585 
2586  Register HiReg = MRI->createVirtualRegister(&RegRC);
2587  Register LoReg = MRI->createVirtualRegister(&RegRC);
2588 
2589  // Extract the subregisters from the source pointer.
2590  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2591  .addReg(SrcReg, 0, AMDGPU::sub0);
2592  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2593  .addReg(SrcReg, 0, AMDGPU::sub1);
2594 
2595  Register MaskedLo, MaskedHi;
2596 
2597  if (CanCopyLow32) {
2598  // If all the bits in the low half are 1, we only need a copy for it.
2599  MaskedLo = LoReg;
2600  } else {
2601  // Extract the mask subregister and apply the and.
2602  Register MaskLo = MRI->createVirtualRegister(&RegRC);
2603  MaskedLo = MRI->createVirtualRegister(&RegRC);
2604 
2605  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2606  .addReg(MaskReg, 0, AMDGPU::sub0);
2607  BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2608  .addReg(LoReg)
2609  .addReg(MaskLo);
2610  }
2611 
2612  if (CanCopyHi32) {
2613  // If all the bits in the high half are 1, we only need a copy for it.
2614  MaskedHi = HiReg;
2615  } else {
2616  Register MaskHi = MRI->createVirtualRegister(&RegRC);
2617  MaskedHi = MRI->createVirtualRegister(&RegRC);
2618 
2619  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
2620  .addReg(MaskReg, 0, AMDGPU::sub1);
2621  BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
2622  .addReg(HiReg)
2623  .addReg(MaskHi);
2624  }
2625 
2626  BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2627  .addReg(MaskedLo)
2628  .addImm(AMDGPU::sub0)
2629  .addReg(MaskedHi)
2630  .addImm(AMDGPU::sub1);
2631  I.eraseFromParent();
2632  return true;
2633 }
2634 
2635 /// Return the register to use for the index value, and the subregister to use
2636 /// for the indirectly accessed register.
2637 static std::pair<Register, unsigned>
2639  const SIRegisterInfo &TRI,
2640  const TargetRegisterClass *SuperRC,
2641  Register IdxReg,
2642  unsigned EltSize) {
2643  Register IdxBaseReg;
2644  int Offset;
2645 
2646  std::tie(IdxBaseReg, Offset) = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg);
2647  if (IdxBaseReg == AMDGPU::NoRegister) {
2648  // This will happen if the index is a known constant. This should ordinarily
2649  // be legalized out, but handle it as a register just in case.
2650  assert(Offset == 0);
2651  IdxBaseReg = IdxReg;
2652  }
2653 
2654  ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
2655 
2656  // Skip out of bounds offsets, or else we would end up using an undefined
2657  // register.
2658  if (static_cast<unsigned>(Offset) >= SubRegs.size())
2659  return std::make_pair(IdxReg, SubRegs[0]);
2660  return std::make_pair(IdxBaseReg, SubRegs[Offset]);
2661 }
2662 
2663 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
2664  MachineInstr &MI) const {
2665  Register DstReg = MI.getOperand(0).getReg();
2666  Register SrcReg = MI.getOperand(1).getReg();
2667  Register IdxReg = MI.getOperand(2).getReg();
2668 
2669  LLT DstTy = MRI->getType(DstReg);
2670  LLT SrcTy = MRI->getType(SrcReg);
2671 
2672  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2673  const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2674  const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2675 
2676  // The index must be scalar. If it wasn't RegBankSelect should have moved this
2677  // into a waterfall loop.
2678  if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2679  return false;
2680 
2681  const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB,
2682  *MRI);
2683  const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB,
2684  *MRI);
2685  if (!SrcRC || !DstRC)
2686  return false;
2687  if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2688  !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2689  !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2690  return false;
2691 
2692  MachineBasicBlock *BB = MI.getParent();
2693  const DebugLoc &DL = MI.getDebugLoc();
2694  const bool Is64 = DstTy.getSizeInBits() == 64;
2695 
2696  unsigned SubReg;
2697  std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg,
2698  DstTy.getSizeInBits() / 8);
2699 
2700  if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
2701  if (DstTy.getSizeInBits() != 32 && !Is64)
2702  return false;
2703 
2704  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2705  .addReg(IdxReg);
2706 
2707  unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
2708  BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
2709  .addReg(SrcReg, 0, SubReg)
2710  .addReg(SrcReg, RegState::Implicit);
2711  MI.eraseFromParent();
2712  return true;
2713  }
2714 
2715  if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
2716  return false;
2717 
2718  if (!STI.useVGPRIndexMode()) {
2719  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2720  .addReg(IdxReg);
2721  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
2722  .addReg(SrcReg, 0, SubReg)
2723  .addReg(SrcReg, RegState::Implicit);
2724  MI.eraseFromParent();
2725  return true;
2726  }
2727 
2728  const MCInstrDesc &GPRIDXDesc =
2729  TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
2730  BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
2731  .addReg(SrcReg)
2732  .addReg(IdxReg)
2733  .addImm(SubReg);
2734 
2735  MI.eraseFromParent();
2736  return true;
2737 }
2738 
2739 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
2740 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
2741  MachineInstr &MI) const {
2742  Register DstReg = MI.getOperand(0).getReg();
2743  Register VecReg = MI.getOperand(1).getReg();
2744  Register ValReg = MI.getOperand(2).getReg();
2745  Register IdxReg = MI.getOperand(3).getReg();
2746 
2747  LLT VecTy = MRI->getType(DstReg);
2748  LLT ValTy = MRI->getType(ValReg);
2749  unsigned VecSize = VecTy.getSizeInBits();
2750  unsigned ValSize = ValTy.getSizeInBits();
2751 
2752  const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
2753  const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
2754  const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2755 
2756  assert(VecTy.getElementType() == ValTy);
2757 
2758  // The index must be scalar. If it wasn't RegBankSelect should have moved this
2759  // into a waterfall loop.
2760  if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2761  return false;
2762 
2763  const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB,
2764  *MRI);
2765  const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB,
2766  *MRI);
2767 
2768  if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
2769  !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
2770  !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
2771  !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2772  return false;
2773 
2774  if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
2775  return false;
2776 
2777  unsigned SubReg;
2778  std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg,
2779  ValSize / 8);
2780 
2781  const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
2782  STI.useVGPRIndexMode();
2783 
2784  MachineBasicBlock *BB = MI.getParent();
2785  const DebugLoc &DL = MI.getDebugLoc();
2786 
2787  if (!IndexMode) {
2788  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2789  .addReg(IdxReg);
2790 
2791  const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
2792  VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
2793  BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
2794  .addReg(VecReg)
2795  .addReg(ValReg)
2796  .addImm(SubReg);
2797  MI.eraseFromParent();
2798  return true;
2799  }
2800 
2801  const MCInstrDesc &GPRIDXDesc =
2802  TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
2803  BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
2804  .addReg(VecReg)
2805  .addReg(ValReg)
2806  .addReg(IdxReg)
2807  .addImm(SubReg);
2808 
2809  MI.eraseFromParent();
2810  return true;
2811 }
2812 
2813 static bool isZeroOrUndef(int X) {
2814  return X == 0 || X == -1;
2815 }
2816 
2817 static bool isOneOrUndef(int X) {
2818  return X == 1 || X == -1;
2819 }
2820 
2821 static bool isZeroOrOneOrUndef(int X) {
2822  return X == 0 || X == 1 || X == -1;
2823 }
2824 
2825 // Normalize a VOP3P shuffle mask to refer to the low/high half of a single
2826 // 32-bit register.
2827 static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1,
2828  ArrayRef<int> Mask) {
2829  NewMask[0] = Mask[0];
2830  NewMask[1] = Mask[1];
2832  return Src0;
2833 
2834  assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1);
2835  assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1);
2836 
2837  // Shift the mask inputs to be 0/1;
2838  NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2;
2839  NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2;
2840  return Src1;
2841 }
2842 
2843 // This is only legal with VOP3P instructions as an aid to op_sel matching.
2844 bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
2845  MachineInstr &MI) const {
2846  Register DstReg = MI.getOperand(0).getReg();
2847  Register Src0Reg = MI.getOperand(1).getReg();
2848  Register Src1Reg = MI.getOperand(2).getReg();
2849  ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask();
2850 
2851  const LLT V2S16 = LLT::fixed_vector(2, 16);
2852  if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16)
2853  return false;
2854 
2855  if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask))
2856  return false;
2857 
2858  assert(ShufMask.size() == 2);
2859  assert(STI.hasSDWA() && "no target has VOP3P but not SDWA");
2860 
2861  MachineBasicBlock *MBB = MI.getParent();
2862  const DebugLoc &DL = MI.getDebugLoc();
2863 
2864  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2865  const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2866  const TargetRegisterClass &RC = IsVALU ?
2867  AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2868 
2869  // Handle the degenerate case which should have folded out.
2870  if (ShufMask[0] == -1 && ShufMask[1] == -1) {
2871  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg);
2872 
2873  MI.eraseFromParent();
2874  return RBI.constrainGenericRegister(DstReg, RC, *MRI);
2875  }
2876 
2877  // A legal VOP3P mask only reads one of the sources.
2878  int Mask[2];
2879  Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask);
2880 
2881  if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) ||
2882  !RBI.constrainGenericRegister(SrcVec, RC, *MRI))
2883  return false;
2884 
2885  // TODO: This also should have been folded out
2886  if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) {
2887  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg)
2888  .addReg(SrcVec);
2889 
2890  MI.eraseFromParent();
2891  return true;
2892  }
2893 
2894  if (Mask[0] == 1 && Mask[1] == -1) {
2895  if (IsVALU) {
2896  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
2897  .addImm(16)
2898  .addReg(SrcVec);
2899  } else {
2900  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
2901  .addReg(SrcVec)
2902  .addImm(16);
2903  }
2904  } else if (Mask[0] == -1 && Mask[1] == 0) {
2905  if (IsVALU) {
2906  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg)
2907  .addImm(16)
2908  .addReg(SrcVec);
2909  } else {
2910  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg)
2911  .addReg(SrcVec)
2912  .addImm(16);
2913  }
2914  } else if (Mask[0] == 0 && Mask[1] == 0) {
2915  if (IsVALU) {
2916  // Write low half of the register into the high half.
2917  MachineInstr *MovSDWA =
2918  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2919  .addImm(0) // $src0_modifiers
2920  .addReg(SrcVec) // $src0
2921  .addImm(0) // $clamp
2922  .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2923  .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2924  .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2925  .addReg(SrcVec, RegState::Implicit);
2926  MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2927  } else {
2928  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2929  .addReg(SrcVec)
2930  .addReg(SrcVec);
2931  }
2932  } else if (Mask[0] == 1 && Mask[1] == 1) {
2933  if (IsVALU) {
2934  // Write high half of the register into the low half.
2935  MachineInstr *MovSDWA =
2936  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2937  .addImm(0) // $src0_modifiers
2938  .addReg(SrcVec) // $src0
2939  .addImm(0) // $clamp
2940  .addImm(AMDGPU::SDWA::WORD_0) // $dst_sel
2941  .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2942  .addImm(AMDGPU::SDWA::WORD_1) // $src0_sel
2943  .addReg(SrcVec, RegState::Implicit);
2944  MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2945  } else {
2946  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg)
2947  .addReg(SrcVec)
2948  .addReg(SrcVec);
2949  }
2950  } else if (Mask[0] == 1 && Mask[1] == 0) {
2951  if (IsVALU) {
2952  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32_e64), DstReg)
2953  .addReg(SrcVec)
2954  .addReg(SrcVec)
2955  .addImm(16);
2956  } else {
2957  Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2958  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg)
2959  .addReg(SrcVec)
2960  .addImm(16);
2961  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2962  .addReg(TmpReg)
2963  .addReg(SrcVec);
2964  }
2965  } else
2966  llvm_unreachable("all shuffle masks should be handled");
2967 
2968  MI.eraseFromParent();
2969  return true;
2970 }
2971 
2972 bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
2973  MachineInstr &MI) const {
2974  if (STI.hasGFX90AInsts())
2975  return selectImpl(MI, *CoverageInfo);
2976 
2977  MachineBasicBlock *MBB = MI.getParent();
2978  const DebugLoc &DL = MI.getDebugLoc();
2979 
2980  if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
2981  Function &F = MBB->getParent()->getFunction();
2983  NoFpRet(F, "return versions of fp atomics not supported",
2984  MI.getDebugLoc(), DS_Error);
2985  F.getContext().diagnose(NoFpRet);
2986  return false;
2987  }
2988 
2989  // FIXME: This is only needed because tablegen requires number of dst operands
2990  // in match and replace pattern to be the same. Otherwise patterns can be
2991  // exported from SDag path.
2992  MachineOperand &VDataIn = MI.getOperand(1);
2993  MachineOperand &VIndex = MI.getOperand(3);
2994  MachineOperand &VOffset = MI.getOperand(4);
2995  MachineOperand &SOffset = MI.getOperand(5);
2996  int16_t Offset = MI.getOperand(6).getImm();
2997 
2998  bool HasVOffset = !isOperandImmEqual(VOffset, 0, *MRI);
2999  bool HasVIndex = !isOperandImmEqual(VIndex, 0, *MRI);
3000 
3001  unsigned Opcode;
3002  if (HasVOffset) {
3003  Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN
3004  : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN;
3005  } else {
3006  Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN
3007  : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET;
3008  }
3009 
3010  if (MRI->getType(VDataIn.getReg()).isVector()) {
3011  switch (Opcode) {
3012  case AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN:
3013  Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN;
3014  break;
3015  case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN:
3016  Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFEN;
3017  break;
3018  case AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN:
3019  Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_IDXEN;
3020  break;
3021  case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET:
3022  Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFSET;
3023  break;
3024  }
3025  }
3026 
3027  auto I = BuildMI(*MBB, MI, DL, TII.get(Opcode));
3028  I.add(VDataIn);
3029 
3030  if (Opcode == AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN ||
3031  Opcode == AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN) {
3032  Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3033  BuildMI(*MBB, &*I, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3034  .addReg(VIndex.getReg())
3035  .addImm(AMDGPU::sub0)
3036  .addReg(VOffset.getReg())
3037  .addImm(AMDGPU::sub1);
3038 
3039  I.addReg(IdxReg);
3040  } else if (HasVIndex) {
3041  I.add(VIndex);
3042  } else if (HasVOffset) {
3043  I.add(VOffset);
3044  }
3045 
3046  I.add(MI.getOperand(2)); // rsrc
3047  I.add(SOffset);
3048  I.addImm(Offset);
3049  I.addImm(MI.getOperand(7).getImm()); // cpol
3050  I.cloneMemRefs(MI);
3051 
3052  MI.eraseFromParent();
3053 
3054  return true;
3055 }
3056 
3057 bool AMDGPUInstructionSelector::selectGlobalAtomicFadd(
3058  MachineInstr &MI, MachineOperand &AddrOp, MachineOperand &DataOp) const {
3059 
3060  if (STI.hasGFX90AInsts()) {
3061  // gfx90a adds return versions of the global atomic fadd instructions so no
3062  // special handling is required.
3063  return selectImpl(MI, *CoverageInfo);
3064  }
3065 
3066  MachineBasicBlock *MBB = MI.getParent();
3067  const DebugLoc &DL = MI.getDebugLoc();
3068 
3069  if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
3070  Function &F = MBB->getParent()->getFunction();
3072  NoFpRet(F, "return versions of fp atomics not supported",
3073  MI.getDebugLoc(), DS_Error);
3074  F.getContext().diagnose(NoFpRet);
3075  return false;
3076  }
3077 
3078  // FIXME: This is only needed because tablegen requires number of dst operands
3079  // in match and replace pattern to be the same. Otherwise patterns can be
3080  // exported from SDag path.
3081  auto Addr = selectFlatOffsetImpl(AddrOp, SIInstrFlags::FlatGlobal);
3082 
3083  Register Data = DataOp.getReg();
3084  const unsigned Opc = MRI->getType(Data).isVector() ?
3085  AMDGPU::GLOBAL_ATOMIC_PK_ADD_F16 : AMDGPU::GLOBAL_ATOMIC_ADD_F32;
3086  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3087  .addReg(Addr.first)
3088  .addReg(Data)
3089  .addImm(Addr.second)
3090  .addImm(0) // cpol
3091  .cloneMemRefs(MI);
3092 
3093  MI.eraseFromParent();
3094  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3095 }
3096 
3097 bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3098  MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3099  MI.RemoveOperand(1);
3100  MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3101  return true;
3102 }
3103 
3104 bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3105  Register DstReg = MI.getOperand(0).getReg();
3106  Register SrcReg = MI.getOperand(1).getReg();
3107  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3108  const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3109  MachineBasicBlock *MBB = MI.getParent();
3110  const DebugLoc &DL = MI.getDebugLoc();
3111 
3112  if (IsVALU) {
3113  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3114  .addImm(Subtarget->getWavefrontSizeLog2())
3115  .addReg(SrcReg);
3116  } else {
3117  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3118  .addReg(SrcReg)
3119  .addImm(Subtarget->getWavefrontSizeLog2());
3120  }
3121 
3122  const TargetRegisterClass &RC =
3123  IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3124  if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3125  return false;
3126 
3127  MI.eraseFromParent();
3128  return true;
3129 }
3130 
3132  if (I.isPHI())
3133  return selectPHI(I);
3134 
3135  if (!I.isPreISelOpcode()) {
3136  if (I.isCopy())
3137  return selectCOPY(I);
3138  return true;
3139  }
3140 
3141  switch (I.getOpcode()) {
3142  case TargetOpcode::G_AND:
3143  case TargetOpcode::G_OR:
3144  case TargetOpcode::G_XOR:
3145  if (selectImpl(I, *CoverageInfo))
3146  return true;
3147  return selectG_AND_OR_XOR(I);
3148  case TargetOpcode::G_ADD:
3149  case TargetOpcode::G_SUB:
3150  if (selectImpl(I, *CoverageInfo))
3151  return true;
3152  return selectG_ADD_SUB(I);
3153  case TargetOpcode::G_UADDO:
3154  case TargetOpcode::G_USUBO:
3155  case TargetOpcode::G_UADDE:
3156  case TargetOpcode::G_USUBE:
3157  return selectG_UADDO_USUBO_UADDE_USUBE(I);
3158  case TargetOpcode::G_INTTOPTR:
3159  case TargetOpcode::G_BITCAST:
3160  case TargetOpcode::G_PTRTOINT:
3161  return selectCOPY(I);
3162  case TargetOpcode::G_CONSTANT:
3163  case TargetOpcode::G_FCONSTANT:
3164  return selectG_CONSTANT(I);
3165  case TargetOpcode::G_FNEG:
3166  if (selectImpl(I, *CoverageInfo))
3167  return true;
3168  return selectG_FNEG(I);
3169  case TargetOpcode::G_FABS:
3170  if (selectImpl(I, *CoverageInfo))
3171  return true;
3172  return selectG_FABS(I);
3173  case TargetOpcode::G_EXTRACT:
3174  return selectG_EXTRACT(I);
3175  case TargetOpcode::G_MERGE_VALUES:
3176  case TargetOpcode::G_BUILD_VECTOR:
3177  case TargetOpcode::G_CONCAT_VECTORS:
3178  return selectG_MERGE_VALUES(I);
3179  case TargetOpcode::G_UNMERGE_VALUES:
3180  return selectG_UNMERGE_VALUES(I);
3181  case TargetOpcode::G_BUILD_VECTOR_TRUNC:
3182  return selectG_BUILD_VECTOR_TRUNC(I);
3183  case TargetOpcode::G_PTR_ADD:
3184  return selectG_PTR_ADD(I);
3185  case TargetOpcode::G_IMPLICIT_DEF:
3186  return selectG_IMPLICIT_DEF(I);
3187  case TargetOpcode::G_FREEZE:
3188  return selectCOPY(I);
3189  case TargetOpcode::G_INSERT:
3190  return selectG_INSERT(I);
3191  case TargetOpcode::G_INTRINSIC:
3192  return selectG_INTRINSIC(I);
3193  case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3194  return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
3195  case TargetOpcode::G_ICMP:
3196  if (selectG_ICMP(I))
3197  return true;
3198  return selectImpl(I, *CoverageInfo);
3199  case TargetOpcode::G_LOAD:
3200  case TargetOpcode::G_STORE:
3201  case TargetOpcode::G_ATOMIC_CMPXCHG:
3202  case TargetOpcode::G_ATOMICRMW_XCHG:
3203  case TargetOpcode::G_ATOMICRMW_ADD:
3204  case TargetOpcode::G_ATOMICRMW_SUB:
3205  case TargetOpcode::G_ATOMICRMW_AND:
3206  case TargetOpcode::G_ATOMICRMW_OR:
3207  case TargetOpcode::G_ATOMICRMW_XOR:
3208  case TargetOpcode::G_ATOMICRMW_MIN:
3209  case TargetOpcode::G_ATOMICRMW_MAX:
3210  case TargetOpcode::G_ATOMICRMW_UMIN:
3211  case TargetOpcode::G_ATOMICRMW_UMAX:
3212  case TargetOpcode::G_ATOMICRMW_FADD:
3213  case AMDGPU::G_AMDGPU_ATOMIC_INC:
3214  case AMDGPU::G_AMDGPU_ATOMIC_DEC:
3215  case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
3216  case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
3217  return selectG_LOAD_STORE_ATOMICRMW(I);
3218  case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
3219  return selectG_AMDGPU_ATOMIC_CMPXCHG(I);
3220  case TargetOpcode::G_SELECT:
3221  return selectG_SELECT(I);
3222  case TargetOpcode::G_TRUNC:
3223  return selectG_TRUNC(I);
3224  case TargetOpcode::G_SEXT:
3225  case TargetOpcode::G_ZEXT:
3226  case TargetOpcode::G_ANYEXT:
3227  case TargetOpcode::G_SEXT_INREG:
3228  if (selectImpl(I, *CoverageInfo))
3229  return true;
3230  return selectG_SZA_EXT(I);
3231  case TargetOpcode::G_BRCOND:
3232  return selectG_BRCOND(I);
3233  case TargetOpcode::G_GLOBAL_VALUE:
3234  return selectG_GLOBAL_VALUE(I);
3235  case TargetOpcode::G_PTRMASK:
3236  return selectG_PTRMASK(I);
3237  case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3238  return selectG_EXTRACT_VECTOR_ELT(I);
3239  case TargetOpcode::G_INSERT_VECTOR_ELT:
3240  return selectG_INSERT_VECTOR_ELT(I);
3241  case TargetOpcode::G_SHUFFLE_VECTOR:
3242  return selectG_SHUFFLE_VECTOR(I);
3243  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3244  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3245  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3246  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3248  = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID());
3249  assert(Intr && "not an image intrinsic with image pseudo");
3250  return selectImageIntrinsic(I, Intr);
3251  }
3252  case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
3253  return selectBVHIntrinsic(I);
3254  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3255  return selectAMDGPU_BUFFER_ATOMIC_FADD(I);
3256  case AMDGPU::G_SBFX:
3257  case AMDGPU::G_UBFX:
3258  return selectG_SBFX_UBFX(I);
3259  case AMDGPU::G_SI_CALL:
3260  I.setDesc(TII.get(AMDGPU::SI_CALL));
3261  return true;
3262  case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
3263  return selectWaveAddress(I);
3264  default:
3265  return selectImpl(I, *CoverageInfo);
3266  }
3267  return false;
3268 }
3269 
3271 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
3272  return {{
3273  [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3274  }};
3275 
3276 }
3277 
3278 std::pair<Register, unsigned>
3279 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
3280  bool AllowAbs) const {
3281  Register Src = Root.getReg();
3282  Register OrigSrc = Src;
3283  unsigned Mods = 0;
3284  MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
3285 
3286  if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
3287  Src = MI->getOperand(1).getReg();
3288  Mods |= SISrcMods::NEG;
3289  MI = getDefIgnoringCopies(Src, *MRI);
3290  }
3291 
3292  if (AllowAbs && MI && MI->getOpcode() == AMDGPU::G_FABS) {
3293  Src = MI->getOperand(1).getReg();
3294  Mods |= SISrcMods::ABS;
3295  }
3296 
3297  if (Mods != 0 &&
3298  RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
3299  MachineInstr *UseMI = Root.getParent();
3300 
3301  // If we looked through copies to find source modifiers on an SGPR operand,
3302  // we now have an SGPR register source. To avoid potentially violating the
3303  // constant bus restriction, we need to insert a copy to a VGPR.
3304  Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc);
3306  TII.get(AMDGPU::COPY), VGPRSrc)
3307  .addReg(Src);
3308  Src = VGPRSrc;
3309  }
3310 
3311  return std::make_pair(Src, Mods);
3312 }
3313 
3314 ///
3315 /// This will select either an SGPR or VGPR operand and will save us from
3316 /// having to write an extra tablegen pattern.
3318 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
3319  return {{
3320  [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3321  }};
3322 }
3323 
3325 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
3326  Register Src;
3327  unsigned Mods;
3328  std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3329 
3330  return {{
3331  [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3332  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3333  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3334  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3335  }};
3336 }
3337 
3339 AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
3340  Register Src;
3341  unsigned Mods;
3342  std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false);
3343 
3344  return {{
3345  [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3346  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3347  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3348  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3349  }};
3350 }
3351 
3353 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
3354  return {{
3355  [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
3356  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3357  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3358  }};
3359 }
3360 
3362 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3363  Register Src;
3364  unsigned Mods;
3365  std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3366 
3367  return {{
3368  [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3369  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3370  }};
3371 }
3372 
3374 AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
3375  Register Src;
3376  unsigned Mods;
3377  std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false);
3378 
3379  return {{
3380  [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3381  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3382  }};
3383 }
3384 
3386 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3387  Register Reg = Root.getReg();
3388  const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3389  if (Def && (Def->getOpcode() == AMDGPU::G_FNEG ||
3390  Def->getOpcode() == AMDGPU::G_FABS))
3391  return {};
3392  return {{
3393  [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3394  }};
3395 }
3396 
3397 std::pair<Register, unsigned>
3398 AMDGPUInstructionSelector::selectVOP3PModsImpl(
3399  Register Src, const MachineRegisterInfo &MRI) const {
3400  unsigned Mods = 0;
3401  MachineInstr *MI = MRI.getVRegDef(Src);
3402 
3403  if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3404  // It's possible to see an f32 fneg here, but unlikely.
3405  // TODO: Treat f32 fneg as only high bit.
3406  MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
3407  Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3408  Src = MI->getOperand(1).getReg();
3409  MI = MRI.getVRegDef(Src);
3410  }
3411 
3412  // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3413 
3414  // Packed instructions do not have abs modifiers.
3415  Mods |= SISrcMods::OP_SEL_1;
3416 
3417  return std::make_pair(Src, Mods);
3418 }
3419 
3421 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3423  = Root.getParent()->getParent()->getParent()->getRegInfo();
3424 
3425  Register Src;
3426  unsigned Mods;
3427  std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3428 
3429  return {{
3430  [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3431  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3432  }};
3433 }
3434 
3436 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
3437  Register Src;
3438  unsigned Mods;
3439  std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3440  if (!isKnownNeverNaN(Src, *MRI))
3441  return None;
3442 
3443  return {{
3444  [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3445  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3446  }};
3447 }
3448 
3450 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
3451  // FIXME: Handle op_sel
3452  return {{
3453  [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
3454  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods
3455  }};
3456 }
3457 
3459 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
3460  SmallVector<GEPInfo, 4> AddrInfo;
3461  getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3462 
3463  if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3464  return None;
3465 
3466  const GEPInfo &GEPInfo = AddrInfo[0];
3467  Optional<int64_t> EncodedImm =
3468  AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false);
3469  if (!EncodedImm)
3470  return None;
3471 
3472  unsigned PtrReg = GEPInfo.SgprParts[0];
3473  return {{
3474  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3475  [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3476  }};
3477 }
3478 
3480 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
3481  SmallVector<GEPInfo, 4> AddrInfo;
3482  getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3483 
3484  if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3485  return None;
3486 
3487  const GEPInfo &GEPInfo = AddrInfo[0];
3488  Register PtrReg = GEPInfo.SgprParts[0];
3489  Optional<int64_t> EncodedImm =
3490  AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
3491  if (!EncodedImm)
3492  return None;
3493 
3494  return {{
3495  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3496  [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3497  }};
3498 }
3499 
3501 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
3502  MachineInstr *MI = Root.getParent();
3503  MachineBasicBlock *MBB = MI->getParent();
3504 
3505  SmallVector<GEPInfo, 4> AddrInfo;
3506  getAddrModeInfo(*MI, *MRI, AddrInfo);
3507 
3508  // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
3509  // then we can select all ptr + 32-bit offsets not just immediate offsets.
3510  if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3511  return None;
3512 
3513  const GEPInfo &GEPInfo = AddrInfo[0];
3514  // SGPR offset is unsigned.
3515  if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm))
3516  return None;
3517 
3518  // If we make it this far we have a load with an 32-bit immediate offset.
3519  // It is OK to select this using a sgpr offset, because we have already
3520  // failed trying to select this load into one of the _IMM variants since
3521  // the _IMM Patterns are considered before the _SGPR patterns.
3522  Register PtrReg = GEPInfo.SgprParts[0];
3523  Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3524  BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
3525  .addImm(GEPInfo.Imm);
3526  return {{
3527  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3528  [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }
3529  }};
3530 }
3531 
3532 std::pair<Register, int>
3533 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
3534  uint64_t FlatVariant) const {
3535  MachineInstr *MI = Root.getParent();
3536 
3537  auto Default = std::make_pair(Root.getReg(), 0);
3538 
3539  if (!STI.hasFlatInstOffsets())
3540  return Default;
3541 
3542  Register PtrBase;
3543  int64_t ConstOffset;
3544  std::tie(PtrBase, ConstOffset) =
3545  getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3546  if (ConstOffset == 0)
3547  return Default;
3548 
3549  unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
3550  if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
3551  return Default;
3552 
3553  return std::make_pair(PtrBase, ConstOffset);
3554 }
3555 
3557 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
3558  auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
3559 
3560  return {{
3561  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
3562  [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
3563  }};
3564 }
3565 
3567 AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
3568  auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
3569 
3570  return {{
3571  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
3572  [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
3573  }};
3574 }
3575 
3577 AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
3578  auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
3579 
3580  return {{
3581  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
3582  [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
3583  }};
3584 }
3585 
3586 /// Match a zero extend from a 32-bit value to 64-bits.
3588  Register ZExtSrc;
3589  if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3590  return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3591 
3592  // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3594  if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3595  return false;
3596 
3597  if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3598  return Def->getOperand(1).getReg();
3599  }
3600 
3601  return Register();
3602 }
3603 
3604 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
3606 AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
3607  Register Addr = Root.getReg();
3608  Register PtrBase;
3609  int64_t ConstOffset;
3610  int64_t ImmOffset = 0;
3611 
3612  // Match the immediate offset first, which canonically is moved as low as
3613  // possible.
3614  std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
3615 
3616  if (ConstOffset != 0) {
3617  if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
3619  Addr = PtrBase;
3620  ImmOffset = ConstOffset;
3621  } else {
3622  auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
3623  if (!PtrBaseDef)
3624  return None;
3625 
3626  if (isSGPR(PtrBaseDef->Reg)) {
3627  if (ConstOffset > 0) {
3628  // Offset is too large.
3629  //
3630  // saddr + large_offset -> saddr +
3631  // (voffset = large_offset & ~MaxOffset) +
3632  // (large_offset & MaxOffset);
3633  int64_t SplitImmOffset, RemainderOffset;
3634  std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
3636 
3637  if (isUInt<32>(RemainderOffset)) {
3638  MachineInstr *MI = Root.getParent();
3639  MachineBasicBlock *MBB = MI->getParent();
3640  Register HighBits =
3641  MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3642 
3643  BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3644  HighBits)
3645  .addImm(RemainderOffset);
3646 
3647  return {{
3648  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
3649  [=](MachineInstrBuilder &MIB) {
3650  MIB.addReg(HighBits);
3651  }, // voffset
3652  [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
3653  }};
3654  }
3655  }
3656 
3657  // We are adding a 64 bit SGPR and a constant. If constant bus limit
3658  // is 1 we would need to perform 1 or 2 extra moves for each half of
3659  // the constant and it is better to do a scalar add and then issue a
3660  // single VALU instruction to materialize zero. Otherwise it is less
3661  // instructions to perform VALU adds with immediates or inline literals.
3662  unsigned NumLiterals =
3663  !TII.isInlineConstant(APInt(32, ConstOffset & 0xffffffff)) +
3664  !TII.isInlineConstant(APInt(32, ConstOffset >> 32));
3665  if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
3666  return None;
3667  }
3668  }
3669  }
3670 
3671  auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3672  if (!AddrDef)
3673  return None;
3674 
3675  // Match the variable offset.
3676  if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3677  // Look through the SGPR->VGPR copy.
3678  Register SAddr =
3679  getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3680 
3681  if (SAddr && isSGPR(SAddr)) {
3682  Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3683 
3684  // It's possible voffset is an SGPR here, but the copy to VGPR will be
3685  // inserted later.
3686  if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
3687  return {{[=](MachineInstrBuilder &MIB) { // saddr
3688  MIB.addReg(SAddr);
3689  },
3690  [=](MachineInstrBuilder &MIB) { // voffset
3691  MIB.addReg(VOffset);
3692  },
3693  [=](MachineInstrBuilder &MIB) { // offset
3694  MIB.addImm(ImmOffset);
3695  }}};
3696  }
3697  }
3698  }
3699 
3700  // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
3701  // drop this.
3702  if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
3703  AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
3704  return None;
3705 
3706  // It's cheaper to materialize a single 32-bit zero for vaddr than the two
3707  // moves required to copy a 64-bit SGPR to VGPR.
3708  MachineInstr *MI = Root.getParent();
3709  MachineBasicBlock *MBB = MI->getParent();
3710  Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3711 
3712  BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3713  .addImm(0);
3714 
3715  return {{
3716  [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
3717  [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
3718  [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
3719  }};
3720 }
3721 
3723 AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
3724  Register Addr = Root.getReg();
3725  Register PtrBase;
3726  int64_t ConstOffset;
3727  int64_t ImmOffset = 0;
3728 
3729  // Match the immediate offset first, which canonically is moved as low as
3730  // possible.
3731  std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
3732 
3733  if (ConstOffset != 0 &&
3736  Addr = PtrBase;
3737  ImmOffset = ConstOffset;
3738  }
3739 
3740  auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3741  if (!AddrDef)
3742  return None;
3743 
3744  if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
3745  int FI = AddrDef->MI->getOperand(1).getIndex();
3746  return {{
3747  [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
3748  [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
3749  }};
3750  }
3751 
3752  Register SAddr = AddrDef->Reg;
3753 
3754  if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3755  Register LHS = AddrDef->MI->getOperand(1).getReg();
3756  Register RHS = AddrDef->MI->getOperand(2).getReg();
3757  auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
3758  auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
3759 
3760  if (LHSDef && RHSDef &&
3761  LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
3762  isSGPR(RHSDef->Reg)) {
3763  int FI = LHSDef->MI->getOperand(1).getIndex();
3764  MachineInstr &I = *Root.getParent();
3765  MachineBasicBlock *BB = I.getParent();
3766  const DebugLoc &DL = I.getDebugLoc();
3767  SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3768 
3769  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
3770  .addFrameIndex(FI)
3771  .addReg(RHSDef->Reg);
3772  }
3773  }
3774 
3775  if (!isSGPR(SAddr))
3776  return None;
3777 
3778  return {{
3779  [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
3780  [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
3781  }};
3782 }
3783 
3785 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
3786  MachineInstr *MI = Root.getParent();
3787  MachineBasicBlock *MBB = MI->getParent();
3790 
3791  int64_t Offset = 0;
3792  if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
3794  Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3795 
3796  // TODO: Should this be inside the render function? The iterator seems to
3797  // move.
3798  BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3799  HighBits)
3800  .addImm(Offset & ~4095);
3801 
3802  return {{[=](MachineInstrBuilder &MIB) { // rsrc
3803  MIB.addReg(Info->getScratchRSrcReg());
3804  },
3805  [=](MachineInstrBuilder &MIB) { // vaddr
3806  MIB.addReg(HighBits);
3807  },
3808  [=](MachineInstrBuilder &MIB) { // soffset
3809  // Use constant zero for soffset and rely on eliminateFrameIndex
3810  // to choose the appropriate frame register if need be.
3811  MIB.addImm(0);
3812  },
3813  [=](MachineInstrBuilder &MIB) { // offset
3814  MIB.addImm(Offset & 4095);
3815  }}};
3816  }
3817 
3818  assert(Offset == 0 || Offset == -1);
3819 
3820  // Try to fold a frame index directly into the MUBUF vaddr field, and any
3821  // offsets.
3822  Optional<int> FI;
3823  Register VAddr = Root.getReg();
3824  if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
3825  Register PtrBase;
3826  int64_t ConstOffset;
3827  std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
3828  if (ConstOffset != 0) {
3829  if (SIInstrInfo::isLegalMUBUFImmOffset(ConstOffset) &&
3831  KnownBits->signBitIsZero(PtrBase))) {
3832  const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
3833  if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
3834  FI = PtrBaseDef->getOperand(1).getIndex();
3835  else
3836  VAddr = PtrBase;
3837  Offset = ConstOffset;
3838  }
3839  } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
3840  FI = RootDef->getOperand(1).getIndex();
3841  }
3842  }
3843 
3844  return {{[=](MachineInstrBuilder &MIB) { // rsrc
3845  MIB.addReg(Info->getScratchRSrcReg());
3846  },
3847  [=](MachineInstrBuilder &MIB) { // vaddr
3848  if (FI.hasValue())
3849  MIB.addFrameIndex(FI.getValue());
3850  else
3851  MIB.addReg(VAddr);
3852  },
3853  [=](MachineInstrBuilder &MIB) { // soffset
3854  // Use constant zero for soffset and rely on eliminateFrameIndex
3855  // to choose the appropriate frame register if need be.
3856  MIB.addImm(0);
3857  },
3858  [=](MachineInstrBuilder &MIB) { // offset
3859  MIB.addImm(Offset);
3860  }}};
3861 }
3862 
3863 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
3864  int64_t Offset) const {
3865  if (!isUInt<16>(Offset))
3866  return false;
3867 
3869  return true;
3870 
3871  // On Southern Islands instruction with a negative base value and an offset
3872  // don't seem to work.
3873  return KnownBits->signBitIsZero(Base);
3874 }
3875 
3876 bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
3877  int64_t Offset1,
3878  unsigned Size) const {
3879  if (Offset0 % Size != 0 || Offset1 % Size != 0)
3880  return false;
3881  if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
3882  return false;
3883 
3885  return true;
3886 
3887  // On Southern Islands instruction with a negative base value and an offset
3888  // don't seem to work.
3889  return KnownBits->signBitIsZero(Base);
3890 }
3891 
3892 bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
3893  unsigned ShAmtBits) const {
3894  assert(MI.getOpcode() == TargetOpcode::G_AND);
3895 
3896  Optional<APInt> RHS = getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
3897  if (!RHS)
3898  return false;
3899 
3900  if (RHS->countTrailingOnes() >= ShAmtBits)
3901  return true;
3902 
3903  const APInt &LHSKnownZeros =
3904  KnownBits->getKnownZeroes(MI.getOperand(1).getReg());
3905  return (LHSKnownZeros | *RHS).countTrailingOnes() >= ShAmtBits;
3906 }
3907 
3909 AMDGPUInstructionSelector::selectMUBUFScratchOffset(
3910  MachineOperand &Root) const {
3911  MachineInstr *MI = Root.getParent();
3912  MachineBasicBlock *MBB = MI->getParent();
3913 
3914  int64_t Offset = 0;
3915  if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
3917  return {};
3918 
3919  const MachineFunction *MF = MBB->getParent();
3921 
3922  return {{
3923  [=](MachineInstrBuilder &MIB) { // rsrc
3924  MIB.addReg(Info->getScratchRSrcReg());
3925  },
3926  [=](MachineInstrBuilder &MIB) { // soffset
3927  MIB.addImm(0);
3928  },
3929  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
3930  }};
3931 }
3932 
3933 std::pair<Register, unsigned>
3934 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
3935  const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3936  if (!RootDef)
3937  return std::make_pair(Root.getReg(), 0);
3938 
3939  int64_t ConstAddr = 0;
3940 
3941  Register PtrBase;
3942  int64_t Offset;
3943  std::tie(PtrBase, Offset) =
3944  getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3945 
3946  if (Offset) {
3947  if (isDSOffsetLegal(PtrBase, Offset)) {
3948  // (add n0, c0)
3949  return std::make_pair(PtrBase, Offset);
3950  }
3951  } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3952  // TODO
3953 
3954 
3955  } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3956  // TODO
3957 
3958  }
3959 
3960  return std::make_pair(Root.getReg(), 0);
3961 }
3962 
3964 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
3965  Register Reg;
3966  unsigned Offset;
3967  std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
3968  return {{
3969  [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3970  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
3971  }};
3972 }
3973 
3975 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
3976  return selectDSReadWrite2(Root, 4);
3977 }
3978 
3980 AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
3981  return selectDSReadWrite2(Root, 8);
3982 }
3983 
3985 AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
3986  unsigned Size) const {
3987  Register Reg;
3988  unsigned Offset;
3989  std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
3990  return {{
3991  [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3992  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
3993  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
3994  }};
3995 }
3996 
3997 std::pair<Register, unsigned>
3998 AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
3999  unsigned Size) const {
4000  const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4001  if (!RootDef)
4002  return std::make_pair(Root.getReg(), 0);
4003 
4004  int64_t ConstAddr = 0;
4005 
4006  Register PtrBase;
4007  int64_t Offset;
4008  std::tie(PtrBase, Offset) =
4009  getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4010 
4011  if (Offset) {
4012  int64_t OffsetValue0 = Offset;
4013  int64_t OffsetValue1 = Offset + Size;
4014  if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
4015  // (add n0, c0)
4016  return std::make_pair(PtrBase, OffsetValue0 / Size);
4017  }
4018  } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
4019  // TODO
4020 
4021  } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
4022  // TODO
4023 
4024  }
4025 
4026  return std::make_pair(Root.getReg(), 0);
4027 }
4028 
4029 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
4030 /// the base value with the constant offset. There may be intervening copies
4031 /// between \p Root and the identified constant. Returns \p Root, 0 if this does
4032 /// not match the pattern.
4033 std::pair<Register, int64_t>
4034 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
4035  Register Root, const MachineRegisterInfo &MRI) const {
4036  MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
4037  if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
4038  return {Root, 0};
4039 
4040  MachineOperand &RHS = RootI->getOperand(2);
4041  Optional<ValueAndVReg> MaybeOffset =
4042  getIConstantVRegValWithLookThrough(RHS.getReg(), MRI);
4043  if (!MaybeOffset)
4044  return {Root, 0};
4045  return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
4046 }
4047 
4048 static void addZeroImm(MachineInstrBuilder &MIB) {
4049  MIB.addImm(0);
4050 }
4051 
4052 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
4053 /// BasePtr is not valid, a null base pointer will be used.
4055  uint32_t FormatLo, uint32_t FormatHi,
4056  Register BasePtr) {
4057  Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4058  Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4059  Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4060  Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
4061 
4062  B.buildInstr(AMDGPU::S_MOV_B32)
4063  .addDef(RSrc2)
4064  .addImm(FormatLo);
4065  B.buildInstr(AMDGPU::S_MOV_B32)
4066  .addDef(RSrc3)
4067  .addImm(FormatHi);
4068 
4069  // Build the half of the subregister with the constants before building the
4070  // full 128-bit register. If we are building multiple resource descriptors,
4071  // this will allow CSEing of the 2-component register.
4072  B.buildInstr(AMDGPU::REG_SEQUENCE)
4073  .addDef(RSrcHi)
4074  .addReg(RSrc2)
4075  .addImm(AMDGPU::sub0)
4076  .addReg(RSrc3)
4077  .addImm(AMDGPU::sub1);
4078 
4079  Register RSrcLo = BasePtr;
4080  if (!BasePtr) {
4081  RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4082  B.buildInstr(AMDGPU::S_MOV_B64)
4083  .addDef(RSrcLo)
4084  .addImm(0);
4085  }
4086 
4087  B.buildInstr(AMDGPU::REG_SEQUENCE)
4088  .addDef(RSrc)
4089  .addReg(RSrcLo)
4090  .addImm(AMDGPU::sub0_sub1)
4091  .addReg(RSrcHi)
4092  .addImm(AMDGPU::sub2_sub3);
4093 
4094  return RSrc;
4095 }
4096 
4098  const SIInstrInfo &TII, Register BasePtr) {
4099  uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
4100 
4101  // FIXME: Why are half the "default" bits ignored based on the addressing
4102  // mode?
4103  return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
4104 }
4105 
4107  const SIInstrInfo &TII, Register BasePtr) {
4108  uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
4109 
4110  // FIXME: Why are half the "default" bits ignored based on the addressing
4111  // mode?
4112  return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
4113 }
4114 
4115 AMDGPUInstructionSelector::MUBUFAddressData
4116 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
4117  MUBUFAddressData Data;
4118  Data.N0 = Src;
4119 
4120  Register PtrBase;
4121  int64_t Offset;
4122 
4123  std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
4124  if (isUInt<32>(Offset)) {
4125  Data.N0 = PtrBase;
4126  Data.Offset = Offset;
4127  }
4128 
4129  if (MachineInstr *InputAdd
4130  = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
4131  Data.N2 = InputAdd->getOperand(1).getReg();
4132  Data.N3 = InputAdd->getOperand(2).getReg();
4133 
4134  // FIXME: Need to fix extra SGPR->VGPRcopies inserted
4135  // FIXME: Don't know this was defined by operand 0
4136  //
4137  // TODO: Remove this when we have copy folding optimizations after
4138  // RegBankSelect.
4139  Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
4140  Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
4141  }
4142 
4143  return Data;
4144 }
4145 
4146 /// Return if the addr64 mubuf mode should be used for the given address.
4147 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
4148  // (ptr_add N2, N3) -> addr64, or
4149  // (ptr_add (ptr_add N2, N3), C1) -> addr64
4150  if (Addr.N2)
4151  return true;
4152 
4153  const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
4154  return N0Bank->getID() == AMDGPU::VGPRRegBankID;
4155 }
4156 
4157 /// Split an immediate offset \p ImmOffset depending on whether it fits in the
4158 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
4159 /// component.
4160 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
4161  MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
4162  if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset))
4163  return;
4164 
4165  // Illegal offset, store it in soffset.
4166  SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4167  B.buildInstr(AMDGPU::S_MOV_B32)
4168  .addDef(SOffset)
4169  .addImm(ImmOffset);
4170  ImmOffset = 0;
4171 }
4172 
4173 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
4174  MachineOperand &Root, Register &VAddr, Register &RSrcReg,
4175  Register &SOffset, int64_t &Offset) const {
4176  // FIXME: Predicates should stop this from reaching here.
4177  // addr64 bit was removed for volcanic islands.
4178  if (!STI.hasAddr64() || STI.useFlatForGlobal())
4179  return false;
4180 
4181  MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
4182  if (!shouldUseAddr64(AddrData))
4183  return false;
4184 
4185  Register N0 = AddrData.N0;
4186  Register N2 = AddrData.N2;
4187  Register N3 = AddrData.N3;
4188  Offset = AddrData.Offset;
4189 
4190  // Base pointer for the SRD.
4191  Register SRDPtr;
4192 
4193  if (N2) {
4194  if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4195  assert(N3);
4196  if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4197  // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
4198  // addr64, and construct the default resource from a 0 address.
4199  VAddr = N0;
4200  } else {
4201  SRDPtr = N3;
4202  VAddr = N2;
4203  }
4204  } else {
4205  // N2 is not divergent.
4206  SRDPtr = N2;
4207  VAddr = N3;
4208  }
4209  } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4210  // Use the default null pointer in the resource
4211  VAddr = N0;
4212  } else {
4213  // N0 -> offset, or
4214  // (N0 + C1) -> offset
4215  SRDPtr = N0;
4216  }
4217 
4218  MachineIRBuilder B(*Root.getParent());
4219  RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
4220  splitIllegalMUBUFOffset(B, SOffset, Offset);
4221  return true;
4222 }
4223 
4224 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
4225  MachineOperand &Root, Register &RSrcReg, Register &SOffset,
4226  int64_t &Offset) const {
4227 
4228  // FIXME: Pattern should not reach here.
4229  if (STI.useFlatForGlobal())
4230  return false;
4231 
4232  MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
4233  if (shouldUseAddr64(AddrData))
4234  return false;
4235 
4236  // N0 -> offset, or
4237  // (N0 + C1) -> offset
4238  Register SRDPtr = AddrData.N0;
4239  Offset = AddrData.Offset;
4240 
4241  // TODO: Look through extensions for 32-bit soffset.
4242  MachineIRBuilder B(*Root.getParent());
4243 
4244  RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
4245  splitIllegalMUBUFOffset(B, SOffset, Offset);
4246  return true;
4247 }
4248 
4250 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
4251  Register VAddr;
4252  Register RSrcReg;
4253  Register SOffset;
4254  int64_t Offset = 0;
4255 
4256  if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
4257  return {};
4258 
4259  // FIXME: Use defaulted operands for trailing 0s and remove from the complex
4260  // pattern.
4261  return {{
4262  [=](MachineInstrBuilder &MIB) { // rsrc
4263  MIB.addReg(RSrcReg);
4264  },
4265  [=](MachineInstrBuilder &MIB) { // vaddr
4266  MIB.addReg(VAddr);
4267  },
4268  [=](MachineInstrBuilder &MIB) { // soffset
4269  if (SOffset)
4270  MIB.addReg(SOffset);
4271  else
4272  MIB.addImm(0);
4273  },
4274  [=](MachineInstrBuilder &MIB) { // offset
4275  MIB.addImm(Offset);
4276  },
4277  addZeroImm, // cpol
4278  addZeroImm, // tfe
4279  addZeroImm // swz
4280  }};
4281 }
4282 
4284 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
4285  Register RSrcReg;
4286  Register SOffset;
4287  int64_t Offset = 0;
4288 
4289  if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
4290  return {};
4291 
4292  return {{
4293  [=](MachineInstrBuilder &MIB) { // rsrc
4294  MIB.addReg(RSrcReg);
4295  },
4296  [=](MachineInstrBuilder &MIB) { // soffset
4297  if (SOffset)
4298  MIB.addReg(SOffset);
4299  else
4300  MIB.addImm(0);
4301  },
4302  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
4303  addZeroImm, // cpol
4304  addZeroImm, // tfe
4305  addZeroImm, // swz
4306  }};
4307 }
4308 
4310 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const {
4311  Register VAddr;
4312  Register RSrcReg;
4313  Register SOffset;
4314  int64_t Offset = 0;
4315 
4316  if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
4317  return {};
4318 
4319  // FIXME: Use defaulted operands for trailing 0s and remove from the complex
4320  // pattern.
4321  return {{
4322  [=](MachineInstrBuilder &MIB) { // rsrc
4323  MIB.addReg(RSrcReg);
4324  },
4325  [=](MachineInstrBuilder &MIB) { // vaddr
4326  MIB.addReg(VAddr);
4327  },
4328  [=](MachineInstrBuilder &MIB) { // soffset
4329  if (SOffset)
4330  MIB.addReg(SOffset);
4331  else
4332  MIB.addImm(0);
4333  },
4334  [=](MachineInstrBuilder &MIB) { // offset
4335  MIB.addImm(Offset);
4336  },
4337  [=](MachineInstrBuilder &MIB) {
4338  MIB.addImm(AMDGPU::CPol::GLC); // cpol
4339  }
4340  }};
4341 }
4342 
4344 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
4345  Register RSrcReg;
4346  Register SOffset;
4347  int64_t Offset = 0;
4348 
4349  if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
4350  return {};
4351 
4352  return {{
4353  [=](MachineInstrBuilder &MIB) { // rsrc
4354  MIB.addReg(RSrcReg);
4355  },
4356  [=](MachineInstrBuilder &MIB) { // soffset
4357  if (SOffset)
4358  MIB.addReg(SOffset);
4359  else
4360  MIB.addImm(0);
4361  },
4362  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
4363  [=](MachineInstrBuilder &MIB) { MIB.addImm(AMDGPU::CPol::GLC); } // cpol
4364  }};
4365 }
4366 
4367 /// Get an immediate that must be 32-bits, and treated as zero extended.
4369  const MachineRegisterInfo &MRI) {
4370  // getIConstantVRegVal sexts any values, so see if that matters.
4372  if (!OffsetVal || !isInt<32>(*OffsetVal))
4373  return None;
4374  return Lo_32(*OffsetVal);
4375 }
4376 
4378 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
4379  Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
4380  if (!OffsetVal)
4381  return {};
4382 
4383  Optional<int64_t> EncodedImm =
4384  AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
4385  if (!EncodedImm)
4386  return {};
4387 
4388  return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
4389 }
4390 
4392 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
4394 
4395  Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
4396  if (!OffsetVal)
4397  return {};
4398 
4399  Optional<int64_t> EncodedImm
4400  = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
4401  if (!EncodedImm)
4402  return {};
4403 
4404  return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
4405 }
4406 
4407 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
4408  const MachineInstr &MI,
4409  int OpIdx) const {
4410  assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
4411  "Expected G_CONSTANT");
4412  MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
4413 }
4414 
4415 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
4416  const MachineInstr &MI,
4417  int OpIdx) const {
4418  assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
4419  "Expected G_CONSTANT");
4420  MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
4421 }
4422 
4423 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
4424  const MachineInstr &MI,
4425  int OpIdx) const {
4426  assert(OpIdx == -1);
4427 
4428  const MachineOperand &Op = MI.getOperand(1);
4429  if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
4430  MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
4431  else {
4432  assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
4433  MIB.addImm(Op.getCImm()->getSExtValue());
4434  }
4435 }
4436 
4437 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
4438  const MachineInstr &MI,
4439  int OpIdx) const {
4440  assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
4441  "Expected G_CONSTANT");
4442  MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation());
4443 }
4444 
4445 /// This only really exists to satisfy DAG type checking machinery, so is a
4446 /// no-op here.
4447 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
4448  const MachineInstr &MI,
4449  int OpIdx) const {
4450  MIB.addImm(MI.getOperand(OpIdx).getImm());
4451 }
4452 
4453 void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
4454  const MachineInstr &MI,
4455  int OpIdx) const {
4456  assert(OpIdx >= 0 && "expected to match an immediate operand");
4457  MIB.addImm(MI.getOperand(OpIdx).getImm() & AMDGPU::CPol::ALL);
4458 }
4459 
4460 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
4461  const MachineInstr &MI,
4462  int OpIdx) const {
4463  assert(OpIdx >= 0 && "expected to match an immediate operand");
4464  MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
4465 }
4466 
4467 void AMDGPUInstructionSelector::renderSetGLC(MachineInstrBuilder &MIB,
4468  const MachineInstr &MI,
4469  int OpIdx) const {
4470  assert(OpIdx >= 0 && "expected to match an immediate operand");
4471  MIB.addImm(MI.getOperand(OpIdx).getImm() | AMDGPU::CPol::GLC);
4472 }
4473 
4474 void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
4475  const MachineInstr &MI,
4476  int OpIdx) const {
4477  MIB.addFrameIndex((MI.getOperand(1).getIndex()));
4478 }
4479 
4480 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
4482 }
4483 
4484 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const {
4486 }
4487 
4488 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const {
4490 }
4491 
4492 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
4493  return TII.isInlineConstant(Imm);
4494 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
i
i
Definition: README.txt:29
GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_PREDICATES_INIT
MIPatternMatch.h
addZeroImm
static void addZeroImm(MachineInstrBuilder &MIB)
Definition: AMDGPUInstructionSelector.cpp:4048
llvm::ARMII::VecSize
@ VecSize
Definition: ARMBaseInfo.h:421
llvm::TargetMachine::getOptLevel
CodeGenOpt::Level getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
Definition: TargetMachine.cpp:188
llvm::getIConstantVRegSExtVal
Optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition: Utils.cpp:292
llvm::TargetRegisterInfo::getConstrainedRegClassForOperand
virtual const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const
Definition: TargetRegisterInfo.h:1086
sizeToSubRegIndex
static int sizeToSubRegIndex(unsigned Size)
Definition: AMDGPUInstructionSelector.cpp:1819
Signed
@ Signed
Definition: NVPTXISelLowering.cpp:4645
llvm::AMDGPUSubtarget::hasInv2PiInlineImm
bool hasInv2PiInlineImm() const
Definition: AMDGPUSubtarget.h:180
llvm::getDefIgnoringCopies
MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition: Utils.cpp:453
llvm::AMDGPURegisterBankInfo
Definition: AMDGPURegisterBankInfo.h:42
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:105
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:131
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AllocatorList.h:22
llvm::tgtok::Def
@ Def
Definition: TGLexer.h:50
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::CmpInst::ICMP_EQ
@ ICMP_EQ
equal
Definition: InstrTypes.h:742
UseMI
MachineInstrBuilder & UseMI
Definition: AArch64ExpandPseudoInsts.cpp:102
llvm::SISrcMods::NEG_HI
@ NEG_HI
Definition: SIDefines.h:213
llvm::MachineOperand::CreateReg
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
Definition: MachineOperand.h:791
normalizeVOP3PMask
static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1, ArrayRef< int > Mask)
Definition: AMDGPUInstructionSelector.cpp:2827
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1400
llvm::MachineRegisterInfo::createVirtualRegister
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Definition: MachineRegisterInfo.cpp:158
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:721
llvm::MachineFrameInfo::setReturnAddressIsTaken
void setReturnAddressIsTaken(bool s)
Definition: MachineFrameInfo.h:375
llvm::DiagnosticInfoUnsupported
Diagnostic information for unsupported feature in backend.
Definition: DiagnosticInfo.h:1004
llvm::RegState::Dead
@ Dead
Unused definition.
Definition: MachineInstrBuilder.h:50
llvm::MIPatternMatch::m_Reg
operand_type_match m_Reg()
Definition: MIPatternMatch.h:210
SIMachineFunctionInfo.h
llvm::GISelKnownBits
Definition: GISelKnownBits.h:29
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:52
llvm::MachineInstrBuilder::add
const MachineInstrBuilder & add(const MachineOperand &MO) const
Definition: MachineInstrBuilder.h:224
llvm::Function
Definition: Function.h:62
llvm::AMDGPU::getSMRDEncodedOffset
Optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer)
Definition: AMDGPUBaseInfo.cpp:1900
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
llvm::MIPatternMatch::m_GLShr
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
Definition: MIPatternMatch.h:446
llvm::getOpcodeDef
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:467
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1175
llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1479
GISelKnownBits.h
llvm::AMDGPU::ImageDimIntrinsicInfo
Definition: AMDGPUInstrInfo.h:47
llvm::RegisterBankInfo::getRegBank
RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
Definition: RegisterBankInfo.h:432
llvm::GCNSubtarget::needsAlignedVGPRs
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
Definition: GCNSubtarget.h:962
llvm::X86Disassembler::Reg
Reg
All possible values of the reg field in the ModR/M byte.
Definition: X86DisassemblerDecoder.h:462
llvm::getSrcRegIgnoringCopies
Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition: Utils.cpp:460
llvm::MachineRegisterInfo::getUniqueVRegDef
MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
Definition: MachineRegisterInfo.cpp:409
llvm::CmpInst::ICMP_NE
@ ICMP_NE
not equal
Definition: InstrTypes.h:743
llvm::AMDGPUSubtarget::SEA_ISLANDS
@ SEA_ISLANDS
Definition: AMDGPUSubtarget.h:38
llvm::InstructionSelector::setupMF
virtual void setupMF(MachineFunction &mf, GISelKnownBits *KB, CodeGenCoverage &covinfo, ProfileSummaryInfo *psi, BlockFrequencyInfo *bfi)
Setup per-MF selector state.
Definition: InstructionSelector.h:452
llvm::AMDGPU::getSMRDEncodedLiteralOffset32
Optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
Definition: AMDGPUBaseInfo.cpp:1917
llvm::TargetRegisterInfo
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Definition: TargetRegisterInfo.h:233
llvm::SIRegisterInfo::getWaveMaskRegClass
const TargetRegisterClass * getWaveMaskRegClass() const
Definition: SIRegisterInfo.h:327
llvm::Depth
@ Depth
Definition: SIMachineScheduler.h:36
llvm::CmpInst::ICMP_SGT
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:748
getAddrSpace
static Error getAddrSpace(StringRef R, unsigned &AddrSpace)
Definition: DataLayout.cpp:249
llvm::MachineMemOperand
A description of a memory reference used in the backend.
Definition: MachineMemOperand.h:128
llvm::AMDGPUMachineFunction::getLDSSize
unsigned getLDSSize() const
Definition: AMDGPUMachineFunction.h:68
isZeroOrUndef
static bool isZeroOrUndef(int X)
Definition: AMDGPUInstructionSelector.cpp:2813
llvm::AMDGPU::SDWA::UNUSED_PRESERVE
@ UNUSED_PRESERVE
Definition: SIDefines.h:665
llvm::MachineOperand::isCImm
bool isCImm() const
isCImm - Test if this is a MO_CImmediate operand.
Definition: MachineOperand.h:325
llvm::ConstantFP::getValueAPF
const APFloat & getValueAPF() const
Definition: Constants.h:297
llvm::LLT::isValid
bool isValid() const
Definition: LowLevelTypeImpl.h:116
llvm::Optional
Definition: APInt.h:33
llvm::AMDGPUSubtarget::hasSDWA
bool hasSDWA() const
Definition: AMDGPUSubtarget.h:160
Offset
uint64_t Offset
Definition: ELFObjHandler.cpp:80
llvm::RegState::Kill
@ Kill
The last use of a register.
Definition: MachineInstrBuilder.h:48
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::RegState::Undef
@ Undef
Value of the register doesn't matter.
Definition: MachineInstrBuilder.h:52
llvm::CmpInst::ICMP_SLE
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:751
llvm::MipsISD::Ret
@ Ret
Definition: MipsISelLowering.h:116
MachineIRBuilder.h
llvm::SIInstrInfo::getIndirectRegWriteMovRelPseudo
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
Definition: SIInstrInfo.cpp:1340
llvm::PowerOf2Ceil
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:702
RHS
Value * RHS
Definition: X86PartialReduction.cpp:74
llvm::Lo_32
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:353
llvm::BitmaskEnumDetail::Mask
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1564
llvm::cl::ReallyHidden
@ ReallyHidden
Definition: CommandLine.h:144
llvm::GCNSubtarget::hasScalarCompareEq64
bool hasScalarCompareEq64() const
Definition: GCNSubtarget.h:796
include
include(LLVM-Build) add_subdirectory(IR) add_subdirectory(FuzzMutate) add_subdirectory(FileCheck) add_subdirectory(InterfaceStub) add_subdirectory(IRReader) add_subdirectory(CodeGen) add_subdirectory(BinaryFormat) add_subdirectory(Bitcode) add_subdirectory(Bitstream) add_subdirectory(DWARFLinker) add_subdirectory(Extensions) add_subdirectory(Frontend) add_subdirectory(Transforms) add_subdirectory(Linker) add_subdirectory(Analysis) add_subdirectory(LTO) add_subdirectory(MC) add_subdirectory(MCA) add_subdirectory(Object) add_subdirectory(ObjectYAML) add_subdirectory(Option) add_subdirectory(Remarks) add_subdirectory(Debuginfod) add_subdirectory(DebugInfo) add_subdirectory(DWP) add_subdirectory(ExecutionEngine) add_subdirectory(Target) add_subdirectory(AsmParser) add_subdirectory(LineEditor) add_subdirectory(ProfileData) add_subdirectory(Passes) add_subdirectory(TextAPI) add_subdirectory(ToolDrivers) add_subdirectory(XRay) if(LLVM_INCLUDE_TESTS) add_subdirectory(Testing) endif() add_subdirectory(WindowsManifest) set(LLVMCONFIGLIBRARYDEPENDENCIESINC "$
Definition: CMakeLists.txt:1
llvm::ValueAndVReg::Value
APInt Value
Definition: Utils.h:178
llvm::Data
@ Data
Definition: SIMachineScheduler.h:55
llvm::PointerUnion::get
T get() const
Returns the value of the specified pointer type.
Definition: PointerUnion.h:144
llvm::ARMII::IndexMode
IndexMode
ARM Index Modes.
Definition: ARMBaseInfo.h:177
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:241
DEBUG_TYPE
#define DEBUG_TYPE
Definition: AMDGPUInstructionSelector.cpp:29
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::constrainSelectedInstRegOperands
bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition: Utils.cpp:144
F
#define F(x, y, z)
Definition: MD5.cpp:55
llvm::getUndefRegState
unsigned getUndefRegState(bool B)
Definition: MachineInstrBuilder.h:514
llvm::GCNSubtarget::unsafeDSOffsetFoldingEnabled
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:399
llvm::GCNSubtarget::hasGWSSemaReleaseAll
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:617
llvm::Optional::hasValue
constexpr bool hasValue() const
Definition: Optional.h:288
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::SIRegisterInfo::getRegClassForTypeOnBank
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank, const MachineRegisterInfo &MRI) const
Definition: SIRegisterInfo.h:312
llvm::LLT::fixed_vector
static LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelTypeImpl.h:74
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:185
llvm::getFunctionLiveInPhysReg
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition: Utils.cpp:703
llvm::MachineOperand::isKill
bool isKill() const
Definition: MachineOperand.h:390
llvm::MachineInstrBuilder::addDef
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Definition: MachineInstrBuilder.h:116
LHS
Value * LHS
Definition: X86PartialReduction.cpp:73
llvm::AMDGPU::CPol::CPol
CPol
Definition: SIDefines.h:292
getLogicalBitOpcode
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
Definition: AMDGPUInstructionSelector.cpp:266
llvm::BlockFrequencyInfo
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Definition: BlockFrequencyInfo.h:37
llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition: MachineFunction.h:651
llvm::AMDGPUAS::CONSTANT_ADDRESS_32BIT
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition: AMDGPU.h:366
llvm::MachineOperand::isImplicit
bool isImplicit() const
Definition: MachineOperand.h:380
llvm::AMDGPUTargetMachine
Definition: AMDGPUTargetMachine.h:28
llvm::MachineInstrBuilder::addMBB
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Definition: MachineInstrBuilder.h:146
llvm::RegisterBank
This class implements the register bank concept.
Definition: RegisterBank.h:28
llvm::MIPatternMatch::m_GZExt
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
Definition: MIPatternMatch.h:497
llvm::SIInstrFlags::WQM
@ WQM
Definition: SIDefines.h:71
llvm::SIInstrInfo::isInlineConstant
bool isInlineConstant(const APInt &Imm) const
Definition: SIInstrInfo.cpp:3470
llvm::MachineOperand::CreateImm
static MachineOperand CreateImm(int64_t Val)
Definition: MachineOperand.h:773
llvm::GCNSubtarget::useVGPRIndexMode
bool useVGPRIndexMode() const
Definition: AMDGPUSubtarget.cpp:733
llvm::GCNSubtarget::hasAddr64
bool hasAddr64() const
Definition: GCNSubtarget.h:310
llvm::AMDGPUInstructionSelector::select
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
Definition: AMDGPUInstructionSelector.cpp:3131
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::MachineMemOperand::getAddrSpace
unsigned getAddrSpace() const
Definition: MachineMemOperand.h:229
llvm::MachineOperand::getImm
int64_t getImm() const
Definition: MachineOperand.h:537
llvm::GCNSubtarget::useFlatForGlobal
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:458
llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition: MachineFunction.h:739
Intr
unsigned Intr
Definition: AMDGPUBaseInfo.cpp:2016
llvm::MachineMemOperand::getValue
const Value * getValue() const
Return the base address of the memory access.
Definition: MachineMemOperand.h:211
isVCmpResult
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
Definition: AMDGPUInstructionSelector.cpp:2435
llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:499
llvm::CmpInst::ICMP_ULE
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:747
llvm::AMDGPUISD::DS_ORDERED_COUNT
@ DS_ORDERED_COUNT
Definition: AMDGPUISelLowering.h:501
llvm::LLT::getSizeInBits
TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelTypeImpl.h:152
llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:46
llvm::SIRegisterInfo::getReturnAddressReg
MCRegister getReturnAddressReg(const MachineFunction &MF) const
Definition: SIRegisterInfo.cpp:2661
llvm::InstructionSelector::CoverageInfo
CodeGenCoverage * CoverageInfo
Definition: InstructionSelector.h:439
gwsIntrinToOpcode
static unsigned gwsIntrinToOpcode(unsigned IntrID)
Definition: AMDGPUInstructionSelector.cpp:1320
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:127
llvm::MachineOperand::ChangeToImmediate
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
Definition: MachineOperand.cpp:156
llvm::MCInstrDesc
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:195
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:49
llvm::MIPatternMatch::m_ZeroInt
SpecificConstantMatch m_ZeroInt()
{ Convenience matchers for specific integer values.
Definition: MIPatternMatch.h:192
llvm::M0
unsigned M0(unsigned Val)
Definition: VE.h:370
llvm::Instruction
Definition: Instruction.h:45
llvm::GCNSubtarget::isWave32
bool isWave32() const
Definition: GCNSubtarget.h:1130
llvm::AMDGPUTargetMachine::getNullPointerValue
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
Definition: AMDGPUTargetMachine.cpp:744
llvm::TargetRegisterClass::hasSuperClassEq
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
Definition: TargetRegisterInfo.h:138
llvm::report_fatal_error
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:143
llvm::APInt::getZExtValue
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1467
llvm::APInt::getHighBitsSet
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:279
llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:360
llvm::RegisterBank::getID
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:47
llvm::SIRegisterInfo::getSubRegFromChannel
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
Definition: SIRegisterInfo.cpp:505
matchZeroExtendFromS32
static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg)
Match a zero extend from a 32-bit value to 64-bits.
Definition: AMDGPUInstructionSelector.cpp:3587
llvm::MIPatternMatch::m_SpecificICst
SpecificConstantMatch m_SpecificICst(int64_t RequestedValue)
Matches a constant equal to RequestedValue.
Definition: MIPatternMatch.h:149
llvm::SIRegisterInfo::isSGPRClass
static bool isSGPRClass(const TargetRegisterClass *RC)
Definition: SIRegisterInfo.h:174
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:30
llvm::APFloat::bitcastToAPInt
APInt bitcastToAPInt() const
Definition: APFloat.h:1130
llvm::AMDGPU::getMIMGOpcode
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
Definition: AMDGPUBaseInfo.cpp:140
llvm::Metadata
Root of the metadata hierarchy.
Definition: Metadata.h:62
GET_GLOBALISEL_TEMPORARIES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
llvm::MachineRegisterInfo::getVRegDef
MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
Definition: MachineRegisterInfo.cpp:398
llvm::MachineOperand::getParent
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
Definition: MachineOperand.h:238
llvm::None
const NoneType None
Definition: None.h:23
llvm::RegisterBankInfo::getSizeInBits
unsigned getSizeInBits(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Get the size in bits of Reg.
Definition: RegisterBankInfo.cpp:493
llvm::MachineRegisterInfo::use_empty
bool use_empty(Register RegNo) const
use_empty - Return true if there are no instructions using the specified register.
Definition: MachineRegisterInfo.h:506
llvm::RegisterBankInfo::constrainGenericRegister
static const TargetRegisterClass * constrainGenericRegister(Register Reg, const TargetRegisterClass &RC, MachineRegisterInfo &MRI)
Constrain the (possibly generic) virtual register Reg to RC.
Definition: RegisterBankInfo.cpp:132
llvm::AMDGPU::isGFX10Plus
bool isGFX10Plus(const MCSubtargetInfo &STI)
Definition: AMDGPUBaseInfo.cpp:1468
X
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
llvm::MachineBasicBlock</