LLVM  14.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
15 #include "AMDGPU.h"
16 #include "AMDGPUGlobalISelUtils.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
26 #include "llvm/IR/DiagnosticInfo.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 
29 #define DEBUG_TYPE "amdgpu-isel"
30 
31 using namespace llvm;
32 using namespace MIPatternMatch;
33 
35  "amdgpu-global-isel-risky-select",
36  cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
37  cl::init(false),
39 
40 #define GET_GLOBALISEL_IMPL
41 #define AMDGPUSubtarget GCNSubtarget
42 #include "AMDGPUGenGlobalISel.inc"
43 #undef GET_GLOBALISEL_IMPL
44 #undef AMDGPUSubtarget
45 
47  const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
48  const AMDGPUTargetMachine &TM)
49  : InstructionSelector(), TII(*STI.getInstrInfo()),
50  TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
51  STI(STI),
52  EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
54 #include "AMDGPUGenGlobalISel.inc"
57 #include "AMDGPUGenGlobalISel.inc"
59 {
60 }
61 
62 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
63 
65  CodeGenCoverage &CoverageInfo,
66  ProfileSummaryInfo *PSI,
68  MRI = &MF.getRegInfo();
69  Subtarget = &MF.getSubtarget<GCNSubtarget>();
71 }
72 
73 bool AMDGPUInstructionSelector::isVCC(Register Reg,
74  const MachineRegisterInfo &MRI) const {
75  // The verifier is oblivious to s1 being a valid value for wavesize registers.
76  if (Reg.isPhysical())
77  return false;
78 
79  auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
80  const TargetRegisterClass *RC =
81  RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
82  if (RC) {
83  const LLT Ty = MRI.getType(Reg);
84  return RC->hasSuperClassEq(TRI.getBoolRC()) &&
85  Ty.isValid() && Ty.getSizeInBits() == 1;
86  }
87 
88  const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
89  return RB->getID() == AMDGPU::VCCRegBankID;
90 }
91 
92 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
93  unsigned NewOpc) const {
94  MI.setDesc(TII.get(NewOpc));
95  MI.RemoveOperand(1); // Remove intrinsic ID.
96  MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
97 
98  MachineOperand &Dst = MI.getOperand(0);
99  MachineOperand &Src = MI.getOperand(1);
100 
101  // TODO: This should be legalized to s32 if needed
102  if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
103  return false;
104 
105  const TargetRegisterClass *DstRC
106  = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
107  const TargetRegisterClass *SrcRC
108  = TRI.getConstrainedRegClassForOperand(Src, *MRI);
109  if (!DstRC || DstRC != SrcRC)
110  return false;
111 
112  return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
113  RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
114 }
115 
116 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
117  const DebugLoc &DL = I.getDebugLoc();
118  MachineBasicBlock *BB = I.getParent();
119  I.setDesc(TII.get(TargetOpcode::COPY));
120 
121  const MachineOperand &Src = I.getOperand(1);
122  MachineOperand &Dst = I.getOperand(0);
123  Register DstReg = Dst.getReg();
124  Register SrcReg = Src.getReg();
125 
126  if (isVCC(DstReg, *MRI)) {
127  if (SrcReg == AMDGPU::SCC) {
128  const TargetRegisterClass *RC
129  = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
130  if (!RC)
131  return true;
132  return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
133  }
134 
135  if (!isVCC(SrcReg, *MRI)) {
136  // TODO: Should probably leave the copy and let copyPhysReg expand it.
137  if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
138  return false;
139 
140  const TargetRegisterClass *SrcRC
141  = TRI.getConstrainedRegClassForOperand(Src, *MRI);
142 
143  Optional<ValueAndVReg> ConstVal =
144  getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
145  if (ConstVal) {
146  unsigned MovOpc =
147  STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
148  BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
149  .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
150  } else {
151  Register MaskedReg = MRI->createVirtualRegister(SrcRC);
152 
153  // We can't trust the high bits at this point, so clear them.
154 
155  // TODO: Skip masking high bits if def is known boolean.
156 
157  unsigned AndOpc =
158  TRI.isSGPRClass(SrcRC) ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
159  BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
160  .addImm(1)
161  .addReg(SrcReg);
162  BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
163  .addImm(0)
164  .addReg(MaskedReg);
165  }
166 
167  if (!MRI->getRegClassOrNull(SrcReg))
168  MRI->setRegClass(SrcReg, SrcRC);
169  I.eraseFromParent();
170  return true;
171  }
172 
173  const TargetRegisterClass *RC =
174  TRI.getConstrainedRegClassForOperand(Dst, *MRI);
175  if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
176  return false;
177 
178  return true;
179  }
180 
181  for (const MachineOperand &MO : I.operands()) {
182  if (MO.getReg().isPhysical())
183  continue;
184 
185  const TargetRegisterClass *RC =
186  TRI.getConstrainedRegClassForOperand(MO, *MRI);
187  if (!RC)
188  continue;
189  RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
190  }
191  return true;
192 }
193 
194 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
195  const Register DefReg = I.getOperand(0).getReg();
196  const LLT DefTy = MRI->getType(DefReg);
197  if (DefTy == LLT::scalar(1)) {
198  if (!AllowRiskySelect) {
199  LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
200  return false;
201  }
202 
203  LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n");
204  }
205 
206  // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
207 
208  const RegClassOrRegBank &RegClassOrBank =
209  MRI->getRegClassOrRegBank(DefReg);
210 
211  const TargetRegisterClass *DefRC
212  = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
213  if (!DefRC) {
214  if (!DefTy.isValid()) {
215  LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
216  return false;
217  }
218 
219  const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
220  DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI);
221  if (!DefRC) {
222  LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
223  return false;
224  }
225  }
226 
227  // TODO: Verify that all registers have the same bank
228  I.setDesc(TII.get(TargetOpcode::PHI));
229  return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
230 }
231 
233 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
234  const TargetRegisterClass &SubRC,
235  unsigned SubIdx) const {
236 
237  MachineInstr *MI = MO.getParent();
239  Register DstReg = MRI->createVirtualRegister(&SubRC);
240 
241  if (MO.isReg()) {
242  unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
243  Register Reg = MO.getReg();
244  BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
245  .addReg(Reg, 0, ComposedSubIdx);
246 
247  return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
248  MO.isKill(), MO.isDead(), MO.isUndef(),
249  MO.isEarlyClobber(), 0, MO.isDebug(),
250  MO.isInternalRead());
251  }
252 
253  assert(MO.isImm());
254 
255  APInt Imm(64, MO.getImm());
256 
257  switch (SubIdx) {
258  default:
259  llvm_unreachable("do not know to split immediate with this sub index.");
260  case AMDGPU::sub0:
261  return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
262  case AMDGPU::sub1:
263  return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
264  }
265 }
266 
267 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
268  switch (Opc) {
269  case AMDGPU::G_AND:
270  return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
271  case AMDGPU::G_OR:
272  return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
273  case AMDGPU::G_XOR:
274  return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
275  default:
276  llvm_unreachable("not a bit op");
277  }
278 }
279 
280 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
281  Register DstReg = I.getOperand(0).getReg();
282  unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
283 
284  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
285  if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
286  DstRB->getID() != AMDGPU::VCCRegBankID)
287  return false;
288 
289  bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
290  STI.isWave64());
291  I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
292 
293  // Dead implicit-def of scc
294  I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
295  true, // isImp
296  false, // isKill
297  true)); // isDead
298  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
299 }
300 
301 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
302  MachineBasicBlock *BB = I.getParent();
303  MachineFunction *MF = BB->getParent();
304  Register DstReg = I.getOperand(0).getReg();
305  const DebugLoc &DL = I.getDebugLoc();
306  LLT Ty = MRI->getType(DstReg);
307  if (Ty.isVector())
308  return false;
309 
310  unsigned Size = Ty.getSizeInBits();
311  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
312  const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
313  const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
314 
315  if (Size == 32) {
316  if (IsSALU) {
317  const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
318  MachineInstr *Add =
319  BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
320  .add(I.getOperand(1))
321  .add(I.getOperand(2));
322  I.eraseFromParent();
323  return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
324  }
325 
326  if (STI.hasAddNoCarry()) {
327  const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
328  I.setDesc(TII.get(Opc));
329  I.addOperand(*MF, MachineOperand::CreateImm(0));
330  I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
331  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
332  }
333 
334  const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
335 
338  = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
339  .addDef(UnusedCarry, RegState::Dead)
340  .add(I.getOperand(1))
341  .add(I.getOperand(2))
342  .addImm(0);
343  I.eraseFromParent();
344  return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
345  }
346 
347  assert(!Sub && "illegal sub should not reach here");
348 
349  const TargetRegisterClass &RC
350  = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
351  const TargetRegisterClass &HalfRC
352  = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
353 
354  MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
355  MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
356  MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
357  MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
358 
359  Register DstLo = MRI->createVirtualRegister(&HalfRC);
360  Register DstHi = MRI->createVirtualRegister(&HalfRC);
361 
362  if (IsSALU) {
363  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
364  .add(Lo1)
365  .add(Lo2);
366  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
367  .add(Hi1)
368  .add(Hi2);
369  } else {
370  const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
371  Register CarryReg = MRI->createVirtualRegister(CarryRC);
372  BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
373  .addDef(CarryReg)
374  .add(Lo1)
375  .add(Lo2)
376  .addImm(0);
377  MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
379  .add(Hi1)
380  .add(Hi2)
381  .addReg(CarryReg, RegState::Kill)
382  .addImm(0);
383 
384  if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
385  return false;
386  }
387 
388  BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
389  .addReg(DstLo)
390  .addImm(AMDGPU::sub0)
391  .addReg(DstHi)
392  .addImm(AMDGPU::sub1);
393 
394 
395  if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
396  return false;
397 
398  I.eraseFromParent();
399  return true;
400 }
401 
402 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
403  MachineInstr &I) const {
404  MachineBasicBlock *BB = I.getParent();
405  MachineFunction *MF = BB->getParent();
406  const DebugLoc &DL = I.getDebugLoc();
407  Register Dst0Reg = I.getOperand(0).getReg();
408  Register Dst1Reg = I.getOperand(1).getReg();
409  const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
410  I.getOpcode() == AMDGPU::G_UADDE;
411  const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
412  I.getOpcode() == AMDGPU::G_USUBE;
413 
414  if (isVCC(Dst1Reg, *MRI)) {
415  unsigned NoCarryOpc =
416  IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
417  unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
418  I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
419  I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
420  I.addOperand(*MF, MachineOperand::CreateImm(0));
421  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
422  }
423 
424  Register Src0Reg = I.getOperand(2).getReg();
425  Register Src1Reg = I.getOperand(3).getReg();
426 
427  if (HasCarryIn) {
428  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
429  .addReg(I.getOperand(4).getReg());
430  }
431 
432  unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
433  unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
434 
435  BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
436  .add(I.getOperand(2))
437  .add(I.getOperand(3));
438  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
440 
441  if (!MRI->getRegClassOrNull(Dst1Reg))
442  MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
443 
444  if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
445  !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
446  !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
447  return false;
448 
449  if (HasCarryIn &&
450  !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
451  AMDGPU::SReg_32RegClass, *MRI))
452  return false;
453 
454  I.eraseFromParent();
455  return true;
456 }
457 
458 // TODO: We should probably legalize these to only using 32-bit results.
459 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
460  MachineBasicBlock *BB = I.getParent();
461  Register DstReg = I.getOperand(0).getReg();
462  Register SrcReg = I.getOperand(1).getReg();
463  LLT DstTy = MRI->getType(DstReg);
464  LLT SrcTy = MRI->getType(SrcReg);
465  const unsigned SrcSize = SrcTy.getSizeInBits();
466  unsigned DstSize = DstTy.getSizeInBits();
467 
468  // TODO: Should handle any multiple of 32 offset.
469  unsigned Offset = I.getOperand(2).getImm();
470  if (Offset % 32 != 0 || DstSize > 128)
471  return false;
472 
473  // 16-bit operations really use 32-bit registers.
474  // FIXME: Probably should not allow 16-bit G_EXTRACT results.
475  if (DstSize == 16)
476  DstSize = 32;
477 
478  const TargetRegisterClass *DstRC =
479  TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
480  if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
481  return false;
482 
483  const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
484  const TargetRegisterClass *SrcRC =
485  TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
486  if (!SrcRC)
487  return false;
489  DstSize / 32);
490  SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
491  if (!SrcRC)
492  return false;
493 
494  SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
495  *SrcRC, I.getOperand(1));
496  const DebugLoc &DL = I.getDebugLoc();
497  BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
498  .addReg(SrcReg, 0, SubReg);
499 
500  I.eraseFromParent();
501  return true;
502 }
503 
504 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
505  MachineBasicBlock *BB = MI.getParent();
506  Register DstReg = MI.getOperand(0).getReg();
507  LLT DstTy = MRI->getType(DstReg);
508  LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
509 
510  const unsigned SrcSize = SrcTy.getSizeInBits();
511  if (SrcSize < 32)
512  return selectImpl(MI, *CoverageInfo);
513 
514  const DebugLoc &DL = MI.getDebugLoc();
515  const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
516  const unsigned DstSize = DstTy.getSizeInBits();
517  const TargetRegisterClass *DstRC =
518  TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
519  if (!DstRC)
520  return false;
521 
522  ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
523  MachineInstrBuilder MIB =
524  BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
525  for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
526  MachineOperand &Src = MI.getOperand(I + 1);
527  MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
528  MIB.addImm(SubRegs[I]);
529 
530  const TargetRegisterClass *SrcRC
531  = TRI.getConstrainedRegClassForOperand(Src, *MRI);
532  if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
533  return false;
534  }
535 
536  if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
537  return false;
538 
539  MI.eraseFromParent();
540  return true;
541 }
542 
543 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
544  MachineBasicBlock *BB = MI.getParent();
545  const int NumDst = MI.getNumOperands() - 1;
546 
547  MachineOperand &Src = MI.getOperand(NumDst);
548 
549  Register SrcReg = Src.getReg();
550  Register DstReg0 = MI.getOperand(0).getReg();
551  LLT DstTy = MRI->getType(DstReg0);
552  LLT SrcTy = MRI->getType(SrcReg);
553 
554  const unsigned DstSize = DstTy.getSizeInBits();
555  const unsigned SrcSize = SrcTy.getSizeInBits();
556  const DebugLoc &DL = MI.getDebugLoc();
557  const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
558 
559  const TargetRegisterClass *SrcRC =
560  TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
561  if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
562  return false;
563 
564  // Note we could have mixed SGPR and VGPR destination banks for an SGPR
565  // source, and this relies on the fact that the same subregister indices are
566  // used for both.
567  ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
568  for (int I = 0, E = NumDst; I != E; ++I) {
569  MachineOperand &Dst = MI.getOperand(I);
570  BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
571  .addReg(SrcReg, 0, SubRegs[I]);
572 
573  // Make sure the subregister index is valid for the source register.
574  SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
575  if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
576  return false;
577 
578  const TargetRegisterClass *DstRC =
579  TRI.getConstrainedRegClassForOperand(Dst, *MRI);
580  if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
581  return false;
582  }
583 
584  MI.eraseFromParent();
585  return true;
586 }
587 
588 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
589  MachineInstr &MI) const {
590  if (selectImpl(MI, *CoverageInfo))
591  return true;
592 
593  const LLT S32 = LLT::scalar(32);
594  const LLT V2S16 = LLT::fixed_vector(2, 16);
595 
596  Register Dst = MI.getOperand(0).getReg();
597  if (MRI->getType(Dst) != V2S16)
598  return false;
599 
600  const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
601  if (DstBank->getID() != AMDGPU::SGPRRegBankID)
602  return false;
603 
604  Register Src0 = MI.getOperand(1).getReg();
605  Register Src1 = MI.getOperand(2).getReg();
606  if (MRI->getType(Src0) != S32)
607  return false;
608 
609  const DebugLoc &DL = MI.getDebugLoc();
610  MachineBasicBlock *BB = MI.getParent();
611 
612  auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
613  if (ConstSrc1) {
614  auto ConstSrc0 =
615  getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
616  if (ConstSrc0) {
617  const int64_t K0 = ConstSrc0->Value.getSExtValue();
618  const int64_t K1 = ConstSrc1->Value.getSExtValue();
619  uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
620  uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
621 
622  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst)
623  .addImm(Lo16 | (Hi16 << 16));
624  MI.eraseFromParent();
625  return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
626  }
627  }
628 
629  // TODO: This should probably be a combine somewhere
630  // (build_vector_trunc $src0, undef -> copy $src0
631  MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
632  if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
633  MI.setDesc(TII.get(AMDGPU::COPY));
634  MI.RemoveOperand(2);
635  return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) &&
636  RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI);
637  }
638 
639  Register ShiftSrc0;
640  Register ShiftSrc1;
641 
642  // With multiple uses of the shift, this will duplicate the shift and
643  // increase register pressure.
644  //
645  // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
646  // => (S_PACK_HH_B32_B16 $src0, $src1)
647  // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16))
648  // => (S_PACK_LH_B32_B16 $src0, $src1)
649  // (build_vector_trunc $src0, $src1)
650  // => (S_PACK_LL_B32_B16 $src0, $src1)
651 
652  bool Shift0 = mi_match(
653  Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
654 
655  bool Shift1 = mi_match(
656  Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
657 
658  unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
659  if (Shift0 && Shift1) {
660  Opc = AMDGPU::S_PACK_HH_B32_B16;
661  MI.getOperand(1).setReg(ShiftSrc0);
662  MI.getOperand(2).setReg(ShiftSrc1);
663  } else if (Shift1) {
664  Opc = AMDGPU::S_PACK_LH_B32_B16;
665  MI.getOperand(2).setReg(ShiftSrc1);
666  } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) {
667  // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
668  auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
669  .addReg(ShiftSrc0)
670  .addImm(16);
671 
672  MI.eraseFromParent();
673  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
674  }
675 
676  MI.setDesc(TII.get(Opc));
677  return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
678 }
679 
680 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
681  return selectG_ADD_SUB(I);
682 }
683 
684 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
685  const MachineOperand &MO = I.getOperand(0);
686 
687  // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
688  // regbank check here is to know why getConstrainedRegClassForOperand failed.
689  const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
690  if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
691  (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
692  I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
693  return true;
694  }
695 
696  return false;
697 }
698 
699 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
700  MachineBasicBlock *BB = I.getParent();
701 
702  Register DstReg = I.getOperand(0).getReg();
703  Register Src0Reg = I.getOperand(1).getReg();
704  Register Src1Reg = I.getOperand(2).getReg();
705  LLT Src1Ty = MRI->getType(Src1Reg);
706 
707  unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
708  unsigned InsSize = Src1Ty.getSizeInBits();
709 
710  int64_t Offset = I.getOperand(3).getImm();
711 
712  // FIXME: These cases should have been illegal and unnecessary to check here.
713  if (Offset % 32 != 0 || InsSize % 32 != 0)
714  return false;
715 
716  // Currently not handled by getSubRegFromChannel.
717  if (InsSize > 128)
718  return false;
719 
720  unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
721  if (SubReg == AMDGPU::NoSubRegister)
722  return false;
723 
724  const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
725  const TargetRegisterClass *DstRC =
726  TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
727  if (!DstRC)
728  return false;
729 
730  const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
731  const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
732  const TargetRegisterClass *Src0RC =
733  TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI);
734  const TargetRegisterClass *Src1RC =
735  TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI);
736 
737  // Deal with weird cases where the class only partially supports the subreg
738  // index.
739  Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
740  if (!Src0RC || !Src1RC)
741  return false;
742 
743  if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
744  !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
745  !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
746  return false;
747 
748  const DebugLoc &DL = I.getDebugLoc();
749  BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
750  .addReg(Src0Reg)
751  .addReg(Src1Reg)
752  .addImm(SubReg);
753 
754  I.eraseFromParent();
755  return true;
756 }
757 
758 bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
759  Register DstReg = MI.getOperand(0).getReg();
760  Register SrcReg = MI.getOperand(1).getReg();
761  Register OffsetReg = MI.getOperand(2).getReg();
762  Register WidthReg = MI.getOperand(3).getReg();
763 
764  assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
765  "scalar BFX instructions are expanded in regbankselect");
766  assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
767  "64-bit vector BFX instructions are expanded in regbankselect");
768 
769  const DebugLoc &DL = MI.getDebugLoc();
770  MachineBasicBlock *MBB = MI.getParent();
771 
772  bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
773  unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
774  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
775  .addReg(SrcReg)
776  .addReg(OffsetReg)
777  .addReg(WidthReg);
778  MI.eraseFromParent();
779  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
780 }
781 
782 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
783  if (STI.getLDSBankCount() != 16)
784  return selectImpl(MI, *CoverageInfo);
785 
786  Register Dst = MI.getOperand(0).getReg();
787  Register Src0 = MI.getOperand(2).getReg();
788  Register M0Val = MI.getOperand(6).getReg();
789  if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
790  !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
791  !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
792  return false;
793 
794  // This requires 2 instructions. It is possible to write a pattern to support
795  // this, but the generated isel emitter doesn't correctly deal with multiple
796  // output instructions using the same physical register input. The copy to m0
797  // is incorrectly placed before the second instruction.
798  //
799  // TODO: Match source modifiers.
800 
801  Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
802  const DebugLoc &DL = MI.getDebugLoc();
803  MachineBasicBlock *MBB = MI.getParent();
804 
805  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
806  .addReg(M0Val);
807  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
808  .addImm(2)
809  .addImm(MI.getOperand(4).getImm()) // $attr
810  .addImm(MI.getOperand(3).getImm()); // $attrchan
811 
812  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
813  .addImm(0) // $src0_modifiers
814  .addReg(Src0) // $src0
815  .addImm(MI.getOperand(4).getImm()) // $attr
816  .addImm(MI.getOperand(3).getImm()) // $attrchan
817  .addImm(0) // $src2_modifiers
818  .addReg(InterpMov) // $src2 - 2 f16 values selected by high
819  .addImm(MI.getOperand(5).getImm()) // $high
820  .addImm(0) // $clamp
821  .addImm(0); // $omod
822 
823  MI.eraseFromParent();
824  return true;
825 }
826 
827 // Writelane is special in that it can use SGPR and M0 (which would normally
828 // count as using the constant bus twice - but in this case it is allowed since
829 // the lane selector doesn't count as a use of the constant bus). However, it is
830 // still required to abide by the 1 SGPR rule. Fix this up if we might have
831 // multiple SGPRs.
832 bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
833  // With a constant bus limit of at least 2, there's no issue.
834  if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
835  return selectImpl(MI, *CoverageInfo);
836 
837  MachineBasicBlock *MBB = MI.getParent();
838  const DebugLoc &DL = MI.getDebugLoc();
839  Register VDst = MI.getOperand(0).getReg();
840  Register Val = MI.getOperand(2).getReg();
841  Register LaneSelect = MI.getOperand(3).getReg();
842  Register VDstIn = MI.getOperand(4).getReg();
843 
844  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
845 
846  Optional<ValueAndVReg> ConstSelect =
847  getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
848  if (ConstSelect) {
849  // The selector has to be an inline immediate, so we can use whatever for
850  // the other operands.
851  MIB.addReg(Val);
852  MIB.addImm(ConstSelect->Value.getSExtValue() &
853  maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
854  } else {
855  Optional<ValueAndVReg> ConstVal =
857 
858  // If the value written is an inline immediate, we can get away without a
859  // copy to m0.
860  if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
861  STI.hasInv2PiInlineImm())) {
862  MIB.addImm(ConstVal->Value.getSExtValue());
863  MIB.addReg(LaneSelect);
864  } else {
865  MIB.addReg(Val);
866 
867  // If the lane selector was originally in a VGPR and copied with
868  // readfirstlane, there's a hazard to read the same SGPR from the
869  // VALU. Constrain to a different SGPR to help avoid needing a nop later.
870  RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
871 
872  BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
873  .addReg(LaneSelect);
874  MIB.addReg(AMDGPU::M0);
875  }
876  }
877 
878  MIB.addReg(VDstIn);
879 
880  MI.eraseFromParent();
881  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
882 }
883 
884 // We need to handle this here because tablegen doesn't support matching
885 // instructions with multiple outputs.
886 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
887  Register Dst0 = MI.getOperand(0).getReg();
888  Register Dst1 = MI.getOperand(1).getReg();
889 
890  LLT Ty = MRI->getType(Dst0);
891  unsigned Opc;
892  if (Ty == LLT::scalar(32))
893  Opc = AMDGPU::V_DIV_SCALE_F32_e64;
894  else if (Ty == LLT::scalar(64))
895  Opc = AMDGPU::V_DIV_SCALE_F64_e64;
896  else
897  return false;
898 
899  // TODO: Match source modifiers.
900 
901  const DebugLoc &DL = MI.getDebugLoc();
902  MachineBasicBlock *MBB = MI.getParent();
903 
904  Register Numer = MI.getOperand(3).getReg();
905  Register Denom = MI.getOperand(4).getReg();
906  unsigned ChooseDenom = MI.getOperand(5).getImm();
907 
908  Register Src0 = ChooseDenom != 0 ? Numer : Denom;
909 
910  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
911  .addDef(Dst1)
912  .addImm(0) // $src0_modifiers
913  .addUse(Src0) // $src0
914  .addImm(0) // $src1_modifiers
915  .addUse(Denom) // $src1
916  .addImm(0) // $src2_modifiers
917  .addUse(Numer) // $src2
918  .addImm(0) // $clamp
919  .addImm(0); // $omod
920 
921  MI.eraseFromParent();
922  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
923 }
924 
925 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
926  unsigned IntrinsicID = I.getIntrinsicID();
927  switch (IntrinsicID) {
928  case Intrinsic::amdgcn_if_break: {
929  MachineBasicBlock *BB = I.getParent();
930 
931  // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
932  // SelectionDAG uses for wave32 vs wave64.
933  BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
934  .add(I.getOperand(0))
935  .add(I.getOperand(2))
936  .add(I.getOperand(3));
937 
938  Register DstReg = I.getOperand(0).getReg();
939  Register Src0Reg = I.getOperand(2).getReg();
940  Register Src1Reg = I.getOperand(3).getReg();
941 
942  I.eraseFromParent();
943 
944  for (Register Reg : { DstReg, Src0Reg, Src1Reg })
946 
947  return true;
948  }
949  case Intrinsic::amdgcn_interp_p1_f16:
950  return selectInterpP1F16(I);
951  case Intrinsic::amdgcn_wqm:
952  return constrainCopyLikeIntrin(I, AMDGPU::WQM);
953  case Intrinsic::amdgcn_softwqm:
954  return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
955  case Intrinsic::amdgcn_strict_wwm:
956  case Intrinsic::amdgcn_wwm:
957  return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
958  case Intrinsic::amdgcn_strict_wqm:
959  return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
960  case Intrinsic::amdgcn_writelane:
961  return selectWritelane(I);
962  case Intrinsic::amdgcn_div_scale:
963  return selectDivScale(I);
964  case Intrinsic::amdgcn_icmp:
965  return selectIntrinsicIcmp(I);
966  case Intrinsic::amdgcn_ballot:
967  return selectBallot(I);
968  case Intrinsic::amdgcn_reloc_constant:
969  return selectRelocConstant(I);
970  case Intrinsic::amdgcn_groupstaticsize:
971  return selectGroupStaticSize(I);
972  case Intrinsic::returnaddress:
973  return selectReturnAddress(I);
974  default:
975  return selectImpl(I, *CoverageInfo);
976  }
977 }
978 
979 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) {
980  if (Size != 32 && Size != 64)
981  return -1;
982  switch (P) {
983  default:
984  llvm_unreachable("Unknown condition code!");
985  case CmpInst::ICMP_NE:
986  return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64;
987  case CmpInst::ICMP_EQ:
988  return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64;
989  case CmpInst::ICMP_SGT:
990  return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64;
991  case CmpInst::ICMP_SGE:
992  return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64;
993  case CmpInst::ICMP_SLT:
994  return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64;
995  case CmpInst::ICMP_SLE:
996  return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64;
997  case CmpInst::ICMP_UGT:
998  return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64;
999  case CmpInst::ICMP_UGE:
1000  return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64;
1001  case CmpInst::ICMP_ULT:
1002  return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64;
1003  case CmpInst::ICMP_ULE:
1004  return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64;
1005  }
1006 }
1007 
1008 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1009  unsigned Size) const {
1010  if (Size == 64) {
1011  if (!STI.hasScalarCompareEq64())
1012  return -1;
1013 
1014  switch (P) {
1015  case CmpInst::ICMP_NE:
1016  return AMDGPU::S_CMP_LG_U64;
1017  case CmpInst::ICMP_EQ:
1018  return AMDGPU::S_CMP_EQ_U64;
1019  default:
1020  return -1;
1021  }
1022  }
1023 
1024  if (Size != 32)
1025  return -1;
1026 
1027  switch (P) {
1028  case CmpInst::ICMP_NE:
1029  return AMDGPU::S_CMP_LG_U32;
1030  case CmpInst::ICMP_EQ:
1031  return AMDGPU::S_CMP_EQ_U32;
1032  case CmpInst::ICMP_SGT:
1033  return AMDGPU::S_CMP_GT_I32;
1034  case CmpInst::ICMP_SGE:
1035  return AMDGPU::S_CMP_GE_I32;
1036  case CmpInst::ICMP_SLT:
1037  return AMDGPU::S_CMP_LT_I32;
1038  case CmpInst::ICMP_SLE:
1039  return AMDGPU::S_CMP_LE_I32;
1040  case CmpInst::ICMP_UGT:
1041  return AMDGPU::S_CMP_GT_U32;
1042  case CmpInst::ICMP_UGE:
1043  return AMDGPU::S_CMP_GE_U32;
1044  case CmpInst::ICMP_ULT:
1045  return AMDGPU::S_CMP_LT_U32;
1046  case CmpInst::ICMP_ULE:
1047  return AMDGPU::S_CMP_LE_U32;
1048  default:
1049  llvm_unreachable("Unknown condition code!");
1050  }
1051 }
1052 
1053 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
1054  MachineBasicBlock *BB = I.getParent();
1055  const DebugLoc &DL = I.getDebugLoc();
1056 
1057  Register SrcReg = I.getOperand(2).getReg();
1058  unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1059 
1060  auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1061 
1062  Register CCReg = I.getOperand(0).getReg();
1063  if (!isVCC(CCReg, *MRI)) {
1064  int Opcode = getS_CMPOpcode(Pred, Size);
1065  if (Opcode == -1)
1066  return false;
1067  MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1068  .add(I.getOperand(2))
1069  .add(I.getOperand(3));
1070  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1071  .addReg(AMDGPU::SCC);
1072  bool Ret =
1073  constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1074  RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1075  I.eraseFromParent();
1076  return Ret;
1077  }
1078 
1079  int Opcode = getV_CMPOpcode(Pred, Size);
1080  if (Opcode == -1)
1081  return false;
1082 
1083  MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1084  I.getOperand(0).getReg())
1085  .add(I.getOperand(2))
1086  .add(I.getOperand(3));
1088  *TRI.getBoolRC(), *MRI);
1089  bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1090  I.eraseFromParent();
1091  return Ret;
1092 }
1093 
1094 bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const {
1095  Register Dst = I.getOperand(0).getReg();
1096  if (isVCC(Dst, *MRI))
1097  return false;
1098 
1099  if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize())
1100  return false;
1101 
1102  MachineBasicBlock *BB = I.getParent();
1103  const DebugLoc &DL = I.getDebugLoc();
1104  Register SrcReg = I.getOperand(2).getReg();
1105  unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1106  auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1107 
1108  int Opcode = getV_CMPOpcode(Pred, Size);
1109  if (Opcode == -1)
1110  return false;
1111 
1112  MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
1113  .add(I.getOperand(2))
1114  .add(I.getOperand(3));
1115  RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(),
1116  *MRI);
1117  bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1118  I.eraseFromParent();
1119  return Ret;
1120 }
1121 
1122 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1123  MachineBasicBlock *BB = I.getParent();
1124  const DebugLoc &DL = I.getDebugLoc();
1125  Register DstReg = I.getOperand(0).getReg();
1126  const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1127  const bool Is64 = Size == 64;
1128 
1129  if (Size != STI.getWavefrontSize())
1130  return false;
1131 
1133  getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
1134 
1135  if (Arg.hasValue()) {
1136  const int64_t Value = Arg.getValue().Value.getSExtValue();
1137  if (Value == 0) {
1138  unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1139  BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1140  } else if (Value == -1) { // all ones
1141  Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
1142  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1143  } else
1144  return false;
1145  } else {
1146  Register SrcReg = I.getOperand(2).getReg();
1147  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1148  }
1149 
1150  I.eraseFromParent();
1151  return true;
1152 }
1153 
1154 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1155  Register DstReg = I.getOperand(0).getReg();
1156  const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1157  const TargetRegisterClass *DstRC =
1158  TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI);
1159  if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1160  return false;
1161 
1162  const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1163 
1164  Module *M = MF->getFunction().getParent();
1165  const MDNode *Metadata = I.getOperand(2).getMetadata();
1166  auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1167  auto RelocSymbol = cast<GlobalVariable>(
1168  M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1169 
1170  MachineBasicBlock *BB = I.getParent();
1171  BuildMI(*BB, &I, I.getDebugLoc(),
1172  TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1173  .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);
1174 
1175  I.eraseFromParent();
1176  return true;
1177 }
1178 
1179 bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1181 
1182  Register DstReg = I.getOperand(0).getReg();
1183  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1184  unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1185  AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1186 
1187  MachineBasicBlock *MBB = I.getParent();
1188  const DebugLoc &DL = I.getDebugLoc();
1189 
1190  auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1191 
1192  if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1194  MIB.addImm(MFI->getLDSSize());
1195  } else {
1196  Module *M = MF->getFunction().getParent();
1197  const GlobalValue *GV
1198  = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1200  }
1201 
1202  I.eraseFromParent();
1203  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1204 }
1205 
1206 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1207  MachineBasicBlock *MBB = I.getParent();
1209  const DebugLoc &DL = I.getDebugLoc();
1210 
1211  MachineOperand &Dst = I.getOperand(0);
1212  Register DstReg = Dst.getReg();
1213  unsigned Depth = I.getOperand(2).getImm();
1214 
1215  const TargetRegisterClass *RC
1216  = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1217  if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1218  !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1219  return false;
1220 
1221  // Check for kernel and shader functions
1222  if (Depth != 0 ||
1224  BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1225  .addImm(0);
1226  I.eraseFromParent();
1227  return true;
1228  }
1229 
1230  MachineFrameInfo &MFI = MF.getFrameInfo();
1231  // There is a call to @llvm.returnaddress in this function
1232  MFI.setReturnAddressIsTaken(true);
1233 
1234  // Get the return address reg and mark it as an implicit live-in
1235  Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1236  Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1237  AMDGPU::SReg_64RegClass);
1238  BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1239  .addReg(LiveIn);
1240  I.eraseFromParent();
1241  return true;
1242 }
1243 
1244 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1245  // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1246  // SelectionDAG uses for wave32 vs wave64.
1247  MachineBasicBlock *BB = MI.getParent();
1248  BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1249  .add(MI.getOperand(1));
1250 
1251  Register Reg = MI.getOperand(1).getReg();
1252  MI.eraseFromParent();
1253 
1254  if (!MRI->getRegClassOrNull(Reg))
1256  return true;
1257 }
1258 
1259 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1260  MachineInstr &MI, Intrinsic::ID IntrID) const {
1261  MachineBasicBlock *MBB = MI.getParent();
1263  const DebugLoc &DL = MI.getDebugLoc();
1264 
1265  unsigned IndexOperand = MI.getOperand(7).getImm();
1266  bool WaveRelease = MI.getOperand(8).getImm() != 0;
1267  bool WaveDone = MI.getOperand(9).getImm() != 0;
1268 
1269  if (WaveDone && !WaveRelease)
1270  report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1271 
1272  unsigned OrderedCountIndex = IndexOperand & 0x3f;
1273  IndexOperand &= ~0x3f;
1274  unsigned CountDw = 0;
1275 
1276  if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1277  CountDw = (IndexOperand >> 24) & 0xf;
1278  IndexOperand &= ~(0xf << 24);
1279 
1280  if (CountDw < 1 || CountDw > 4) {
1282  "ds_ordered_count: dword count must be between 1 and 4");
1283  }
1284  }
1285 
1286  if (IndexOperand)
1287  report_fatal_error("ds_ordered_count: bad index operand");
1288 
1289  unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1290  unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1291 
1292  unsigned Offset0 = OrderedCountIndex << 2;
1293  unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
1294  (Instruction << 4);
1295 
1297  Offset1 |= (CountDw - 1) << 6;
1298 
1299  unsigned Offset = Offset0 | (Offset1 << 8);
1300 
1301  Register M0Val = MI.getOperand(2).getReg();
1302  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1303  .addReg(M0Val);
1304 
1305  Register DstReg = MI.getOperand(0).getReg();
1306  Register ValReg = MI.getOperand(3).getReg();
1308  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1309  .addReg(ValReg)
1310  .addImm(Offset)
1311  .cloneMemRefs(MI);
1312 
1313  if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1314  return false;
1315 
1316  bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1317  MI.eraseFromParent();
1318  return Ret;
1319 }
1320 
1321 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1322  switch (IntrID) {
1323  case Intrinsic::amdgcn_ds_gws_init:
1324  return AMDGPU::DS_GWS_INIT;
1325  case Intrinsic::amdgcn_ds_gws_barrier:
1326  return AMDGPU::DS_GWS_BARRIER;
1327  case Intrinsic::amdgcn_ds_gws_sema_v:
1328  return AMDGPU::DS_GWS_SEMA_V;
1329  case Intrinsic::amdgcn_ds_gws_sema_br:
1330  return AMDGPU::DS_GWS_SEMA_BR;
1331  case Intrinsic::amdgcn_ds_gws_sema_p:
1332  return AMDGPU::DS_GWS_SEMA_P;
1333  case Intrinsic::amdgcn_ds_gws_sema_release_all:
1334  return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1335  default:
1336  llvm_unreachable("not a gws intrinsic");
1337  }
1338 }
1339 
1340 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1341  Intrinsic::ID IID) const {
1342  if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1343  !STI.hasGWSSemaReleaseAll())
1344  return false;
1345 
1346  // intrinsic ID, vsrc, offset
1347  const bool HasVSrc = MI.getNumOperands() == 3;
1348  assert(HasVSrc || MI.getNumOperands() == 2);
1349 
1350  Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1351  const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1352  if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1353  return false;
1354 
1355  MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1356  assert(OffsetDef);
1357 
1358  unsigned ImmOffset;
1359 
1360  MachineBasicBlock *MBB = MI.getParent();
1361  const DebugLoc &DL = MI.getDebugLoc();
1362 
1363  MachineInstr *Readfirstlane = nullptr;
1364 
1365  // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1366  // incoming offset, in case there's an add of a constant. We'll have to put it
1367  // back later.
1368  if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1369  Readfirstlane = OffsetDef;
1370  BaseOffset = OffsetDef->getOperand(1).getReg();
1371  OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1372  }
1373 
1374  if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1375  // If we have a constant offset, try to use the 0 in m0 as the base.
1376  // TODO: Look into changing the default m0 initialization value. If the
1377  // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1378  // the immediate offset.
1379 
1380  ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1381  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1382  .addImm(0);
1383  } else {
1384  std::tie(BaseOffset, ImmOffset) =
1385  AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset);
1386 
1387  if (Readfirstlane) {
1388  // We have the constant offset now, so put the readfirstlane back on the
1389  // variable component.
1390  if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1391  return false;
1392 
1393  Readfirstlane->getOperand(1).setReg(BaseOffset);
1394  BaseOffset = Readfirstlane->getOperand(0).getReg();
1395  } else {
1396  if (!RBI.constrainGenericRegister(BaseOffset,
1397  AMDGPU::SReg_32RegClass, *MRI))
1398  return false;
1399  }
1400 
1401  Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1402  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1403  .addReg(BaseOffset)
1404  .addImm(16);
1405 
1406  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1407  .addReg(M0Base);
1408  }
1409 
1410  // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1411  // offset field) % 64. Some versions of the programming guide omit the m0
1412  // part, or claim it's from offset 0.
1413  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1414 
1415  if (HasVSrc) {
1416  Register VSrc = MI.getOperand(1).getReg();
1417 
1418  if (STI.needsAlignedVGPRs()) {
1419  // Add implicit aligned super-reg to force alignment on the data operand.
1420  Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1421  BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
1422  Register NewVR =
1423  MRI->createVirtualRegister(&AMDGPU::VReg_64_Align2RegClass);
1424  BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), NewVR)
1425  .addReg(VSrc, 0, MI.getOperand(1).getSubReg())
1426  .addImm(AMDGPU::sub0)
1427  .addReg(Undef)
1428  .addImm(AMDGPU::sub1);
1429  MIB.addReg(NewVR, 0, AMDGPU::sub0);
1430  MIB.addReg(NewVR, RegState::Implicit);
1431  } else {
1432  MIB.addReg(VSrc);
1433  }
1434 
1435  if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1436  return false;
1437  }
1438 
1439  MIB.addImm(ImmOffset)
1440  .cloneMemRefs(MI);
1441 
1442  MI.eraseFromParent();
1443  return true;
1444 }
1445 
1446 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1447  bool IsAppend) const {
1448  Register PtrBase = MI.getOperand(2).getReg();
1449  LLT PtrTy = MRI->getType(PtrBase);
1450  bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1451 
1452  unsigned Offset;
1453  std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1454 
1455  // TODO: Should this try to look through readfirstlane like GWS?
1456  if (!isDSOffsetLegal(PtrBase, Offset)) {
1457  PtrBase = MI.getOperand(2).getReg();
1458  Offset = 0;
1459  }
1460 
1461  MachineBasicBlock *MBB = MI.getParent();
1462  const DebugLoc &DL = MI.getDebugLoc();
1463  const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1464 
1465  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1466  .addReg(PtrBase);
1467  if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1468  return false;
1469 
1470  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1471  .addImm(Offset)
1472  .addImm(IsGDS ? -1 : 0)
1473  .cloneMemRefs(MI);
1474  MI.eraseFromParent();
1475  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1476 }
1477 
1478 bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1479  if (TM.getOptLevel() > CodeGenOpt::None) {
1480  unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1481  if (WGSize <= STI.getWavefrontSize()) {
1482  MachineBasicBlock *MBB = MI.getParent();
1483  const DebugLoc &DL = MI.getDebugLoc();
1484  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1485  MI.eraseFromParent();
1486  return true;
1487  }
1488  }
1489  return selectImpl(MI, *CoverageInfo);
1490 }
1491 
1492 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1493  bool &IsTexFail) {
1494  if (TexFailCtrl)
1495  IsTexFail = true;
1496 
1497  TFE = (TexFailCtrl & 0x1) ? 1 : 0;
1498  TexFailCtrl &= ~(uint64_t)0x1;
1499  LWE = (TexFailCtrl & 0x2) ? 1 : 0;
1500  TexFailCtrl &= ~(uint64_t)0x2;
1501 
1502  return TexFailCtrl == 0;
1503 }
1504 
1505 bool AMDGPUInstructionSelector::selectImageIntrinsic(
1507  MachineBasicBlock *MBB = MI.getParent();
1508  const DebugLoc &DL = MI.getDebugLoc();
1509 
1510  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1511  AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1512 
1513  const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1514  const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
1515  AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
1516  const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
1517  AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
1518  unsigned IntrOpcode = Intr->BaseOpcode;
1519  const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
1520 
1521  const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
1522 
1523  Register VDataIn, VDataOut;
1524  LLT VDataTy;
1525  int NumVDataDwords = -1;
1526  bool IsD16 = false;
1527 
1528  bool Unorm;
1529  if (!BaseOpcode->Sampler)
1530  Unorm = true;
1531  else
1532  Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
1533 
1534  bool TFE;
1535  bool LWE;
1536  bool IsTexFail = false;
1537  if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
1538  TFE, LWE, IsTexFail))
1539  return false;
1540 
1541  const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
1542  const bool IsA16 = (Flags & 1) != 0;
1543  const bool IsG16 = (Flags & 2) != 0;
1544 
1545  // A16 implies 16 bit gradients if subtarget doesn't support G16
1546  if (IsA16 && !STI.hasG16() && !IsG16)
1547  return false;
1548 
1549  unsigned DMask = 0;
1550  unsigned DMaskLanes = 0;
1551 
1552  if (BaseOpcode->Atomic) {
1553  VDataOut = MI.getOperand(0).getReg();
1554  VDataIn = MI.getOperand(2).getReg();
1555  LLT Ty = MRI->getType(VDataIn);
1556 
1557  // Be careful to allow atomic swap on 16-bit element vectors.
1558  const bool Is64Bit = BaseOpcode->AtomicX2 ?
1559  Ty.getSizeInBits() == 128 :
1560  Ty.getSizeInBits() == 64;
1561 
1562  if (BaseOpcode->AtomicX2) {
1563  assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1564 
1565  DMask = Is64Bit ? 0xf : 0x3;
1566  NumVDataDwords = Is64Bit ? 4 : 2;
1567  } else {
1568  DMask = Is64Bit ? 0x3 : 0x1;
1569  NumVDataDwords = Is64Bit ? 2 : 1;
1570  }
1571  } else {
1572  DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
1573  DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
1574 
1575  // One memoperand is mandatory, except for getresinfo.
1576  // FIXME: Check this in verifier.
1577  if (!MI.memoperands_empty()) {
1578  const MachineMemOperand *MMO = *MI.memoperands_begin();
1579 
1580  // Infer d16 from the memory size, as the register type will be mangled by
1581  // unpacked subtargets, or by TFE.
1582  IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
1583  }
1584 
1585  if (BaseOpcode->Store) {
1586  VDataIn = MI.getOperand(1).getReg();
1587  VDataTy = MRI->getType(VDataIn);
1588  NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1589  } else {
1590  VDataOut = MI.getOperand(0).getReg();
1591  VDataTy = MRI->getType(VDataOut);
1592  NumVDataDwords = DMaskLanes;
1593 
1594  if (IsD16 && !STI.hasUnpackedD16VMem())
1595  NumVDataDwords = (DMaskLanes + 1) / 2;
1596  }
1597  }
1598 
1599  // Optimize _L to _LZ when _L is zero
1600  if (LZMappingInfo) {
1601  // The legalizer replaced the register with an immediate 0 if we need to
1602  // change the opcode.
1603  const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->LodIndex);
1604  if (Lod.isImm()) {
1605  assert(Lod.getImm() == 0);
1606  IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l
1607  }
1608  }
1609 
1610  // Optimize _mip away, when 'lod' is zero
1611  if (MIPMappingInfo) {
1612  const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->MipIndex);
1613  if (Lod.isImm()) {
1614  assert(Lod.getImm() == 0);
1615  IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip
1616  }
1617  }
1618 
1619  // Set G16 opcode
1620  if (IsG16 && !IsA16) {
1621  const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1622  AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
1623  assert(G16MappingInfo);
1624  IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1625  }
1626 
1627  // TODO: Check this in verifier.
1628  assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1629 
1630  unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
1631  if (BaseOpcode->Atomic)
1632  CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
1633  if (CPol & ~AMDGPU::CPol::ALL)
1634  return false;
1635 
1636  int NumVAddrRegs = 0;
1637  int NumVAddrDwords = 0;
1638  for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
1639  // Skip the $noregs and 0s inserted during legalization.
1640  MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
1641  if (!AddrOp.isReg())
1642  continue; // XXX - Break?
1643 
1644  Register Addr = AddrOp.getReg();
1645  if (!Addr)
1646  break;
1647 
1648  ++NumVAddrRegs;
1649  NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1650  }
1651 
1652  // The legalizer preprocessed the intrinsic arguments. If we aren't using
1653  // NSA, these should have beeen packed into a single value in the first
1654  // address register
1655  const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
1656  if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1657  LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1658  return false;
1659  }
1660 
1661  if (IsTexFail)
1662  ++NumVDataDwords;
1663 
1664  int Opcode = -1;
1665  if (IsGFX10Plus) {
1666  Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1667  UseNSA ? AMDGPU::MIMGEncGfx10NSA
1668  : AMDGPU::MIMGEncGfx10Default,
1669  NumVDataDwords, NumVAddrDwords);
1670  } else {
1672  Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1673  NumVDataDwords, NumVAddrDwords);
1674  if (Opcode == -1)
1675  Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1676  NumVDataDwords, NumVAddrDwords);
1677  }
1678  assert(Opcode != -1);
1679 
1680  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1681  .cloneMemRefs(MI);
1682 
1683  if (VDataOut) {
1684  if (BaseOpcode->AtomicX2) {
1685  const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1686 
1688  Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1689  unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1690 
1691  MIB.addDef(TmpReg);
1692  if (!MRI->use_empty(VDataOut)) {
1693  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1694  .addReg(TmpReg, RegState::Kill, SubReg);
1695  }
1696 
1697  } else {
1698  MIB.addDef(VDataOut); // vdata output
1699  }
1700  }
1701 
1702  if (VDataIn)
1703  MIB.addReg(VDataIn); // vdata input
1704 
1705  for (int I = 0; I != NumVAddrRegs; ++I) {
1706  MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
1707  if (SrcOp.isReg()) {
1708  assert(SrcOp.getReg() != 0);
1709  MIB.addReg(SrcOp.getReg());
1710  }
1711  }
1712 
1713  MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
1714  if (BaseOpcode->Sampler)
1715  MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
1716 
1717  MIB.addImm(DMask); // dmask
1718 
1719  if (IsGFX10Plus)
1720  MIB.addImm(DimInfo->Encoding);
1721  MIB.addImm(Unorm);
1722 
1723  MIB.addImm(CPol);
1724  MIB.addImm(IsA16 && // a16 or r128
1725  STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
1726  if (IsGFX10Plus)
1727  MIB.addImm(IsA16 ? -1 : 0);
1728 
1729  MIB.addImm(TFE); // tfe
1730  MIB.addImm(LWE); // lwe
1731  if (!IsGFX10Plus)
1732  MIB.addImm(DimInfo->DA ? -1 : 0);
1733  if (BaseOpcode->HasD16)
1734  MIB.addImm(IsD16 ? -1 : 0);
1735 
1736  if (IsTexFail) {
1737  // An image load instruction with TFE/LWE only conditionally writes to its
1738  // result registers. Initialize them to zero so that we always get well
1739  // defined result values.
1740  assert(VDataOut && !VDataIn);
1741  Register Tied = MRI->cloneVirtualRegister(VDataOut);
1742  Register Zero = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1743  BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::V_MOV_B32_e32), Zero)
1744  .addImm(0);
1745  auto Parts = TRI.getRegSplitParts(MRI->getRegClass(Tied), 4);
1746  if (STI.usePRTStrictNull()) {
1747  // With enable-prt-strict-null enabled, initialize all result registers to
1748  // zero.
1749  auto RegSeq =
1750  BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
1751  for (auto Sub : Parts)
1752  RegSeq.addReg(Zero).addImm(Sub);
1753  } else {
1754  // With enable-prt-strict-null disabled, only initialize the extra TFE/LWE
1755  // result register.
1756  Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1757  BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
1758  auto RegSeq =
1759  BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
1760  for (auto Sub : Parts.drop_back(1))
1761  RegSeq.addReg(Undef).addImm(Sub);
1762  RegSeq.addReg(Zero).addImm(Parts.back());
1763  }
1764  MIB.addReg(Tied, RegState::Implicit);
1765  MIB->tieOperands(0, MIB->getNumOperands() - 1);
1766  }
1767 
1768  MI.eraseFromParent();
1769  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1770 }
1771 
1772 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
1773  MachineInstr &I) const {
1774  unsigned IntrinsicID = I.getIntrinsicID();
1775  switch (IntrinsicID) {
1776  case Intrinsic::amdgcn_end_cf:
1777  return selectEndCfIntrinsic(I);
1778  case Intrinsic::amdgcn_ds_ordered_add:
1779  case Intrinsic::amdgcn_ds_ordered_swap:
1780  return selectDSOrderedIntrinsic(I, IntrinsicID);
1781  case Intrinsic::amdgcn_ds_gws_init:
1782  case Intrinsic::amdgcn_ds_gws_barrier:
1783  case Intrinsic::amdgcn_ds_gws_sema_v:
1784  case Intrinsic::amdgcn_ds_gws_sema_br:
1785  case Intrinsic::amdgcn_ds_gws_sema_p:
1786  case Intrinsic::amdgcn_ds_gws_sema_release_all:
1787  return selectDSGWSIntrinsic(I, IntrinsicID);
1788  case Intrinsic::amdgcn_ds_append:
1789  return selectDSAppendConsume(I, true);
1790  case Intrinsic::amdgcn_ds_consume:
1791  return selectDSAppendConsume(I, false);
1792  case Intrinsic::amdgcn_s_barrier:
1793  return selectSBarrier(I);
1794  case Intrinsic::amdgcn_global_atomic_fadd:
1795  return selectGlobalAtomicFadd(I, I.getOperand(2), I.getOperand(3));
1796  default: {
1797  return selectImpl(I, *CoverageInfo);
1798  }
1799  }
1800 }
1801 
1802 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
1803  if (selectImpl(I, *CoverageInfo))
1804  return true;
1805 
1806  MachineBasicBlock *BB = I.getParent();
1807  const DebugLoc &DL = I.getDebugLoc();
1808 
1809  Register DstReg = I.getOperand(0).getReg();
1810  unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
1811  assert(Size <= 32 || Size == 64);
1812  const MachineOperand &CCOp = I.getOperand(1);
1813  Register CCReg = CCOp.getReg();
1814  if (!isVCC(CCReg, *MRI)) {
1815  unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
1816  AMDGPU::S_CSELECT_B32;
1817  MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
1818  .addReg(CCReg);
1819 
1820  // The generic constrainSelectedInstRegOperands doesn't work for the scc register
1821  // bank, because it does not cover the register class that we used to represent
1822  // for it. So we need to manually set the register class here.
1823  if (!MRI->getRegClassOrNull(CCReg))
1824  MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
1825  MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
1826  .add(I.getOperand(2))
1827  .add(I.getOperand(3));
1828 
1829  bool Ret = false;
1830  Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
1831  Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
1832  I.eraseFromParent();
1833  return Ret;
1834  }
1835 
1836  // Wide VGPR select should have been split in RegBankSelect.
1837  if (Size > 32)
1838  return false;
1839 
1840  MachineInstr *Select =
1841  BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1842  .addImm(0)
1843  .add(I.getOperand(3))
1844  .addImm(0)
1845  .add(I.getOperand(2))
1846  .add(I.getOperand(1));
1847 
1848  bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
1849  I.eraseFromParent();
1850  return Ret;
1851 }
1852 
1853 static int sizeToSubRegIndex(unsigned Size) {
1854  switch (Size) {
1855  case 32:
1856  return AMDGPU::sub0;
1857  case 64:
1858  return AMDGPU::sub0_sub1;
1859  case 96:
1860  return AMDGPU::sub0_sub1_sub2;
1861  case 128:
1862  return AMDGPU::sub0_sub1_sub2_sub3;
1863  case 256:
1864  return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
1865  default:
1866  if (Size < 32)
1867  return AMDGPU::sub0;
1868  if (Size > 256)
1869  return -1;
1871  }
1872 }
1873 
1874 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
1875  Register DstReg = I.getOperand(0).getReg();
1876  Register SrcReg = I.getOperand(1).getReg();
1877  const LLT DstTy = MRI->getType(DstReg);
1878  const LLT SrcTy = MRI->getType(SrcReg);
1879  const LLT S1 = LLT::scalar(1);
1880 
1881  const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
1882  const RegisterBank *DstRB;
1883  if (DstTy == S1) {
1884  // This is a special case. We don't treat s1 for legalization artifacts as
1885  // vcc booleans.
1886  DstRB = SrcRB;
1887  } else {
1888  DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1889  if (SrcRB != DstRB)
1890  return false;
1891  }
1892 
1893  const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
1894 
1895  unsigned DstSize = DstTy.getSizeInBits();
1896  unsigned SrcSize = SrcTy.getSizeInBits();
1897 
1898  const TargetRegisterClass *SrcRC
1899  = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI);
1900  const TargetRegisterClass *DstRC
1901  = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
1902  if (!SrcRC || !DstRC)
1903  return false;
1904 
1905  if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
1906  !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
1907  LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
1908  return false;
1909  }
1910 
1911  if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
1912  MachineBasicBlock *MBB = I.getParent();
1913  const DebugLoc &DL = I.getDebugLoc();
1914 
1915  Register LoReg = MRI->createVirtualRegister(DstRC);
1916  Register HiReg = MRI->createVirtualRegister(DstRC);
1917  BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
1918  .addReg(SrcReg, 0, AMDGPU::sub0);
1919  BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
1920  .addReg(SrcReg, 0, AMDGPU::sub1);
1921 
1922  if (IsVALU && STI.hasSDWA()) {
1923  // Write the low 16-bits of the high element into the high 16-bits of the
1924  // low element.
1925  MachineInstr *MovSDWA =
1926  BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
1927  .addImm(0) // $src0_modifiers
1928  .addReg(HiReg) // $src0
1929  .addImm(0) // $clamp
1930  .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
1931  .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
1932  .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
1933  .addReg(LoReg, RegState::Implicit);
1934  MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
1935  } else {
1936  Register TmpReg0 = MRI->createVirtualRegister(DstRC);
1937  Register TmpReg1 = MRI->createVirtualRegister(DstRC);
1938  Register ImmReg = MRI->createVirtualRegister(DstRC);
1939  if (IsVALU) {
1940  BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
1941  .addImm(16)
1942  .addReg(HiReg);
1943  } else {
1944  BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
1945  .addReg(HiReg)
1946  .addImm(16);
1947  }
1948 
1949  unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
1950  unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
1951  unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
1952 
1953  BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
1954  .addImm(0xffff);
1955  BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
1956  .addReg(LoReg)
1957  .addReg(ImmReg);
1958  BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
1959  .addReg(TmpReg0)
1960  .addReg(TmpReg1);
1961  }
1962 
1963  I.eraseFromParent();
1964  return true;
1965  }
1966 
1967  if (!DstTy.isScalar())
1968  return false;
1969 
1970  if (SrcSize > 32) {
1971  int SubRegIdx = sizeToSubRegIndex(DstSize);
1972  if (SubRegIdx == -1)
1973  return false;
1974 
1975  // Deal with weird cases where the class only partially supports the subreg
1976  // index.
1977  const TargetRegisterClass *SrcWithSubRC
1978  = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
1979  if (!SrcWithSubRC)
1980  return false;
1981 
1982  if (SrcWithSubRC != SrcRC) {
1983  if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
1984  return false;
1985  }
1986 
1987  I.getOperand(1).setSubReg(SubRegIdx);
1988  }
1989 
1990  I.setDesc(TII.get(TargetOpcode::COPY));
1991  return true;
1992 }
1993 
1994 /// \returns true if a bitmask for \p Size bits will be an inline immediate.
1995 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
1996  Mask = maskTrailingOnes<unsigned>(Size);
1997  int SignedMask = static_cast<int>(Mask);
1998  return SignedMask >= -16 && SignedMask <= 64;
1999 }
2000 
2001 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2002 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2004  const TargetRegisterInfo &TRI) const {
2005  const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2006  if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
2007  return RB;
2008 
2009  // Ignore the type, since we don't use vcc in artifacts.
2010  if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
2011  return &RBI.getRegBankFromRegClass(*RC, LLT());
2012  return nullptr;
2013 }
2014 
2015 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2016  bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2017  bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2018  const DebugLoc &DL = I.getDebugLoc();
2019  MachineBasicBlock &MBB = *I.getParent();
2020  const Register DstReg = I.getOperand(0).getReg();
2021  const Register SrcReg = I.getOperand(1).getReg();
2022 
2023  const LLT DstTy = MRI->getType(DstReg);
2024  const LLT SrcTy = MRI->getType(SrcReg);
2025  const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2026  I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2027  const unsigned DstSize = DstTy.getSizeInBits();
2028  if (!DstTy.isScalar())
2029  return false;
2030 
2031  // Artifact casts should never use vcc.
2032  const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2033 
2034  // FIXME: This should probably be illegal and split earlier.
2035  if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2036  if (DstSize <= 32)
2037  return selectCOPY(I);
2038 
2039  const TargetRegisterClass *SrcRC =
2040  TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI);
2041  const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2042  const TargetRegisterClass *DstRC =
2043  TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
2044 
2045  Register UndefReg = MRI->createVirtualRegister(SrcRC);
2046  BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2047  BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2048  .addReg(SrcReg)
2049  .addImm(AMDGPU::sub0)
2050  .addReg(UndefReg)
2051  .addImm(AMDGPU::sub1);
2052  I.eraseFromParent();
2053 
2054  return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2055  RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2056  }
2057 
2058  if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2059  // 64-bit should have been split up in RegBankSelect
2060 
2061  // Try to use an and with a mask if it will save code size.
2062  unsigned Mask;
2063  if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2064  MachineInstr *ExtI =
2065  BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2066  .addImm(Mask)
2067  .addReg(SrcReg);
2068  I.eraseFromParent();
2069  return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2070  }
2071 
2072  const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2073  MachineInstr *ExtI =
2074  BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2075  .addReg(SrcReg)
2076  .addImm(0) // Offset
2077  .addImm(SrcSize); // Width
2078  I.eraseFromParent();
2079  return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2080  }
2081 
2082  if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2083  const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2084  AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2085  if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2086  return false;
2087 
2088  if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2089  const unsigned SextOpc = SrcSize == 8 ?
2090  AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2091  BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2092  .addReg(SrcReg);
2093  I.eraseFromParent();
2094  return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2095  }
2096 
2097  const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2098  const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2099 
2100  // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2101  if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2102  // We need a 64-bit register source, but the high bits don't matter.
2103  Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2104  Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2105  unsigned SubReg = InReg ? AMDGPU::sub0 : 0;
2106 
2107  BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2108  BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2109  .addReg(SrcReg, 0, SubReg)
2110  .addImm(AMDGPU::sub0)
2111  .addReg(UndefReg)
2112  .addImm(AMDGPU::sub1);
2113 
2114  BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2115  .addReg(ExtReg)
2116  .addImm(SrcSize << 16);
2117 
2118  I.eraseFromParent();
2119  return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2120  }
2121 
2122  unsigned Mask;
2123  if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2124  BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2125  .addReg(SrcReg)
2126  .addImm(Mask);
2127  } else {
2128  BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2129  .addReg(SrcReg)
2130  .addImm(SrcSize << 16);
2131  }
2132 
2133  I.eraseFromParent();
2134  return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2135  }
2136 
2137  return false;
2138 }
2139 
2140 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
2141  MachineBasicBlock *BB = I.getParent();
2142  MachineOperand &ImmOp = I.getOperand(1);
2143  Register DstReg = I.getOperand(0).getReg();
2144  unsigned Size = MRI->getType(DstReg).getSizeInBits();
2145 
2146  // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
2147  if (ImmOp.isFPImm()) {
2148  const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
2149  ImmOp.ChangeToImmediate(Imm.getZExtValue());
2150  } else if (ImmOp.isCImm()) {
2151  ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
2152  } else {
2153  llvm_unreachable("Not supported by g_constants");
2154  }
2155 
2156  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2157  const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
2158 
2159  unsigned Opcode;
2160  if (DstRB->getID() == AMDGPU::VCCRegBankID) {
2161  Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2162  } else {
2163  Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2164 
2165  // We should never produce s1 values on banks other than VCC. If the user of
2166  // this already constrained the register, we may incorrectly think it's VCC
2167  // if it wasn't originally.
2168  if (Size == 1)
2169  return false;
2170  }
2171 
2172  if (Size != 64) {
2173  I.setDesc(TII.get(Opcode));
2174  I.addImplicitDefUseOperands(*MF);
2175  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2176  }
2177 
2178  const DebugLoc &DL = I.getDebugLoc();
2179 
2180  APInt Imm(Size, I.getOperand(1).getImm());
2181 
2182  MachineInstr *ResInst;
2183  if (IsSgpr && TII.isInlineConstant(Imm)) {
2184  ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2185  .addImm(I.getOperand(1).getImm());
2186  } else {
2187  const TargetRegisterClass *RC = IsSgpr ?
2188  &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2189  Register LoReg = MRI->createVirtualRegister(RC);
2190  Register HiReg = MRI->createVirtualRegister(RC);
2191 
2192  BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2193  .addImm(Imm.trunc(32).getZExtValue());
2194 
2195  BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2196  .addImm(Imm.ashr(32).getZExtValue());
2197 
2198  ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2199  .addReg(LoReg)
2200  .addImm(AMDGPU::sub0)
2201  .addReg(HiReg)
2202  .addImm(AMDGPU::sub1);
2203  }
2204 
2205  // We can't call constrainSelectedInstRegOperands here, because it doesn't
2206  // work for target independent opcodes
2207  I.eraseFromParent();
2208  const TargetRegisterClass *DstRC =
2209  TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2210  if (!DstRC)
2211  return true;
2212  return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2213 }
2214 
2215 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2216  // Only manually handle the f64 SGPR case.
2217  //
2218  // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2219  // the bit ops theoretically have a second result due to the implicit def of
2220  // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2221  // that is easy by disabling the check. The result works, but uses a
2222  // nonsensical sreg32orlds_and_sreg_1 regclass.
2223  //
2224  // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2225  // the variadic REG_SEQUENCE operands.
2226 
2227  Register Dst = MI.getOperand(0).getReg();
2228  const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2229  if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2230  MRI->getType(Dst) != LLT::scalar(64))
2231  return false;
2232 
2233  Register Src = MI.getOperand(1).getReg();
2234  MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2235  if (Fabs)
2236  Src = Fabs->getOperand(1).getReg();
2237 
2238  if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2239  !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2240  return false;
2241 
2242  MachineBasicBlock *BB = MI.getParent();
2243  const DebugLoc &DL = MI.getDebugLoc();
2244  Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2245  Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2246  Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2247  Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2248 
2249  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2250  .addReg(Src, 0, AMDGPU::sub0);
2251  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2252  .addReg(Src, 0, AMDGPU::sub1);
2253  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2254  .addImm(0x80000000);
2255 
2256  // Set or toggle sign bit.
2257  unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2258  BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2259  .addReg(HiReg)
2260  .addReg(ConstReg);
2261  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2262  .addReg(LoReg)
2263  .addImm(AMDGPU::sub0)
2264  .addReg(OpReg)
2265  .addImm(AMDGPU::sub1);
2266  MI.eraseFromParent();
2267  return true;
2268 }
2269 
2270 // FIXME: This is a workaround for the same tablegen problems as G_FNEG
2271 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2272  Register Dst = MI.getOperand(0).getReg();
2273  const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2274  if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2275  MRI->getType(Dst) != LLT::scalar(64))
2276  return false;
2277 
2278  Register Src = MI.getOperand(1).getReg();
2279  MachineBasicBlock *BB = MI.getParent();
2280  const DebugLoc &DL = MI.getDebugLoc();
2281  Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2282  Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2283  Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2284  Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2285 
2286  if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2287  !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2288  return false;
2289 
2290  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2291  .addReg(Src, 0, AMDGPU::sub0);
2292  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2293  .addReg(Src, 0, AMDGPU::sub1);
2294  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2295  .addImm(0x7fffffff);
2296 
2297  // Clear sign bit.
2298  // TODO: Should this used S_BITSET0_*?
2299  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2300  .addReg(HiReg)
2301  .addReg(ConstReg);
2302  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2303  .addReg(LoReg)
2304  .addImm(AMDGPU::sub0)
2305  .addReg(OpReg)
2306  .addImm(AMDGPU::sub1);
2307 
2308  MI.eraseFromParent();
2309  return true;
2310 }
2311 
2312 static bool isConstant(const MachineInstr &MI) {
2313  return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2314 }
2315 
2316 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2317  const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2318 
2319  const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg());
2320 
2321  assert(PtrMI);
2322 
2323  if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2324  return;
2325 
2326  GEPInfo GEPInfo(*PtrMI);
2327 
2328  for (unsigned i = 1; i != 3; ++i) {
2329  const MachineOperand &GEPOp = PtrMI->getOperand(i);
2330  const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2331  assert(OpDef);
2332  if (i == 2 && isConstant(*OpDef)) {
2333  // TODO: Could handle constant base + variable offset, but a combine
2334  // probably should have commuted it.
2335  assert(GEPInfo.Imm == 0);
2336  GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2337  continue;
2338  }
2339  const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2340  if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2341  GEPInfo.SgprParts.push_back(GEPOp.getReg());
2342  else
2343  GEPInfo.VgprParts.push_back(GEPOp.getReg());
2344  }
2345 
2346  AddrInfo.push_back(GEPInfo);
2347  getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2348 }
2349 
2350 bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2351  return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2352 }
2353 
2354 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2355  if (!MI.hasOneMemOperand())
2356  return false;
2357 
2358  const MachineMemOperand *MMO = *MI.memoperands_begin();
2359  const Value *Ptr = MMO->getValue();
2360 
2361  // UndefValue means this is a load of a kernel input. These are uniform.
2362  // Sometimes LDS instructions have constant pointers.
2363  // If Ptr is null, then that means this mem operand contains a
2364  // PseudoSourceValue like GOT.
2365  if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2366  isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2367  return true;
2368 
2370  return true;
2371 
2372  const Instruction *I = dyn_cast<Instruction>(Ptr);
2373  return I && I->getMetadata("amdgpu.uniform");
2374 }
2375 
2376 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2377  for (const GEPInfo &GEPInfo : AddrInfo) {
2378  if (!GEPInfo.VgprParts.empty())
2379  return true;
2380  }
2381  return false;
2382 }
2383 
2384 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2385  const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2386  unsigned AS = PtrTy.getAddressSpace();
2387  if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
2388  STI.ldsRequiresM0Init()) {
2389  MachineBasicBlock *BB = I.getParent();
2390 
2391  // If DS instructions require M0 initialization, insert it before selecting.
2392  BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2393  .addImm(-1);
2394  }
2395 }
2396 
2397 bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2398  MachineInstr &I) const {
2399  if (I.getOpcode() == TargetOpcode::G_ATOMICRMW_FADD) {
2400  const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2401  unsigned AS = PtrTy.getAddressSpace();
2402  if (AS == AMDGPUAS::GLOBAL_ADDRESS)
2403  return selectGlobalAtomicFadd(I, I.getOperand(1), I.getOperand(2));
2404  }
2405 
2406  initM0(I);
2407  return selectImpl(I, *CoverageInfo);
2408 }
2409 
2410 // TODO: No rtn optimization.
2411 bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
2412  MachineInstr &MI) const {
2413  Register PtrReg = MI.getOperand(1).getReg();
2414  const LLT PtrTy = MRI->getType(PtrReg);
2415  if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
2416  STI.useFlatForGlobal())
2417  return selectImpl(MI, *CoverageInfo);
2418 
2419  Register DstReg = MI.getOperand(0).getReg();
2420  const LLT Ty = MRI->getType(DstReg);
2421  const bool Is64 = Ty.getSizeInBits() == 64;
2422  const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2424  Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2425 
2426  const DebugLoc &DL = MI.getDebugLoc();
2427  MachineBasicBlock *BB = MI.getParent();
2428 
2429  Register VAddr, RSrcReg, SOffset;
2430  int64_t Offset = 0;
2431 
2432  unsigned Opcode;
2433  if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) {
2434  Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN :
2435  AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN;
2436  } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr,
2437  RSrcReg, SOffset, Offset)) {
2438  Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN :
2439  AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN;
2440  } else
2441  return selectImpl(MI, *CoverageInfo);
2442 
2443  auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg)
2444  .addReg(MI.getOperand(2).getReg());
2445 
2446  if (VAddr)
2447  MIB.addReg(VAddr);
2448 
2449  MIB.addReg(RSrcReg);
2450  if (SOffset)
2451  MIB.addReg(SOffset);
2452  else
2453  MIB.addImm(0);
2454 
2455  MIB.addImm(Offset);
2457  MIB.cloneMemRefs(MI);
2458 
2459  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg)
2460  .addReg(TmpReg, RegState::Kill, SubReg);
2461 
2462  MI.eraseFromParent();
2463 
2464  MRI->setRegClass(
2465  DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass);
2466  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2467 }
2468 
2470  if (Reg.isPhysical())
2471  return false;
2472 
2474  const unsigned Opcode = MI.getOpcode();
2475 
2476  if (Opcode == AMDGPU::COPY)
2477  return isVCmpResult(MI.getOperand(1).getReg(), MRI);
2478 
2479  if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
2480  Opcode == AMDGPU::G_XOR)
2481  return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
2482  isVCmpResult(MI.getOperand(2).getReg(), MRI);
2483 
2484  if (Opcode == TargetOpcode::G_INTRINSIC)
2485  return MI.getIntrinsicID() == Intrinsic::amdgcn_class;
2486 
2487  return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
2488 }
2489 
2490 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2491  MachineBasicBlock *BB = I.getParent();
2492  MachineOperand &CondOp = I.getOperand(0);
2493  Register CondReg = CondOp.getReg();
2494  const DebugLoc &DL = I.getDebugLoc();
2495 
2496  unsigned BrOpcode;
2497  Register CondPhysReg;
2498  const TargetRegisterClass *ConstrainRC;
2499 
2500  // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2501  // whether the branch is uniform when selecting the instruction. In
2502  // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2503  // RegBankSelect knows what it's doing if the branch condition is scc, even
2504  // though it currently does not.
2505  if (!isVCC(CondReg, *MRI)) {
2506  if (MRI->getType(CondReg) != LLT::scalar(32))
2507  return false;
2508 
2509  CondPhysReg = AMDGPU::SCC;
2510  BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2511  ConstrainRC = &AMDGPU::SReg_32RegClass;
2512  } else {
2513  // FIXME: Should scc->vcc copies and with exec?
2514 
2515  // Unless the value of CondReg is a result of a V_CMP* instruction then we
2516  // need to insert an and with exec.
2517  if (!isVCmpResult(CondReg, *MRI)) {
2518  const bool Is64 = STI.isWave64();
2519  const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
2520  const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
2521 
2522  Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
2523  BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
2524  .addReg(CondReg)
2525  .addReg(Exec);
2526  CondReg = TmpReg;
2527  }
2528 
2529  CondPhysReg = TRI.getVCC();
2530  BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2531  ConstrainRC = TRI.getBoolRC();
2532  }
2533 
2534  if (!MRI->getRegClassOrNull(CondReg))
2535  MRI->setRegClass(CondReg, ConstrainRC);
2536 
2537  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2538  .addReg(CondReg);
2539  BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2540  .addMBB(I.getOperand(1).getMBB());
2541 
2542  I.eraseFromParent();
2543  return true;
2544 }
2545 
2546 bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2547  MachineInstr &I) const {
2548  Register DstReg = I.getOperand(0).getReg();
2549  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2550  const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2551  I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2552  if (IsVGPR)
2553  I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2554 
2555  return RBI.constrainGenericRegister(
2556  DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2557 }
2558 
2559 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2560  Register DstReg = I.getOperand(0).getReg();
2561  Register SrcReg = I.getOperand(1).getReg();
2562  Register MaskReg = I.getOperand(2).getReg();
2563  LLT Ty = MRI->getType(DstReg);
2564  LLT MaskTy = MRI->getType(MaskReg);
2565 
2566  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2567  const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2568  const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2569  const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2570  if (DstRB != SrcRB) // Should only happen for hand written MIR.
2571  return false;
2572 
2573  unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2574  const TargetRegisterClass &RegRC
2575  = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2576 
2577  const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB,
2578  *MRI);
2579  const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB,
2580  *MRI);
2581  const TargetRegisterClass *MaskRC =
2582  TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI);
2583 
2584  if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2585  !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2586  !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2587  return false;
2588 
2589  MachineBasicBlock *BB = I.getParent();
2590  const DebugLoc &DL = I.getDebugLoc();
2591  if (Ty.getSizeInBits() == 32) {
2592  assert(MaskTy.getSizeInBits() == 32 &&
2593  "ptrmask should have been narrowed during legalize");
2594 
2595  BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2596  .addReg(SrcReg)
2597  .addReg(MaskReg);
2598  I.eraseFromParent();
2599  return true;
2600  }
2601 
2602  Register HiReg = MRI->createVirtualRegister(&RegRC);
2603  Register LoReg = MRI->createVirtualRegister(&RegRC);
2604 
2605  // Extract the subregisters from the source pointer.
2606  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2607  .addReg(SrcReg, 0, AMDGPU::sub0);
2608  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2609  .addReg(SrcReg, 0, AMDGPU::sub1);
2610 
2611  Register MaskedLo, MaskedHi;
2612 
2613  // Try to avoid emitting a bit operation when we only need to touch half of
2614  // the 64-bit pointer.
2615  APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
2616 
2617  const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2618  const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2619  if ((MaskOnes & MaskLo32) == MaskLo32) {
2620  // If all the bits in the low half are 1, we only need a copy for it.
2621  MaskedLo = LoReg;
2622  } else {
2623  // Extract the mask subregister and apply the and.
2624  Register MaskLo = MRI->createVirtualRegister(&RegRC);
2625  MaskedLo = MRI->createVirtualRegister(&RegRC);
2626 
2627  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2628  .addReg(MaskReg, 0, AMDGPU::sub0);
2629  BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2630  .addReg(LoReg)
2631  .addReg(MaskLo);
2632  }
2633 
2634  if ((MaskOnes & MaskHi32) == MaskHi32) {
2635  // If all the bits in the high half are 1, we only need a copy for it.
2636  MaskedHi = HiReg;
2637  } else {
2638  Register MaskHi = MRI->createVirtualRegister(&RegRC);
2639  MaskedHi = MRI->createVirtualRegister(&RegRC);
2640 
2641  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
2642  .addReg(MaskReg, 0, AMDGPU::sub1);
2643  BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
2644  .addReg(HiReg)
2645  .addReg(MaskHi);
2646  }
2647 
2648  BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2649  .addReg(MaskedLo)
2650  .addImm(AMDGPU::sub0)
2651  .addReg(MaskedHi)
2652  .addImm(AMDGPU::sub1);
2653  I.eraseFromParent();
2654  return true;
2655 }
2656 
2657 /// Return the register to use for the index value, and the subregister to use
2658 /// for the indirectly accessed register.
2659 static std::pair<Register, unsigned>
2661  const SIRegisterInfo &TRI,
2662  const TargetRegisterClass *SuperRC,
2663  Register IdxReg,
2664  unsigned EltSize) {
2665  Register IdxBaseReg;
2666  int Offset;
2667 
2668  std::tie(IdxBaseReg, Offset) = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg);
2669  if (IdxBaseReg == AMDGPU::NoRegister) {
2670  // This will happen if the index is a known constant. This should ordinarily
2671  // be legalized out, but handle it as a register just in case.
2672  assert(Offset == 0);
2673  IdxBaseReg = IdxReg;
2674  }
2675 
2676  ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
2677 
2678  // Skip out of bounds offsets, or else we would end up using an undefined
2679  // register.
2680  if (static_cast<unsigned>(Offset) >= SubRegs.size())
2681  return std::make_pair(IdxReg, SubRegs[0]);
2682  return std::make_pair(IdxBaseReg, SubRegs[Offset]);
2683 }
2684 
2685 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
2686  MachineInstr &MI) const {
2687  Register DstReg = MI.getOperand(0).getReg();
2688  Register SrcReg = MI.getOperand(1).getReg();
2689  Register IdxReg = MI.getOperand(2).getReg();
2690 
2691  LLT DstTy = MRI->getType(DstReg);
2692  LLT SrcTy = MRI->getType(SrcReg);
2693 
2694  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2695  const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2696  const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2697 
2698  // The index must be scalar. If it wasn't RegBankSelect should have moved this
2699  // into a waterfall loop.
2700  if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2701  return false;
2702 
2703  const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB,
2704  *MRI);
2705  const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB,
2706  *MRI);
2707  if (!SrcRC || !DstRC)
2708  return false;
2709  if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2710  !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2711  !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2712  return false;
2713 
2714  MachineBasicBlock *BB = MI.getParent();
2715  const DebugLoc &DL = MI.getDebugLoc();
2716  const bool Is64 = DstTy.getSizeInBits() == 64;
2717 
2718  unsigned SubReg;
2719  std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg,
2720  DstTy.getSizeInBits() / 8);
2721 
2722  if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
2723  if (DstTy.getSizeInBits() != 32 && !Is64)
2724  return false;
2725 
2726  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2727  .addReg(IdxReg);
2728 
2729  unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
2730  BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
2731  .addReg(SrcReg, 0, SubReg)
2732  .addReg(SrcReg, RegState::Implicit);
2733  MI.eraseFromParent();
2734  return true;
2735  }
2736 
2737  if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
2738  return false;
2739 
2740  if (!STI.useVGPRIndexMode()) {
2741  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2742  .addReg(IdxReg);
2743  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
2744  .addReg(SrcReg, 0, SubReg)
2745  .addReg(SrcReg, RegState::Implicit);
2746  MI.eraseFromParent();
2747  return true;
2748  }
2749 
2750  const MCInstrDesc &GPRIDXDesc =
2751  TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
2752  BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
2753  .addReg(SrcReg)
2754  .addReg(IdxReg)
2755  .addImm(SubReg);
2756 
2757  MI.eraseFromParent();
2758  return true;
2759 }
2760 
2761 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
2762 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
2763  MachineInstr &MI) const {
2764  Register DstReg = MI.getOperand(0).getReg();
2765  Register VecReg = MI.getOperand(1).getReg();
2766  Register ValReg = MI.getOperand(2).getReg();
2767  Register IdxReg = MI.getOperand(3).getReg();
2768 
2769  LLT VecTy = MRI->getType(DstReg);
2770  LLT ValTy = MRI->getType(ValReg);
2771  unsigned VecSize = VecTy.getSizeInBits();
2772  unsigned ValSize = ValTy.getSizeInBits();
2773 
2774  const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
2775  const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
2776  const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2777 
2778  assert(VecTy.getElementType() == ValTy);
2779 
2780  // The index must be scalar. If it wasn't RegBankSelect should have moved this
2781  // into a waterfall loop.
2782  if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2783  return false;
2784 
2785  const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB,
2786  *MRI);
2787  const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB,
2788  *MRI);
2789 
2790  if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
2791  !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
2792  !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
2793  !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2794  return false;
2795 
2796  if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
2797  return false;
2798 
2799  unsigned SubReg;
2800  std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg,
2801  ValSize / 8);
2802 
2803  const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
2804  STI.useVGPRIndexMode();
2805 
2806  MachineBasicBlock *BB = MI.getParent();
2807  const DebugLoc &DL = MI.getDebugLoc();
2808 
2809  if (!IndexMode) {
2810  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2811  .addReg(IdxReg);
2812 
2813  const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
2814  VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
2815  BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
2816  .addReg(VecReg)
2817  .addReg(ValReg)
2818  .addImm(SubReg);
2819  MI.eraseFromParent();
2820  return true;
2821  }
2822 
2823  const MCInstrDesc &GPRIDXDesc =
2824  TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
2825  BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
2826  .addReg(VecReg)
2827  .addReg(ValReg)
2828  .addReg(IdxReg)
2829  .addImm(SubReg);
2830 
2831  MI.eraseFromParent();
2832  return true;
2833 }
2834 
2835 static bool isZeroOrUndef(int X) {
2836  return X == 0 || X == -1;
2837 }
2838 
2839 static bool isOneOrUndef(int X) {
2840  return X == 1 || X == -1;
2841 }
2842 
2843 static bool isZeroOrOneOrUndef(int X) {
2844  return X == 0 || X == 1 || X == -1;
2845 }
2846 
2847 // Normalize a VOP3P shuffle mask to refer to the low/high half of a single
2848 // 32-bit register.
2849 static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1,
2850  ArrayRef<int> Mask) {
2851  NewMask[0] = Mask[0];
2852  NewMask[1] = Mask[1];
2854  return Src0;
2855 
2856  assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1);
2857  assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1);
2858 
2859  // Shift the mask inputs to be 0/1;
2860  NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2;
2861  NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2;
2862  return Src1;
2863 }
2864 
2865 // This is only legal with VOP3P instructions as an aid to op_sel matching.
2866 bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
2867  MachineInstr &MI) const {
2868  Register DstReg = MI.getOperand(0).getReg();
2869  Register Src0Reg = MI.getOperand(1).getReg();
2870  Register Src1Reg = MI.getOperand(2).getReg();
2871  ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask();
2872 
2873  const LLT V2S16 = LLT::fixed_vector(2, 16);
2874  if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16)
2875  return false;
2876 
2877  if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask))
2878  return false;
2879 
2880  assert(ShufMask.size() == 2);
2881  assert(STI.hasSDWA() && "no target has VOP3P but not SDWA");
2882 
2883  MachineBasicBlock *MBB = MI.getParent();
2884  const DebugLoc &DL = MI.getDebugLoc();
2885 
2886  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2887  const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2888  const TargetRegisterClass &RC = IsVALU ?
2889  AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2890 
2891  // Handle the degenerate case which should have folded out.
2892  if (ShufMask[0] == -1 && ShufMask[1] == -1) {
2893  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg);
2894 
2895  MI.eraseFromParent();
2896  return RBI.constrainGenericRegister(DstReg, RC, *MRI);
2897  }
2898 
2899  // A legal VOP3P mask only reads one of the sources.
2900  int Mask[2];
2901  Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask);
2902 
2903  if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) ||
2904  !RBI.constrainGenericRegister(SrcVec, RC, *MRI))
2905  return false;
2906 
2907  // TODO: This also should have been folded out
2908  if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) {
2909  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg)
2910  .addReg(SrcVec);
2911 
2912  MI.eraseFromParent();
2913  return true;
2914  }
2915 
2916  if (Mask[0] == 1 && Mask[1] == -1) {
2917  if (IsVALU) {
2918  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
2919  .addImm(16)
2920  .addReg(SrcVec);
2921  } else {
2922  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
2923  .addReg(SrcVec)
2924  .addImm(16);
2925  }
2926  } else if (Mask[0] == -1 && Mask[1] == 0) {
2927  if (IsVALU) {
2928  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg)
2929  .addImm(16)
2930  .addReg(SrcVec);
2931  } else {
2932  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg)
2933  .addReg(SrcVec)
2934  .addImm(16);
2935  }
2936  } else if (Mask[0] == 0 && Mask[1] == 0) {
2937  if (IsVALU) {
2938  // Write low half of the register into the high half.
2939  MachineInstr *MovSDWA =
2940  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2941  .addImm(0) // $src0_modifiers
2942  .addReg(SrcVec) // $src0
2943  .addImm(0) // $clamp
2944  .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2945  .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2946  .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2947  .addReg(SrcVec, RegState::Implicit);
2948  MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2949  } else {
2950  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2951  .addReg(SrcVec)
2952  .addReg(SrcVec);
2953  }
2954  } else if (Mask[0] == 1 && Mask[1] == 1) {
2955  if (IsVALU) {
2956  // Write high half of the register into the low half.
2957  MachineInstr *MovSDWA =
2958  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2959  .addImm(0) // $src0_modifiers
2960  .addReg(SrcVec) // $src0
2961  .addImm(0) // $clamp
2962  .addImm(AMDGPU::SDWA::WORD_0) // $dst_sel
2963  .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2964  .addImm(AMDGPU::SDWA::WORD_1) // $src0_sel
2965  .addReg(SrcVec, RegState::Implicit);
2966  MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2967  } else {
2968  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg)
2969  .addReg(SrcVec)
2970  .addReg(SrcVec);
2971  }
2972  } else if (Mask[0] == 1 && Mask[1] == 0) {
2973  if (IsVALU) {
2974  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32_e64), DstReg)
2975  .addReg(SrcVec)
2976  .addReg(SrcVec)
2977  .addImm(16);
2978  } else {
2979  Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2980  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg)
2981  .addReg(SrcVec)
2982  .addImm(16);
2983  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2984  .addReg(TmpReg)
2985  .addReg(SrcVec);
2986  }
2987  } else
2988  llvm_unreachable("all shuffle masks should be handled");
2989 
2990  MI.eraseFromParent();
2991  return true;
2992 }
2993 
2994 bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
2995  MachineInstr &MI) const {
2996  if (STI.hasGFX90AInsts())
2997  return selectImpl(MI, *CoverageInfo);
2998 
2999  MachineBasicBlock *MBB = MI.getParent();
3000  const DebugLoc &DL = MI.getDebugLoc();
3001 
3002  if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
3003  Function &F = MBB->getParent()->getFunction();
3005  NoFpRet(F, "return versions of fp atomics not supported",
3006  MI.getDebugLoc(), DS_Error);
3007  F.getContext().diagnose(NoFpRet);
3008  return false;
3009  }
3010 
3011  // FIXME: This is only needed because tablegen requires number of dst operands
3012  // in match and replace pattern to be the same. Otherwise patterns can be
3013  // exported from SDag path.
3014  MachineOperand &VDataIn = MI.getOperand(1);
3015  MachineOperand &VIndex = MI.getOperand(3);
3016  MachineOperand &VOffset = MI.getOperand(4);
3017  MachineOperand &SOffset = MI.getOperand(5);
3018  int16_t Offset = MI.getOperand(6).getImm();
3019 
3020  bool HasVOffset = !isOperandImmEqual(VOffset, 0, *MRI);
3021  bool HasVIndex = !isOperandImmEqual(VIndex, 0, *MRI);
3022 
3023  unsigned Opcode;
3024  if (HasVOffset) {
3025  Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN
3026  : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN;
3027  } else {
3028  Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN
3029  : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET;
3030  }
3031 
3032  if (MRI->getType(VDataIn.getReg()).isVector()) {
3033  switch (Opcode) {
3034  case AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN:
3035  Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN;
3036  break;
3037  case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN:
3038  Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFEN;
3039  break;
3040  case AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN:
3041  Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_IDXEN;
3042  break;
3043  case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET:
3044  Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFSET;
3045  break;
3046  }
3047  }
3048 
3049  auto I = BuildMI(*MBB, MI, DL, TII.get(Opcode));
3050  I.add(VDataIn);
3051 
3052  if (Opcode == AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN ||
3053  Opcode == AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN) {
3054  Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3055  BuildMI(*MBB, &*I, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3056  .addReg(VIndex.getReg())
3057  .addImm(AMDGPU::sub0)
3058  .addReg(VOffset.getReg())
3059  .addImm(AMDGPU::sub1);
3060 
3061  I.addReg(IdxReg);
3062  } else if (HasVIndex) {
3063  I.add(VIndex);
3064  } else if (HasVOffset) {
3065  I.add(VOffset);
3066  }
3067 
3068  I.add(MI.getOperand(2)); // rsrc
3069  I.add(SOffset);
3070  I.addImm(Offset);
3071  I.addImm(MI.getOperand(7).getImm()); // cpol
3072  I.cloneMemRefs(MI);
3073 
3074  MI.eraseFromParent();
3075 
3076  return true;
3077 }
3078 
3079 bool AMDGPUInstructionSelector::selectGlobalAtomicFadd(
3080  MachineInstr &MI, MachineOperand &AddrOp, MachineOperand &DataOp) const {
3081 
3082  if (STI.hasGFX90AInsts()) {
3083  // gfx90a adds return versions of the global atomic fadd instructions so no
3084  // special handling is required.
3085  return selectImpl(MI, *CoverageInfo);
3086  }
3087 
3088  MachineBasicBlock *MBB = MI.getParent();
3089  const DebugLoc &DL = MI.getDebugLoc();
3090 
3091  if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
3092  Function &F = MBB->getParent()->getFunction();
3094  NoFpRet(F, "return versions of fp atomics not supported",
3095  MI.getDebugLoc(), DS_Error);
3096  F.getContext().diagnose(NoFpRet);
3097  return false;
3098  }
3099 
3100  // FIXME: This is only needed because tablegen requires number of dst operands
3101  // in match and replace pattern to be the same. Otherwise patterns can be
3102  // exported from SDag path.
3103  auto Addr = selectFlatOffsetImpl(AddrOp, SIInstrFlags::FlatGlobal);
3104 
3105  Register Data = DataOp.getReg();
3106  const unsigned Opc = MRI->getType(Data).isVector() ?
3107  AMDGPU::GLOBAL_ATOMIC_PK_ADD_F16 : AMDGPU::GLOBAL_ATOMIC_ADD_F32;
3108  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3109  .addReg(Addr.first)
3110  .addReg(Data)
3111  .addImm(Addr.second)
3112  .addImm(0) // cpol
3113  .cloneMemRefs(MI);
3114 
3115  MI.eraseFromParent();
3116  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3117 }
3118 
3119 bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3120  MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3121  MI.RemoveOperand(1);
3122  MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3123  return true;
3124 }
3125 
3127  if (I.isPHI())
3128  return selectPHI(I);
3129 
3130  if (!I.isPreISelOpcode()) {
3131  if (I.isCopy())
3132  return selectCOPY(I);
3133  return true;
3134  }
3135 
3136  switch (I.getOpcode()) {
3137  case TargetOpcode::G_AND:
3138  case TargetOpcode::G_OR:
3139  case TargetOpcode::G_XOR:
3140  if (selectImpl(I, *CoverageInfo))
3141  return true;
3142  return selectG_AND_OR_XOR(I);
3143  case TargetOpcode::G_ADD:
3144  case TargetOpcode::G_SUB:
3145  if (selectImpl(I, *CoverageInfo))
3146  return true;
3147  return selectG_ADD_SUB(I);
3148  case TargetOpcode::G_UADDO:
3149  case TargetOpcode::G_USUBO:
3150  case TargetOpcode::G_UADDE:
3151  case TargetOpcode::G_USUBE:
3152  return selectG_UADDO_USUBO_UADDE_USUBE(I);
3153  case TargetOpcode::G_INTTOPTR:
3154  case TargetOpcode::G_BITCAST:
3155  case TargetOpcode::G_PTRTOINT:
3156  return selectCOPY(I);
3157  case TargetOpcode::G_CONSTANT:
3158  case TargetOpcode::G_FCONSTANT:
3159  return selectG_CONSTANT(I);
3160  case TargetOpcode::G_FNEG:
3161  if (selectImpl(I, *CoverageInfo))
3162  return true;
3163  return selectG_FNEG(I);
3164  case TargetOpcode::G_FABS:
3165  if (selectImpl(I, *CoverageInfo))
3166  return true;
3167  return selectG_FABS(I);
3168  case TargetOpcode::G_EXTRACT:
3169  return selectG_EXTRACT(I);
3170  case TargetOpcode::G_MERGE_VALUES:
3171  case TargetOpcode::G_BUILD_VECTOR:
3172  case TargetOpcode::G_CONCAT_VECTORS:
3173  return selectG_MERGE_VALUES(I);
3174  case TargetOpcode::G_UNMERGE_VALUES:
3175  return selectG_UNMERGE_VALUES(I);
3176  case TargetOpcode::G_BUILD_VECTOR_TRUNC:
3177  return selectG_BUILD_VECTOR_TRUNC(I);
3178  case TargetOpcode::G_PTR_ADD:
3179  return selectG_PTR_ADD(I);
3180  case TargetOpcode::G_IMPLICIT_DEF:
3181  return selectG_IMPLICIT_DEF(I);
3182  case TargetOpcode::G_FREEZE:
3183  return selectCOPY(I);
3184  case TargetOpcode::G_INSERT:
3185  return selectG_INSERT(I);
3186  case TargetOpcode::G_INTRINSIC:
3187  return selectG_INTRINSIC(I);
3188  case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3189  return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
3190  case TargetOpcode::G_ICMP:
3191  if (selectG_ICMP(I))
3192  return true;
3193  return selectImpl(I, *CoverageInfo);
3194  case TargetOpcode::G_LOAD:
3195  case TargetOpcode::G_STORE:
3196  case TargetOpcode::G_ATOMIC_CMPXCHG:
3197  case TargetOpcode::G_ATOMICRMW_XCHG:
3198  case TargetOpcode::G_ATOMICRMW_ADD:
3199  case TargetOpcode::G_ATOMICRMW_SUB:
3200  case TargetOpcode::G_ATOMICRMW_AND:
3201  case TargetOpcode::G_ATOMICRMW_OR:
3202  case TargetOpcode::G_ATOMICRMW_XOR:
3203  case TargetOpcode::G_ATOMICRMW_MIN:
3204  case TargetOpcode::G_ATOMICRMW_MAX:
3205  case TargetOpcode::G_ATOMICRMW_UMIN:
3206  case TargetOpcode::G_ATOMICRMW_UMAX:
3207  case TargetOpcode::G_ATOMICRMW_FADD:
3208  case AMDGPU::G_AMDGPU_ATOMIC_INC:
3209  case AMDGPU::G_AMDGPU_ATOMIC_DEC:
3210  case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
3211  case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
3212  return selectG_LOAD_STORE_ATOMICRMW(I);
3213  case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
3214  return selectG_AMDGPU_ATOMIC_CMPXCHG(I);
3215  case TargetOpcode::G_SELECT:
3216  return selectG_SELECT(I);
3217  case TargetOpcode::G_TRUNC:
3218  return selectG_TRUNC(I);
3219  case TargetOpcode::G_SEXT:
3220  case TargetOpcode::G_ZEXT:
3221  case TargetOpcode::G_ANYEXT:
3222  case TargetOpcode::G_SEXT_INREG:
3223  if (selectImpl(I, *CoverageInfo))
3224  return true;
3225  return selectG_SZA_EXT(I);
3226  case TargetOpcode::G_BRCOND:
3227  return selectG_BRCOND(I);
3228  case TargetOpcode::G_GLOBAL_VALUE:
3229  return selectG_GLOBAL_VALUE(I);
3230  case TargetOpcode::G_PTRMASK:
3231  return selectG_PTRMASK(I);
3232  case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3233  return selectG_EXTRACT_VECTOR_ELT(I);
3234  case TargetOpcode::G_INSERT_VECTOR_ELT:
3235  return selectG_INSERT_VECTOR_ELT(I);
3236  case TargetOpcode::G_SHUFFLE_VECTOR:
3237  return selectG_SHUFFLE_VECTOR(I);
3238  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3239  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
3241  = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID());
3242  assert(Intr && "not an image intrinsic with image pseudo");
3243  return selectImageIntrinsic(I, Intr);
3244  }
3245  case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
3246  return selectBVHIntrinsic(I);
3247  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3248  return selectAMDGPU_BUFFER_ATOMIC_FADD(I);
3249  case AMDGPU::G_SBFX:
3250  case AMDGPU::G_UBFX:
3251  return selectG_SBFX_UBFX(I);
3252  default:
3253  return selectImpl(I, *CoverageInfo);
3254  }
3255  return false;
3256 }
3257 
3259 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
3260  return {{
3261  [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3262  }};
3263 
3264 }
3265 
3266 std::pair<Register, unsigned>
3267 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
3268  bool AllowAbs) const {
3269  Register Src = Root.getReg();
3270  Register OrigSrc = Src;
3271  unsigned Mods = 0;
3272  MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
3273 
3274  if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
3275  Src = MI->getOperand(1).getReg();
3276  Mods |= SISrcMods::NEG;
3277  MI = getDefIgnoringCopies(Src, *MRI);
3278  }
3279 
3280  if (AllowAbs && MI && MI->getOpcode() == AMDGPU::G_FABS) {
3281  Src = MI->getOperand(1).getReg();
3282  Mods |= SISrcMods::ABS;
3283  }
3284 
3285  if (Mods != 0 &&
3286  RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
3287  MachineInstr *UseMI = Root.getParent();
3288 
3289  // If we looked through copies to find source modifiers on an SGPR operand,
3290  // we now have an SGPR register source. To avoid potentially violating the
3291  // constant bus restriction, we need to insert a copy to a VGPR.
3292  Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc);
3294  TII.get(AMDGPU::COPY), VGPRSrc)
3295  .addReg(Src);
3296  Src = VGPRSrc;
3297  }
3298 
3299  return std::make_pair(Src, Mods);
3300 }
3301 
3302 ///
3303 /// This will select either an SGPR or VGPR operand and will save us from
3304 /// having to write an extra tablegen pattern.
3306 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
3307  return {{
3308  [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3309  }};
3310 }
3311 
3313 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
3314  Register Src;
3315  unsigned Mods;
3316  std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3317 
3318  return {{
3319  [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3320  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3321  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3322  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3323  }};
3324 }
3325 
3327 AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
3328  Register Src;
3329  unsigned Mods;
3330  std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false);
3331 
3332  return {{
3333  [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3334  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3335  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3336  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3337  }};
3338 }
3339 
3341 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
3342  return {{
3343  [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
3344  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3345  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3346  }};
3347 }
3348 
3350 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3351  Register Src;
3352  unsigned Mods;
3353  std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3354 
3355  return {{
3356  [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3357  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3358  }};
3359 }
3360 
3362 AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
3363  Register Src;
3364  unsigned Mods;
3365  std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false);
3366 
3367  return {{
3368  [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3369  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3370  }};
3371 }
3372 
3374 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3375  Register Reg = Root.getReg();
3376  const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3377  if (Def && (Def->getOpcode() == AMDGPU::G_FNEG ||
3378  Def->getOpcode() == AMDGPU::G_FABS))
3379  return {};
3380  return {{
3381  [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3382  }};
3383 }
3384 
3385 std::pair<Register, unsigned>
3386 AMDGPUInstructionSelector::selectVOP3PModsImpl(
3387  Register Src, const MachineRegisterInfo &MRI) const {
3388  unsigned Mods = 0;
3389  MachineInstr *MI = MRI.getVRegDef(Src);
3390 
3391  if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3392  // It's possible to see an f32 fneg here, but unlikely.
3393  // TODO: Treat f32 fneg as only high bit.
3394  MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
3395  Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3396  Src = MI->getOperand(1).getReg();
3397  MI = MRI.getVRegDef(Src);
3398  }
3399 
3400  // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3401 
3402  // Packed instructions do not have abs modifiers.
3403  Mods |= SISrcMods::OP_SEL_1;
3404 
3405  return std::make_pair(Src, Mods);
3406 }
3407 
3409 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3411  = Root.getParent()->getParent()->getParent()->getRegInfo();
3412 
3413  Register Src;
3414  unsigned Mods;
3415  std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3416 
3417  return {{
3418  [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3419  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3420  }};
3421 }
3422 
3424 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
3425  Register Src;
3426  unsigned Mods;
3427  std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3428  if (!isKnownNeverNaN(Src, *MRI))
3429  return None;
3430 
3431  return {{
3432  [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3433  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3434  }};
3435 }
3436 
3438 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
3439  // FIXME: Handle op_sel
3440  return {{
3441  [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
3442  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods
3443  }};
3444 }
3445 
3447 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
3448  SmallVector<GEPInfo, 4> AddrInfo;
3449  getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3450 
3451  if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3452  return None;
3453 
3454  const GEPInfo &GEPInfo = AddrInfo[0];
3455  Optional<int64_t> EncodedImm =
3456  AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false);
3457  if (!EncodedImm)
3458  return None;
3459 
3460  unsigned PtrReg = GEPInfo.SgprParts[0];
3461  return {{
3462  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3463  [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3464  }};
3465 }
3466 
3468 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
3469  SmallVector<GEPInfo, 4> AddrInfo;
3470  getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3471 
3472  if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3473  return None;
3474 
3475  const GEPInfo &GEPInfo = AddrInfo[0];
3476  Register PtrReg = GEPInfo.SgprParts[0];
3477  Optional<int64_t> EncodedImm =
3478  AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
3479  if (!EncodedImm)
3480  return None;
3481 
3482  return {{
3483  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3484  [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3485  }};
3486 }
3487 
3489 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
3490  MachineInstr *MI = Root.getParent();
3491  MachineBasicBlock *MBB = MI->getParent();
3492 
3493  SmallVector<GEPInfo, 4> AddrInfo;
3494  getAddrModeInfo(*MI, *MRI, AddrInfo);
3495 
3496  // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
3497  // then we can select all ptr + 32-bit offsets not just immediate offsets.
3498  if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3499  return None;
3500 
3501  const GEPInfo &GEPInfo = AddrInfo[0];
3502  // SGPR offset is unsigned.
3503  if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm))
3504  return None;
3505 
3506  // If we make it this far we have a load with an 32-bit immediate offset.
3507  // It is OK to select this using a sgpr offset, because we have already
3508  // failed trying to select this load into one of the _IMM variants since
3509  // the _IMM Patterns are considered before the _SGPR patterns.
3510  Register PtrReg = GEPInfo.SgprParts[0];
3511  Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3512  BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
3513  .addImm(GEPInfo.Imm);
3514  return {{
3515  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3516  [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }
3517  }};
3518 }
3519 
3520 std::pair<Register, int>
3521 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
3522  uint64_t FlatVariant) const {
3523  MachineInstr *MI = Root.getParent();
3524 
3525  auto Default = std::make_pair(Root.getReg(), 0);
3526 
3527  if (!STI.hasFlatInstOffsets())
3528  return Default;
3529 
3530  Register PtrBase;
3531  int64_t ConstOffset;
3532  std::tie(PtrBase, ConstOffset) =
3533  getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3534  if (ConstOffset == 0)
3535  return Default;
3536 
3537  unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
3538  if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
3539  return Default;
3540 
3541  return std::make_pair(PtrBase, ConstOffset);
3542 }
3543 
3545 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
3546  auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
3547 
3548  return {{
3549  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
3550  [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
3551  }};
3552 }
3553 
3555 AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
3556  auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
3557 
3558  return {{
3559  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
3560  [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
3561  }};
3562 }
3563 
3565 AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
3566  auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
3567 
3568  return {{
3569  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
3570  [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
3571  }};
3572 }
3573 
3574 /// Match a zero extend from a 32-bit value to 64-bits.
3576  Register ZExtSrc;
3577  if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3578  return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3579 
3580  // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3582  if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3583  return false;
3584 
3585  if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3586  return Def->getOperand(1).getReg();
3587  }
3588 
3589  return Register();
3590 }
3591 
3592 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
3594 AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
3595  Register Addr = Root.getReg();
3596  Register PtrBase;
3597  int64_t ConstOffset;
3598  int64_t ImmOffset = 0;
3599 
3600  // Match the immediate offset first, which canonically is moved as low as
3601  // possible.
3602  std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
3603 
3604  if (ConstOffset != 0) {
3605  if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
3607  Addr = PtrBase;
3608  ImmOffset = ConstOffset;
3609  } else {
3610  auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
3611  if (!PtrBaseDef)
3612  return None;
3613 
3614  if (isSGPR(PtrBaseDef->Reg)) {
3615  if (ConstOffset > 0) {
3616  // Offset is too large.
3617  //
3618  // saddr + large_offset -> saddr +
3619  // (voffset = large_offset & ~MaxOffset) +
3620  // (large_offset & MaxOffset);
3621  int64_t SplitImmOffset, RemainderOffset;
3622  std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
3624 
3625  if (isUInt<32>(RemainderOffset)) {
3626  MachineInstr *MI = Root.getParent();
3627  MachineBasicBlock *MBB = MI->getParent();
3628  Register HighBits =
3629  MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3630 
3631  BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3632  HighBits)
3633  .addImm(RemainderOffset);
3634 
3635  return {{
3636  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
3637  [=](MachineInstrBuilder &MIB) {
3638  MIB.addReg(HighBits);
3639  }, // voffset
3640  [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
3641  }};
3642  }
3643  }
3644 
3645  // We are adding a 64 bit SGPR and a constant. If constant bus limit
3646  // is 1 we would need to perform 1 or 2 extra moves for each half of
3647  // the constant and it is better to do a scalar add and then issue a
3648  // single VALU instruction to materialize zero. Otherwise it is less
3649  // instructions to perform VALU adds with immediates or inline literals.
3650  unsigned NumLiterals =
3651  !TII.isInlineConstant(APInt(32, ConstOffset & 0xffffffff)) +
3652  !TII.isInlineConstant(APInt(32, ConstOffset >> 32));
3653  if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
3654  return None;
3655  }
3656  }
3657  }
3658 
3659  auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3660  if (!AddrDef)
3661  return None;
3662 
3663  // Match the variable offset.
3664  if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3665  // Look through the SGPR->VGPR copy.
3666  Register SAddr =
3667  getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3668 
3669  if (SAddr && isSGPR(SAddr)) {
3670  Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3671 
3672  // It's possible voffset is an SGPR here, but the copy to VGPR will be
3673  // inserted later.
3674  if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
3675  return {{[=](MachineInstrBuilder &MIB) { // saddr
3676  MIB.addReg(SAddr);
3677  },
3678  [=](MachineInstrBuilder &MIB) { // voffset
3679  MIB.addReg(VOffset);
3680  },
3681  [=](MachineInstrBuilder &MIB) { // offset
3682  MIB.addImm(ImmOffset);
3683  }}};
3684  }
3685  }
3686  }
3687 
3688  // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
3689  // drop this.
3690  if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
3691  AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
3692  return None;
3693 
3694  // It's cheaper to materialize a single 32-bit zero for vaddr than the two
3695  // moves required to copy a 64-bit SGPR to VGPR.
3696  MachineInstr *MI = Root.getParent();
3697  MachineBasicBlock *MBB = MI->getParent();
3698  Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3699 
3700  BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3701  .addImm(0);
3702 
3703  return {{
3704  [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
3705  [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
3706  [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
3707  }};
3708 }
3709 
3711 AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
3712  Register Addr = Root.getReg();
3713  Register PtrBase;
3714  int64_t ConstOffset;
3715  int64_t ImmOffset = 0;
3716 
3717  // Match the immediate offset first, which canonically is moved as low as
3718  // possible.
3719  std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
3720 
3721  if (ConstOffset != 0 &&
3724  Addr = PtrBase;
3725  ImmOffset = ConstOffset;
3726  }
3727 
3728  auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3729  if (!AddrDef)
3730  return None;
3731 
3732  if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
3733  int FI = AddrDef->MI->getOperand(1).getIndex();
3734  return {{
3735  [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
3736  [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
3737  }};
3738  }
3739 
3740  Register SAddr = AddrDef->Reg;
3741 
3742  if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3743  Register LHS = AddrDef->MI->getOperand(1).getReg();
3744  Register RHS = AddrDef->MI->getOperand(2).getReg();
3745  auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
3746  auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
3747 
3748  if (LHSDef && RHSDef &&
3749  LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
3750  isSGPR(RHSDef->Reg)) {
3751  int FI = LHSDef->MI->getOperand(1).getIndex();
3752  MachineInstr &I = *Root.getParent();
3753  MachineBasicBlock *BB = I.getParent();
3754  const DebugLoc &DL = I.getDebugLoc();
3755  SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3756 
3757  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
3758  .addFrameIndex(FI)
3759  .addReg(RHSDef->Reg);
3760  }
3761  }
3762 
3763  if (!isSGPR(SAddr))
3764  return None;
3765 
3766  return {{
3767  [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
3768  [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
3769  }};
3770 }
3771 
3773 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
3774  MachineInstr *MI = Root.getParent();
3775  MachineBasicBlock *MBB = MI->getParent();
3778 
3779  int64_t Offset = 0;
3780  if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
3782  Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3783 
3784  // TODO: Should this be inside the render function? The iterator seems to
3785  // move.
3786  BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3787  HighBits)
3788  .addImm(Offset & ~4095);
3789 
3790  return {{[=](MachineInstrBuilder &MIB) { // rsrc
3791  MIB.addReg(Info->getScratchRSrcReg());
3792  },
3793  [=](MachineInstrBuilder &MIB) { // vaddr
3794  MIB.addReg(HighBits);
3795  },
3796  [=](MachineInstrBuilder &MIB) { // soffset
3797  // Use constant zero for soffset and rely on eliminateFrameIndex
3798  // to choose the appropriate frame register if need be.
3799  MIB.addImm(0);
3800  },
3801  [=](MachineInstrBuilder &MIB) { // offset
3802  MIB.addImm(Offset & 4095);
3803  }}};
3804  }
3805 
3806  assert(Offset == 0 || Offset == -1);
3807 
3808  // Try to fold a frame index directly into the MUBUF vaddr field, and any
3809  // offsets.
3810  Optional<int> FI;
3811  Register VAddr = Root.getReg();
3812  if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
3813  Register PtrBase;
3814  int64_t ConstOffset;
3815  std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
3816  if (ConstOffset != 0) {
3817  if (SIInstrInfo::isLegalMUBUFImmOffset(ConstOffset) &&
3819  KnownBits->signBitIsZero(PtrBase))) {
3820  const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
3821  if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
3822  FI = PtrBaseDef->getOperand(1).getIndex();
3823  else
3824  VAddr = PtrBase;
3825  Offset = ConstOffset;
3826  }
3827  } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
3828  FI = RootDef->getOperand(1).getIndex();
3829  }
3830  }
3831 
3832  return {{[=](MachineInstrBuilder &MIB) { // rsrc
3833  MIB.addReg(Info->getScratchRSrcReg());
3834  },
3835  [=](MachineInstrBuilder &MIB) { // vaddr
3836  if (FI.hasValue())
3837  MIB.addFrameIndex(FI.getValue());
3838  else
3839  MIB.addReg(VAddr);
3840  },
3841  [=](MachineInstrBuilder &MIB) { // soffset
3842  // Use constant zero for soffset and rely on eliminateFrameIndex
3843  // to choose the appropriate frame register if need be.
3844  MIB.addImm(0);
3845  },
3846  [=](MachineInstrBuilder &MIB) { // offset
3847  MIB.addImm(Offset);
3848  }}};
3849 }
3850 
3851 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
3852  int64_t Offset) const {
3853  if (!isUInt<16>(Offset))
3854  return false;
3855 
3857  return true;
3858 
3859  // On Southern Islands instruction with a negative base value and an offset
3860  // don't seem to work.
3861  return KnownBits->signBitIsZero(Base);
3862 }
3863 
3864 bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
3865  int64_t Offset1,
3866  unsigned Size) const {
3867  if (Offset0 % Size != 0 || Offset1 % Size != 0)
3868  return false;
3869  if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
3870  return false;
3871 
3873  return true;
3874 
3875  // On Southern Islands instruction with a negative base value and an offset
3876  // don't seem to work.
3877  return KnownBits->signBitIsZero(Base);
3878 }
3879 
3881 AMDGPUInstructionSelector::selectMUBUFScratchOffset(
3882  MachineOperand &Root) const {
3883  MachineInstr *MI = Root.getParent();
3884  MachineBasicBlock *MBB = MI->getParent();
3885 
3886  int64_t Offset = 0;
3887  if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
3889  return {};
3890 
3891  const MachineFunction *MF = MBB->getParent();
3893 
3894  return {{
3895  [=](MachineInstrBuilder &MIB) { // rsrc
3896  MIB.addReg(Info->getScratchRSrcReg());
3897  },
3898  [=](MachineInstrBuilder &MIB) { // soffset
3899  MIB.addImm(0);
3900  },
3901  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
3902  }};
3903 }
3904 
3905 std::pair<Register, unsigned>
3906 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
3907  const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3908  if (!RootDef)
3909  return std::make_pair(Root.getReg(), 0);
3910 
3911  int64_t ConstAddr = 0;
3912 
3913  Register PtrBase;
3914  int64_t Offset;
3915  std::tie(PtrBase, Offset) =
3916  getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3917 
3918  if (Offset) {
3919  if (isDSOffsetLegal(PtrBase, Offset)) {
3920  // (add n0, c0)
3921  return std::make_pair(PtrBase, Offset);
3922  }
3923  } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3924  // TODO
3925 
3926 
3927  } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3928  // TODO
3929 
3930  }
3931 
3932  return std::make_pair(Root.getReg(), 0);
3933 }
3934 
3936 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
3937  Register Reg;
3938  unsigned Offset;
3939  std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
3940  return {{
3941  [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3942  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
3943  }};
3944 }
3945 
3947 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
3948  return selectDSReadWrite2(Root, 4);
3949 }
3950 
3952 AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
3953  return selectDSReadWrite2(Root, 8);
3954 }
3955 
3957 AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
3958  unsigned Size) const {
3959  Register Reg;
3960  unsigned Offset;
3961  std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
3962  return {{
3963  [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3964  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
3965  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
3966  }};
3967 }
3968 
3969 std::pair<Register, unsigned>
3970 AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
3971  unsigned Size) const {
3972  const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3973  if (!RootDef)
3974  return std::make_pair(Root.getReg(), 0);
3975 
3976  int64_t ConstAddr = 0;
3977 
3978  Register PtrBase;
3979  int64_t Offset;
3980  std::tie(PtrBase, Offset) =
3981  getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3982 
3983  if (Offset) {
3984  int64_t OffsetValue0 = Offset;
3985  int64_t OffsetValue1 = Offset + Size;
3986  if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
3987  // (add n0, c0)
3988  return std::make_pair(PtrBase, OffsetValue0 / Size);
3989  }
3990  } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3991  // TODO
3992 
3993  } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3994  // TODO
3995 
3996  }
3997 
3998  return std::make_pair(Root.getReg(), 0);
3999 }
4000 
4001 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
4002 /// the base value with the constant offset. There may be intervening copies
4003 /// between \p Root and the identified constant. Returns \p Root, 0 if this does
4004 /// not match the pattern.
4005 std::pair<Register, int64_t>
4006 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
4007  Register Root, const MachineRegisterInfo &MRI) const {
4008  MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
4009  if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
4010  return {Root, 0};
4011 
4012  MachineOperand &RHS = RootI->getOperand(2);
4013  Optional<ValueAndVReg> MaybeOffset =
4015  if (!MaybeOffset)
4016  return {Root, 0};
4017  return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
4018 }
4019 
4020 static void addZeroImm(MachineInstrBuilder &MIB) {
4021  MIB.addImm(0);
4022 }
4023 
4024 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
4025 /// BasePtr is not valid, a null base pointer will be used.
4027  uint32_t FormatLo, uint32_t FormatHi,
4028  Register BasePtr) {
4029  Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4030  Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4031  Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4032  Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
4033 
4034  B.buildInstr(AMDGPU::S_MOV_B32)
4035  .addDef(RSrc2)
4036  .addImm(FormatLo);
4037  B.buildInstr(AMDGPU::S_MOV_B32)
4038  .addDef(RSrc3)
4039  .addImm(FormatHi);
4040 
4041  // Build the half of the subregister with the constants before building the
4042  // full 128-bit register. If we are building multiple resource descriptors,
4043  // this will allow CSEing of the 2-component register.
4044  B.buildInstr(AMDGPU::REG_SEQUENCE)
4045  .addDef(RSrcHi)
4046  .addReg(RSrc2)
4047  .addImm(AMDGPU::sub0)
4048  .addReg(RSrc3)
4049  .addImm(AMDGPU::sub1);
4050 
4051  Register RSrcLo = BasePtr;
4052  if (!BasePtr) {
4053  RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4054  B.buildInstr(AMDGPU::S_MOV_B64)
4055  .addDef(RSrcLo)
4056  .addImm(0);
4057  }
4058 
4059  B.buildInstr(AMDGPU::REG_SEQUENCE)
4060  .addDef(RSrc)
4061  .addReg(RSrcLo)
4062  .addImm(AMDGPU::sub0_sub1)
4063  .addReg(RSrcHi)
4064  .addImm(AMDGPU::sub2_sub3);
4065 
4066  return RSrc;
4067 }
4068 
4070  const SIInstrInfo &TII, Register BasePtr) {
4071  uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
4072 
4073  // FIXME: Why are half the "default" bits ignored based on the addressing
4074  // mode?
4075  return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
4076 }
4077 
4079  const SIInstrInfo &TII, Register BasePtr) {
4080  uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
4081 
4082  // FIXME: Why are half the "default" bits ignored based on the addressing
4083  // mode?
4084  return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
4085 }
4086 
4087 AMDGPUInstructionSelector::MUBUFAddressData
4088 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
4089  MUBUFAddressData Data;
4090  Data.N0 = Src;
4091 
4092  Register PtrBase;
4093  int64_t Offset;
4094 
4095  std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
4096  if (isUInt<32>(Offset)) {
4097  Data.N0 = PtrBase;
4098  Data.Offset = Offset;
4099  }
4100 
4101  if (MachineInstr *InputAdd
4102  = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
4103  Data.N2 = InputAdd->getOperand(1).getReg();
4104  Data.N3 = InputAdd->getOperand(2).getReg();
4105 
4106  // FIXME: Need to fix extra SGPR->VGPRcopies inserted
4107  // FIXME: Don't know this was defined by operand 0
4108  //
4109  // TODO: Remove this when we have copy folding optimizations after
4110  // RegBankSelect.
4111  Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
4112  Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
4113  }
4114 
4115  return Data;
4116 }
4117 
4118 /// Return if the addr64 mubuf mode should be used for the given address.
4119 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
4120  // (ptr_add N2, N3) -> addr64, or
4121  // (ptr_add (ptr_add N2, N3), C1) -> addr64
4122  if (Addr.N2)
4123  return true;
4124 
4125  const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
4126  return N0Bank->getID() == AMDGPU::VGPRRegBankID;
4127 }
4128 
4129 /// Split an immediate offset \p ImmOffset depending on whether it fits in the
4130 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
4131 /// component.
4132 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
4133  MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
4134  if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset))
4135  return;
4136 
4137  // Illegal offset, store it in soffset.
4138  SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4139  B.buildInstr(AMDGPU::S_MOV_B32)
4140  .addDef(SOffset)
4141  .addImm(ImmOffset);
4142  ImmOffset = 0;
4143 }
4144 
4145 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
4146  MachineOperand &Root, Register &VAddr, Register &RSrcReg,
4147  Register &SOffset, int64_t &Offset) const {
4148  // FIXME: Predicates should stop this from reaching here.
4149  // addr64 bit was removed for volcanic islands.
4150  if (!STI.hasAddr64() || STI.useFlatForGlobal())
4151  return false;
4152 
4153  MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
4154  if (!shouldUseAddr64(AddrData))
4155  return false;
4156 
4157  Register N0 = AddrData.N0;
4158  Register N2 = AddrData.N2;
4159  Register N3 = AddrData.N3;
4160  Offset = AddrData.Offset;
4161 
4162  // Base pointer for the SRD.
4163  Register SRDPtr;
4164 
4165  if (N2) {
4166  if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4167  assert(N3);
4168  if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4169  // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
4170  // addr64, and construct the default resource from a 0 address.
4171  VAddr = N0;
4172  } else {
4173  SRDPtr = N3;
4174  VAddr = N2;
4175  }
4176  } else {
4177  // N2 is not divergent.
4178  SRDPtr = N2;
4179  VAddr = N3;
4180  }
4181  } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4182  // Use the default null pointer in the resource
4183  VAddr = N0;
4184  } else {
4185  // N0 -> offset, or
4186  // (N0 + C1) -> offset
4187  SRDPtr = N0;
4188  }
4189 
4190  MachineIRBuilder B(*Root.getParent());
4191  RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
4192  splitIllegalMUBUFOffset(B, SOffset, Offset);
4193  return true;
4194 }
4195 
4196 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
4197  MachineOperand &Root, Register &RSrcReg, Register &SOffset,
4198  int64_t &Offset) const {
4199 
4200  // FIXME: Pattern should not reach here.
4201  if (STI.useFlatForGlobal())
4202  return false;
4203 
4204  MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
4205  if (shouldUseAddr64(AddrData))
4206  return false;
4207 
4208  // N0 -> offset, or
4209  // (N0 + C1) -> offset
4210  Register SRDPtr = AddrData.N0;
4211  Offset = AddrData.Offset;
4212 
4213  // TODO: Look through extensions for 32-bit soffset.
4214  MachineIRBuilder B(*Root.getParent());
4215 
4216  RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
4217  splitIllegalMUBUFOffset(B, SOffset, Offset);
4218  return true;
4219 }
4220 
4222 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
4223  Register VAddr;
4224  Register RSrcReg;
4225  Register SOffset;
4226  int64_t Offset = 0;
4227 
4228  if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
4229  return {};
4230 
4231  // FIXME: Use defaulted operands for trailing 0s and remove from the complex
4232  // pattern.
4233  return {{
4234  [=](MachineInstrBuilder &MIB) { // rsrc
4235  MIB.addReg(RSrcReg);
4236  },
4237  [=](MachineInstrBuilder &MIB) { // vaddr
4238  MIB.addReg(VAddr);
4239  },
4240  [=](MachineInstrBuilder &MIB) { // soffset
4241  if (SOffset)
4242  MIB.addReg(SOffset);
4243  else
4244  MIB.addImm(0);
4245  },
4246  [=](MachineInstrBuilder &MIB) { // offset
4247  MIB.addImm(Offset);
4248  },
4249  addZeroImm, // cpol
4250  addZeroImm, // tfe
4251  addZeroImm // swz
4252  }};
4253 }
4254 
4256 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
4257  Register RSrcReg;
4258  Register SOffset;
4259  int64_t Offset = 0;
4260 
4261  if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
4262  return {};
4263 
4264  return {{
4265  [=](MachineInstrBuilder &MIB) { // rsrc
4266  MIB.addReg(RSrcReg);
4267  },
4268  [=](MachineInstrBuilder &MIB) { // soffset
4269  if (SOffset)
4270  MIB.addReg(SOffset);
4271  else
4272  MIB.addImm(0);
4273  },
4274  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
4275  addZeroImm, // cpol
4276  addZeroImm, // tfe
4277  addZeroImm, // swz
4278  }};
4279 }
4280 
4282 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const {
4283  Register VAddr;
4284  Register RSrcReg;
4285  Register SOffset;
4286  int64_t Offset = 0;
4287 
4288  if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
4289  return {};
4290 
4291  // FIXME: Use defaulted operands for trailing 0s and remove from the complex
4292  // pattern.
4293  return {{
4294  [=](MachineInstrBuilder &MIB) { // rsrc
4295  MIB.addReg(RSrcReg);
4296  },
4297  [=](MachineInstrBuilder &MIB) { // vaddr
4298  MIB.addReg(VAddr);
4299  },
4300  [=](MachineInstrBuilder &MIB) { // soffset
4301  if (SOffset)
4302  MIB.addReg(SOffset);
4303  else
4304  MIB.addImm(0);
4305  },
4306  [=](MachineInstrBuilder &MIB) { // offset
4307  MIB.addImm(Offset);
4308  },
4309  [=](MachineInstrBuilder &MIB) {
4310  MIB.addImm(AMDGPU::CPol::GLC); // cpol
4311  }
4312  }};
4313 }
4314 
4316 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
4317  Register RSrcReg;
4318  Register SOffset;
4319  int64_t Offset = 0;
4320 
4321  if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
4322  return {};
4323 
4324  return {{
4325  [=](MachineInstrBuilder &MIB) { // rsrc
4326  MIB.addReg(RSrcReg);
4327  },
4328  [=](MachineInstrBuilder &MIB) { // soffset
4329  if (SOffset)
4330  MIB.addReg(SOffset);
4331  else
4332  MIB.addImm(0);
4333  },
4334  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
4335  [=](MachineInstrBuilder &MIB) { MIB.addImm(AMDGPU::CPol::GLC); } // cpol
4336  }};
4337 }
4338 
4339 /// Get an immediate that must be 32-bits, and treated as zero extended.
4341  const MachineRegisterInfo &MRI) {
4342  // getIConstantVRegVal sexts any values, so see if that matters.
4344  if (!OffsetVal || !isInt<32>(*OffsetVal))
4345  return None;
4346  return Lo_32(*OffsetVal);
4347 }
4348 
4350 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
4351  Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
4352  if (!OffsetVal)
4353  return {};
4354 
4355  Optional<int64_t> EncodedImm =
4356  AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
4357  if (!EncodedImm)
4358  return {};
4359 
4360  return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
4361 }
4362 
4364 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
4366 
4367  Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
4368  if (!OffsetVal)
4369  return {};
4370 
4371  Optional<int64_t> EncodedImm
4372  = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
4373  if (!EncodedImm)
4374  return {};
4375 
4376  return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
4377 }
4378 
4379 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
4380  const MachineInstr &MI,
4381  int OpIdx) const {
4382  assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
4383  "Expected G_CONSTANT");
4384  MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
4385 }
4386 
4387 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
4388  const MachineInstr &MI,
4389  int OpIdx) const {
4390  assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
4391  "Expected G_CONSTANT");
4392  MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
4393 }
4394 
4395 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
4396  const MachineInstr &MI,
4397  int OpIdx) const {
4398  assert(OpIdx == -1);
4399 
4400  const MachineOperand &Op = MI.getOperand(1);
4401  if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
4402  MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
4403  else {
4404  assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
4405  MIB.addImm(Op.getCImm()->getSExtValue());
4406  }
4407 }
4408 
4409 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
4410  const MachineInstr &MI,
4411  int OpIdx) const {
4412  assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
4413  "Expected G_CONSTANT");
4414  MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation());
4415 }
4416 
4417 /// This only really exists to satisfy DAG type checking machinery, so is a
4418 /// no-op here.
4419 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
4420  const MachineInstr &MI,
4421  int OpIdx) const {
4422  MIB.addImm(MI.getOperand(OpIdx).getImm());
4423 }
4424 
4425 void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
4426  const MachineInstr &MI,
4427  int OpIdx) const {
4428  assert(OpIdx >= 0 && "expected to match an immediate operand");
4429  MIB.addImm(MI.getOperand(OpIdx).getImm() & AMDGPU::CPol::ALL);
4430 }
4431 
4432 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
4433  const MachineInstr &MI,
4434  int OpIdx) const {
4435  assert(OpIdx >= 0 && "expected to match an immediate operand");
4436  MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
4437 }
4438 
4439 void AMDGPUInstructionSelector::renderSetGLC(MachineInstrBuilder &MIB,
4440  const MachineInstr &MI,
4441  int OpIdx) const {
4442  assert(OpIdx >= 0 && "expected to match an immediate operand");
4443  MIB.addImm(MI.getOperand(OpIdx).getImm() | AMDGPU::CPol::GLC);
4444 }
4445 
4446 void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
4447  const MachineInstr &MI,
4448  int OpIdx) const {
4449  MIB.addFrameIndex((MI.getOperand(1).getIndex()));
4450 }
4451 
4452 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
4454 }
4455 
4456 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const {
4458 }
4459 
4460 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const {
4462 }
4463 
4464 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
4465  return TII.isInlineConstant(Imm);
4466 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
i
i
Definition: README.txt:29
GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_PREDICATES_INIT
MIPatternMatch.h
addZeroImm
static void addZeroImm(MachineInstrBuilder &MIB)
Definition: AMDGPUInstructionSelector.cpp:4020
llvm::TargetMachine::getOptLevel
CodeGenOpt::Level getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
Definition: TargetMachine.cpp:185
llvm::getIConstantVRegSExtVal
Optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition: Utils.cpp:284
llvm::TargetRegisterInfo::getConstrainedRegClassForOperand
virtual const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const
Definition: TargetRegisterInfo.h:1086
sizeToSubRegIndex
static int sizeToSubRegIndex(unsigned Size)
Definition: AMDGPUInstructionSelector.cpp:1853
Signed
@ Signed
Definition: NVPTXISelLowering.cpp:4636
llvm::AMDGPUSubtarget::hasInv2PiInlineImm
bool hasInv2PiInlineImm() const
Definition: AMDGPUSubtarget.h:180
llvm::getDefIgnoringCopies
MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition: Utils.cpp:445
llvm::AMDGPURegisterBankInfo
Definition: AMDGPURegisterBankInfo.h:42
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:105
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:131
llvm
This file implements support for optimizing divisions by a constant.
Definition: AllocatorList.h:23
llvm::tgtok::Def
@ Def
Definition: TGLexer.h:50
llvm::AMDGPU::MIMGBaseOpcodeInfo::HasD16
bool HasD16
Definition: AMDGPUBaseInfo.h:293
Reg
unsigned Reg
Definition: MachineSink.cpp:1566
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::CmpInst::ICMP_EQ
@ ICMP_EQ
equal
Definition: InstrTypes.h:741
UseMI
MachineInstrBuilder & UseMI
Definition: AArch64ExpandPseudoInsts.cpp:102
llvm::AMDGPU::MIMGBaseOpcodeInfo::Store
bool Store
Definition: AMDGPUBaseInfo.h:282
llvm::MachineOperand::CreateReg
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
Definition: MachineOperand.h:791
llvm::ARMII::VecSize
@ VecSize
Definition: ARMBaseInfo.h:417
normalizeVOP3PMask
static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1, ArrayRef< int > Mask)
Definition: AMDGPUInstructionSelector.cpp:2849
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1379
llvm::MachineRegisterInfo::createVirtualRegister
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Definition: MachineRegisterInfo.cpp:158
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:720
llvm::RegState::Dead
@ Dead
Unused definition.
Definition: MachineInstrBuilder.h:50
llvm::MachineFrameInfo::setReturnAddressIsTaken
void setReturnAddressIsTaken(bool s)
Definition: MachineFrameInfo.h:375
llvm::AMDGPUAS::PRIVATE_ADDRESS
@ PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPU.h:364
llvm::DiagnosticInfoUnsupported
Diagnostic information for unsupported feature in backend.
Definition: DiagnosticInfo.h:1004
llvm::SIInstrFlags::DS
@ DS
Definition: SIDefines.h:59
llvm::MIPatternMatch::m_Reg
operand_type_match m_Reg()
Definition: MIPatternMatch.h:152
SIMachineFunctionInfo.h
llvm::GISelKnownBits
Definition: GISelKnownBits.h:29
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:52
llvm::MachineInstrBuilder::add
const MachineInstrBuilder & add(const MachineOperand &MO) const
Definition: MachineInstrBuilder.h:224
llvm::Function
Definition: Function.h:62
llvm::AMDGPU::getSMRDEncodedOffset
Optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer)
Definition: AMDGPUBaseInfo.cpp:1875
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
llvm::MIPatternMatch::m_GLShr
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
Definition: MIPatternMatch.h:388
llvm::RegState::Implicit
@ Implicit
Not emitted register (e.g. carry, or temporary result).
Definition: MachineInstrBuilder.h:46
include
include(LLVM-Build) add_subdirectory(IR) add_subdirectory(FuzzMutate) add_subdirectory(FileCheck) add_subdirectory(InterfaceStub) add_subdirectory(IRReader) add_subdirectory(CodeGen) add_subdirectory(BinaryFormat) add_subdirectory(Bitcode) add_subdirectory(Bitstream) add_subdirectory(DWARFLinker) add_subdirectory(Extensions) add_subdirectory(Frontend) add_subdirectory(Transforms) add_subdirectory(Linker) add_subdirectory(Analysis) add_subdirectory(LTO) add_subdirectory(MC) add_subdirectory(MCA) add_subdirectory(Object) add_subdirectory(ObjectYAML) add_subdirectory(Option) add_subdirectory(Remarks) add_subdirectory(DebugInfo) add_subdirectory(DWP) add_subdirectory(ExecutionEngine) add_subdirectory(Target) add_subdirectory(AsmParser) add_subdirectory(LineEditor) add_subdirectory(ProfileData) add_subdirectory(Passes) add_subdirectory(TextAPI) add_subdirectory(ToolDrivers) add_subdirectory(XRay) if(LLVM_INCLUDE_TESTS) add_subdirectory(Testing) endif() add_subdirectory(WindowsManifest) set(LLVMCONFIGLIBRARYDEPENDENCIESINC "$
Definition: CMakeLists.txt:1
llvm::getOpcodeDef
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:459
llvm::SIInstrFlags::FlatGlobal
@ FlatGlobal
Definition: SIDefines.h:101
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::AMDGPU::MIMGBaseOpcodeInfo::Gather4
bool Gather4
Definition: AMDGPUBaseInfo.h:286
llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1474
GISelKnownBits.h
llvm::AMDGPU::ImageDimIntrinsicInfo
Definition: AMDGPUInstrInfo.h:50
llvm::RegisterBankInfo::getRegBank
RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
Definition: RegisterBankInfo.h:432
llvm::GCNSubtarget::needsAlignedVGPRs
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
Definition: GCNSubtarget.h:962
llvm::AMDGPU::MIMGDimInfo
Definition: AMDGPUBaseInfo.h:304
llvm::getSrcRegIgnoringCopies
Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition: Utils.cpp:452
llvm::MachineRegisterInfo::getUniqueVRegDef
MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
Definition: MachineRegisterInfo.cpp:411
llvm::CmpInst::ICMP_NE
@ ICMP_NE
not equal
Definition: InstrTypes.h:742
llvm::AMDGPUSubtarget::SEA_ISLANDS
@ SEA_ISLANDS
Definition: AMDGPUSubtarget.h:38
llvm::InstructionSelector::setupMF
virtual void setupMF(MachineFunction &mf, GISelKnownBits *KB, CodeGenCoverage &covinfo, ProfileSummaryInfo *psi, BlockFrequencyInfo *bfi)
Setup per-MF selector state.
Definition: InstructionSelector.h:452
llvm::AMDGPU::getSMRDEncodedLiteralOffset32
Optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
Definition: AMDGPUBaseInfo.cpp:1892
llvm::TargetRegisterInfo
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Definition: TargetRegisterInfo.h:233
llvm::SIRegisterInfo::getWaveMaskRegClass
const TargetRegisterClass * getWaveMaskRegClass() const
Definition: SIRegisterInfo.h:300
llvm::Depth
@ Depth
Definition: SIMachineScheduler.h:36
llvm::CmpInst::ICMP_SGT
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:747
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
getAddrSpace
static Error getAddrSpace(StringRef R, unsigned &AddrSpace)
Definition: DataLayout.cpp:250
llvm::getFunctionLiveInPhysReg
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition: Utils.cpp:695
llvm::MachineMemOperand
A description of a memory reference used in the backend.
Definition: MachineMemOperand.h:128
llvm::AMDGPUMachineFunction::getLDSSize
unsigned getLDSSize() const
Definition: AMDGPUMachineFunction.h:70
isZeroOrUndef
static bool isZeroOrUndef(int X)
Definition: AMDGPUInstructionSelector.cpp:2835
llvm::AMDGPU::SDWA::UNUSED_PRESERVE
@ UNUSED_PRESERVE
Definition: SIDefines.h:662
llvm::MachineOperand::isCImm
bool isCImm() const
isCImm - Test if this is a MO_CImmediate operand.
Definition: MachineOperand.h:325
llvm::ConstantFP::getValueAPF
const APFloat & getValueAPF() const
Definition: Constants.h:297
llvm::LLT::isValid
bool isValid() const
Definition: LowLevelTypeImpl.h:117
llvm::Optional
Definition: APInt.h:33
llvm::AMDGPUSubtarget::hasSDWA
bool hasSDWA() const
Definition: AMDGPUSubtarget.h:160
Offset
uint64_t Offset
Definition: ELFObjHandler.cpp:81
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::CmpInst::ICMP_SLE
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:750
llvm::MipsISD::Ret
@ Ret
Definition: MipsISelLowering.h:116
MachineIRBuilder.h
llvm::SIInstrInfo::getIndirectRegWriteMovRelPseudo
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
Definition: SIInstrInfo.cpp:1317
llvm::PowerOf2Ceil
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:702
llvm::Lo_32
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:353
llvm::BitmaskEnumDetail::Mask
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1567
llvm::cl::ReallyHidden
@ ReallyHidden
Definition: CommandLine.h:144
llvm::GCNSubtarget::hasScalarCompareEq64
bool hasScalarCompareEq64() const
Definition: GCNSubtarget.h:796
llvm::ValueAndVReg::Value
APInt Value
Definition: Utils.h:179
llvm::AMDGPU::MIMGMIPMappingInfo::NONMIP
MIMGBaseOpcode NONMIP
Definition: AMDGPUBaseInfo.h:330
llvm::Data
@ Data
Definition: SIMachineScheduler.h:55
llvm::PointerUnion::get
T get() const
Returns the value of the specified pointer type.
Definition: PointerUnion.h:158
llvm::ARMII::IndexMode
IndexMode
ARM Index Modes.
Definition: ARMBaseInfo.h:177
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:241
DEBUG_TYPE
#define DEBUG_TYPE
Definition: AMDGPUInstructionSelector.cpp:29
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::constrainSelectedInstRegOperands
bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition: Utils.cpp:136
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::getUndefRegState
unsigned getUndefRegState(bool B)
Definition: MachineInstrBuilder.h:514
llvm::GCNSubtarget::unsafeDSOffsetFoldingEnabled
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:399
llvm::RegState::Kill
@ Kill
The last use of a register.
Definition: MachineInstrBuilder.h:48
llvm::GCNSubtarget::hasGWSSemaReleaseAll
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:617
llvm::Optional::hasValue
constexpr bool hasValue() const
Definition: Optional.h:288
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::SIRegisterInfo::getRegClassForTypeOnBank
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank, const MachineRegisterInfo &MRI) const
Definition: SIRegisterInfo.h:285
llvm::LLT::fixed_vector
static LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelTypeImpl.h:75
llvm::SIInstrFlags::FLAT
@ FLAT
Definition: SIDefines.h:58
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:206
llvm::MachineOperand::isKill
bool isKill() const
Definition: MachineOperand.h:390
llvm::MachineInstrBuilder::addDef
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Definition: MachineInstrBuilder.h:116
llvm::SIInstrFlags::FlatScratch
@ FlatScratch
Definition: SIDefines.h:116
llvm::AMDGPU::CPol::CPol
CPol
Definition: SIDefines.h:291
getLogicalBitOpcode
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
Definition: AMDGPUInstructionSelector.cpp:267
llvm::BlockFrequencyInfo
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Definition: BlockFrequencyInfo.h:37
llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition: MachineFunction.h:640
llvm::MachineOperand::isImplicit
bool isImplicit() const
Definition: MachineOperand.h:380
llvm::AMDGPUTargetMachine
Definition: AMDGPUTargetMachine.h:29
llvm::MachineInstrBuilder::addMBB
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Definition: MachineInstrBuilder.h:146
llvm::RegisterBank
This class implements the register bank concept.
Definition: RegisterBank.h:28
llvm::MIPatternMatch::m_GZExt
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
Definition: MIPatternMatch.h:439
llvm::RegState::Undef
@ Undef
Value of the register doesn't matter.
Definition: MachineInstrBuilder.h:52
llvm::SIInstrInfo::isInlineConstant
bool isInlineConstant(const APInt &Imm) const
Definition: SIInstrInfo.cpp:3372
llvm::MachineOperand::CreateImm
static MachineOperand CreateImm(int64_t Val)
Definition: MachineOperand.h:773
llvm::GCNSubtarget::useVGPRIndexMode
bool useVGPRIndexMode() const
Definition: AMDGPUSubtarget.cpp:726
llvm::GCNSubtarget::hasAddr64
bool hasAddr64() const
Definition: GCNSubtarget.h:310
llvm::AMDGPUInstructionSelector::select
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
Definition: AMDGPUInstructionSelector.cpp:3126
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::MachineMemOperand::getAddrSpace
unsigned getAddrSpace() const
Definition: MachineMemOperand.h:229
llvm::MachineOperand::getImm
int64_t getImm() const
Definition: MachineOperand.h:537
llvm::GCNSubtarget::useFlatForGlobal
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:458
llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition: MachineFunction.h:728
llvm::AMDGPU::MIMGBaseOpcodeInfo
Definition: AMDGPUBaseInfo.h:280
Intr
unsigned Intr
Definition: AMDGPUBaseInfo.cpp:1991
llvm::MachineMemOperand::getValue
const Value * getValue() const
Return the base address of the memory access.
Definition: MachineMemOperand.h:211
isVCmpResult
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
Definition: AMDGPUInstructionSelector.cpp:2469
llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:499
llvm::CmpInst::ICMP_ULE
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:746
llvm::AMDGPUISD::DS_ORDERED_COUNT
@ DS_ORDERED_COUNT
Definition: AMDGPUISelLowering.h:491
llvm::LLT::getSizeInBits
TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelTypeImpl.h:153
llvm::AMDGPU::MIMGDimInfo::Encoding
uint8_t Encoding
Definition: AMDGPUBaseInfo.h:310
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:363
llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:46
llvm::SIRegisterInfo::getReturnAddressReg
MCRegister getReturnAddressReg(const MachineFunction &MF) const
Definition: SIRegisterInfo.cpp:2388
llvm::InstructionSelector::CoverageInfo
CodeGenCoverage * CoverageInfo
Definition: InstructionSelector.h:439
llvm::SISrcMods::NEG_HI
@ NEG_HI
Definition: SIDefines.h:212
llvm::SISrcMods::OP_SEL_1
@ OP_SEL_1
Definition: SIDefines.h:214
gwsIntrinToOpcode
static unsigned gwsIntrinToOpcode(unsigned IntrID)
Definition: AMDGPUInstructionSelector.cpp:1321
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:129
llvm::MachineOperand::ChangeToImmediate
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
Definition: MachineOperand.cpp:156
llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:359
llvm::MCInstrDesc
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:195
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:49
llvm::MIPatternMatch::m_ZeroInt
SpecificConstantMatch m_ZeroInt()
{ Convenience matchers for specific integer values.
Definition: MIPatternMatch.h:134
llvm::M0
unsigned M0(unsigned Val)
Definition: VE.h:371
llvm::AMDGPU::MIMGLZMappingInfo
Definition: AMDGPUBaseInfo.h:323
llvm::Instruction
Definition: Instruction.h:45
llvm::GCNSubtarget::isWave32
bool isWave32() const
Definition: GCNSubtarget.h:1130
llvm::AMDGPUTargetMachine::getNullPointerValue
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
Definition: AMDGPUTargetMachine.cpp:750
llvm::TargetRegisterClass::hasSuperClassEq
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
Definition: TargetRegisterInfo.h:138
llvm::report_fatal_error
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:140
llvm::APInt::getZExtValue
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1460
llvm::SISrcMods::NEG
@ NEG
Definition: SIDefines.h:209
llvm::SISrcMods::ABS
@ ABS
Definition: SIDefines.h:210
llvm::APInt::getHighBitsSet
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:279
llvm::RegisterBank::getID
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:47
llvm::SIRegisterInfo::getSubRegFromChannel
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
Definition: SIRegisterInfo.cpp:427
llvm::AMDGPU::getMIMGLZMappingInfo
const LLVM_READONLY MIMGLZMappingInfo * getMIMGLZMappingInfo(unsigned L)
matchZeroExtendFromS32
static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg)
Match a zero extend from a 32-bit value to 64-bits.
Definition: AMDGPUInstructionSelector.cpp:3575
llvm::MIPatternMatch::m_SpecificICst
SpecificConstantMatch m_SpecificICst(int64_t RequestedValue)
Matches a constant equal to RequestedValue.
Definition: MIPatternMatch.h:128
llvm::SIRegisterInfo::isSGPRClass
static bool isSGPRClass(const TargetRegisterClass *RC)
Definition: SIRegisterInfo.h:162
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:30
llvm::APFloat::bitcastToAPInt
APInt bitcastToAPInt() const
Definition: APFloat.h:1130
llvm::AMDGPU::getMIMGOpcode
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
Definition: AMDGPUBaseInfo.cpp:138
llvm::Metadata
Root of the metadata hierarchy.
Definition: Metadata.h:62
GET_GLOBALISEL_TEMPORARIES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
llvm::MachineRegisterInfo::getVRegDef
MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
Definition: MachineRegisterInfo.cpp:400
llvm::MachineOperand::getParent
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
Definition: MachineOperand.h:238
llvm::None
const NoneType None
Definition: None.h:23
llvm::RegisterBankInfo::getSizeInBits
unsigned getSizeInBits(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Get the size in bits of Reg.
Definition: RegisterBankInfo.cpp:493
llvm::MachineRegisterInfo::use_empty
bool use_empty(Register RegNo) const
use_empty - Return true if there are no instructions using the specified register.
Definition: MachineRegisterInfo.h:506
llvm::RegisterBankInfo::constrainGenericRegister
static const TargetRegisterClass * constrainGenericRegister(Register Reg, const TargetRegisterClass &RC, MachineRegisterInfo &MRI)
Constrain the (possibly generic) virtual register Reg to RC.
Definition: RegisterBankInfo.cpp:132
llvm::AMDGPU::isGFX10Plus
bool isGFX10Plus(const MCSubtargetInfo &STI)
Definition: AMDGPUBaseInfo.cpp:1457