LLVM  13.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
15 #include "AMDGPU.h"
16 #include "AMDGPUGlobalISelUtils.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
25 #include "llvm/IR/DiagnosticInfo.h"
26 
27 #define DEBUG_TYPE "amdgpu-isel"
28 
29 using namespace llvm;
30 using namespace MIPatternMatch;
31 
33  "amdgpu-global-isel-risky-select",
34  cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
35  cl::init(false),
37 
38 #define GET_GLOBALISEL_IMPL
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenGlobalISel.inc"
41 #undef GET_GLOBALISEL_IMPL
42 #undef AMDGPUSubtarget
43 
45  const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
46  const AMDGPUTargetMachine &TM)
47  : InstructionSelector(), TII(*STI.getInstrInfo()),
48  TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
49  STI(STI),
50  EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
52 #include "AMDGPUGenGlobalISel.inc"
55 #include "AMDGPUGenGlobalISel.inc"
57 {
58 }
59 
60 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
61 
63  CodeGenCoverage &CoverageInfo,
64  ProfileSummaryInfo *PSI,
66  MRI = &MF.getRegInfo();
67  Subtarget = &MF.getSubtarget<GCNSubtarget>();
69 }
70 
71 bool AMDGPUInstructionSelector::isVCC(Register Reg,
72  const MachineRegisterInfo &MRI) const {
73  // The verifier is oblivious to s1 being a valid value for wavesize registers.
74  if (Reg.isPhysical())
75  return false;
76 
77  auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
78  const TargetRegisterClass *RC =
79  RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
80  if (RC) {
81  const LLT Ty = MRI.getType(Reg);
82  return RC->hasSuperClassEq(TRI.getBoolRC()) &&
83  Ty.isValid() && Ty.getSizeInBits() == 1;
84  }
85 
86  const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
87  return RB->getID() == AMDGPU::VCCRegBankID;
88 }
89 
90 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
91  unsigned NewOpc) const {
92  MI.setDesc(TII.get(NewOpc));
93  MI.RemoveOperand(1); // Remove intrinsic ID.
94  MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
95 
96  MachineOperand &Dst = MI.getOperand(0);
97  MachineOperand &Src = MI.getOperand(1);
98 
99  // TODO: This should be legalized to s32 if needed
100  if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
101  return false;
102 
103  const TargetRegisterClass *DstRC
104  = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
105  const TargetRegisterClass *SrcRC
106  = TRI.getConstrainedRegClassForOperand(Src, *MRI);
107  if (!DstRC || DstRC != SrcRC)
108  return false;
109 
110  return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
111  RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
112 }
113 
114 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
115  const DebugLoc &DL = I.getDebugLoc();
116  MachineBasicBlock *BB = I.getParent();
117  I.setDesc(TII.get(TargetOpcode::COPY));
118 
119  const MachineOperand &Src = I.getOperand(1);
120  MachineOperand &Dst = I.getOperand(0);
121  Register DstReg = Dst.getReg();
122  Register SrcReg = Src.getReg();
123 
124  if (isVCC(DstReg, *MRI)) {
125  if (SrcReg == AMDGPU::SCC) {
126  const TargetRegisterClass *RC
127  = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
128  if (!RC)
129  return true;
130  return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
131  }
132 
133  if (!isVCC(SrcReg, *MRI)) {
134  // TODO: Should probably leave the copy and let copyPhysReg expand it.
135  if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
136  return false;
137 
138  const TargetRegisterClass *SrcRC
139  = TRI.getConstrainedRegClassForOperand(Src, *MRI);
140 
141  Optional<ValueAndVReg> ConstVal =
142  getConstantVRegValWithLookThrough(SrcReg, *MRI, true, true);
143  if (ConstVal) {
144  unsigned MovOpc =
145  STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
146  BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
147  .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
148  } else {
149  Register MaskedReg = MRI->createVirtualRegister(SrcRC);
150 
151  // We can't trust the high bits at this point, so clear them.
152 
153  // TODO: Skip masking high bits if def is known boolean.
154 
155  unsigned AndOpc =
156  TRI.isSGPRClass(SrcRC) ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
157  BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
158  .addImm(1)
159  .addReg(SrcReg);
160  BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
161  .addImm(0)
162  .addReg(MaskedReg);
163  }
164 
165  if (!MRI->getRegClassOrNull(SrcReg))
166  MRI->setRegClass(SrcReg, SrcRC);
167  I.eraseFromParent();
168  return true;
169  }
170 
171  const TargetRegisterClass *RC =
172  TRI.getConstrainedRegClassForOperand(Dst, *MRI);
173  if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
174  return false;
175 
176  return true;
177  }
178 
179  for (const MachineOperand &MO : I.operands()) {
180  if (MO.getReg().isPhysical())
181  continue;
182 
183  const TargetRegisterClass *RC =
184  TRI.getConstrainedRegClassForOperand(MO, *MRI);
185  if (!RC)
186  continue;
187  RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
188  }
189  return true;
190 }
191 
192 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
193  const Register DefReg = I.getOperand(0).getReg();
194  const LLT DefTy = MRI->getType(DefReg);
195  if (DefTy == LLT::scalar(1)) {
196  if (!AllowRiskySelect) {
197  LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
198  return false;
199  }
200 
201  LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n");
202  }
203 
204  // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
205 
206  const RegClassOrRegBank &RegClassOrBank =
207  MRI->getRegClassOrRegBank(DefReg);
208 
209  const TargetRegisterClass *DefRC
210  = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
211  if (!DefRC) {
212  if (!DefTy.isValid()) {
213  LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
214  return false;
215  }
216 
217  const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
218  DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI);
219  if (!DefRC) {
220  LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
221  return false;
222  }
223  }
224 
225  // TODO: Verify that all registers have the same bank
226  I.setDesc(TII.get(TargetOpcode::PHI));
227  return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
228 }
229 
231 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
232  const TargetRegisterClass &SubRC,
233  unsigned SubIdx) const {
234 
235  MachineInstr *MI = MO.getParent();
237  Register DstReg = MRI->createVirtualRegister(&SubRC);
238 
239  if (MO.isReg()) {
240  unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
241  Register Reg = MO.getReg();
242  BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
243  .addReg(Reg, 0, ComposedSubIdx);
244 
245  return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
246  MO.isKill(), MO.isDead(), MO.isUndef(),
247  MO.isEarlyClobber(), 0, MO.isDebug(),
248  MO.isInternalRead());
249  }
250 
251  assert(MO.isImm());
252 
253  APInt Imm(64, MO.getImm());
254 
255  switch (SubIdx) {
256  default:
257  llvm_unreachable("do not know to split immediate with this sub index.");
258  case AMDGPU::sub0:
259  return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
260  case AMDGPU::sub1:
261  return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
262  }
263 }
264 
265 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
266  switch (Opc) {
267  case AMDGPU::G_AND:
268  return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
269  case AMDGPU::G_OR:
270  return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
271  case AMDGPU::G_XOR:
272  return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
273  default:
274  llvm_unreachable("not a bit op");
275  }
276 }
277 
278 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
279  Register DstReg = I.getOperand(0).getReg();
280  unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
281 
282  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
283  if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
284  DstRB->getID() != AMDGPU::VCCRegBankID)
285  return false;
286 
287  bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
288  STI.isWave64());
289  I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
290 
291  // Dead implicit-def of scc
292  I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
293  true, // isImp
294  false, // isKill
295  true)); // isDead
296  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
297 }
298 
299 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
300  MachineBasicBlock *BB = I.getParent();
301  MachineFunction *MF = BB->getParent();
302  Register DstReg = I.getOperand(0).getReg();
303  const DebugLoc &DL = I.getDebugLoc();
304  LLT Ty = MRI->getType(DstReg);
305  if (Ty.isVector())
306  return false;
307 
308  unsigned Size = Ty.getSizeInBits();
309  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
310  const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
311  const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
312 
313  if (Size == 32) {
314  if (IsSALU) {
315  const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
316  MachineInstr *Add =
317  BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
318  .add(I.getOperand(1))
319  .add(I.getOperand(2));
320  I.eraseFromParent();
321  return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
322  }
323 
324  if (STI.hasAddNoCarry()) {
325  const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
326  I.setDesc(TII.get(Opc));
327  I.addOperand(*MF, MachineOperand::CreateImm(0));
328  I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
329  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
330  }
331 
332  const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
333 
336  = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
337  .addDef(UnusedCarry, RegState::Dead)
338  .add(I.getOperand(1))
339  .add(I.getOperand(2))
340  .addImm(0);
341  I.eraseFromParent();
342  return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
343  }
344 
345  assert(!Sub && "illegal sub should not reach here");
346 
347  const TargetRegisterClass &RC
348  = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
349  const TargetRegisterClass &HalfRC
350  = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
351 
352  MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
353  MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
354  MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
355  MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
356 
357  Register DstLo = MRI->createVirtualRegister(&HalfRC);
358  Register DstHi = MRI->createVirtualRegister(&HalfRC);
359 
360  if (IsSALU) {
361  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
362  .add(Lo1)
363  .add(Lo2);
364  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
365  .add(Hi1)
366  .add(Hi2);
367  } else {
368  const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
369  Register CarryReg = MRI->createVirtualRegister(CarryRC);
370  BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
371  .addDef(CarryReg)
372  .add(Lo1)
373  .add(Lo2)
374  .addImm(0);
375  MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
377  .add(Hi1)
378  .add(Hi2)
379  .addReg(CarryReg, RegState::Kill)
380  .addImm(0);
381 
382  if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
383  return false;
384  }
385 
386  BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
387  .addReg(DstLo)
388  .addImm(AMDGPU::sub0)
389  .addReg(DstHi)
390  .addImm(AMDGPU::sub1);
391 
392 
393  if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
394  return false;
395 
396  I.eraseFromParent();
397  return true;
398 }
399 
400 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
401  MachineInstr &I) const {
402  MachineBasicBlock *BB = I.getParent();
403  MachineFunction *MF = BB->getParent();
404  const DebugLoc &DL = I.getDebugLoc();
405  Register Dst0Reg = I.getOperand(0).getReg();
406  Register Dst1Reg = I.getOperand(1).getReg();
407  const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
408  I.getOpcode() == AMDGPU::G_UADDE;
409  const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
410  I.getOpcode() == AMDGPU::G_USUBE;
411 
412  if (isVCC(Dst1Reg, *MRI)) {
413  unsigned NoCarryOpc =
414  IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
415  unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
416  I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
417  I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
418  I.addOperand(*MF, MachineOperand::CreateImm(0));
419  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
420  }
421 
422  Register Src0Reg = I.getOperand(2).getReg();
423  Register Src1Reg = I.getOperand(3).getReg();
424 
425  if (HasCarryIn) {
426  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
427  .addReg(I.getOperand(4).getReg());
428  }
429 
430  unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
431  unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
432 
433  BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
434  .add(I.getOperand(2))
435  .add(I.getOperand(3));
436  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
438 
439  if (!MRI->getRegClassOrNull(Dst1Reg))
440  MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
441 
442  if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
443  !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
444  !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
445  return false;
446 
447  if (HasCarryIn &&
448  !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
449  AMDGPU::SReg_32RegClass, *MRI))
450  return false;
451 
452  I.eraseFromParent();
453  return true;
454 }
455 
456 // TODO: We should probably legalize these to only using 32-bit results.
457 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
458  MachineBasicBlock *BB = I.getParent();
459  Register DstReg = I.getOperand(0).getReg();
460  Register SrcReg = I.getOperand(1).getReg();
461  LLT DstTy = MRI->getType(DstReg);
462  LLT SrcTy = MRI->getType(SrcReg);
463  const unsigned SrcSize = SrcTy.getSizeInBits();
464  unsigned DstSize = DstTy.getSizeInBits();
465 
466  // TODO: Should handle any multiple of 32 offset.
467  unsigned Offset = I.getOperand(2).getImm();
468  if (Offset % 32 != 0 || DstSize > 128)
469  return false;
470 
471  // 16-bit operations really use 32-bit registers.
472  // FIXME: Probably should not allow 16-bit G_EXTRACT results.
473  if (DstSize == 16)
474  DstSize = 32;
475 
476  const TargetRegisterClass *DstRC =
477  TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
478  if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
479  return false;
480 
481  const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
482  const TargetRegisterClass *SrcRC =
483  TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
484  if (!SrcRC)
485  return false;
487  DstSize / 32);
488  SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
489  if (!SrcRC)
490  return false;
491 
492  SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
493  *SrcRC, I.getOperand(1));
494  const DebugLoc &DL = I.getDebugLoc();
495  BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
496  .addReg(SrcReg, 0, SubReg);
497 
498  I.eraseFromParent();
499  return true;
500 }
501 
502 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
503  MachineBasicBlock *BB = MI.getParent();
504  Register DstReg = MI.getOperand(0).getReg();
505  LLT DstTy = MRI->getType(DstReg);
506  LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
507 
508  const unsigned SrcSize = SrcTy.getSizeInBits();
509  if (SrcSize < 32)
510  return selectImpl(MI, *CoverageInfo);
511 
512  const DebugLoc &DL = MI.getDebugLoc();
513  const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
514  const unsigned DstSize = DstTy.getSizeInBits();
515  const TargetRegisterClass *DstRC =
516  TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
517  if (!DstRC)
518  return false;
519 
520  ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
521  MachineInstrBuilder MIB =
522  BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
523  for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
524  MachineOperand &Src = MI.getOperand(I + 1);
525  MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
526  MIB.addImm(SubRegs[I]);
527 
528  const TargetRegisterClass *SrcRC
529  = TRI.getConstrainedRegClassForOperand(Src, *MRI);
530  if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
531  return false;
532  }
533 
534  if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
535  return false;
536 
537  MI.eraseFromParent();
538  return true;
539 }
540 
541 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
542  MachineBasicBlock *BB = MI.getParent();
543  const int NumDst = MI.getNumOperands() - 1;
544 
545  MachineOperand &Src = MI.getOperand(NumDst);
546 
547  Register SrcReg = Src.getReg();
548  Register DstReg0 = MI.getOperand(0).getReg();
549  LLT DstTy = MRI->getType(DstReg0);
550  LLT SrcTy = MRI->getType(SrcReg);
551 
552  const unsigned DstSize = DstTy.getSizeInBits();
553  const unsigned SrcSize = SrcTy.getSizeInBits();
554  const DebugLoc &DL = MI.getDebugLoc();
555  const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
556 
557  const TargetRegisterClass *SrcRC =
558  TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
559  if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
560  return false;
561 
562  // Note we could have mixed SGPR and VGPR destination banks for an SGPR
563  // source, and this relies on the fact that the same subregister indices are
564  // used for both.
565  ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
566  for (int I = 0, E = NumDst; I != E; ++I) {
567  MachineOperand &Dst = MI.getOperand(I);
568  BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
569  .addReg(SrcReg, 0, SubRegs[I]);
570 
571  // Make sure the subregister index is valid for the source register.
572  SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
573  if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
574  return false;
575 
576  const TargetRegisterClass *DstRC =
577  TRI.getConstrainedRegClassForOperand(Dst, *MRI);
578  if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
579  return false;
580  }
581 
582  MI.eraseFromParent();
583  return true;
584 }
585 
586 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
587  MachineInstr &MI) const {
588  if (selectImpl(MI, *CoverageInfo))
589  return true;
590 
591  const LLT S32 = LLT::scalar(32);
592  const LLT V2S16 = LLT::vector(2, 16);
593 
594  Register Dst = MI.getOperand(0).getReg();
595  if (MRI->getType(Dst) != V2S16)
596  return false;
597 
598  const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
599  if (DstBank->getID() != AMDGPU::SGPRRegBankID)
600  return false;
601 
602  Register Src0 = MI.getOperand(1).getReg();
603  Register Src1 = MI.getOperand(2).getReg();
604  if (MRI->getType(Src0) != S32)
605  return false;
606 
607  const DebugLoc &DL = MI.getDebugLoc();
608  MachineBasicBlock *BB = MI.getParent();
609 
610  auto ConstSrc1 =
611  getConstantVRegValWithLookThrough(Src1, *MRI, true, true, true);
612  if (ConstSrc1) {
613  auto ConstSrc0 =
614  getConstantVRegValWithLookThrough(Src0, *MRI, true, true, true);
615  if (ConstSrc0) {
616  const int64_t K0 = ConstSrc0->Value.getSExtValue();
617  const int64_t K1 = ConstSrc1->Value.getSExtValue();
618  uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
619  uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
620 
621  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst)
622  .addImm(Lo16 | (Hi16 << 16));
623  MI.eraseFromParent();
624  return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
625  }
626  }
627 
628  // TODO: This should probably be a combine somewhere
629  // (build_vector_trunc $src0, undef -> copy $src0
630  MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
631  if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
632  MI.setDesc(TII.get(AMDGPU::COPY));
633  MI.RemoveOperand(2);
634  return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) &&
635  RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI);
636  }
637 
638  Register ShiftSrc0;
639  Register ShiftSrc1;
640 
641  // With multiple uses of the shift, this will duplicate the shift and
642  // increase register pressure.
643  //
644  // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
645  // => (S_PACK_HH_B32_B16 $src0, $src1)
646  // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16))
647  // => (S_PACK_LH_B32_B16 $src0, $src1)
648  // (build_vector_trunc $src0, $src1)
649  // => (S_PACK_LL_B32_B16 $src0, $src1)
650 
651  bool Shift0 = mi_match(
652  Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
653 
654  bool Shift1 = mi_match(
655  Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
656 
657  unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
658  if (Shift0 && Shift1) {
659  Opc = AMDGPU::S_PACK_HH_B32_B16;
660  MI.getOperand(1).setReg(ShiftSrc0);
661  MI.getOperand(2).setReg(ShiftSrc1);
662  } else if (Shift1) {
663  Opc = AMDGPU::S_PACK_LH_B32_B16;
664  MI.getOperand(2).setReg(ShiftSrc1);
665  } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) {
666  // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
667  auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
668  .addReg(ShiftSrc0)
669  .addImm(16);
670 
671  MI.eraseFromParent();
672  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
673  }
674 
675  MI.setDesc(TII.get(Opc));
676  return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
677 }
678 
679 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
680  return selectG_ADD_SUB(I);
681 }
682 
683 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
684  const MachineOperand &MO = I.getOperand(0);
685 
686  // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
687  // regbank check here is to know why getConstrainedRegClassForOperand failed.
688  const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
689  if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
690  (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
691  I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
692  return true;
693  }
694 
695  return false;
696 }
697 
698 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
699  MachineBasicBlock *BB = I.getParent();
700 
701  Register DstReg = I.getOperand(0).getReg();
702  Register Src0Reg = I.getOperand(1).getReg();
703  Register Src1Reg = I.getOperand(2).getReg();
704  LLT Src1Ty = MRI->getType(Src1Reg);
705 
706  unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
707  unsigned InsSize = Src1Ty.getSizeInBits();
708 
709  int64_t Offset = I.getOperand(3).getImm();
710 
711  // FIXME: These cases should have been illegal and unnecessary to check here.
712  if (Offset % 32 != 0 || InsSize % 32 != 0)
713  return false;
714 
715  // Currently not handled by getSubRegFromChannel.
716  if (InsSize > 128)
717  return false;
718 
719  unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
720  if (SubReg == AMDGPU::NoSubRegister)
721  return false;
722 
723  const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
724  const TargetRegisterClass *DstRC =
725  TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
726  if (!DstRC)
727  return false;
728 
729  const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
730  const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
731  const TargetRegisterClass *Src0RC =
732  TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI);
733  const TargetRegisterClass *Src1RC =
734  TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI);
735 
736  // Deal with weird cases where the class only partially supports the subreg
737  // index.
738  Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
739  if (!Src0RC || !Src1RC)
740  return false;
741 
742  if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
743  !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
744  !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
745  return false;
746 
747  const DebugLoc &DL = I.getDebugLoc();
748  BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
749  .addReg(Src0Reg)
750  .addReg(Src1Reg)
751  .addImm(SubReg);
752 
753  I.eraseFromParent();
754  return true;
755 }
756 
757 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
758  if (STI.getLDSBankCount() != 16)
759  return selectImpl(MI, *CoverageInfo);
760 
761  Register Dst = MI.getOperand(0).getReg();
762  Register Src0 = MI.getOperand(2).getReg();
763  Register M0Val = MI.getOperand(6).getReg();
764  if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
765  !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
766  !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
767  return false;
768 
769  // This requires 2 instructions. It is possible to write a pattern to support
770  // this, but the generated isel emitter doesn't correctly deal with multiple
771  // output instructions using the same physical register input. The copy to m0
772  // is incorrectly placed before the second instruction.
773  //
774  // TODO: Match source modifiers.
775 
776  Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
777  const DebugLoc &DL = MI.getDebugLoc();
778  MachineBasicBlock *MBB = MI.getParent();
779 
780  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
781  .addReg(M0Val);
782  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
783  .addImm(2)
784  .addImm(MI.getOperand(4).getImm()) // $attr
785  .addImm(MI.getOperand(3).getImm()); // $attrchan
786 
787  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
788  .addImm(0) // $src0_modifiers
789  .addReg(Src0) // $src0
790  .addImm(MI.getOperand(4).getImm()) // $attr
791  .addImm(MI.getOperand(3).getImm()) // $attrchan
792  .addImm(0) // $src2_modifiers
793  .addReg(InterpMov) // $src2 - 2 f16 values selected by high
794  .addImm(MI.getOperand(5).getImm()) // $high
795  .addImm(0) // $clamp
796  .addImm(0); // $omod
797 
798  MI.eraseFromParent();
799  return true;
800 }
801 
802 // Writelane is special in that it can use SGPR and M0 (which would normally
803 // count as using the constant bus twice - but in this case it is allowed since
804 // the lane selector doesn't count as a use of the constant bus). However, it is
805 // still required to abide by the 1 SGPR rule. Fix this up if we might have
806 // multiple SGPRs.
807 bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
808  // With a constant bus limit of at least 2, there's no issue.
809  if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
810  return selectImpl(MI, *CoverageInfo);
811 
812  MachineBasicBlock *MBB = MI.getParent();
813  const DebugLoc &DL = MI.getDebugLoc();
814  Register VDst = MI.getOperand(0).getReg();
815  Register Val = MI.getOperand(2).getReg();
816  Register LaneSelect = MI.getOperand(3).getReg();
817  Register VDstIn = MI.getOperand(4).getReg();
818 
819  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
820 
821  Optional<ValueAndVReg> ConstSelect =
822  getConstantVRegValWithLookThrough(LaneSelect, *MRI, true, true);
823  if (ConstSelect) {
824  // The selector has to be an inline immediate, so we can use whatever for
825  // the other operands.
826  MIB.addReg(Val);
827  MIB.addImm(ConstSelect->Value.getSExtValue() &
828  maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
829  } else {
830  Optional<ValueAndVReg> ConstVal =
831  getConstantVRegValWithLookThrough(Val, *MRI, true, true);
832 
833  // If the value written is an inline immediate, we can get away without a
834  // copy to m0.
835  if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
836  STI.hasInv2PiInlineImm())) {
837  MIB.addImm(ConstVal->Value.getSExtValue());
838  MIB.addReg(LaneSelect);
839  } else {
840  MIB.addReg(Val);
841 
842  // If the lane selector was originally in a VGPR and copied with
843  // readfirstlane, there's a hazard to read the same SGPR from the
844  // VALU. Constrain to a different SGPR to help avoid needing a nop later.
845  RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
846 
847  BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
848  .addReg(LaneSelect);
849  MIB.addReg(AMDGPU::M0);
850  }
851  }
852 
853  MIB.addReg(VDstIn);
854 
855  MI.eraseFromParent();
856  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
857 }
858 
859 // We need to handle this here because tablegen doesn't support matching
860 // instructions with multiple outputs.
861 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
862  Register Dst0 = MI.getOperand(0).getReg();
863  Register Dst1 = MI.getOperand(1).getReg();
864 
865  LLT Ty = MRI->getType(Dst0);
866  unsigned Opc;
867  if (Ty == LLT::scalar(32))
868  Opc = AMDGPU::V_DIV_SCALE_F32_e64;
869  else if (Ty == LLT::scalar(64))
870  Opc = AMDGPU::V_DIV_SCALE_F64_e64;
871  else
872  return false;
873 
874  // TODO: Match source modifiers.
875 
876  const DebugLoc &DL = MI.getDebugLoc();
877  MachineBasicBlock *MBB = MI.getParent();
878 
879  Register Numer = MI.getOperand(3).getReg();
880  Register Denom = MI.getOperand(4).getReg();
881  unsigned ChooseDenom = MI.getOperand(5).getImm();
882 
883  Register Src0 = ChooseDenom != 0 ? Numer : Denom;
884 
885  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
886  .addDef(Dst1)
887  .addImm(0) // $src0_modifiers
888  .addUse(Src0) // $src0
889  .addImm(0) // $src1_modifiers
890  .addUse(Denom) // $src1
891  .addImm(0) // $src2_modifiers
892  .addUse(Numer) // $src2
893  .addImm(0) // $clamp
894  .addImm(0); // $omod
895 
896  MI.eraseFromParent();
897  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
898 }
899 
900 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
901  unsigned IntrinsicID = I.getIntrinsicID();
902  switch (IntrinsicID) {
903  case Intrinsic::amdgcn_if_break: {
904  MachineBasicBlock *BB = I.getParent();
905 
906  // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
907  // SelectionDAG uses for wave32 vs wave64.
908  BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
909  .add(I.getOperand(0))
910  .add(I.getOperand(2))
911  .add(I.getOperand(3));
912 
913  Register DstReg = I.getOperand(0).getReg();
914  Register Src0Reg = I.getOperand(2).getReg();
915  Register Src1Reg = I.getOperand(3).getReg();
916 
917  I.eraseFromParent();
918 
919  for (Register Reg : { DstReg, Src0Reg, Src1Reg })
921 
922  return true;
923  }
924  case Intrinsic::amdgcn_interp_p1_f16:
925  return selectInterpP1F16(I);
926  case Intrinsic::amdgcn_wqm:
927  return constrainCopyLikeIntrin(I, AMDGPU::WQM);
928  case Intrinsic::amdgcn_softwqm:
929  return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
930  case Intrinsic::amdgcn_strict_wwm:
931  case Intrinsic::amdgcn_wwm:
932  return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
933  case Intrinsic::amdgcn_strict_wqm:
934  return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
935  case Intrinsic::amdgcn_writelane:
936  return selectWritelane(I);
937  case Intrinsic::amdgcn_div_scale:
938  return selectDivScale(I);
939  case Intrinsic::amdgcn_icmp:
940  return selectIntrinsicIcmp(I);
941  case Intrinsic::amdgcn_ballot:
942  return selectBallot(I);
943  case Intrinsic::amdgcn_reloc_constant:
944  return selectRelocConstant(I);
945  case Intrinsic::amdgcn_groupstaticsize:
946  return selectGroupStaticSize(I);
947  case Intrinsic::returnaddress:
948  return selectReturnAddress(I);
949  default:
950  return selectImpl(I, *CoverageInfo);
951  }
952 }
953 
954 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) {
955  if (Size != 32 && Size != 64)
956  return -1;
957  switch (P) {
958  default:
959  llvm_unreachable("Unknown condition code!");
960  case CmpInst::ICMP_NE:
961  return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64;
962  case CmpInst::ICMP_EQ:
963  return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64;
964  case CmpInst::ICMP_SGT:
965  return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64;
966  case CmpInst::ICMP_SGE:
967  return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64;
968  case CmpInst::ICMP_SLT:
969  return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64;
970  case CmpInst::ICMP_SLE:
971  return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64;
972  case CmpInst::ICMP_UGT:
973  return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64;
974  case CmpInst::ICMP_UGE:
975  return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64;
976  case CmpInst::ICMP_ULT:
977  return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64;
978  case CmpInst::ICMP_ULE:
979  return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64;
980  }
981 }
982 
983 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
984  unsigned Size) const {
985  if (Size == 64) {
986  if (!STI.hasScalarCompareEq64())
987  return -1;
988 
989  switch (P) {
990  case CmpInst::ICMP_NE:
991  return AMDGPU::S_CMP_LG_U64;
992  case CmpInst::ICMP_EQ:
993  return AMDGPU::S_CMP_EQ_U64;
994  default:
995  return -1;
996  }
997  }
998 
999  if (Size != 32)
1000  return -1;
1001 
1002  switch (P) {
1003  case CmpInst::ICMP_NE:
1004  return AMDGPU::S_CMP_LG_U32;
1005  case CmpInst::ICMP_EQ:
1006  return AMDGPU::S_CMP_EQ_U32;
1007  case CmpInst::ICMP_SGT:
1008  return AMDGPU::S_CMP_GT_I32;
1009  case CmpInst::ICMP_SGE:
1010  return AMDGPU::S_CMP_GE_I32;
1011  case CmpInst::ICMP_SLT:
1012  return AMDGPU::S_CMP_LT_I32;
1013  case CmpInst::ICMP_SLE:
1014  return AMDGPU::S_CMP_LE_I32;
1015  case CmpInst::ICMP_UGT:
1016  return AMDGPU::S_CMP_GT_U32;
1017  case CmpInst::ICMP_UGE:
1018  return AMDGPU::S_CMP_GE_U32;
1019  case CmpInst::ICMP_ULT:
1020  return AMDGPU::S_CMP_LT_U32;
1021  case CmpInst::ICMP_ULE:
1022  return AMDGPU::S_CMP_LE_U32;
1023  default:
1024  llvm_unreachable("Unknown condition code!");
1025  }
1026 }
1027 
1028 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
1029  MachineBasicBlock *BB = I.getParent();
1030  const DebugLoc &DL = I.getDebugLoc();
1031 
1032  Register SrcReg = I.getOperand(2).getReg();
1033  unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1034 
1035  auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1036 
1037  Register CCReg = I.getOperand(0).getReg();
1038  if (!isVCC(CCReg, *MRI)) {
1039  int Opcode = getS_CMPOpcode(Pred, Size);
1040  if (Opcode == -1)
1041  return false;
1042  MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1043  .add(I.getOperand(2))
1044  .add(I.getOperand(3));
1045  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1046  .addReg(AMDGPU::SCC);
1047  bool Ret =
1048  constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1049  RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1050  I.eraseFromParent();
1051  return Ret;
1052  }
1053 
1054  int Opcode = getV_CMPOpcode(Pred, Size);
1055  if (Opcode == -1)
1056  return false;
1057 
1058  MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1059  I.getOperand(0).getReg())
1060  .add(I.getOperand(2))
1061  .add(I.getOperand(3));
1063  *TRI.getBoolRC(), *MRI);
1064  bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1065  I.eraseFromParent();
1066  return Ret;
1067 }
1068 
1069 bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const {
1070  Register Dst = I.getOperand(0).getReg();
1071  if (isVCC(Dst, *MRI))
1072  return false;
1073 
1074  if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize())
1075  return false;
1076 
1077  MachineBasicBlock *BB = I.getParent();
1078  const DebugLoc &DL = I.getDebugLoc();
1079  Register SrcReg = I.getOperand(2).getReg();
1080  unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1081  auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1082 
1083  int Opcode = getV_CMPOpcode(Pred, Size);
1084  if (Opcode == -1)
1085  return false;
1086 
1087  MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
1088  .add(I.getOperand(2))
1089  .add(I.getOperand(3));
1090  RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(),
1091  *MRI);
1092  bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1093  I.eraseFromParent();
1094  return Ret;
1095 }
1096 
1097 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1098  MachineBasicBlock *BB = I.getParent();
1099  const DebugLoc &DL = I.getDebugLoc();
1100  Register DstReg = I.getOperand(0).getReg();
1101  const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1102  const bool Is64 = Size == 64;
1103 
1104  if (Size != STI.getWavefrontSize())
1105  return false;
1106 
1108  getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true);
1109 
1110  if (Arg.hasValue()) {
1111  const int64_t Value = Arg.getValue().Value.getSExtValue();
1112  if (Value == 0) {
1113  unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1114  BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1115  } else if (Value == -1) { // all ones
1116  Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
1117  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1118  } else
1119  return false;
1120  } else {
1121  Register SrcReg = I.getOperand(2).getReg();
1122  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1123  }
1124 
1125  I.eraseFromParent();
1126  return true;
1127 }
1128 
1129 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1130  Register DstReg = I.getOperand(0).getReg();
1131  const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1132  const TargetRegisterClass *DstRC =
1133  TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI);
1134  if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1135  return false;
1136 
1137  const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1138 
1139  Module *M = MF->getFunction().getParent();
1140  const MDNode *Metadata = I.getOperand(2).getMetadata();
1141  auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1142  auto RelocSymbol = cast<GlobalVariable>(
1143  M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1144 
1145  MachineBasicBlock *BB = I.getParent();
1146  BuildMI(*BB, &I, I.getDebugLoc(),
1147  TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1148  .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);
1149 
1150  I.eraseFromParent();
1151  return true;
1152 }
1153 
1154 bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1156 
1157  Register DstReg = I.getOperand(0).getReg();
1158  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1159  unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1160  AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1161 
1162  MachineBasicBlock *MBB = I.getParent();
1163  const DebugLoc &DL = I.getDebugLoc();
1164 
1165  auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1166 
1167  if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1169  MIB.addImm(MFI->getLDSSize());
1170  } else {
1171  Module *M = MF->getFunction().getParent();
1172  const GlobalValue *GV
1173  = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1175  }
1176 
1177  I.eraseFromParent();
1178  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1179 }
1180 
1181 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1182  MachineBasicBlock *MBB = I.getParent();
1184  const DebugLoc &DL = I.getDebugLoc();
1185 
1186  MachineOperand &Dst = I.getOperand(0);
1187  Register DstReg = Dst.getReg();
1188  unsigned Depth = I.getOperand(2).getImm();
1189 
1190  const TargetRegisterClass *RC
1191  = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1192  if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1193  !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1194  return false;
1195 
1196  // Check for kernel and shader functions
1197  if (Depth != 0 ||
1199  BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1200  .addImm(0);
1201  I.eraseFromParent();
1202  return true;
1203  }
1204 
1205  MachineFrameInfo &MFI = MF.getFrameInfo();
1206  // There is a call to @llvm.returnaddress in this function
1207  MFI.setReturnAddressIsTaken(true);
1208 
1209  // Get the return address reg and mark it as an implicit live-in
1210  Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1211  Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1212  AMDGPU::SReg_64RegClass);
1213  BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1214  .addReg(LiveIn);
1215  I.eraseFromParent();
1216  return true;
1217 }
1218 
1219 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1220  // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
1221  // SelectionDAG uses for wave32 vs wave64.
1222  MachineBasicBlock *BB = MI.getParent();
1223  BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1224  .add(MI.getOperand(1));
1225 
1226  Register Reg = MI.getOperand(1).getReg();
1227  MI.eraseFromParent();
1228 
1229  if (!MRI->getRegClassOrNull(Reg))
1231  return true;
1232 }
1233 
1234 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1235  MachineInstr &MI, Intrinsic::ID IntrID) const {
1236  MachineBasicBlock *MBB = MI.getParent();
1238  const DebugLoc &DL = MI.getDebugLoc();
1239 
1240  unsigned IndexOperand = MI.getOperand(7).getImm();
1241  bool WaveRelease = MI.getOperand(8).getImm() != 0;
1242  bool WaveDone = MI.getOperand(9).getImm() != 0;
1243 
1244  if (WaveDone && !WaveRelease)
1245  report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1246 
1247  unsigned OrderedCountIndex = IndexOperand & 0x3f;
1248  IndexOperand &= ~0x3f;
1249  unsigned CountDw = 0;
1250 
1251  if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1252  CountDw = (IndexOperand >> 24) & 0xf;
1253  IndexOperand &= ~(0xf << 24);
1254 
1255  if (CountDw < 1 || CountDw > 4) {
1257  "ds_ordered_count: dword count must be between 1 and 4");
1258  }
1259  }
1260 
1261  if (IndexOperand)
1262  report_fatal_error("ds_ordered_count: bad index operand");
1263 
1264  unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1265  unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1266 
1267  unsigned Offset0 = OrderedCountIndex << 2;
1268  unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
1269  (Instruction << 4);
1270 
1272  Offset1 |= (CountDw - 1) << 6;
1273 
1274  unsigned Offset = Offset0 | (Offset1 << 8);
1275 
1276  Register M0Val = MI.getOperand(2).getReg();
1277  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1278  .addReg(M0Val);
1279 
1280  Register DstReg = MI.getOperand(0).getReg();
1281  Register ValReg = MI.getOperand(3).getReg();
1283  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1284  .addReg(ValReg)
1285  .addImm(Offset)
1286  .cloneMemRefs(MI);
1287 
1288  if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1289  return false;
1290 
1291  bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1292  MI.eraseFromParent();
1293  return Ret;
1294 }
1295 
1296 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1297  switch (IntrID) {
1298  case Intrinsic::amdgcn_ds_gws_init:
1299  return AMDGPU::DS_GWS_INIT;
1300  case Intrinsic::amdgcn_ds_gws_barrier:
1301  return AMDGPU::DS_GWS_BARRIER;
1302  case Intrinsic::amdgcn_ds_gws_sema_v:
1303  return AMDGPU::DS_GWS_SEMA_V;
1304  case Intrinsic::amdgcn_ds_gws_sema_br:
1305  return AMDGPU::DS_GWS_SEMA_BR;
1306  case Intrinsic::amdgcn_ds_gws_sema_p:
1307  return AMDGPU::DS_GWS_SEMA_P;
1308  case Intrinsic::amdgcn_ds_gws_sema_release_all:
1309  return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1310  default:
1311  llvm_unreachable("not a gws intrinsic");
1312  }
1313 }
1314 
1315 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1316  Intrinsic::ID IID) const {
1317  if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1318  !STI.hasGWSSemaReleaseAll())
1319  return false;
1320 
1321  // intrinsic ID, vsrc, offset
1322  const bool HasVSrc = MI.getNumOperands() == 3;
1323  assert(HasVSrc || MI.getNumOperands() == 2);
1324 
1325  Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1326  const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1327  if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1328  return false;
1329 
1330  MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1331  assert(OffsetDef);
1332 
1333  unsigned ImmOffset;
1334 
1335  MachineBasicBlock *MBB = MI.getParent();
1336  const DebugLoc &DL = MI.getDebugLoc();
1337 
1338  MachineInstr *Readfirstlane = nullptr;
1339 
1340  // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1341  // incoming offset, in case there's an add of a constant. We'll have to put it
1342  // back later.
1343  if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1344  Readfirstlane = OffsetDef;
1345  BaseOffset = OffsetDef->getOperand(1).getReg();
1346  OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1347  }
1348 
1349  if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1350  // If we have a constant offset, try to use the 0 in m0 as the base.
1351  // TODO: Look into changing the default m0 initialization value. If the
1352  // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1353  // the immediate offset.
1354 
1355  ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1356  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1357  .addImm(0);
1358  } else {
1359  std::tie(BaseOffset, ImmOffset) =
1360  AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset);
1361 
1362  if (Readfirstlane) {
1363  // We have the constant offset now, so put the readfirstlane back on the
1364  // variable component.
1365  if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1366  return false;
1367 
1368  Readfirstlane->getOperand(1).setReg(BaseOffset);
1369  BaseOffset = Readfirstlane->getOperand(0).getReg();
1370  } else {
1371  if (!RBI.constrainGenericRegister(BaseOffset,
1372  AMDGPU::SReg_32RegClass, *MRI))
1373  return false;
1374  }
1375 
1376  Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1377  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1378  .addReg(BaseOffset)
1379  .addImm(16);
1380 
1381  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1382  .addReg(M0Base);
1383  }
1384 
1385  // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1386  // offset field) % 64. Some versions of the programming guide omit the m0
1387  // part, or claim it's from offset 0.
1388  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1389 
1390  if (HasVSrc) {
1391  Register VSrc = MI.getOperand(1).getReg();
1392  MIB.addReg(VSrc);
1393  if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1394  return false;
1395  }
1396 
1397  MIB.addImm(ImmOffset)
1398  .cloneMemRefs(MI);
1399 
1400  MI.eraseFromParent();
1401  return true;
1402 }
1403 
1404 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1405  bool IsAppend) const {
1406  Register PtrBase = MI.getOperand(2).getReg();
1407  LLT PtrTy = MRI->getType(PtrBase);
1408  bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1409 
1410  unsigned Offset;
1411  std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1412 
1413  // TODO: Should this try to look through readfirstlane like GWS?
1414  if (!isDSOffsetLegal(PtrBase, Offset)) {
1415  PtrBase = MI.getOperand(2).getReg();
1416  Offset = 0;
1417  }
1418 
1419  MachineBasicBlock *MBB = MI.getParent();
1420  const DebugLoc &DL = MI.getDebugLoc();
1421  const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1422 
1423  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1424  .addReg(PtrBase);
1425  if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1426  return false;
1427 
1428  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1429  .addImm(Offset)
1430  .addImm(IsGDS ? -1 : 0)
1431  .cloneMemRefs(MI);
1432  MI.eraseFromParent();
1433  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1434 }
1435 
1436 bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1437  if (TM.getOptLevel() > CodeGenOpt::None) {
1438  unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1439  if (WGSize <= STI.getWavefrontSize()) {
1440  MachineBasicBlock *MBB = MI.getParent();
1441  const DebugLoc &DL = MI.getDebugLoc();
1442  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1443  MI.eraseFromParent();
1444  return true;
1445  }
1446  }
1447  return selectImpl(MI, *CoverageInfo);
1448 }
1449 
1450 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1451  bool &IsTexFail) {
1452  if (TexFailCtrl)
1453  IsTexFail = true;
1454 
1455  TFE = (TexFailCtrl & 0x1) ? 1 : 0;
1456  TexFailCtrl &= ~(uint64_t)0x1;
1457  LWE = (TexFailCtrl & 0x2) ? 1 : 0;
1458  TexFailCtrl &= ~(uint64_t)0x2;
1459 
1460  return TexFailCtrl == 0;
1461 }
1462 
1463 bool AMDGPUInstructionSelector::selectImageIntrinsic(
1465  MachineBasicBlock *MBB = MI.getParent();
1466  const DebugLoc &DL = MI.getDebugLoc();
1467 
1468  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1469  AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1470 
1471  const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1472  const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
1473  AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
1474  const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
1475  AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
1476  unsigned IntrOpcode = Intr->BaseOpcode;
1477  const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
1478 
1479  const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
1480 
1481  Register VDataIn, VDataOut;
1482  LLT VDataTy;
1483  int NumVDataDwords = -1;
1484  bool IsD16 = false;
1485 
1486  bool Unorm;
1487  if (!BaseOpcode->Sampler)
1488  Unorm = true;
1489  else
1490  Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
1491 
1492  bool TFE;
1493  bool LWE;
1494  bool IsTexFail = false;
1495  if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
1496  TFE, LWE, IsTexFail))
1497  return false;
1498 
1499  const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
1500  const bool IsA16 = (Flags & 1) != 0;
1501  const bool IsG16 = (Flags & 2) != 0;
1502 
1503  // A16 implies 16 bit gradients
1504  if (IsA16 && !IsG16)
1505  return false;
1506 
1507  unsigned DMask = 0;
1508  unsigned DMaskLanes = 0;
1509 
1510  if (BaseOpcode->Atomic) {
1511  VDataOut = MI.getOperand(0).getReg();
1512  VDataIn = MI.getOperand(2).getReg();
1513  LLT Ty = MRI->getType(VDataIn);
1514 
1515  // Be careful to allow atomic swap on 16-bit element vectors.
1516  const bool Is64Bit = BaseOpcode->AtomicX2 ?
1517  Ty.getSizeInBits() == 128 :
1518  Ty.getSizeInBits() == 64;
1519 
1520  if (BaseOpcode->AtomicX2) {
1521  assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1522 
1523  DMask = Is64Bit ? 0xf : 0x3;
1524  NumVDataDwords = Is64Bit ? 4 : 2;
1525  } else {
1526  DMask = Is64Bit ? 0x3 : 0x1;
1527  NumVDataDwords = Is64Bit ? 2 : 1;
1528  }
1529  } else {
1530  DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
1531  DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
1532 
1533  // One memoperand is mandatory, except for getresinfo.
1534  // FIXME: Check this in verifier.
1535  if (!MI.memoperands_empty()) {
1536  const MachineMemOperand *MMO = *MI.memoperands_begin();
1537 
1538  // Infer d16 from the memory size, as the register type will be mangled by
1539  // unpacked subtargets, or by TFE.
1540  IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
1541  }
1542 
1543  if (BaseOpcode->Store) {
1544  VDataIn = MI.getOperand(1).getReg();
1545  VDataTy = MRI->getType(VDataIn);
1546  NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1547  } else {
1548  VDataOut = MI.getOperand(0).getReg();
1549  VDataTy = MRI->getType(VDataOut);
1550  NumVDataDwords = DMaskLanes;
1551 
1552  if (IsD16 && !STI.hasUnpackedD16VMem())
1553  NumVDataDwords = (DMaskLanes + 1) / 2;
1554  }
1555  }
1556 
1557  // Optimize _L to _LZ when _L is zero
1558  if (LZMappingInfo) {
1559  // The legalizer replaced the register with an immediate 0 if we need to
1560  // change the opcode.
1561  const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->LodIndex);
1562  if (Lod.isImm()) {
1563  assert(Lod.getImm() == 0);
1564  IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l
1565  }
1566  }
1567 
1568  // Optimize _mip away, when 'lod' is zero
1569  if (MIPMappingInfo) {
1570  const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->MipIndex);
1571  if (Lod.isImm()) {
1572  assert(Lod.getImm() == 0);
1573  IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip
1574  }
1575  }
1576 
1577  // Set G16 opcode
1578  if (IsG16 && !IsA16) {
1579  const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1580  AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
1581  assert(G16MappingInfo);
1582  IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1583  }
1584 
1585  // TODO: Check this in verifier.
1586  assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1587 
1588  unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
1589  if (BaseOpcode->Atomic)
1590  CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
1591  if (CPol & ~AMDGPU::CPol::ALL)
1592  return false;
1593 
1594  int NumVAddrRegs = 0;
1595  int NumVAddrDwords = 0;
1596  for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
1597  // Skip the $noregs and 0s inserted during legalization.
1598  MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
1599  if (!AddrOp.isReg())
1600  continue; // XXX - Break?
1601 
1602  Register Addr = AddrOp.getReg();
1603  if (!Addr)
1604  break;
1605 
1606  ++NumVAddrRegs;
1607  NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1608  }
1609 
1610  // The legalizer preprocessed the intrinsic arguments. If we aren't using
1611  // NSA, these should have beeen packed into a single value in the first
1612  // address register
1613  const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
1614  if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1615  LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1616  return false;
1617  }
1618 
1619  if (IsTexFail)
1620  ++NumVDataDwords;
1621 
1622  int Opcode = -1;
1623  if (IsGFX10Plus) {
1624  Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1625  UseNSA ? AMDGPU::MIMGEncGfx10NSA
1626  : AMDGPU::MIMGEncGfx10Default,
1627  NumVDataDwords, NumVAddrDwords);
1628  } else {
1630  Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1631  NumVDataDwords, NumVAddrDwords);
1632  if (Opcode == -1)
1633  Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1634  NumVDataDwords, NumVAddrDwords);
1635  }
1636  assert(Opcode != -1);
1637 
1638  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1639  .cloneMemRefs(MI);
1640 
1641  if (VDataOut) {
1642  if (BaseOpcode->AtomicX2) {
1643  const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1644 
1646  Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1647  unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1648 
1649  MIB.addDef(TmpReg);
1650  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1651  .addReg(TmpReg, RegState::Kill, SubReg);
1652 
1653  } else {
1654  MIB.addDef(VDataOut); // vdata output
1655  }
1656  }
1657 
1658  if (VDataIn)
1659  MIB.addReg(VDataIn); // vdata input
1660 
1661  for (int I = 0; I != NumVAddrRegs; ++I) {
1662  MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
1663  if (SrcOp.isReg()) {
1664  assert(SrcOp.getReg() != 0);
1665  MIB.addReg(SrcOp.getReg());
1666  }
1667  }
1668 
1669  MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
1670  if (BaseOpcode->Sampler)
1671  MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
1672 
1673  MIB.addImm(DMask); // dmask
1674 
1675  if (IsGFX10Plus)
1676  MIB.addImm(DimInfo->Encoding);
1677  MIB.addImm(Unorm);
1678 
1679  MIB.addImm(CPol);
1680  MIB.addImm(IsA16 && // a16 or r128
1681  STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
1682  if (IsGFX10Plus)
1683  MIB.addImm(IsA16 ? -1 : 0);
1684 
1685  MIB.addImm(TFE); // tfe
1686  MIB.addImm(LWE); // lwe
1687  if (!IsGFX10Plus)
1688  MIB.addImm(DimInfo->DA ? -1 : 0);
1689  if (BaseOpcode->HasD16)
1690  MIB.addImm(IsD16 ? -1 : 0);
1691 
1692  if (IsTexFail) {
1693  // An image load instruction with TFE/LWE only conditionally writes to its
1694  // result registers. Initialize them to zero so that we always get well
1695  // defined result values.
1696  assert(VDataOut && !VDataIn);
1697  Register Tied = MRI->cloneVirtualRegister(VDataOut);
1698  Register Zero = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1699  BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::V_MOV_B32_e32), Zero)
1700  .addImm(0);
1701  auto Parts = TRI.getRegSplitParts(MRI->getRegClass(Tied), 4);
1702  if (STI.usePRTStrictNull()) {
1703  // With enable-prt-strict-null enabled, initialize all result registers to
1704  // zero.
1705  auto RegSeq =
1706  BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
1707  for (auto Sub : Parts)
1708  RegSeq.addReg(Zero).addImm(Sub);
1709  } else {
1710  // With enable-prt-strict-null disabled, only initialize the extra TFE/LWE
1711  // result register.
1712  Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1713  BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
1714  auto RegSeq =
1715  BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied);
1716  for (auto Sub : Parts.drop_back(1))
1717  RegSeq.addReg(Undef).addImm(Sub);
1718  RegSeq.addReg(Zero).addImm(Parts.back());
1719  }
1720  MIB.addReg(Tied, RegState::Implicit);
1721  MIB->tieOperands(0, MIB->getNumOperands() - 1);
1722  }
1723 
1724  MI.eraseFromParent();
1725  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1726 }
1727 
1728 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
1729  MachineInstr &I) const {
1730  unsigned IntrinsicID = I.getIntrinsicID();
1731  switch (IntrinsicID) {
1732  case Intrinsic::amdgcn_end_cf:
1733  return selectEndCfIntrinsic(I);
1734  case Intrinsic::amdgcn_ds_ordered_add:
1735  case Intrinsic::amdgcn_ds_ordered_swap:
1736  return selectDSOrderedIntrinsic(I, IntrinsicID);
1737  case Intrinsic::amdgcn_ds_gws_init:
1738  case Intrinsic::amdgcn_ds_gws_barrier:
1739  case Intrinsic::amdgcn_ds_gws_sema_v:
1740  case Intrinsic::amdgcn_ds_gws_sema_br:
1741  case Intrinsic::amdgcn_ds_gws_sema_p:
1742  case Intrinsic::amdgcn_ds_gws_sema_release_all:
1743  return selectDSGWSIntrinsic(I, IntrinsicID);
1744  case Intrinsic::amdgcn_ds_append:
1745  return selectDSAppendConsume(I, true);
1746  case Intrinsic::amdgcn_ds_consume:
1747  return selectDSAppendConsume(I, false);
1748  case Intrinsic::amdgcn_s_barrier:
1749  return selectSBarrier(I);
1750  case Intrinsic::amdgcn_global_atomic_fadd:
1751  return selectGlobalAtomicFadd(I, I.getOperand(2), I.getOperand(3));
1752  default: {
1753  return selectImpl(I, *CoverageInfo);
1754  }
1755  }
1756 }
1757 
1758 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
1759  if (selectImpl(I, *CoverageInfo))
1760  return true;
1761 
1762  MachineBasicBlock *BB = I.getParent();
1763  const DebugLoc &DL = I.getDebugLoc();
1764 
1765  Register DstReg = I.getOperand(0).getReg();
1766  unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
1767  assert(Size <= 32 || Size == 64);
1768  const MachineOperand &CCOp = I.getOperand(1);
1769  Register CCReg = CCOp.getReg();
1770  if (!isVCC(CCReg, *MRI)) {
1771  unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
1772  AMDGPU::S_CSELECT_B32;
1773  MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
1774  .addReg(CCReg);
1775 
1776  // The generic constrainSelectedInstRegOperands doesn't work for the scc register
1777  // bank, because it does not cover the register class that we used to represent
1778  // for it. So we need to manually set the register class here.
1779  if (!MRI->getRegClassOrNull(CCReg))
1780  MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
1781  MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
1782  .add(I.getOperand(2))
1783  .add(I.getOperand(3));
1784 
1785  bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) |
1786  constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
1787  I.eraseFromParent();
1788  return Ret;
1789  }
1790 
1791  // Wide VGPR select should have been split in RegBankSelect.
1792  if (Size > 32)
1793  return false;
1794 
1795  MachineInstr *Select =
1796  BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1797  .addImm(0)
1798  .add(I.getOperand(3))
1799  .addImm(0)
1800  .add(I.getOperand(2))
1801  .add(I.getOperand(1));
1802 
1803  bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
1804  I.eraseFromParent();
1805  return Ret;
1806 }
1807 
1808 static int sizeToSubRegIndex(unsigned Size) {
1809  switch (Size) {
1810  case 32:
1811  return AMDGPU::sub0;
1812  case 64:
1813  return AMDGPU::sub0_sub1;
1814  case 96:
1815  return AMDGPU::sub0_sub1_sub2;
1816  case 128:
1817  return AMDGPU::sub0_sub1_sub2_sub3;
1818  case 256:
1819  return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
1820  default:
1821  if (Size < 32)
1822  return AMDGPU::sub0;
1823  if (Size > 256)
1824  return -1;
1826  }
1827 }
1828 
1829 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
1830  Register DstReg = I.getOperand(0).getReg();
1831  Register SrcReg = I.getOperand(1).getReg();
1832  const LLT DstTy = MRI->getType(DstReg);
1833  const LLT SrcTy = MRI->getType(SrcReg);
1834  const LLT S1 = LLT::scalar(1);
1835 
1836  const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
1837  const RegisterBank *DstRB;
1838  if (DstTy == S1) {
1839  // This is a special case. We don't treat s1 for legalization artifacts as
1840  // vcc booleans.
1841  DstRB = SrcRB;
1842  } else {
1843  DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1844  if (SrcRB != DstRB)
1845  return false;
1846  }
1847 
1848  const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
1849 
1850  unsigned DstSize = DstTy.getSizeInBits();
1851  unsigned SrcSize = SrcTy.getSizeInBits();
1852 
1853  const TargetRegisterClass *SrcRC
1854  = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI);
1855  const TargetRegisterClass *DstRC
1856  = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
1857  if (!SrcRC || !DstRC)
1858  return false;
1859 
1860  if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
1861  !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
1862  LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
1863  return false;
1864  }
1865 
1866  if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) {
1867  MachineBasicBlock *MBB = I.getParent();
1868  const DebugLoc &DL = I.getDebugLoc();
1869 
1870  Register LoReg = MRI->createVirtualRegister(DstRC);
1871  Register HiReg = MRI->createVirtualRegister(DstRC);
1872  BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
1873  .addReg(SrcReg, 0, AMDGPU::sub0);
1874  BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
1875  .addReg(SrcReg, 0, AMDGPU::sub1);
1876 
1877  if (IsVALU && STI.hasSDWA()) {
1878  // Write the low 16-bits of the high element into the high 16-bits of the
1879  // low element.
1880  MachineInstr *MovSDWA =
1881  BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
1882  .addImm(0) // $src0_modifiers
1883  .addReg(HiReg) // $src0
1884  .addImm(0) // $clamp
1885  .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
1886  .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
1887  .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
1888  .addReg(LoReg, RegState::Implicit);
1889  MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
1890  } else {
1891  Register TmpReg0 = MRI->createVirtualRegister(DstRC);
1892  Register TmpReg1 = MRI->createVirtualRegister(DstRC);
1893  Register ImmReg = MRI->createVirtualRegister(DstRC);
1894  if (IsVALU) {
1895  BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
1896  .addImm(16)
1897  .addReg(HiReg);
1898  } else {
1899  BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
1900  .addReg(HiReg)
1901  .addImm(16);
1902  }
1903 
1904  unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
1905  unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
1906  unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
1907 
1908  BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
1909  .addImm(0xffff);
1910  BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
1911  .addReg(LoReg)
1912  .addReg(ImmReg);
1913  BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
1914  .addReg(TmpReg0)
1915  .addReg(TmpReg1);
1916  }
1917 
1918  I.eraseFromParent();
1919  return true;
1920  }
1921 
1922  if (!DstTy.isScalar())
1923  return false;
1924 
1925  if (SrcSize > 32) {
1926  int SubRegIdx = sizeToSubRegIndex(DstSize);
1927  if (SubRegIdx == -1)
1928  return false;
1929 
1930  // Deal with weird cases where the class only partially supports the subreg
1931  // index.
1932  const TargetRegisterClass *SrcWithSubRC
1933  = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
1934  if (!SrcWithSubRC)
1935  return false;
1936 
1937  if (SrcWithSubRC != SrcRC) {
1938  if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
1939  return false;
1940  }
1941 
1942  I.getOperand(1).setSubReg(SubRegIdx);
1943  }
1944 
1945  I.setDesc(TII.get(TargetOpcode::COPY));
1946  return true;
1947 }
1948 
1949 /// \returns true if a bitmask for \p Size bits will be an inline immediate.
1950 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
1951  Mask = maskTrailingOnes<unsigned>(Size);
1952  int SignedMask = static_cast<int>(Mask);
1953  return SignedMask >= -16 && SignedMask <= 64;
1954 }
1955 
1956 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
1957 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
1959  const TargetRegisterInfo &TRI) const {
1960  const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
1961  if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
1962  return RB;
1963 
1964  // Ignore the type, since we don't use vcc in artifacts.
1965  if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
1966  return &RBI.getRegBankFromRegClass(*RC, LLT());
1967  return nullptr;
1968 }
1969 
1970 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
1971  bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
1972  bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
1973  const DebugLoc &DL = I.getDebugLoc();
1974  MachineBasicBlock &MBB = *I.getParent();
1975  const Register DstReg = I.getOperand(0).getReg();
1976  const Register SrcReg = I.getOperand(1).getReg();
1977 
1978  const LLT DstTy = MRI->getType(DstReg);
1979  const LLT SrcTy = MRI->getType(SrcReg);
1980  const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
1981  I.getOperand(2).getImm() : SrcTy.getSizeInBits();
1982  const unsigned DstSize = DstTy.getSizeInBits();
1983  if (!DstTy.isScalar())
1984  return false;
1985 
1986  // Artifact casts should never use vcc.
1987  const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
1988 
1989  // FIXME: This should probably be illegal and split earlier.
1990  if (I.getOpcode() == AMDGPU::G_ANYEXT) {
1991  if (DstSize <= 32)
1992  return selectCOPY(I);
1993 
1994  const TargetRegisterClass *SrcRC =
1995  TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI);
1996  const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1997  const TargetRegisterClass *DstRC =
1998  TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
1999 
2000  Register UndefReg = MRI->createVirtualRegister(SrcRC);
2001  BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2002  BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2003  .addReg(SrcReg)
2004  .addImm(AMDGPU::sub0)
2005  .addReg(UndefReg)
2006  .addImm(AMDGPU::sub1);
2007  I.eraseFromParent();
2008 
2009  return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2010  RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2011  }
2012 
2013  if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2014  // 64-bit should have been split up in RegBankSelect
2015 
2016  // Try to use an and with a mask if it will save code size.
2017  unsigned Mask;
2018  if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2019  MachineInstr *ExtI =
2020  BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2021  .addImm(Mask)
2022  .addReg(SrcReg);
2023  I.eraseFromParent();
2024  return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2025  }
2026 
2027  const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2028  MachineInstr *ExtI =
2029  BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2030  .addReg(SrcReg)
2031  .addImm(0) // Offset
2032  .addImm(SrcSize); // Width
2033  I.eraseFromParent();
2034  return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2035  }
2036 
2037  if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2038  const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2039  AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2040  if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2041  return false;
2042 
2043  if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2044  const unsigned SextOpc = SrcSize == 8 ?
2045  AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2046  BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2047  .addReg(SrcReg);
2048  I.eraseFromParent();
2049  return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2050  }
2051 
2052  const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2053  const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2054 
2055  // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2056  if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2057  // We need a 64-bit register source, but the high bits don't matter.
2058  Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2059  Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2060  unsigned SubReg = InReg ? AMDGPU::sub0 : 0;
2061 
2062  BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2063  BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2064  .addReg(SrcReg, 0, SubReg)
2065  .addImm(AMDGPU::sub0)
2066  .addReg(UndefReg)
2067  .addImm(AMDGPU::sub1);
2068 
2069  BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2070  .addReg(ExtReg)
2071  .addImm(SrcSize << 16);
2072 
2073  I.eraseFromParent();
2074  return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2075  }
2076 
2077  unsigned Mask;
2078  if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2079  BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2080  .addReg(SrcReg)
2081  .addImm(Mask);
2082  } else {
2083  BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2084  .addReg(SrcReg)
2085  .addImm(SrcSize << 16);
2086  }
2087 
2088  I.eraseFromParent();
2089  return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2090  }
2091 
2092  return false;
2093 }
2094 
2095 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
2096  MachineBasicBlock *BB = I.getParent();
2097  MachineOperand &ImmOp = I.getOperand(1);
2098  Register DstReg = I.getOperand(0).getReg();
2099  unsigned Size = MRI->getType(DstReg).getSizeInBits();
2100 
2101  // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
2102  if (ImmOp.isFPImm()) {
2103  const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
2104  ImmOp.ChangeToImmediate(Imm.getZExtValue());
2105  } else if (ImmOp.isCImm()) {
2106  ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
2107  } else {
2108  llvm_unreachable("Not supported by g_constants");
2109  }
2110 
2111  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2112  const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
2113 
2114  unsigned Opcode;
2115  if (DstRB->getID() == AMDGPU::VCCRegBankID) {
2116  Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2117  } else {
2118  Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2119 
2120  // We should never produce s1 values on banks other than VCC. If the user of
2121  // this already constrained the register, we may incorrectly think it's VCC
2122  // if it wasn't originally.
2123  if (Size == 1)
2124  return false;
2125  }
2126 
2127  if (Size != 64) {
2128  I.setDesc(TII.get(Opcode));
2129  I.addImplicitDefUseOperands(*MF);
2130  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2131  }
2132 
2133  const DebugLoc &DL = I.getDebugLoc();
2134 
2135  APInt Imm(Size, I.getOperand(1).getImm());
2136 
2137  MachineInstr *ResInst;
2138  if (IsSgpr && TII.isInlineConstant(Imm)) {
2139  ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2140  .addImm(I.getOperand(1).getImm());
2141  } else {
2142  const TargetRegisterClass *RC = IsSgpr ?
2143  &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2144  Register LoReg = MRI->createVirtualRegister(RC);
2145  Register HiReg = MRI->createVirtualRegister(RC);
2146 
2147  BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2148  .addImm(Imm.trunc(32).getZExtValue());
2149 
2150  BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2151  .addImm(Imm.ashr(32).getZExtValue());
2152 
2153  ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2154  .addReg(LoReg)
2155  .addImm(AMDGPU::sub0)
2156  .addReg(HiReg)
2157  .addImm(AMDGPU::sub1);
2158  }
2159 
2160  // We can't call constrainSelectedInstRegOperands here, because it doesn't
2161  // work for target independent opcodes
2162  I.eraseFromParent();
2163  const TargetRegisterClass *DstRC =
2164  TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2165  if (!DstRC)
2166  return true;
2167  return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2168 }
2169 
2170 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2171  // Only manually handle the f64 SGPR case.
2172  //
2173  // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2174  // the bit ops theoretically have a second result due to the implicit def of
2175  // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2176  // that is easy by disabling the check. The result works, but uses a
2177  // nonsensical sreg32orlds_and_sreg_1 regclass.
2178  //
2179  // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2180  // the variadic REG_SEQUENCE operands.
2181 
2182  Register Dst = MI.getOperand(0).getReg();
2183  const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2184  if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2185  MRI->getType(Dst) != LLT::scalar(64))
2186  return false;
2187 
2188  Register Src = MI.getOperand(1).getReg();
2189  MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2190  if (Fabs)
2191  Src = Fabs->getOperand(1).getReg();
2192 
2193  if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2194  !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2195  return false;
2196 
2197  MachineBasicBlock *BB = MI.getParent();
2198  const DebugLoc &DL = MI.getDebugLoc();
2199  Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2200  Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2201  Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2202  Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2203 
2204  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2205  .addReg(Src, 0, AMDGPU::sub0);
2206  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2207  .addReg(Src, 0, AMDGPU::sub1);
2208  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2209  .addImm(0x80000000);
2210 
2211  // Set or toggle sign bit.
2212  unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2213  BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2214  .addReg(HiReg)
2215  .addReg(ConstReg);
2216  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2217  .addReg(LoReg)
2218  .addImm(AMDGPU::sub0)
2219  .addReg(OpReg)
2220  .addImm(AMDGPU::sub1);
2221  MI.eraseFromParent();
2222  return true;
2223 }
2224 
2225 // FIXME: This is a workaround for the same tablegen problems as G_FNEG
2226 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2227  Register Dst = MI.getOperand(0).getReg();
2228  const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2229  if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2230  MRI->getType(Dst) != LLT::scalar(64))
2231  return false;
2232 
2233  Register Src = MI.getOperand(1).getReg();
2234  MachineBasicBlock *BB = MI.getParent();
2235  const DebugLoc &DL = MI.getDebugLoc();
2236  Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2237  Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2238  Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2239  Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2240 
2241  if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2242  !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2243  return false;
2244 
2245  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2246  .addReg(Src, 0, AMDGPU::sub0);
2247  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2248  .addReg(Src, 0, AMDGPU::sub1);
2249  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2250  .addImm(0x7fffffff);
2251 
2252  // Clear sign bit.
2253  // TODO: Should this used S_BITSET0_*?
2254  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2255  .addReg(HiReg)
2256  .addReg(ConstReg);
2257  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2258  .addReg(LoReg)
2259  .addImm(AMDGPU::sub0)
2260  .addReg(OpReg)
2261  .addImm(AMDGPU::sub1);
2262 
2263  MI.eraseFromParent();
2264  return true;
2265 }
2266 
2267 static bool isConstant(const MachineInstr &MI) {
2268  return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2269 }
2270 
2271 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2272  const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2273 
2274  const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg());
2275 
2276  assert(PtrMI);
2277 
2278  if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2279  return;
2280 
2281  GEPInfo GEPInfo(*PtrMI);
2282 
2283  for (unsigned i = 1; i != 3; ++i) {
2284  const MachineOperand &GEPOp = PtrMI->getOperand(i);
2285  const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2286  assert(OpDef);
2287  if (i == 2 && isConstant(*OpDef)) {
2288  // TODO: Could handle constant base + variable offset, but a combine
2289  // probably should have commuted it.
2290  assert(GEPInfo.Imm == 0);
2291  GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2292  continue;
2293  }
2294  const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2295  if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2296  GEPInfo.SgprParts.push_back(GEPOp.getReg());
2297  else
2298  GEPInfo.VgprParts.push_back(GEPOp.getReg());
2299  }
2300 
2301  AddrInfo.push_back(GEPInfo);
2302  getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2303 }
2304 
2305 bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2306  return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2307 }
2308 
2309 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2310  if (!MI.hasOneMemOperand())
2311  return false;
2312 
2313  const MachineMemOperand *MMO = *MI.memoperands_begin();
2314  const Value *Ptr = MMO->getValue();
2315 
2316  // UndefValue means this is a load of a kernel input. These are uniform.
2317  // Sometimes LDS instructions have constant pointers.
2318  // If Ptr is null, then that means this mem operand contains a
2319  // PseudoSourceValue like GOT.
2320  if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2321  isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2322  return true;
2323 
2325  return true;
2326 
2327  const Instruction *I = dyn_cast<Instruction>(Ptr);
2328  return I && I->getMetadata("amdgpu.uniform");
2329 }
2330 
2331 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2332  for (const GEPInfo &GEPInfo : AddrInfo) {
2333  if (!GEPInfo.VgprParts.empty())
2334  return true;
2335  }
2336  return false;
2337 }
2338 
2339 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2340  const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2341  unsigned AS = PtrTy.getAddressSpace();
2342  if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
2343  STI.ldsRequiresM0Init()) {
2344  MachineBasicBlock *BB = I.getParent();
2345 
2346  // If DS instructions require M0 initializtion, insert it before selecting.
2347  BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2348  .addImm(-1);
2349  }
2350 }
2351 
2352 bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2353  MachineInstr &I) const {
2354  if (I.getOpcode() == TargetOpcode::G_ATOMICRMW_FADD) {
2355  const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2356  unsigned AS = PtrTy.getAddressSpace();
2357  if (AS == AMDGPUAS::GLOBAL_ADDRESS)
2358  return selectGlobalAtomicFadd(I, I.getOperand(1), I.getOperand(2));
2359  }
2360 
2361  initM0(I);
2362  return selectImpl(I, *CoverageInfo);
2363 }
2364 
2365 // TODO: No rtn optimization.
2366 bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
2367  MachineInstr &MI) const {
2368  Register PtrReg = MI.getOperand(1).getReg();
2369  const LLT PtrTy = MRI->getType(PtrReg);
2370  if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
2371  STI.useFlatForGlobal())
2372  return selectImpl(MI, *CoverageInfo);
2373 
2374  Register DstReg = MI.getOperand(0).getReg();
2375  const LLT Ty = MRI->getType(DstReg);
2376  const bool Is64 = Ty.getSizeInBits() == 64;
2377  const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2379  Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2380 
2381  const DebugLoc &DL = MI.getDebugLoc();
2382  MachineBasicBlock *BB = MI.getParent();
2383 
2384  Register VAddr, RSrcReg, SOffset;
2385  int64_t Offset = 0;
2386 
2387  unsigned Opcode;
2388  if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) {
2389  Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN :
2390  AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN;
2391  } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr,
2392  RSrcReg, SOffset, Offset)) {
2393  Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN :
2394  AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN;
2395  } else
2396  return selectImpl(MI, *CoverageInfo);
2397 
2398  auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg)
2399  .addReg(MI.getOperand(2).getReg());
2400 
2401  if (VAddr)
2402  MIB.addReg(VAddr);
2403 
2404  MIB.addReg(RSrcReg);
2405  if (SOffset)
2406  MIB.addReg(SOffset);
2407  else
2408  MIB.addImm(0);
2409 
2410  MIB.addImm(Offset);
2412  MIB.cloneMemRefs(MI);
2413 
2414  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg)
2415  .addReg(TmpReg, RegState::Kill, SubReg);
2416 
2417  MI.eraseFromParent();
2418 
2419  MRI->setRegClass(
2420  DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass);
2421  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2422 }
2423 
2424 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2425  MachineBasicBlock *BB = I.getParent();
2426  MachineOperand &CondOp = I.getOperand(0);
2427  Register CondReg = CondOp.getReg();
2428  const DebugLoc &DL = I.getDebugLoc();
2429 
2430  unsigned BrOpcode;
2431  Register CondPhysReg;
2432  const TargetRegisterClass *ConstrainRC;
2433 
2434  // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2435  // whether the branch is uniform when selecting the instruction. In
2436  // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2437  // RegBankSelect knows what it's doing if the branch condition is scc, even
2438  // though it currently does not.
2439  if (!isVCC(CondReg, *MRI)) {
2440  if (MRI->getType(CondReg) != LLT::scalar(32))
2441  return false;
2442 
2443  CondPhysReg = AMDGPU::SCC;
2444  BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2445  ConstrainRC = &AMDGPU::SReg_32RegClass;
2446  } else {
2447  // FIXME: Do we have to insert an and with exec here, like in SelectionDAG?
2448  // We sort of know that a VCC producer based on the register bank, that ands
2449  // inactive lanes with 0. What if there was a logical operation with vcc
2450  // producers in different blocks/with different exec masks?
2451  // FIXME: Should scc->vcc copies and with exec?
2452  CondPhysReg = TRI.getVCC();
2453  BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2454  ConstrainRC = TRI.getBoolRC();
2455  }
2456 
2457  if (!MRI->getRegClassOrNull(CondReg))
2458  MRI->setRegClass(CondReg, ConstrainRC);
2459 
2460  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2461  .addReg(CondReg);
2462  BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2463  .addMBB(I.getOperand(1).getMBB());
2464 
2465  I.eraseFromParent();
2466  return true;
2467 }
2468 
2469 bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2470  MachineInstr &I) const {
2471  Register DstReg = I.getOperand(0).getReg();
2472  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2473  const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2474  I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2475  if (IsVGPR)
2476  I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2477 
2478  return RBI.constrainGenericRegister(
2479  DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2480 }
2481 
2482 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2483  Register DstReg = I.getOperand(0).getReg();
2484  Register SrcReg = I.getOperand(1).getReg();
2485  Register MaskReg = I.getOperand(2).getReg();
2486  LLT Ty = MRI->getType(DstReg);
2487  LLT MaskTy = MRI->getType(MaskReg);
2488 
2489  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2490  const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2491  const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2492  const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2493  if (DstRB != SrcRB) // Should only happen for hand written MIR.
2494  return false;
2495 
2496  unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2497  const TargetRegisterClass &RegRC
2498  = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2499 
2500  const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB,
2501  *MRI);
2502  const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB,
2503  *MRI);
2504  const TargetRegisterClass *MaskRC =
2505  TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI);
2506 
2507  if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2508  !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2509  !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2510  return false;
2511 
2512  MachineBasicBlock *BB = I.getParent();
2513  const DebugLoc &DL = I.getDebugLoc();
2514  if (Ty.getSizeInBits() == 32) {
2515  assert(MaskTy.getSizeInBits() == 32 &&
2516  "ptrmask should have been narrowed during legalize");
2517 
2518  BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2519  .addReg(SrcReg)
2520  .addReg(MaskReg);
2521  I.eraseFromParent();
2522  return true;
2523  }
2524 
2525  Register HiReg = MRI->createVirtualRegister(&RegRC);
2526  Register LoReg = MRI->createVirtualRegister(&RegRC);
2527 
2528  // Extract the subregisters from the source pointer.
2529  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2530  .addReg(SrcReg, 0, AMDGPU::sub0);
2531  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2532  .addReg(SrcReg, 0, AMDGPU::sub1);
2533 
2534  Register MaskedLo, MaskedHi;
2535 
2536  // Try to avoid emitting a bit operation when we only need to touch half of
2537  // the 64-bit pointer.
2538  APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
2539 
2540  const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2541  const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2542  if ((MaskOnes & MaskLo32) == MaskLo32) {
2543  // If all the bits in the low half are 1, we only need a copy for it.
2544  MaskedLo = LoReg;
2545  } else {
2546  // Extract the mask subregister and apply the and.
2547  Register MaskLo = MRI->createVirtualRegister(&RegRC);
2548  MaskedLo = MRI->createVirtualRegister(&RegRC);
2549 
2550  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2551  .addReg(MaskReg, 0, AMDGPU::sub0);
2552  BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2553  .addReg(LoReg)
2554  .addReg(MaskLo);
2555  }
2556 
2557  if ((MaskOnes & MaskHi32) == MaskHi32) {
2558  // If all the bits in the high half are 1, we only need a copy for it.
2559  MaskedHi = HiReg;
2560  } else {
2561  Register MaskHi = MRI->createVirtualRegister(&RegRC);
2562  MaskedHi = MRI->createVirtualRegister(&RegRC);
2563 
2564  BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
2565  .addReg(MaskReg, 0, AMDGPU::sub1);
2566  BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
2567  .addReg(HiReg)
2568  .addReg(MaskHi);
2569  }
2570 
2571  BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2572  .addReg(MaskedLo)
2573  .addImm(AMDGPU::sub0)
2574  .addReg(MaskedHi)
2575  .addImm(AMDGPU::sub1);
2576  I.eraseFromParent();
2577  return true;
2578 }
2579 
2580 /// Return the register to use for the index value, and the subregister to use
2581 /// for the indirectly accessed register.
2582 static std::pair<Register, unsigned>
2584  const SIRegisterInfo &TRI,
2585  const TargetRegisterClass *SuperRC,
2586  Register IdxReg,
2587  unsigned EltSize) {
2588  Register IdxBaseReg;
2589  int Offset;
2590 
2591  std::tie(IdxBaseReg, Offset) = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg);
2592  if (IdxBaseReg == AMDGPU::NoRegister) {
2593  // This will happen if the index is a known constant. This should ordinarily
2594  // be legalized out, but handle it as a register just in case.
2595  assert(Offset == 0);
2596  IdxBaseReg = IdxReg;
2597  }
2598 
2599  ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
2600 
2601  // Skip out of bounds offsets, or else we would end up using an undefined
2602  // register.
2603  if (static_cast<unsigned>(Offset) >= SubRegs.size())
2604  return std::make_pair(IdxReg, SubRegs[0]);
2605  return std::make_pair(IdxBaseReg, SubRegs[Offset]);
2606 }
2607 
2608 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
2609  MachineInstr &MI) const {
2610  Register DstReg = MI.getOperand(0).getReg();
2611  Register SrcReg = MI.getOperand(1).getReg();
2612  Register IdxReg = MI.getOperand(2).getReg();
2613 
2614  LLT DstTy = MRI->getType(DstReg);
2615  LLT SrcTy = MRI->getType(SrcReg);
2616 
2617  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2618  const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2619  const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2620 
2621  // The index must be scalar. If it wasn't RegBankSelect should have moved this
2622  // into a waterfall loop.
2623  if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2624  return false;
2625 
2626  const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB,
2627  *MRI);
2628  const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB,
2629  *MRI);
2630  if (!SrcRC || !DstRC)
2631  return false;
2632  if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2633  !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2634  !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2635  return false;
2636 
2637  MachineBasicBlock *BB = MI.getParent();
2638  const DebugLoc &DL = MI.getDebugLoc();
2639  const bool Is64 = DstTy.getSizeInBits() == 64;
2640 
2641  unsigned SubReg;
2642  std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg,
2643  DstTy.getSizeInBits() / 8);
2644 
2645  if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
2646  if (DstTy.getSizeInBits() != 32 && !Is64)
2647  return false;
2648 
2649  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2650  .addReg(IdxReg);
2651 
2652  unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
2653  BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
2654  .addReg(SrcReg, 0, SubReg)
2655  .addReg(SrcReg, RegState::Implicit);
2656  MI.eraseFromParent();
2657  return true;
2658  }
2659 
2660  if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
2661  return false;
2662 
2663  if (!STI.useVGPRIndexMode()) {
2664  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2665  .addReg(IdxReg);
2666  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
2667  .addReg(SrcReg, 0, SubReg)
2668  .addReg(SrcReg, RegState::Implicit);
2669  MI.eraseFromParent();
2670  return true;
2671  }
2672 
2673  const MCInstrDesc &GPRIDXDesc =
2674  TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
2675  BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
2676  .addReg(SrcReg)
2677  .addReg(IdxReg)
2678  .addImm(SubReg);
2679 
2680  MI.eraseFromParent();
2681  return true;
2682 }
2683 
2684 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
2685 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
2686  MachineInstr &MI) const {
2687  Register DstReg = MI.getOperand(0).getReg();
2688  Register VecReg = MI.getOperand(1).getReg();
2689  Register ValReg = MI.getOperand(2).getReg();
2690  Register IdxReg = MI.getOperand(3).getReg();
2691 
2692  LLT VecTy = MRI->getType(DstReg);
2693  LLT ValTy = MRI->getType(ValReg);
2694  unsigned VecSize = VecTy.getSizeInBits();
2695  unsigned ValSize = ValTy.getSizeInBits();
2696 
2697  const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
2698  const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
2699  const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2700 
2701  assert(VecTy.getElementType() == ValTy);
2702 
2703  // The index must be scalar. If it wasn't RegBankSelect should have moved this
2704  // into a waterfall loop.
2705  if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2706  return false;
2707 
2708  const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB,
2709  *MRI);
2710  const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB,
2711  *MRI);
2712 
2713  if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
2714  !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
2715  !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
2716  !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2717  return false;
2718 
2719  if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
2720  return false;
2721 
2722  unsigned SubReg;
2723  std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg,
2724  ValSize / 8);
2725 
2726  const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
2727  STI.useVGPRIndexMode();
2728 
2729  MachineBasicBlock *BB = MI.getParent();
2730  const DebugLoc &DL = MI.getDebugLoc();
2731 
2732  if (!IndexMode) {
2733  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2734  .addReg(IdxReg);
2735 
2736  const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
2737  VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
2738  BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
2739  .addReg(VecReg)
2740  .addReg(ValReg)
2741  .addImm(SubReg);
2742  MI.eraseFromParent();
2743  return true;
2744  }
2745 
2746  const MCInstrDesc &GPRIDXDesc =
2747  TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
2748  BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
2749  .addReg(VecReg)
2750  .addReg(ValReg)
2751  .addReg(IdxReg)
2752  .addImm(SubReg);
2753 
2754  MI.eraseFromParent();
2755  return true;
2756 }
2757 
2758 static bool isZeroOrUndef(int X) {
2759  return X == 0 || X == -1;
2760 }
2761 
2762 static bool isOneOrUndef(int X) {
2763  return X == 1 || X == -1;
2764 }
2765 
2766 static bool isZeroOrOneOrUndef(int X) {
2767  return X == 0 || X == 1 || X == -1;
2768 }
2769 
2770 // Normalize a VOP3P shuffle mask to refer to the low/high half of a single
2771 // 32-bit register.
2772 static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1,
2773  ArrayRef<int> Mask) {
2774  NewMask[0] = Mask[0];
2775  NewMask[1] = Mask[1];
2777  return Src0;
2778 
2779  assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1);
2780  assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1);
2781 
2782  // Shift the mask inputs to be 0/1;
2783  NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2;
2784  NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2;
2785  return Src1;
2786 }
2787 
2788 // This is only legal with VOP3P instructions as an aid to op_sel matching.
2789 bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
2790  MachineInstr &MI) const {
2791  Register DstReg = MI.getOperand(0).getReg();
2792  Register Src0Reg = MI.getOperand(1).getReg();
2793  Register Src1Reg = MI.getOperand(2).getReg();
2794  ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask();
2795 
2796  const LLT V2S16 = LLT::vector(2, 16);
2797  if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16)
2798  return false;
2799 
2800  if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask))
2801  return false;
2802 
2803  assert(ShufMask.size() == 2);
2804  assert(STI.hasSDWA() && "no target has VOP3P but not SDWA");
2805 
2806  MachineBasicBlock *MBB = MI.getParent();
2807  const DebugLoc &DL = MI.getDebugLoc();
2808 
2809  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2810  const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2811  const TargetRegisterClass &RC = IsVALU ?
2812  AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2813 
2814  // Handle the degenerate case which should have folded out.
2815  if (ShufMask[0] == -1 && ShufMask[1] == -1) {
2816  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg);
2817 
2818  MI.eraseFromParent();
2819  return RBI.constrainGenericRegister(DstReg, RC, *MRI);
2820  }
2821 
2822  // A legal VOP3P mask only reads one of the sources.
2823  int Mask[2];
2824  Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask);
2825 
2826  if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) ||
2827  !RBI.constrainGenericRegister(SrcVec, RC, *MRI))
2828  return false;
2829 
2830  // TODO: This also should have been folded out
2831  if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) {
2832  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg)
2833  .addReg(SrcVec);
2834 
2835  MI.eraseFromParent();
2836  return true;
2837  }
2838 
2839  if (Mask[0] == 1 && Mask[1] == -1) {
2840  if (IsVALU) {
2841  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
2842  .addImm(16)
2843  .addReg(SrcVec);
2844  } else {
2845  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
2846  .addReg(SrcVec)
2847  .addImm(16);
2848  }
2849  } else if (Mask[0] == -1 && Mask[1] == 0) {
2850  if (IsVALU) {
2851  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg)
2852  .addImm(16)
2853  .addReg(SrcVec);
2854  } else {
2855  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg)
2856  .addReg(SrcVec)
2857  .addImm(16);
2858  }
2859  } else if (Mask[0] == 0 && Mask[1] == 0) {
2860  if (IsVALU) {
2861  // Write low half of the register into the high half.
2862  MachineInstr *MovSDWA =
2863  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2864  .addImm(0) // $src0_modifiers
2865  .addReg(SrcVec) // $src0
2866  .addImm(0) // $clamp
2867  .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2868  .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2869  .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2870  .addReg(SrcVec, RegState::Implicit);
2871  MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2872  } else {
2873  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2874  .addReg(SrcVec)
2875  .addReg(SrcVec);
2876  }
2877  } else if (Mask[0] == 1 && Mask[1] == 1) {
2878  if (IsVALU) {
2879  // Write high half of the register into the low half.
2880  MachineInstr *MovSDWA =
2881  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2882  .addImm(0) // $src0_modifiers
2883  .addReg(SrcVec) // $src0
2884  .addImm(0) // $clamp
2885  .addImm(AMDGPU::SDWA::WORD_0) // $dst_sel
2886  .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2887  .addImm(AMDGPU::SDWA::WORD_1) // $src0_sel
2888  .addReg(SrcVec, RegState::Implicit);
2889  MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2890  } else {
2891  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg)
2892  .addReg(SrcVec)
2893  .addReg(SrcVec);
2894  }
2895  } else if (Mask[0] == 1 && Mask[1] == 0) {
2896  if (IsVALU) {
2897  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32_e64), DstReg)
2898  .addReg(SrcVec)
2899  .addReg(SrcVec)
2900  .addImm(16);
2901  } else {
2902  Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2903  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg)
2904  .addReg(SrcVec)
2905  .addImm(16);
2906  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2907  .addReg(TmpReg)
2908  .addReg(SrcVec);
2909  }
2910  } else
2911  llvm_unreachable("all shuffle masks should be handled");
2912 
2913  MI.eraseFromParent();
2914  return true;
2915 }
2916 
2917 bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
2918  MachineInstr &MI) const {
2919  if (STI.hasGFX90AInsts())
2920  return selectImpl(MI, *CoverageInfo);
2921 
2922  MachineBasicBlock *MBB = MI.getParent();
2923  const DebugLoc &DL = MI.getDebugLoc();
2924 
2925  if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
2926  Function &F = MBB->getParent()->getFunction();
2928  NoFpRet(F, "return versions of fp atomics not supported",
2929  MI.getDebugLoc(), DS_Error);
2930  F.getContext().diagnose(NoFpRet);
2931  return false;
2932  }
2933 
2934  // FIXME: This is only needed because tablegen requires number of dst operands
2935  // in match and replace pattern to be the same. Otherwise patterns can be
2936  // exported from SDag path.
2937  MachineOperand &VDataIn = MI.getOperand(1);
2938  MachineOperand &VIndex = MI.getOperand(3);
2939  MachineOperand &VOffset = MI.getOperand(4);
2940  MachineOperand &SOffset = MI.getOperand(5);
2941  int16_t Offset = MI.getOperand(6).getImm();
2942 
2943  bool HasVOffset = !isOperandImmEqual(VOffset, 0, *MRI);
2944  bool HasVIndex = !isOperandImmEqual(VIndex, 0, *MRI);
2945 
2946  unsigned Opcode;
2947  if (HasVOffset) {
2948  Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN
2949  : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN;
2950  } else {
2951  Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN
2952  : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET;
2953  }
2954 
2955  if (MRI->getType(VDataIn.getReg()).isVector()) {
2956  switch (Opcode) {
2957  case AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN:
2958  Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN;
2959  break;
2960  case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN:
2961  Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFEN;
2962  break;
2963  case AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN:
2964  Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_IDXEN;
2965  break;
2966  case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET:
2967  Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFSET;
2968  break;
2969  }
2970  }
2971 
2972  auto I = BuildMI(*MBB, MI, DL, TII.get(Opcode));
2973  I.add(VDataIn);
2974 
2975  if (Opcode == AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN ||
2976  Opcode == AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN) {
2977  Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
2978  BuildMI(*MBB, &*I, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
2979  .addReg(VIndex.getReg())
2980  .addImm(AMDGPU::sub0)
2981  .addReg(VOffset.getReg())
2982  .addImm(AMDGPU::sub1);
2983 
2984  I.addReg(IdxReg);
2985  } else if (HasVIndex) {
2986  I.add(VIndex);
2987  } else if (HasVOffset) {
2988  I.add(VOffset);
2989  }
2990 
2991  I.add(MI.getOperand(2)); // rsrc
2992  I.add(SOffset);
2993  I.addImm(Offset);
2994  I.addImm(MI.getOperand(7).getImm()); // cpol
2995  I.cloneMemRefs(MI);
2996 
2997  MI.eraseFromParent();
2998 
2999  return true;
3000 }
3001 
3002 bool AMDGPUInstructionSelector::selectGlobalAtomicFadd(
3003  MachineInstr &MI, MachineOperand &AddrOp, MachineOperand &DataOp) const {
3004 
3005  if (STI.hasGFX90AInsts()) {
3006  // gfx90a adds return versions of the global atomic fadd instructions so no
3007  // special handling is required.
3008  return selectImpl(MI, *CoverageInfo);
3009  }
3010 
3011  MachineBasicBlock *MBB = MI.getParent();
3012  const DebugLoc &DL = MI.getDebugLoc();
3013 
3014  if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
3015  Function &F = MBB->getParent()->getFunction();
3017  NoFpRet(F, "return versions of fp atomics not supported",
3018  MI.getDebugLoc(), DS_Error);
3019  F.getContext().diagnose(NoFpRet);
3020  return false;
3021  }
3022 
3023  // FIXME: This is only needed because tablegen requires number of dst operands
3024  // in match and replace pattern to be the same. Otherwise patterns can be
3025  // exported from SDag path.
3026  auto Addr = selectFlatOffsetImpl(AddrOp, SIInstrFlags::FlatGlobal);
3027 
3028  Register Data = DataOp.getReg();
3029  const unsigned Opc = MRI->getType(Data).isVector() ?
3030  AMDGPU::GLOBAL_ATOMIC_PK_ADD_F16 : AMDGPU::GLOBAL_ATOMIC_ADD_F32;
3031  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3032  .addReg(Addr.first)
3033  .addReg(Data)
3034  .addImm(Addr.second)
3035  .addImm(0) // cpol
3036  .cloneMemRefs(MI);
3037 
3038  MI.eraseFromParent();
3039  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3040 }
3041 
3042 bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3043  MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3044  MI.RemoveOperand(1);
3045  MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3046  return true;
3047 }
3048 
3050  if (I.isPHI())
3051  return selectPHI(I);
3052 
3053  if (!I.isPreISelOpcode()) {
3054  if (I.isCopy())
3055  return selectCOPY(I);
3056  return true;
3057  }
3058 
3059  switch (I.getOpcode()) {
3060  case TargetOpcode::G_AND:
3061  case TargetOpcode::G_OR:
3062  case TargetOpcode::G_XOR:
3063  if (selectImpl(I, *CoverageInfo))
3064  return true;
3065  return selectG_AND_OR_XOR(I);
3066  case TargetOpcode::G_ADD:
3067  case TargetOpcode::G_SUB:
3068  if (selectImpl(I, *CoverageInfo))
3069  return true;
3070  return selectG_ADD_SUB(I);
3071  case TargetOpcode::G_UADDO:
3072  case TargetOpcode::G_USUBO:
3073  case TargetOpcode::G_UADDE:
3074  case TargetOpcode::G_USUBE:
3075  return selectG_UADDO_USUBO_UADDE_USUBE(I);
3076  case TargetOpcode::G_INTTOPTR:
3077  case TargetOpcode::G_BITCAST:
3078  case TargetOpcode::G_PTRTOINT:
3079  return selectCOPY(I);
3080  case TargetOpcode::G_CONSTANT:
3081  case TargetOpcode::G_FCONSTANT:
3082  return selectG_CONSTANT(I);
3083  case TargetOpcode::G_FNEG:
3084  if (selectImpl(I, *CoverageInfo))
3085  return true;
3086  return selectG_FNEG(I);
3087  case TargetOpcode::G_FABS:
3088  if (selectImpl(I, *CoverageInfo))
3089  return true;
3090  return selectG_FABS(I);
3091  case TargetOpcode::G_EXTRACT:
3092  return selectG_EXTRACT(I);
3093  case TargetOpcode::G_MERGE_VALUES:
3094  case TargetOpcode::G_BUILD_VECTOR:
3095  case TargetOpcode::G_CONCAT_VECTORS:
3096  return selectG_MERGE_VALUES(I);
3097  case TargetOpcode::G_UNMERGE_VALUES:
3098  return selectG_UNMERGE_VALUES(I);
3099  case TargetOpcode::G_BUILD_VECTOR_TRUNC:
3100  return selectG_BUILD_VECTOR_TRUNC(I);
3101  case TargetOpcode::G_PTR_ADD:
3102  return selectG_PTR_ADD(I);
3103  case TargetOpcode::G_IMPLICIT_DEF:
3104  return selectG_IMPLICIT_DEF(I);
3105  case TargetOpcode::G_FREEZE:
3106  return selectCOPY(I);
3107  case TargetOpcode::G_INSERT:
3108  return selectG_INSERT(I);
3109  case TargetOpcode::G_INTRINSIC:
3110  return selectG_INTRINSIC(I);
3111  case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3112  return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
3113  case TargetOpcode::G_ICMP:
3114  if (selectG_ICMP(I))
3115  return true;
3116  return selectImpl(I, *CoverageInfo);
3117  case TargetOpcode::G_LOAD:
3118  case TargetOpcode::G_STORE:
3119  case TargetOpcode::G_ATOMIC_CMPXCHG:
3120  case TargetOpcode::G_ATOMICRMW_XCHG:
3121  case TargetOpcode::G_ATOMICRMW_ADD:
3122  case TargetOpcode::G_ATOMICRMW_SUB:
3123  case TargetOpcode::G_ATOMICRMW_AND:
3124  case TargetOpcode::G_ATOMICRMW_OR:
3125  case TargetOpcode::G_ATOMICRMW_XOR:
3126  case TargetOpcode::G_ATOMICRMW_MIN:
3127  case TargetOpcode::G_ATOMICRMW_MAX:
3128  case TargetOpcode::G_ATOMICRMW_UMIN:
3129  case TargetOpcode::G_ATOMICRMW_UMAX:
3130  case TargetOpcode::G_ATOMICRMW_FADD:
3131  case AMDGPU::G_AMDGPU_ATOMIC_INC:
3132  case AMDGPU::G_AMDGPU_ATOMIC_DEC:
3133  case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
3134  case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
3135  return selectG_LOAD_STORE_ATOMICRMW(I);
3136  case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
3137  return selectG_AMDGPU_ATOMIC_CMPXCHG(I);
3138  case TargetOpcode::G_SELECT:
3139  return selectG_SELECT(I);
3140  case TargetOpcode::G_TRUNC:
3141  return selectG_TRUNC(I);
3142  case TargetOpcode::G_SEXT:
3143  case TargetOpcode::G_ZEXT:
3144  case TargetOpcode::G_ANYEXT:
3145  case TargetOpcode::G_SEXT_INREG:
3146  if (selectImpl(I, *CoverageInfo))
3147  return true;
3148  return selectG_SZA_EXT(I);
3149  case TargetOpcode::G_BRCOND:
3150  return selectG_BRCOND(I);
3151  case TargetOpcode::G_GLOBAL_VALUE:
3152  return selectG_GLOBAL_VALUE(I);
3153  case TargetOpcode::G_PTRMASK:
3154  return selectG_PTRMASK(I);
3155  case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3156  return selectG_EXTRACT_VECTOR_ELT(I);
3157  case TargetOpcode::G_INSERT_VECTOR_ELT:
3158  return selectG_INSERT_VECTOR_ELT(I);
3159  case TargetOpcode::G_SHUFFLE_VECTOR:
3160  return selectG_SHUFFLE_VECTOR(I);
3161  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3162  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
3164  = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID());
3165  assert(Intr && "not an image intrinsic with image pseudo");
3166  return selectImageIntrinsic(I, Intr);
3167  }
3168  case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
3169  return selectBVHIntrinsic(I);
3170  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3171  return selectAMDGPU_BUFFER_ATOMIC_FADD(I);
3172  default:
3173  return selectImpl(I, *CoverageInfo);
3174  }
3175  return false;
3176 }
3177 
3179 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
3180  return {{
3181  [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3182  }};
3183 
3184 }
3185 
3186 std::pair<Register, unsigned>
3187 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
3188  bool AllowAbs) const {
3189  Register Src = Root.getReg();
3190  Register OrigSrc = Src;
3191  unsigned Mods = 0;
3192  MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
3193 
3194  if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
3195  Src = MI->getOperand(1).getReg();
3196  Mods |= SISrcMods::NEG;
3197  MI = getDefIgnoringCopies(Src, *MRI);
3198  }
3199 
3200  if (AllowAbs && MI && MI->getOpcode() == AMDGPU::G_FABS) {
3201  Src = MI->getOperand(1).getReg();
3202  Mods |= SISrcMods::ABS;
3203  }
3204 
3205  if (Mods != 0 &&
3206  RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
3207  MachineInstr *UseMI = Root.getParent();
3208 
3209  // If we looked through copies to find source modifiers on an SGPR operand,
3210  // we now have an SGPR register source. To avoid potentially violating the
3211  // constant bus restriction, we need to insert a copy to a VGPR.
3212  Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc);
3214  TII.get(AMDGPU::COPY), VGPRSrc)
3215  .addReg(Src);
3216  Src = VGPRSrc;
3217  }
3218 
3219  return std::make_pair(Src, Mods);
3220 }
3221 
3222 ///
3223 /// This will select either an SGPR or VGPR operand and will save us from
3224 /// having to write an extra tablegen pattern.
3226 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
3227  return {{
3228  [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3229  }};
3230 }
3231 
3233 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
3234  Register Src;
3235  unsigned Mods;
3236  std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3237 
3238  return {{
3239  [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3240  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3241  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3242  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3243  }};
3244 }
3245 
3247 AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
3248  Register Src;
3249  unsigned Mods;
3250  std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false);
3251 
3252  return {{
3253  [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3254  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3255  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3256  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3257  }};
3258 }
3259 
3261 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
3262  return {{
3263  [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
3264  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3265  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3266  }};
3267 }
3268 
3270 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3271  Register Src;
3272  unsigned Mods;
3273  std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3274 
3275  return {{
3276  [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3277  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3278  }};
3279 }
3280 
3282 AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
3283  Register Src;
3284  unsigned Mods;
3285  std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false);
3286 
3287  return {{
3288  [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3289  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3290  }};
3291 }
3292 
3294 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3295  Register Reg = Root.getReg();
3296  const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3297  if (Def && (Def->getOpcode() == AMDGPU::G_FNEG ||
3298  Def->getOpcode() == AMDGPU::G_FABS))
3299  return {};
3300  return {{
3301  [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3302  }};
3303 }
3304 
3305 std::pair<Register, unsigned>
3306 AMDGPUInstructionSelector::selectVOP3PModsImpl(
3307  Register Src, const MachineRegisterInfo &MRI) const {
3308  unsigned Mods = 0;
3309  MachineInstr *MI = MRI.getVRegDef(Src);
3310 
3311  if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3312  // It's possible to see an f32 fneg here, but unlikely.
3313  // TODO: Treat f32 fneg as only high bit.
3314  MRI.getType(Src) == LLT::vector(2, 16)) {
3315  Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3316  Src = MI->getOperand(1).getReg();
3317  MI = MRI.getVRegDef(Src);
3318  }
3319 
3320  // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3321 
3322  // Packed instructions do not have abs modifiers.
3323  Mods |= SISrcMods::OP_SEL_1;
3324 
3325  return std::make_pair(Src, Mods);
3326 }
3327 
3329 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3331  = Root.getParent()->getParent()->getParent()->getRegInfo();
3332 
3333  Register Src;
3334  unsigned Mods;
3335  std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3336 
3337  return {{
3338  [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3339  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3340  }};
3341 }
3342 
3344 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
3345  Register Src;
3346  unsigned Mods;
3347  std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3348  if (!isKnownNeverNaN(Src, *MRI))
3349  return None;
3350 
3351  return {{
3352  [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3353  [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3354  }};
3355 }
3356 
3358 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
3359  // FIXME: Handle op_sel
3360  return {{
3361  [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
3362  [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods
3363  }};
3364 }
3365 
3367 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
3368  SmallVector<GEPInfo, 4> AddrInfo;
3369  getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3370 
3371  if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3372  return None;
3373 
3374  const GEPInfo &GEPInfo = AddrInfo[0];
3375  Optional<int64_t> EncodedImm =
3376  AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false);
3377  if (!EncodedImm)
3378  return None;
3379 
3380  unsigned PtrReg = GEPInfo.SgprParts[0];
3381  return {{
3382  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3383  [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3384  }};
3385 }
3386 
3388 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
3389  SmallVector<GEPInfo, 4> AddrInfo;
3390  getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3391 
3392  if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3393  return None;
3394 
3395  const GEPInfo &GEPInfo = AddrInfo[0];
3396  Register PtrReg = GEPInfo.SgprParts[0];
3397  Optional<int64_t> EncodedImm =
3398  AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
3399  if (!EncodedImm)
3400  return None;
3401 
3402  return {{
3403  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3404  [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3405  }};
3406 }
3407 
3409 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
3410  MachineInstr *MI = Root.getParent();
3411  MachineBasicBlock *MBB = MI->getParent();
3412 
3413  SmallVector<GEPInfo, 4> AddrInfo;
3414  getAddrModeInfo(*MI, *MRI, AddrInfo);
3415 
3416  // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
3417  // then we can select all ptr + 32-bit offsets not just immediate offsets.
3418  if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3419  return None;
3420 
3421  const GEPInfo &GEPInfo = AddrInfo[0];
3422  // SGPR offset is unsigned.
3423  if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm))
3424  return None;
3425 
3426  // If we make it this far we have a load with an 32-bit immediate offset.
3427  // It is OK to select this using a sgpr offset, because we have already
3428  // failed trying to select this load into one of the _IMM variants since
3429  // the _IMM Patterns are considered before the _SGPR patterns.
3430  Register PtrReg = GEPInfo.SgprParts[0];
3431  Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3432  BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
3433  .addImm(GEPInfo.Imm);
3434  return {{
3435  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3436  [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }
3437  }};
3438 }
3439 
3440 std::pair<Register, int>
3441 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
3442  uint64_t FlatVariant) const {
3443  MachineInstr *MI = Root.getParent();
3444 
3445  auto Default = std::make_pair(Root.getReg(), 0);
3446 
3447  if (!STI.hasFlatInstOffsets())
3448  return Default;
3449 
3450  Register PtrBase;
3451  int64_t ConstOffset;
3452  std::tie(PtrBase, ConstOffset) =
3453  getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3454  if (ConstOffset == 0)
3455  return Default;
3456 
3457  unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
3458  if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
3459  return Default;
3460 
3461  return std::make_pair(PtrBase, ConstOffset);
3462 }
3463 
3465 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
3466  auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
3467 
3468  return {{
3469  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
3470  [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
3471  }};
3472 }
3473 
3475 AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
3476  auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
3477 
3478  return {{
3479  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
3480  [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
3481  }};
3482 }
3483 
3485 AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
3486  auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
3487 
3488  return {{
3489  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
3490  [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
3491  }};
3492 }
3493 
3494 /// Match a zero extend from a 32-bit value to 64-bits.
3496  Register ZExtSrc;
3497  if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3498  return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3499 
3500  // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3502  if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3503  return false;
3504 
3505  if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3506  return Def->getOperand(1).getReg();
3507  }
3508 
3509  return Register();
3510 }
3511 
3512 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
3514 AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
3515  Register Addr = Root.getReg();
3516  Register PtrBase;
3517  int64_t ConstOffset;
3518  int64_t ImmOffset = 0;
3519 
3520  // Match the immediate offset first, which canonically is moved as low as
3521  // possible.
3522  std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
3523 
3524  if (ConstOffset != 0) {
3525  if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
3527  Addr = PtrBase;
3528  ImmOffset = ConstOffset;
3529  } else if (ConstOffset > 0) {
3530  auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
3531  if (!PtrBaseDef)
3532  return None;
3533 
3534  if (isSGPR(PtrBaseDef->Reg)) {
3535  // Offset is too large.
3536  //
3537  // saddr + large_offset -> saddr + (voffset = large_offset & ~MaxOffset)
3538  // + (large_offset & MaxOffset);
3539  int64_t SplitImmOffset, RemainderOffset;
3540  std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
3542 
3543  if (isUInt<32>(RemainderOffset)) {
3544  MachineInstr *MI = Root.getParent();
3545  MachineBasicBlock *MBB = MI->getParent();
3546  Register HighBits
3547  = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3548 
3549  BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3550  HighBits)
3551  .addImm(RemainderOffset);
3552 
3553  return {{
3554  [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
3555  [=](MachineInstrBuilder &MIB) { MIB.addReg(HighBits); }, // voffset
3556  [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
3557  }};
3558  }
3559  }
3560  }
3561  }
3562 
3563  auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3564  if (!AddrDef)
3565  return None;
3566 
3567  // Match the variable offset.
3568  if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD) {
3569  // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
3570  // drop this.
3571  if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
3572  AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT)
3573  return None;
3574 
3575  // It's cheaper to materialize a single 32-bit zero for vaddr than the two
3576  // moves required to copy a 64-bit SGPR to VGPR.
3577  const Register SAddr = AddrDef->Reg;
3578  if (!isSGPR(SAddr))
3579  return None;
3580 
3581  MachineInstr *MI = Root.getParent();
3582  MachineBasicBlock *MBB = MI->getParent();
3583  Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3584 
3585  BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3586  VOffset)
3587  .addImm(0);
3588 
3589  return {{
3590  [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
3591  [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
3592  [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
3593  }};
3594  }
3595 
3596  // Look through the SGPR->VGPR copy.
3597  Register SAddr =
3598  getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3599  if (!SAddr || !isSGPR(SAddr))
3600  return None;
3601 
3602  Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3603 
3604  // It's possible voffset is an SGPR here, but the copy to VGPR will be
3605  // inserted later.
3606  Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset);
3607  if (!VOffset)
3608  return None;
3609 
3610  return {{[=](MachineInstrBuilder &MIB) { // saddr
3611  MIB.addReg(SAddr);
3612  },
3613  [=](MachineInstrBuilder &MIB) { // voffset
3614  MIB.addReg(VOffset);
3615  },
3616  [=](MachineInstrBuilder &MIB) { // offset
3617  MIB.addImm(ImmOffset);
3618  }}};
3619 }
3620 
3622 AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
3623  Register Addr = Root.getReg();
3624  Register PtrBase;
3625  int64_t ConstOffset;
3626  int64_t ImmOffset = 0;
3627 
3628  // Match the immediate offset first, which canonically is moved as low as
3629  // possible.
3630  std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
3631 
3632  if (ConstOffset != 0 &&
3635  Addr = PtrBase;
3636  ImmOffset = ConstOffset;
3637  }
3638 
3639  auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3640  if (!AddrDef)
3641  return None;
3642 
3643  if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
3644  int FI = AddrDef->MI->getOperand(1).getIndex();
3645  return {{
3646  [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
3647  [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
3648  }};
3649  }
3650 
3651  Register SAddr = AddrDef->Reg;
3652 
3653  if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3654  Register LHS = AddrDef->MI->getOperand(1).getReg();
3655  Register RHS = AddrDef->MI->getOperand(2).getReg();
3656  auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
3657  auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
3658 
3659  if (LHSDef && RHSDef &&
3660  LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
3661  isSGPR(RHSDef->Reg)) {
3662  int FI = LHSDef->MI->getOperand(1).getIndex();
3663  MachineInstr &I = *Root.getParent();
3664  MachineBasicBlock *BB = I.getParent();
3665  const DebugLoc &DL = I.getDebugLoc();
3666  SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3667 
3668  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), SAddr)
3669  .addFrameIndex(FI)
3670  .addReg(RHSDef->Reg);
3671  }
3672  }
3673 
3674  if (!isSGPR(SAddr))
3675  return None;
3676 
3677  return {{
3678  [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
3679  [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
3680  }};
3681 }
3682 
3683 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
3684  auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
3685  return PSV && PSV->isStack();
3686 }
3687 
3689 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
3690  MachineInstr *MI = Root.getParent();
3691  MachineBasicBlock *MBB = MI->getParent();
3694 
3695  int64_t Offset = 0;
3696  if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
3698  Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3699 
3700  // TODO: Should this be inside the render function? The iterator seems to
3701  // move.
3702  BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3703  HighBits)
3704  .addImm(Offset & ~4095);
3705 
3706  return {{[=](MachineInstrBuilder &MIB) { // rsrc
3707  MIB.addReg(Info->getScratchRSrcReg());
3708  },
3709  [=](MachineInstrBuilder &MIB) { // vaddr
3710  MIB.addReg(HighBits);
3711  },
3712  [=](MachineInstrBuilder &MIB) { // soffset
3713  // Use constant zero for soffset and rely on eliminateFrameIndex
3714  // to choose the appropriate frame register if need be.
3715  MIB.addImm(0);
3716  },
3717  [=](MachineInstrBuilder &MIB) { // offset
3718  MIB.addImm(Offset & 4095);
3719  }}};
3720  }
3721 
3722  assert(Offset == 0 || Offset == -1);
3723 
3724  // Try to fold a frame index directly into the MUBUF vaddr field, and any
3725  // offsets.
3726  Optional<int> FI;
3727  Register VAddr = Root.getReg();
3728  if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
3729  Register PtrBase;
3730  int64_t ConstOffset;
3731  std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
3732  if (ConstOffset != 0) {
3733  if (SIInstrInfo::isLegalMUBUFImmOffset(ConstOffset) &&
3735  KnownBits->signBitIsZero(PtrBase))) {
3736  const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
3737  if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
3738  FI = PtrBaseDef->getOperand(1).getIndex();
3739  else
3740  VAddr = PtrBase;
3741  Offset = ConstOffset;
3742  }
3743  } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
3744  FI = RootDef->getOperand(1).getIndex();
3745  }
3746  }
3747 
3748  return {{[=](MachineInstrBuilder &MIB) { // rsrc
3749  MIB.addReg(Info->getScratchRSrcReg());
3750  },
3751  [=](MachineInstrBuilder &MIB) { // vaddr
3752  if (FI.hasValue())
3753  MIB.addFrameIndex(FI.getValue());
3754  else
3755  MIB.addReg(VAddr);
3756  },
3757  [=](MachineInstrBuilder &MIB) { // soffset
3758  // Use constant zero for soffset and rely on eliminateFrameIndex
3759  // to choose the appropriate frame register if need be.
3760  MIB.addImm(0);
3761  },
3762  [=](MachineInstrBuilder &MIB) { // offset
3763  MIB.addImm(Offset);
3764  }}};
3765 }
3766 
3767 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
3768  int64_t Offset) const {
3769  if (!isUInt<16>(Offset))
3770  return false;
3771 
3773  return true;
3774 
3775  // On Southern Islands instruction with a negative base value and an offset
3776  // don't seem to work.
3777  return KnownBits->signBitIsZero(Base);
3778 }
3779 
3780 bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
3781  int64_t Offset1,
3782  unsigned Size) const {
3783  if (Offset0 % Size != 0 || Offset1 % Size != 0)
3784  return false;
3785  if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
3786  return false;
3787 
3789  return true;
3790 
3791  // On Southern Islands instruction with a negative base value and an offset
3792  // don't seem to work.
3793  return KnownBits->signBitIsZero(Base);
3794 }
3795 
3797 AMDGPUInstructionSelector::selectMUBUFScratchOffset(
3798  MachineOperand &Root) const {
3799  MachineInstr *MI = Root.getParent();
3800  MachineBasicBlock *MBB = MI->getParent();
3801 
3802  int64_t Offset = 0;
3803  if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
3805  return {};
3806 
3807  const MachineFunction *MF = MBB->getParent();
3809  const MachineMemOperand *MMO = *MI->memoperands_begin();
3810  const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
3811 
3812  return {{
3813  [=](MachineInstrBuilder &MIB) { // rsrc
3814  MIB.addReg(Info->getScratchRSrcReg());
3815  },
3816  [=](MachineInstrBuilder &MIB) { // soffset
3817  if (isStackPtrRelative(PtrInfo))
3818  MIB.addReg(Info->getStackPtrOffsetReg());
3819  else
3820  MIB.addImm(0);
3821  },
3822  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
3823  }};
3824 }
3825 
3826 std::pair<Register, unsigned>
3827 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
3828  const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3829  if (!RootDef)
3830  return std::make_pair(Root.getReg(), 0);
3831 
3832  int64_t ConstAddr = 0;
3833 
3834  Register PtrBase;
3835  int64_t Offset;
3836  std::tie(PtrBase, Offset) =
3837  getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3838 
3839  if (Offset) {
3840  if (isDSOffsetLegal(PtrBase, Offset)) {
3841  // (add n0, c0)
3842  return std::make_pair(PtrBase, Offset);
3843  }
3844  } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3845  // TODO
3846 
3847 
3848  } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3849  // TODO
3850 
3851  }
3852 
3853  return std::make_pair(Root.getReg(), 0);
3854 }
3855 
3857 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
3858  Register Reg;
3859  unsigned Offset;
3860  std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
3861  return {{
3862  [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3863  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
3864  }};
3865 }
3866 
3868 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
3869  return selectDSReadWrite2(Root, 4);
3870 }
3871 
3873 AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
3874  return selectDSReadWrite2(Root, 8);
3875 }
3876 
3878 AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
3879  unsigned Size) const {
3880  Register Reg;
3881  unsigned Offset;
3882  std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
3883  return {{
3884  [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3885  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
3886  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
3887  }};
3888 }
3889 
3890 std::pair<Register, unsigned>
3891 AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
3892  unsigned Size) const {
3893  const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3894  if (!RootDef)
3895  return std::make_pair(Root.getReg(), 0);
3896 
3897  int64_t ConstAddr = 0;
3898 
3899  Register PtrBase;
3900  int64_t Offset;
3901  std::tie(PtrBase, Offset) =
3902  getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3903 
3904  if (Offset) {
3905  int64_t OffsetValue0 = Offset;
3906  int64_t OffsetValue1 = Offset + Size;
3907  if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
3908  // (add n0, c0)
3909  return std::make_pair(PtrBase, OffsetValue0 / Size);
3910  }
3911  } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3912  // TODO
3913 
3914  } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3915  // TODO
3916 
3917  }
3918 
3919  return std::make_pair(Root.getReg(), 0);
3920 }
3921 
3922 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
3923 /// the base value with the constant offset. There may be intervening copies
3924 /// between \p Root and the identified constant. Returns \p Root, 0 if this does
3925 /// not match the pattern.
3926 std::pair<Register, int64_t>
3927 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
3928  Register Root, const MachineRegisterInfo &MRI) const {
3929  MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
3930  if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
3931  return {Root, 0};
3932 
3933  MachineOperand &RHS = RootI->getOperand(2);
3934  Optional<ValueAndVReg> MaybeOffset
3935  = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true);
3936  if (!MaybeOffset)
3937  return {Root, 0};
3938  return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
3939 }
3940 
3941 static void addZeroImm(MachineInstrBuilder &MIB) {
3942  MIB.addImm(0);
3943 }
3944 
3945 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
3946 /// BasePtr is not valid, a null base pointer will be used.
3948  uint32_t FormatLo, uint32_t FormatHi,
3949  Register BasePtr) {
3950  Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3951  Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3952  Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3953  Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
3954 
3955  B.buildInstr(AMDGPU::S_MOV_B32)
3956  .addDef(RSrc2)
3957  .addImm(FormatLo);
3958  B.buildInstr(AMDGPU::S_MOV_B32)
3959  .addDef(RSrc3)
3960  .addImm(FormatHi);
3961 
3962  // Build the half of the subregister with the constants before building the
3963  // full 128-bit register. If we are building multiple resource descriptors,
3964  // this will allow CSEing of the 2-component register.
3965  B.buildInstr(AMDGPU::REG_SEQUENCE)
3966  .addDef(RSrcHi)
3967  .addReg(RSrc2)
3968  .addImm(AMDGPU::sub0)
3969  .addReg(RSrc3)
3970  .addImm(AMDGPU::sub1);
3971 
3972  Register RSrcLo = BasePtr;
3973  if (!BasePtr) {
3974  RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3975  B.buildInstr(AMDGPU::S_MOV_B64)
3976  .addDef(RSrcLo)
3977  .addImm(0);
3978  }
3979 
3980  B.buildInstr(AMDGPU::REG_SEQUENCE)
3981  .addDef(RSrc)
3982  .addReg(RSrcLo)
3983  .addImm(AMDGPU::sub0_sub1)
3984  .addReg(RSrcHi)
3985  .addImm(AMDGPU::sub2_sub3);
3986 
3987  return RSrc;
3988 }
3989 
3991  const SIInstrInfo &TII, Register BasePtr) {
3992  uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
3993 
3994  // FIXME: Why are half the "default" bits ignored based on the addressing
3995  // mode?
3996  return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
3997 }
3998 
4000  const SIInstrInfo &TII, Register BasePtr) {
4001  uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
4002 
4003  // FIXME: Why are half the "default" bits ignored based on the addressing
4004  // mode?
4005  return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
4006 }
4007 
4008 AMDGPUInstructionSelector::MUBUFAddressData
4009 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
4010  MUBUFAddressData Data;
4011  Data.N0 = Src;
4012 
4013  Register PtrBase;
4014  int64_t Offset;
4015 
4016  std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
4017  if (isUInt<32>(Offset)) {
4018  Data.N0 = PtrBase;
4019  Data.Offset = Offset;
4020  }
4021 
4022  if (MachineInstr *InputAdd
4023  = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
4024  Data.N2 = InputAdd->getOperand(1).getReg();
4025  Data.N3 = InputAdd->getOperand(2).getReg();
4026 
4027  // FIXME: Need to fix extra SGPR->VGPRcopies inserted
4028  // FIXME: Don't know this was defined by operand 0
4029  //
4030  // TODO: Remove this when we have copy folding optimizations after
4031  // RegBankSelect.
4032  Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
4033  Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
4034  }
4035 
4036  return Data;
4037 }
4038 
4039 /// Return if the addr64 mubuf mode should be used for the given address.
4040 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
4041  // (ptr_add N2, N3) -> addr64, or
4042  // (ptr_add (ptr_add N2, N3), C1) -> addr64
4043  if (Addr.N2)
4044  return true;
4045 
4046  const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
4047  return N0Bank->getID() == AMDGPU::VGPRRegBankID;
4048 }
4049 
4050 /// Split an immediate offset \p ImmOffset depending on whether it fits in the
4051 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
4052 /// component.
4053 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
4054  MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
4055  if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset))
4056  return;
4057 
4058  // Illegal offset, store it in soffset.
4059  SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4060  B.buildInstr(AMDGPU::S_MOV_B32)
4061  .addDef(SOffset)
4062  .addImm(ImmOffset);
4063  ImmOffset = 0;
4064 }
4065 
4066 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
4067  MachineOperand &Root, Register &VAddr, Register &RSrcReg,
4068  Register &SOffset, int64_t &Offset) const {
4069  // FIXME: Predicates should stop this from reaching here.
4070  // addr64 bit was removed for volcanic islands.
4071  if (!STI.hasAddr64() || STI.useFlatForGlobal())
4072  return false;
4073 
4074  MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
4075  if (!shouldUseAddr64(AddrData))
4076  return false;
4077 
4078  Register N0 = AddrData.N0;
4079  Register N2 = AddrData.N2;
4080  Register N3 = AddrData.N3;
4081  Offset = AddrData.Offset;
4082 
4083  // Base pointer for the SRD.
4084  Register SRDPtr;
4085 
4086  if (N2) {
4087  if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4088  assert(N3);
4089  if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4090  // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
4091  // addr64, and construct the default resource from a 0 address.
4092  VAddr = N0;
4093  } else {
4094  SRDPtr = N3;
4095  VAddr = N2;
4096  }
4097  } else {
4098  // N2 is not divergent.
4099  SRDPtr = N2;
4100  VAddr = N3;
4101  }
4102  } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4103  // Use the default null pointer in the resource
4104  VAddr = N0;
4105  } else {
4106  // N0 -> offset, or
4107  // (N0 + C1) -> offset
4108  SRDPtr = N0;
4109  }
4110 
4111  MachineIRBuilder B(*Root.getParent());
4112  RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
4113  splitIllegalMUBUFOffset(B, SOffset, Offset);
4114  return true;
4115 }
4116 
4117 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
4118  MachineOperand &Root, Register &RSrcReg, Register &SOffset,
4119  int64_t &Offset) const {
4120 
4121  // FIXME: Pattern should not reach here.
4122  if (STI.useFlatForGlobal())
4123  return false;
4124 
4125  MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
4126  if (shouldUseAddr64(AddrData))
4127  return false;
4128 
4129  // N0 -> offset, or
4130  // (N0 + C1) -> offset
4131  Register SRDPtr = AddrData.N0;
4132  Offset = AddrData.Offset;
4133 
4134  // TODO: Look through extensions for 32-bit soffset.
4135  MachineIRBuilder B(*Root.getParent());
4136 
4137  RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
4138  splitIllegalMUBUFOffset(B, SOffset, Offset);
4139  return true;
4140 }
4141 
4143 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
4144  Register VAddr;
4145  Register RSrcReg;
4146  Register SOffset;
4147  int64_t Offset = 0;
4148 
4149  if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
4150  return {};
4151 
4152  // FIXME: Use defaulted operands for trailing 0s and remove from the complex
4153  // pattern.
4154  return {{
4155  [=](MachineInstrBuilder &MIB) { // rsrc
4156  MIB.addReg(RSrcReg);
4157  },
4158  [=](MachineInstrBuilder &MIB) { // vaddr
4159  MIB.addReg(VAddr);
4160  },
4161  [=](MachineInstrBuilder &MIB) { // soffset
4162  if (SOffset)
4163  MIB.addReg(SOffset);
4164  else
4165  MIB.addImm(0);
4166  },
4167  [=](MachineInstrBuilder &MIB) { // offset
4168  MIB.addImm(Offset);
4169  },
4170  addZeroImm, // cpol
4171  addZeroImm, // tfe
4172  addZeroImm // swz
4173  }};
4174 }
4175 
4177 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
4178  Register RSrcReg;
4179  Register SOffset;
4180  int64_t Offset = 0;
4181 
4182  if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
4183  return {};
4184 
4185  return {{
4186  [=](MachineInstrBuilder &MIB) { // rsrc
4187  MIB.addReg(RSrcReg);
4188  },
4189  [=](MachineInstrBuilder &MIB) { // soffset
4190  if (SOffset)
4191  MIB.addReg(SOffset);
4192  else
4193  MIB.addImm(0);
4194  },
4195  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
4196  addZeroImm, // cpol
4197  addZeroImm, // tfe
4198  addZeroImm, // swz
4199  }};
4200 }
4201 
4203 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const {
4204  Register VAddr;
4205  Register RSrcReg;
4206  Register SOffset;
4207  int64_t Offset = 0;
4208 
4209  if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
4210  return {};
4211 
4212  // FIXME: Use defaulted operands for trailing 0s and remove from the complex
4213  // pattern.
4214  return {{
4215  [=](MachineInstrBuilder &MIB) { // rsrc
4216  MIB.addReg(RSrcReg);
4217  },
4218  [=](MachineInstrBuilder &MIB) { // vaddr
4219  MIB.addReg(VAddr);
4220  },
4221  [=](MachineInstrBuilder &MIB) { // soffset
4222  if (SOffset)
4223  MIB.addReg(SOffset);
4224  else
4225  MIB.addImm(0);
4226  },
4227  [=](MachineInstrBuilder &MIB) { // offset
4228  MIB.addImm(Offset);
4229  },
4230  [=](MachineInstrBuilder &MIB) {
4231  MIB.addImm(AMDGPU::CPol::GLC); // cpol
4232  }
4233  }};
4234 }
4235 
4237 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
4238  Register RSrcReg;
4239  Register SOffset;
4240  int64_t Offset = 0;
4241 
4242  if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
4243  return {};
4244 
4245  return {{
4246  [=](MachineInstrBuilder &MIB) { // rsrc
4247  MIB.addReg(RSrcReg);
4248  },
4249  [=](MachineInstrBuilder &MIB) { // soffset
4250  if (SOffset)
4251  MIB.addReg(SOffset);
4252  else
4253  MIB.addImm(0);
4254  },
4255  [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
4256  [=](MachineInstrBuilder &MIB) { MIB.addImm(AMDGPU::CPol::GLC); } // cpol
4257  }};
4258 }
4259 
4260 /// Get an immediate that must be 32-bits, and treated as zero extended.
4262  const MachineRegisterInfo &MRI) {
4263  // getConstantVRegVal sexts any values, so see if that matters.
4265  if (!OffsetVal || !isInt<32>(*OffsetVal))
4266  return None;
4267  return Lo_32(*OffsetVal);
4268 }
4269 
4271 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
4272  Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
4273  if (!OffsetVal)
4274  return {};
4275 
4276  Optional<int64_t> EncodedImm =
4277  AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
4278  if (!EncodedImm)
4279  return {};
4280 
4281  return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
4282 }
4283 
4285 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
4287 
4288  Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
4289  if (!OffsetVal)
4290  return {};
4291 
4292  Optional<int64_t> EncodedImm
4293  = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
4294  if (!EncodedImm)
4295  return {};
4296 
4297  return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
4298 }
4299 
4300 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
4301  const MachineInstr &MI,
4302  int OpIdx) const {
4303  assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
4304  "Expected G_CONSTANT");
4305  MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
4306 }
4307 
4308 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
4309  const MachineInstr &MI,
4310  int OpIdx) const {
4311  assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
4312  "Expected G_CONSTANT");
4313  MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
4314 }
4315 
4316 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
4317  const MachineInstr &MI,
4318  int OpIdx) const {
4319  assert(OpIdx == -1);
4320 
4321  const MachineOperand &Op = MI.getOperand(1);
4322  if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
4323  MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
4324  else {
4325  assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
4326  MIB.addImm(Op.getCImm()->getSExtValue());
4327  }
4328 }
4329 
4330 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
4331  const MachineInstr &MI,
4332  int OpIdx) const {
4333  assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
4334  "Expected G_CONSTANT");
4335  MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation());
4336 }
4337 
4338 /// This only really exists to satisfy DAG type checking machinery, so is a
4339 /// no-op here.
4340 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
4341  const MachineInstr &MI,
4342  int OpIdx) const {
4343  MIB.addImm(MI.getOperand(OpIdx).getImm());
4344 }
4345 
4346 void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
4347  const MachineInstr &MI,
4348  int OpIdx) const {
4349  assert(OpIdx >= 0 && "expected to match an immediate operand");
4350  MIB.addImm(MI.getOperand(OpIdx).getImm() & AMDGPU::CPol::ALL);
4351 }
4352 
4353 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
4354  const MachineInstr &MI,
4355  int OpIdx) const {
4356  assert(OpIdx >= 0 && "expected to match an immediate operand");
4357  MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
4358 }
4359 
4360 void AMDGPUInstructionSelector::renderSetGLC(MachineInstrBuilder &MIB,
4361  const MachineInstr &MI,
4362  int OpIdx) const {
4363  assert(OpIdx >= 0 && "expected to match an immediate operand");
4364  MIB.addImm(MI.getOperand(OpIdx).getImm() | AMDGPU::CPol::GLC);
4365 }
4366 
4367 void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
4368  const MachineInstr &MI,
4369  int OpIdx) const {
4370  MIB.addFrameIndex((MI.getOperand(1).getIndex()));
4371 }
4372 
4373 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
4375 }
4376 
4377 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const {
4379 }
4380 
4381 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const {
4383 }
4384 
4385 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
4386  return TII.isInlineConstant(Imm);
4387 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
i
i
Definition: README.txt:29
GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_PREDICATES_INIT
MIPatternMatch.h
addZeroImm
static void addZeroImm(MachineInstrBuilder &MIB)
Definition: AMDGPUInstructionSelector.cpp:3941
llvm::TargetMachine::getOptLevel
CodeGenOpt::Level getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
Definition: TargetMachine.cpp:198
llvm::TargetRegisterInfo::getConstrainedRegClassForOperand
virtual const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const
Definition: TargetRegisterInfo.h:1070
sizeToSubRegIndex
static int sizeToSubRegIndex(unsigned Size)
Definition: AMDGPUInstructionSelector.cpp:1808
Signed
@ Signed
Definition: NVPTXISelLowering.cpp:4550
llvm::AMDGPUSubtarget::hasInv2PiInlineImm
bool hasInv2PiInlineImm() const
Definition: AMDGPUSubtarget.h:164
llvm::getDefIgnoringCopies
MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition: Utils.cpp:396
llvm::AMDGPURegisterBankInfo
Definition: AMDGPURegisterBankInfo.h:42
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:100
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:132
llvm
Definition: AllocatorList.h:23
llvm::tgtok::Def
@ Def
Definition: TGLexer.h:50
llvm::AMDGPU::MIMGBaseOpcodeInfo::HasD16
bool HasD16
Definition: AMDGPUBaseInfo.h:293
Reg
unsigned Reg
Definition: MachineSink.cpp:1566
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::CmpInst::ICMP_EQ
@ ICMP_EQ
equal
Definition: InstrTypes.h:743
UseMI
MachineInstrBuilder & UseMI
Definition: AArch64ExpandPseudoInsts.cpp:100
llvm::SystemZISD::TM
@ TM
Definition: SystemZISelLowering.h:65
llvm::AMDGPU::MIMGBaseOpcodeInfo::Store
bool Store
Definition: AMDGPUBaseInfo.h:282
llvm::MachineOperand::CreateReg
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
Definition: MachineOperand.h:788
normalizeVOP3PMask
static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1, ArrayRef< int > Mask)
Definition: AMDGPUInstructionSelector.cpp:2772
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1291
llvm::MachineRegisterInfo::createVirtualRegister
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Definition: MachineRegisterInfo.cpp:158
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:722
llvm::MachineFrameInfo::setReturnAddressIsTaken
void setReturnAddressIsTaken(bool s)
Definition: MachineFrameInfo.h:373
llvm::DiagnosticInfoUnsupported
Diagnostic information for unsupported feature in backend.
Definition: DiagnosticInfo.h:993
llvm::MIPatternMatch::m_Reg
operand_type_match m_Reg()
Definition: MIPatternMatch.h:106
SIMachineFunctionInfo.h
llvm::GISelKnownBits
Definition: GISelKnownBits.h:29
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:52
llvm::MachineInstrBuilder::add
const MachineInstrBuilder & add(const MachineOperand &MO) const
Definition: MachineInstrBuilder.h:225
llvm::Function
Definition: Function.h:61
llvm::AMDGPU::getSMRDEncodedOffset
Optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer)
Definition: AMDGPUBaseInfo.cpp:1811
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
llvm::MIPatternMatch::m_GLShr
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
Definition: MIPatternMatch.h:300
llvm::RegState::Dead
@ Dead
Unused definition.
Definition: MachineInstrBuilder.h:51
llvm::getOpcodeDef
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:410
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::AMDGPU::MIMGBaseOpcodeInfo::Gather4
bool Gather4
Definition: AMDGPUBaseInfo.h:286
llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1643
GISelKnownBits.h
llvm::AMDGPU::ImageDimIntrinsicInfo
Definition: AMDGPUInstrInfo.h:50
llvm::RegisterBankInfo::getRegBank
RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
Definition: RegisterBankInfo.h:432
llvm::AMDGPU::MIMGDimInfo
Definition: AMDGPUBaseInfo.h:300
llvm::getSrcRegIgnoringCopies
Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition: Utils.cpp:403
llvm::MachineRegisterInfo::getUniqueVRegDef
MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
Definition: MachineRegisterInfo.cpp:411
llvm::CmpInst::ICMP_NE
@ ICMP_NE
not equal
Definition: InstrTypes.h:744
llvm::AMDGPUSubtarget::SEA_ISLANDS
@ SEA_ISLANDS
Definition: AMDGPUSubtarget.h:38
llvm::InstructionSelector::setupMF
virtual void setupMF(MachineFunction &mf, GISelKnownBits *KB, CodeGenCoverage &covinfo, ProfileSummaryInfo *psi, BlockFrequencyInfo *bfi)
Setup per-MF selector state.
Definition: InstructionSelector.h:447
llvm::AMDGPU::getSMRDEncodedLiteralOffset32
Optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
Definition: AMDGPUBaseInfo.cpp:1828
llvm::SIInstrFlags::FlatScratch
@ FlatScratch
Definition: SIDefines.h:109
llvm::TargetRegisterInfo
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Definition: TargetRegisterInfo.h:231
llvm::SIRegisterInfo::getWaveMaskRegClass
const TargetRegisterClass * getWaveMaskRegClass() const
Definition: SIRegisterInfo.h:283
llvm::Depth
@ Depth
Definition: SIMachineScheduler.h:34
llvm::CmpInst::ICMP_SGT
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:749
getAddrSpace
static Error getAddrSpace(StringRef R, unsigned &AddrSpace)
Definition: DataLayout.cpp:248
llvm::getFunctionLiveInPhysReg
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition: Utils.cpp:617
llvm::MachineMemOperand
A description of a memory reference used in the backend.
Definition: MachineMemOperand.h:127
llvm::AMDGPUMachineFunction::getLDSSize
unsigned getLDSSize() const
Definition: AMDGPUMachineFunction.h:70
isZeroOrUndef
static bool isZeroOrUndef(int X)
Definition: AMDGPUInstructionSelector.cpp:2758
llvm::AMDGPU::SDWA::UNUSED_PRESERVE
@ UNUSED_PRESERVE
Definition: SIDefines.h:652
llvm::MachineOperand::isCImm
bool isCImm() const
isCImm - Test if this is a MO_CImmediate operand.
Definition: MachineOperand.h:322
llvm::ConstantFP::getValueAPF
const APFloat & getValueAPF() const
Definition: Constants.h:295
llvm::LLT::isValid
bool isValid() const
Definition: LowLevelTypeImpl.h:90
llvm::Optional
Definition: APInt.h:33
llvm::AMDGPUSubtarget::hasSDWA
bool hasSDWA() const
Definition: AMDGPUSubtarget.h:148
Offset
uint64_t Offset
Definition: ELFObjHandler.cpp:81
llvm::GCNSubtarget
Definition: GCNSubtarget.h:38
llvm::CmpInst::ICMP_SLE
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:752
llvm::MipsISD::Ret
@ Ret
Definition: MipsISelLowering.h:116
MachineIRBuilder.h
llvm::SIInstrInfo::getIndirectRegWriteMovRelPseudo
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
Definition: SIInstrInfo.cpp:1329
llvm::PowerOf2Ceil
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:703
llvm::Lo_32
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:354
llvm::BitmaskEnumDetail::Mask
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1567
llvm::cl::ReallyHidden
@ ReallyHidden
Definition: CommandLine.h:141
llvm::GCNSubtarget::hasScalarCompareEq64
bool hasScalarCompareEq64() const
Definition: GCNSubtarget.h:798
llvm::ValueAndVReg::Value
APInt Value
Definition: Utils.h:175
llvm::AMDGPU::MIMGMIPMappingInfo::NONMIP
MIMGBaseOpcode NONMIP
Definition: AMDGPUBaseInfo.h:326
llvm::Data
@ Data
Definition: SIMachineScheduler.h:56
llvm::PointerUnion::get
T get() const
Returns the value of the specified pointer type.
Definition: PointerUnion.h:187
llvm::ARMII::IndexMode
IndexMode
ARM Index Modes.
Definition: ARMBaseInfo.h:177
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:204
DEBUG_TYPE
#define DEBUG_TYPE
Definition: AMDGPUInstructionSelector.cpp:27
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:122
llvm::constrainSelectedInstRegOperands
bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition: Utils.cpp:134
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::getUndefRegState
unsigned getUndefRegState(bool B)
Definition: MachineInstrBuilder.h:515
llvm::GCNSubtarget::unsafeDSOffsetFoldingEnabled
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:401
llvm::GCNSubtarget::hasGWSSemaReleaseAll
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:619
llvm::Optional::hasValue
constexpr bool hasValue() const
Definition: Optional.h:286
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
llvm::SIRegisterInfo::getRegClassForTypeOnBank
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank, const MachineRegisterInfo &MRI) const
Definition: SIRegisterInfo.h:268
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:205
llvm::MachineOperand::isKill
bool isKill() const
Definition: MachineOperand.h:387
llvm::MachineInstrBuilder::addDef
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Definition: MachineInstrBuilder.h:117
llvm::AMDGPUAS::CONSTANT_ADDRESS_32BIT
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition: AMDGPU.h:380
llvm::AMDGPU::CPol::CPol
CPol
Definition: SIDefines.h:281
getLogicalBitOpcode
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
Definition: AMDGPUInstructionSelector.cpp:265
llvm::BlockFrequencyInfo
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Definition: BlockFrequencyInfo.h:37
llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition: MachineFunction.h:565
llvm::MachineOperand::isImplicit
bool isImplicit() const
Definition: MachineOperand.h:377
llvm::AMDGPUTargetMachine
Definition: AMDGPUTargetMachine.h:27
llvm::MachineInstrBuilder::addMBB
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Definition: MachineInstrBuilder.h:147
llvm::RegisterBank
This class implements the register bank concept.
Definition: RegisterBank.h:28
llvm::MIPatternMatch::m_GZExt
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
Definition: MIPatternMatch.h:351
llvm::SIInstrInfo::isInlineConstant
bool isInlineConstant(const APInt &Imm) const
Definition: SIInstrInfo.cpp:3313
llvm::MachineOperand::CreateImm
static MachineOperand CreateImm(int64_t Val)
Definition: MachineOperand.h:770
llvm::GCNSubtarget::useVGPRIndexMode
bool useVGPRIndexMode() const
Definition: AMDGPUSubtarget.cpp:652
llvm::GCNSubtarget::hasAddr64
bool hasAddr64() const
Definition: GCNSubtarget.h:312
llvm::AMDGPUInstructionSelector::select
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
Definition: AMDGPUInstructionSelector.cpp:3049
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::MachineMemOperand::getAddrSpace
unsigned getAddrSpace() const
Definition: MachineMemOperand.h:218
llvm::MachineOperand::getImm
int64_t getImm() const
Definition: MachineOperand.h:534
llvm::GCNSubtarget::useFlatForGlobal
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:460
llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition: MachineFunction.h:653
llvm::PseudoSourceValue::isStack
bool isStack() const
Definition: PseudoSourceValue.h:68
llvm::AMDGPU::MIMGBaseOpcodeInfo
Definition: AMDGPUBaseInfo.h:280
Intr
unsigned Intr
Definition: AMDGPUBaseInfo.cpp:1927
llvm::LLT::getSizeInBits
unsigned getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelTypeImpl.h:109
llvm::MachineMemOperand::getValue
const Value * getValue() const
Return the base address of the memory access.
Definition: MachineMemOperand.h:200
llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:488
llvm::CmpInst::ICMP_ULE
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:748
llvm::AMDGPUISD::DS_ORDERED_COUNT
@ DS_ORDERED_COUNT
Definition: AMDGPUISelLowering.h:492
llvm::AMDGPU::MIMGDimInfo::Encoding
uint8_t Encoding
Definition: AMDGPUBaseInfo.h:306
llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:46
llvm::SIRegisterInfo::getReturnAddressReg
MCRegister getReturnAddressReg(const MachineFunction &MF) const
Definition: SIRegisterInfo.cpp:2327
llvm::InstructionSelector::CoverageInfo
CodeGenCoverage * CoverageInfo
Definition: InstructionSelector.h:434
llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:373
gwsIntrinToOpcode
static unsigned gwsIntrinToOpcode(unsigned IntrID)
Definition: AMDGPUInstructionSelector.cpp:1296
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:129
llvm::MachineOperand::ChangeToImmediate
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
Definition: MachineOperand.cpp:156
llvm::MCInstrDesc
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:196
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:49
llvm::MIPatternMatch::m_ZeroInt
SpecificConstantMatch m_ZeroInt()
{ Convenience matchers for specific integer values.
Definition: MIPatternMatch.h:88
llvm::SISrcMods::NEG_HI
@ NEG_HI
Definition: SIDefines.h:202
llvm::M0
unsigned M0(unsigned Val)
Definition: VE.h:371
llvm::AMDGPU::MIMGLZMappingInfo
Definition: AMDGPUBaseInfo.h:319
llvm::Instruction
Definition: Instruction.h:45
llvm::GCNSubtarget::isWave32
bool isWave32() const
Definition: GCNSubtarget.h:1070
llvm::AMDGPUTargetMachine::getNullPointerValue
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
Definition: AMDGPUTargetMachine.cpp:653
llvm::TargetRegisterClass::hasSuperClassEq
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
Definition: TargetRegisterInfo.h:136
llvm::report_fatal_error
LLVM_ATTRIBUTE_NORETURN void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:140
llvm::APInt::getZExtValue
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1631
llvm::APInt::getHighBitsSet
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Get a value with high bits set.
Definition: APInt.h:655
llvm::RegisterBank::getID
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:47
llvm::SIRegisterInfo::getSubRegFromChannel
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
Definition: SIRegisterInfo.cpp:426
llvm::AMDGPU::getMIMGLZMappingInfo
const LLVM_READONLY MIMGLZMappingInfo * getMIMGLZMappingInfo(unsigned L)
llvm::LLT::vector
static LLT vector(uint16_t NumElements, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
Definition: LowLevelTypeImpl.h:58
matchZeroExtendFromS32
static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg)
Match a zero extend from a 32-bit value to 64-bits.
Definition: AMDGPUInstructionSelector.cpp:3495
llvm::SIInstrFlags::DS
@ DS
Definition: SIDefines.h:52
llvm::MIPatternMatch::m_SpecificICst
SpecificConstantMatch m_SpecificICst(int64_t RequestedValue)
Matches a constant equal to RequestedValue.
Definition: MIPatternMatch.h:82
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:26
llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:29
llvm::PseudoSourceValue
Special value supplied for machine level alias analysis.
Definition: PseudoSourceValue.h:35
llvm::RegState::Implicit
@ Implicit
Not emitted register (e.g. carry, or temporary result).
Definition: MachineInstrBuilder.h:47
llvm::APFloat::bitcastToAPInt
APInt bitcastToAPInt() const
Definition: APFloat.h:1132
llvm::AMDGPU::getMIMGOpcode
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
Definition: AMDGPUBaseInfo.cpp:138
llvm::Metadata
Root of the metadata hierarchy.
Definition: Metadata.h:62
llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:374
GET_GLOBALISEL_TEMPORARIES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
llvm::MachineRegisterInfo::getVRegDef
MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
Definition: MachineRegisterInfo.cpp:400
llvm::MachineOperand::getParent
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
Definition: MachineOperand.h:235
llvm::None
const NoneType None
Definition: None.h:23
llvm::RegState::Kill
@ Kill
The last use of a register.
Definition: MachineInstrBuilder.h:49
llvm::RegisterBankInfo::getSizeInBits
unsigned getSizeInBits(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Get the size in bits of Reg.
Definition: RegisterBankInfo.cpp:493
llvm::RegisterBankInfo::constrainGenericRegister
static const TargetRegisterClass * constrainGenericRegister(Register Reg, const TargetRegisterClass &RC, MachineRegisterInfo &MRI)
Constrain the (possibly generic) virtual register Reg to RC.
Definition: RegisterBankInfo.cpp:132
llvm::AMDGPU::isGFX10Plus
bool isGFX10Plus(const MCSubtargetInfo &STI)
Definition: AMDGPUBaseInfo.cpp:1410
X
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:95
include
include(LLVM-Build) add_subdirectory(IR) add_subdirectory(FuzzMutate) add_subdirectory(FileCheck) add_subdirectory(InterfaceStub) add_subdirectory(IRReader) add_subdirectory(CodeGen) add_subdirectory(BinaryFormat) add_subdirectory(Bitcode) add_subdirectory(Bitstream) add_subdirectory(DWARFLinker) add_subdirectory(Extensions) add_subdirectory(Frontend) add_subdirectory(Transforms) add_subdirectory(Linker) add_subdirectory(Analysis) add_subdirectory(LTO) add_subdirectory(MC) add_subdirectory(MCA) add_subdirectory(Object) add_subdirectory(ObjectYAML) add_subdirectory(Option) add_subdirectory(Remarks) add_subdirectory(DebugInfo) add_subdirectory(ExecutionEngine) add_subdirectory(Target) add_subdirectory(AsmParser) add_subdirectory(LineEditor) add_subdirectory(ProfileData) add_subdirectory(Passes) add_subdirectory(TextAPI) add_subdirectory(ToolDrivers) add_subdirectory(XRay) if(LLVM_INCLUDE_TESTS) add_subdirectory(Testing) endif() add_subdirectory(WindowsManifest) set(LLVMCONFIGLIBRARYDEPENDENCIESINC "$
Definition: CMakeLists.txt:1
llvm::Hi_32
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:349
llvm::MachineInstrBuilder::cloneMemRefs
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Definition: MachineInstrBuilder.h:214
llvm::ProfileSummaryInfo
Analysis providing profile information.
Definition: ProfileSummaryInfo.h:39
llvm::AMDGPUAS::PRIVATE_ADDRESS
@ PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPU.h:378
llvm::SIInstrInfo::MO_ABS32_LO
@ MO_ABS32_LO
Definition: SIInstrInfo.h:165
llvm::MachineRegisterInfo::getRegClass
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
Definition: MachineRegisterInfo.h:634
llvm::APInt::ashr
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:963
llvm::AMDGPUSubtarget::getWavefrontSize
unsigned getWavefrontSize() const
Definition: AMDGPUSubtarget.h:180
llvm::GCNSubtarget::isWave64
bool isWave64() const
Definition: GCNSubtarget.h:1074
llvm::Triple::AMDHSA
@ AMDHSA
Definition: Triple.h:190
llvm::AMDGPUSubtarget::GFX10
@ GFX10
Definition: AMDGPUSubtarget.h:41
llvm::LLT::getAddressSpace
unsigned getAddressSpace() const
Definition: LowLevelTypeImpl.h:178
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:555
llvm::MachineInstrBuilder::addFrameIndex
const MachineInstrBuilder & addFrameIndex(int Idx) const
Definition: MachineInstrBuilder.h:153
llvm::InstructionSelector::MF
MachineFunction * MF
Definition: InstructionSelector.h:436
buildRSRC
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
Definition: AMDGPUInstructionSelector.cpp:3947
llvm::AMDGPU::getMIMGDimInfo
const LLVM_READONLY MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
llvm::cl::opt< bool >
llvm::InstructionSelector::isOperandImmEqual
bool isOperandImmEqual(const MachineOperand &MO, int64_t Value, const MachineRegisterInfo &MRI) const
Definition: InstructionSelector.cpp:36
llvm::APFloat
Definition: APFloat.h:701
llvm::GCNSubtarget::usePRTStrictNull
bool usePRTStrictNull() const
Definition: GCNSubtarget.h:488
llvm::MachineInstr::getDebugLoc
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:418
llvm::GlobalValue
Definition: GlobalValue.h:44
llvm::InstructionSelector
Provides the logic to select generic machine instructions.
Definition: InstructionSelector.h:418
x2
gcc mainline compiles it x2(%rip)
llvm::countPopulation
unsigned countPopulation(T Value)
Count the number of set bits in a value.
Definition: MathExtras.h:568
llvm::MachineOperand::isUndef
bool isUndef() const
Definition: MachineOperand.h:392
AMDGPURegisterBankInfo.h
llvm::AMDGPURegisterBankInfo::getRegBankFromRegClass
const RegisterBank & getRegBankFromRegClass(const TargetRegisterClass &RC, LLT) const override
Get a register bank that covers RC.
Definition: AMDGPURegisterBankInfo.cpp:276
llvm::isInt< 32 >
constexpr bool isInt< 32 >(int64_t x)
Definition: MathExtras.h:374
llvm::MachineIRBuilder
Helper class to build MachineInstr.
Definition: MachineIRBuilder.h:220
llvm::CodeGenCoverage
Definition: CodeGenCoverage.h:20
llvm::isUInt< 16 >
constexpr bool isUInt< 16 >(uint64_t x)
Definition: MathExtras.h:409
llvm::AMDGPU::CPol::GLC
@ GLC
Definition: SIDefines.h:282
llvm::MachineOperand::isReg
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Definition: MachineOperand.h:318
llvm::MachineOperand::getCImm
const ConstantInt * getCImm() const
Definition: MachineOperand.h:539
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:64
llvm::MachineInstrBuilder
Definition: MachineInstrBuilder.h:70
llvm::GlobalValue::getParent
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:572
llvm::Triple::getOS
OSType getOS() const
getOS - Get the parsed operating system type of this triple.
Definition: Triple.h:316
llvm::AMDGPU::getBaseWithConstantOffset
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg)
Returns base register and constant offset.
Definition: AMDGPUGlobalISelUtils.cpp:17
Addr
uint64_t Addr
Definition: ELFObjHandler.cpp:80
buildOffsetSrc
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
Definition: AMDGPUInstructionSelector.cpp:3999
buildAddr64RSrc
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
Definition: AMDGPUInstructionSelector.cpp:3990
llvm::PointerUnion::dyn_cast
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:194
llvm::MachinePointerInfo
This class contains a discriminated union of information about pointers in memory operands,...
Definition: MachineMemOperand.h:37
llvm::GCNSubtarget::hasAddNoCarry
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:627
isConstant
static bool isConstant(const MachineInstr &MI)
Definition: AMDGPUInstructionSelector.cpp:2267
llvm::GCNSubtarget::privateMemoryResourceIsRangeChecked
bool privateMemoryResourceIsRangeChecked() const
Definition: GCNSubtarget.h:482
llvm::SIRegisterInfo::getRegClassForSizeOnBank
const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank, const MachineRegisterInfo &MRI) const
Definition: SIRegisterInfo.cpp:2333
llvm::MachineOperand::isDead
bool isDead() const
Definition: MachineOperand.h:382
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::AMDGPUMachineFunction::isEntryFunction
bool isEntryFunction() const
Definition: AMDGPUMachineFunction.h:78
llvm::SIInstrInfo::isLegalFLATOffset
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
Definition: SIInstrInfo.cpp:7302
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:440