LLVM  9.0.0svn
SIInstrInfo.cpp
Go to the documentation of this file.
1 //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// SI Implementation of TargetInstrInfo.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "SIInstrInfo.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUSubtarget.h"
18 #include "GCNHazardRecognizer.h"
19 #include "SIDefines.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "SIRegisterInfo.h"
23 #include "Utils/AMDGPUBaseInfo.h"
24 #include "llvm/ADT/APInt.h"
25 #include "llvm/ADT/ArrayRef.h"
26 #include "llvm/ADT/SmallVector.h"
27 #include "llvm/ADT/StringRef.h"
47 #include "llvm/IR/DebugLoc.h"
48 #include "llvm/IR/DiagnosticInfo.h"
49 #include "llvm/IR/Function.h"
50 #include "llvm/IR/InlineAsm.h"
51 #include "llvm/IR/LLVMContext.h"
52 #include "llvm/MC/MCInstrDesc.h"
53 #include "llvm/Support/Casting.h"
55 #include "llvm/Support/Compiler.h"
60 #include <cassert>
61 #include <cstdint>
62 #include <iterator>
63 #include <utility>
64 
65 using namespace llvm;
66 
67 #define GET_INSTRINFO_CTOR_DTOR
68 #include "AMDGPUGenInstrInfo.inc"
69 
70 namespace llvm {
71 namespace AMDGPU {
72 #define GET_D16ImageDimIntrinsics_IMPL
73 #define GET_ImageDimIntrinsicTable_IMPL
74 #define GET_RsrcIntrinsics_IMPL
75 #include "AMDGPUGenSearchableTables.inc"
76 }
77 }
78 
79 
80 // Must be at least 4 to be able to branch over minimum unconditional branch
81 // code. This is only for making it possible to write reasonably small tests for
82 // long branches.
83 static cl::opt<unsigned>
84 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
85  cl::desc("Restrict range of branch instructions (DEBUG)"));
86 
88  : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
89  RI(ST), ST(ST) {}
90 
91 //===----------------------------------------------------------------------===//
92 // TargetInstrInfo callbacks
93 //===----------------------------------------------------------------------===//
94 
95 static unsigned getNumOperandsNoGlue(SDNode *Node) {
96  unsigned N = Node->getNumOperands();
97  while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
98  --N;
99  return N;
100 }
101 
103  SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1);
104  assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node");
105  return LastOp;
106 }
107 
108 /// Returns true if both nodes have the same value for the given
109 /// operand \p Op, or if both nodes do not have this operand.
110 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
111  unsigned Opc0 = N0->getMachineOpcode();
112  unsigned Opc1 = N1->getMachineOpcode();
113 
114  int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
115  int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
116 
117  if (Op0Idx == -1 && Op1Idx == -1)
118  return true;
119 
120 
121  if ((Op0Idx == -1 && Op1Idx != -1) ||
122  (Op1Idx == -1 && Op0Idx != -1))
123  return false;
124 
125  // getNamedOperandIdx returns the index for the MachineInstr's operands,
126  // which includes the result as the first operand. We are indexing into the
127  // MachineSDNode's operands, so we need to skip the result operand to get
128  // the real index.
129  --Op0Idx;
130  --Op1Idx;
131 
132  return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
133 }
134 
136  AliasAnalysis *AA) const {
137  // TODO: The generic check fails for VALU instructions that should be
138  // rematerializable due to implicit reads of exec. We really want all of the
139  // generic logic for this except for this.
140  switch (MI.getOpcode()) {
141  case AMDGPU::V_MOV_B32_e32:
142  case AMDGPU::V_MOV_B32_e64:
143  case AMDGPU::V_MOV_B64_PSEUDO:
144  return true;
145  default:
146  return false;
147  }
148 }
149 
151  int64_t &Offset0,
152  int64_t &Offset1) const {
153  if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
154  return false;
155 
156  unsigned Opc0 = Load0->getMachineOpcode();
157  unsigned Opc1 = Load1->getMachineOpcode();
158 
159  // Make sure both are actually loads.
160  if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
161  return false;
162 
163  if (isDS(Opc0) && isDS(Opc1)) {
164 
165  // FIXME: Handle this case:
166  if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
167  return false;
168 
169  // Check base reg.
170  if (Load0->getOperand(1) != Load1->getOperand(1))
171  return false;
172 
173  // Check chain.
174  if (findChainOperand(Load0) != findChainOperand(Load1))
175  return false;
176 
177  // Skip read2 / write2 variants for simplicity.
178  // TODO: We should report true if the used offsets are adjacent (excluded
179  // st64 versions).
180  if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 ||
181  AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1)
182  return false;
183 
184  Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue();
185  Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue();
186  return true;
187  }
188 
189  if (isSMRD(Opc0) && isSMRD(Opc1)) {
190  // Skip time and cache invalidation instructions.
191  if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 ||
192  AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1)
193  return false;
194 
196 
197  // Check base reg.
198  if (Load0->getOperand(0) != Load1->getOperand(0))
199  return false;
200 
201  const ConstantSDNode *Load0Offset =
202  dyn_cast<ConstantSDNode>(Load0->getOperand(1));
203  const ConstantSDNode *Load1Offset =
204  dyn_cast<ConstantSDNode>(Load1->getOperand(1));
205 
206  if (!Load0Offset || !Load1Offset)
207  return false;
208 
209  // Check chain.
210  if (findChainOperand(Load0) != findChainOperand(Load1))
211  return false;
212 
213  Offset0 = Load0Offset->getZExtValue();
214  Offset1 = Load1Offset->getZExtValue();
215  return true;
216  }
217 
218  // MUBUF and MTBUF can access the same addresses.
219  if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
220 
221  // MUBUF and MTBUF have vaddr at different indices.
222  if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
223  findChainOperand(Load0) != findChainOperand(Load1) ||
224  !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
225  !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
226  return false;
227 
228  int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
229  int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
230 
231  if (OffIdx0 == -1 || OffIdx1 == -1)
232  return false;
233 
234  // getNamedOperandIdx returns the index for MachineInstrs. Since they
235  // inlcude the output in the operand list, but SDNodes don't, we need to
236  // subtract the index by one.
237  --OffIdx0;
238  --OffIdx1;
239 
240  SDValue Off0 = Load0->getOperand(OffIdx0);
241  SDValue Off1 = Load1->getOperand(OffIdx1);
242 
243  // The offset might be a FrameIndexSDNode.
244  if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
245  return false;
246 
247  Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
248  Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
249  return true;
250  }
251 
252  return false;
253 }
254 
255 static bool isStride64(unsigned Opc) {
256  switch (Opc) {
257  case AMDGPU::DS_READ2ST64_B32:
258  case AMDGPU::DS_READ2ST64_B64:
259  case AMDGPU::DS_WRITE2ST64_B32:
260  case AMDGPU::DS_WRITE2ST64_B64:
261  return true;
262  default:
263  return false;
264  }
265 }
266 
268  MachineOperand *&BaseOp,
269  int64_t &Offset,
270  const TargetRegisterInfo *TRI) const {
271  unsigned Opc = LdSt.getOpcode();
272 
273  if (isDS(LdSt)) {
274  const MachineOperand *OffsetImm =
275  getNamedOperand(LdSt, AMDGPU::OpName::offset);
276  if (OffsetImm) {
277  // Normal, single offset LDS instruction.
278  BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
279  Offset = OffsetImm->getImm();
280  assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
281  "operands of type register.");
282  return true;
283  }
284 
285  // The 2 offset instructions use offset0 and offset1 instead. We can treat
286  // these as a load with a single offset if the 2 offsets are consecutive. We
287  // will use this for some partially aligned loads.
288  const MachineOperand *Offset0Imm =
289  getNamedOperand(LdSt, AMDGPU::OpName::offset0);
290  const MachineOperand *Offset1Imm =
291  getNamedOperand(LdSt, AMDGPU::OpName::offset1);
292 
293  uint8_t Offset0 = Offset0Imm->getImm();
294  uint8_t Offset1 = Offset1Imm->getImm();
295 
296  if (Offset1 > Offset0 && Offset1 - Offset0 == 1) {
297  // Each of these offsets is in element sized units, so we need to convert
298  // to bytes of the individual reads.
299 
300  unsigned EltSize;
301  if (LdSt.mayLoad())
302  EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
303  else {
304  assert(LdSt.mayStore());
305  int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
306  EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
307  }
308 
309  if (isStride64(Opc))
310  EltSize *= 64;
311 
312  BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
313  Offset = EltSize * Offset0;
314  assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
315  "operands of type register.");
316  return true;
317  }
318 
319  return false;
320  }
321 
322  if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
323  const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset);
324  if (SOffset && SOffset->isReg())
325  return false;
326 
327  MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
328  if (!AddrReg)
329  return false;
330 
331  const MachineOperand *OffsetImm =
332  getNamedOperand(LdSt, AMDGPU::OpName::offset);
333  BaseOp = AddrReg;
334  Offset = OffsetImm->getImm();
335 
336  if (SOffset) // soffset can be an inline immediate.
337  Offset += SOffset->getImm();
338 
339  assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
340  "operands of type register.");
341  return true;
342  }
343 
344  if (isSMRD(LdSt)) {
345  const MachineOperand *OffsetImm =
346  getNamedOperand(LdSt, AMDGPU::OpName::offset);
347  if (!OffsetImm)
348  return false;
349 
350  MachineOperand *SBaseReg = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
351  BaseOp = SBaseReg;
352  Offset = OffsetImm->getImm();
353  assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
354  "operands of type register.");
355  return true;
356  }
357 
358  if (isFLAT(LdSt)) {
359  MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
360  if (VAddr) {
361  // Can't analyze 2 offsets.
362  if (getNamedOperand(LdSt, AMDGPU::OpName::saddr))
363  return false;
364 
365  BaseOp = VAddr;
366  } else {
367  // scratch instructions have either vaddr or saddr.
368  BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
369  }
370 
371  Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
372  assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
373  "operands of type register.");
374  return true;
375  }
376 
377  return false;
378 }
379 
380 static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
381  const MachineOperand &BaseOp1,
382  const MachineInstr &MI2,
383  const MachineOperand &BaseOp2) {
384  // Support only base operands with base registers.
385  // Note: this could be extended to support FI operands.
386  if (!BaseOp1.isReg() || !BaseOp2.isReg())
387  return false;
388 
389  if (BaseOp1.isIdenticalTo(BaseOp2))
390  return true;
391 
392  if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
393  return false;
394 
395  auto MO1 = *MI1.memoperands_begin();
396  auto MO2 = *MI2.memoperands_begin();
397  if (MO1->getAddrSpace() != MO2->getAddrSpace())
398  return false;
399 
400  auto Base1 = MO1->getValue();
401  auto Base2 = MO2->getValue();
402  if (!Base1 || !Base2)
403  return false;
404  const MachineFunction &MF = *MI1.getParent()->getParent();
405  const DataLayout &DL = MF.getFunction().getParent()->getDataLayout();
406  Base1 = GetUnderlyingObject(Base1, DL);
407  Base2 = GetUnderlyingObject(Base1, DL);
408 
409  if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
410  return false;
411 
412  return Base1 == Base2;
413 }
414 
416  MachineOperand &BaseOp2,
417  unsigned NumLoads) const {
418  MachineInstr &FirstLdSt = *BaseOp1.getParent();
419  MachineInstr &SecondLdSt = *BaseOp2.getParent();
420 
421  if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOp1, SecondLdSt, BaseOp2))
422  return false;
423 
424  const MachineOperand *FirstDst = nullptr;
425  const MachineOperand *SecondDst = nullptr;
426 
427  if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
428  (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
429  (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
430  const unsigned MaxGlobalLoadCluster = 6;
431  if (NumLoads > MaxGlobalLoadCluster)
432  return false;
433 
434  FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
435  if (!FirstDst)
436  FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
437  SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
438  if (!SecondDst)
439  SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
440  } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
441  FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
442  SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
443  } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
444  FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
445  SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
446  }
447 
448  if (!FirstDst || !SecondDst)
449  return false;
450 
451  // Try to limit clustering based on the total number of bytes loaded
452  // rather than the number of instructions. This is done to help reduce
453  // register pressure. The method used is somewhat inexact, though,
454  // because it assumes that all loads in the cluster will load the
455  // same number of bytes as FirstLdSt.
456 
457  // The unit of this value is bytes.
458  // FIXME: This needs finer tuning.
459  unsigned LoadClusterThreshold = 16;
460 
461  const MachineRegisterInfo &MRI =
462  FirstLdSt.getParent()->getParent()->getRegInfo();
463  const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());
464 
465  return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
466 }
467 
468 // FIXME: This behaves strangely. If, for example, you have 32 load + stores,
469 // the first 16 loads will be interleaved with the stores, and the next 16 will
470 // be clustered as expected. It should really split into 2 16 store batches.
471 //
472 // Loads are clustered until this returns false, rather than trying to schedule
473 // groups of stores. This also means we have to deal with saying different
474 // address space loads should be clustered, and ones which might cause bank
475 // conflicts.
476 //
477 // This might be deprecated so it might not be worth that much effort to fix.
479  int64_t Offset0, int64_t Offset1,
480  unsigned NumLoads) const {
481  assert(Offset1 > Offset0 &&
482  "Second offset should be larger than first offset!");
483  // If we have less than 16 loads in a row, and the offsets are within 64
484  // bytes, then schedule together.
485 
486  // A cacheline is 64 bytes (for global memory).
487  return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
488 }
489 
492  const DebugLoc &DL, unsigned DestReg,
493  unsigned SrcReg, bool KillSrc) {
494  MachineFunction *MF = MBB.getParent();
495  DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(),
496  "illegal SGPR to VGPR copy",
497  DL, DS_Error);
498  LLVMContext &C = MF->getFunction().getContext();
499  C.diagnose(IllegalCopy);
500 
501  BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
502  .addReg(SrcReg, getKillRegState(KillSrc));
503 }
504 
507  const DebugLoc &DL, unsigned DestReg,
508  unsigned SrcReg, bool KillSrc) const {
509  const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg);
510 
511  if (RC == &AMDGPU::VGPR_32RegClass) {
512  assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
513  AMDGPU::SReg_32RegClass.contains(SrcReg));
514  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
515  .addReg(SrcReg, getKillRegState(KillSrc));
516  return;
517  }
518 
519  if (RC == &AMDGPU::SReg_32_XM0RegClass ||
520  RC == &AMDGPU::SReg_32RegClass) {
521  if (SrcReg == AMDGPU::SCC) {
522  BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
523  .addImm(-1)
524  .addImm(0);
525  return;
526  }
527 
528  if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
529  reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
530  return;
531  }
532 
533  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
534  .addReg(SrcReg, getKillRegState(KillSrc));
535  return;
536  }
537 
538  if (RC == &AMDGPU::SReg_64RegClass) {
539  if (DestReg == AMDGPU::VCC) {
540  if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
541  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
542  .addReg(SrcReg, getKillRegState(KillSrc));
543  } else {
544  // FIXME: Hack until VReg_1 removed.
545  assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
546  BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
547  .addImm(0)
548  .addReg(SrcReg, getKillRegState(KillSrc));
549  }
550 
551  return;
552  }
553 
554  if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
555  reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
556  return;
557  }
558 
559  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
560  .addReg(SrcReg, getKillRegState(KillSrc));
561  return;
562  }
563 
564  if (DestReg == AMDGPU::SCC) {
565  assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
566  BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
567  .addReg(SrcReg, getKillRegState(KillSrc))
568  .addImm(0);
569  return;
570  }
571 
572  unsigned EltSize = 4;
573  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
574  if (RI.isSGPRClass(RC)) {
575  if (RI.getRegSizeInBits(*RC) > 32) {
576  Opcode = AMDGPU::S_MOV_B64;
577  EltSize = 8;
578  } else {
579  Opcode = AMDGPU::S_MOV_B32;
580  EltSize = 4;
581  }
582 
583  if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) {
584  reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
585  return;
586  }
587  }
588 
589  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
590  bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
591 
592  for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
593  unsigned SubIdx;
594  if (Forward)
595  SubIdx = SubIndices[Idx];
596  else
597  SubIdx = SubIndices[SubIndices.size() - Idx - 1];
598 
599  MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
600  get(Opcode), RI.getSubReg(DestReg, SubIdx));
601 
602  Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
603 
604  if (Idx == 0)
605  Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
606 
607  bool UseKill = KillSrc && Idx == SubIndices.size() - 1;
608  Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
609  }
610 }
611 
612 int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
613  int NewOpc;
614 
615  // Try to map original to commuted opcode
616  NewOpc = AMDGPU::getCommuteRev(Opcode);
617  if (NewOpc != -1)
618  // Check if the commuted (REV) opcode exists on the target.
619  return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
620 
621  // Try to map commuted to original opcode
622  NewOpc = AMDGPU::getCommuteOrig(Opcode);
623  if (NewOpc != -1)
624  // Check if the original (non-REV) opcode exists on the target.
625  return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
626 
627  return Opcode;
628 }
629 
632  const DebugLoc &DL, unsigned DestReg,
633  int64_t Value) const {
635  const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
636  if (RegClass == &AMDGPU::SReg_32RegClass ||
637  RegClass == &AMDGPU::SGPR_32RegClass ||
638  RegClass == &AMDGPU::SReg_32_XM0RegClass ||
639  RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
640  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
641  .addImm(Value);
642  return;
643  }
644 
645  if (RegClass == &AMDGPU::SReg_64RegClass ||
646  RegClass == &AMDGPU::SGPR_64RegClass ||
647  RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
648  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
649  .addImm(Value);
650  return;
651  }
652 
653  if (RegClass == &AMDGPU::VGPR_32RegClass) {
654  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
655  .addImm(Value);
656  return;
657  }
658  if (RegClass == &AMDGPU::VReg_64RegClass) {
659  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
660  .addImm(Value);
661  return;
662  }
663 
664  unsigned EltSize = 4;
665  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
666  if (RI.isSGPRClass(RegClass)) {
667  if (RI.getRegSizeInBits(*RegClass) > 32) {
668  Opcode = AMDGPU::S_MOV_B64;
669  EltSize = 8;
670  } else {
671  Opcode = AMDGPU::S_MOV_B32;
672  EltSize = 4;
673  }
674  }
675 
676  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
677  for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
678  int64_t IdxValue = Idx == 0 ? Value : 0;
679 
680  MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
681  get(Opcode), RI.getSubReg(DestReg, Idx));
682  Builder.addImm(IdxValue);
683  }
684 }
685 
686 const TargetRegisterClass *
688  return &AMDGPU::VGPR_32RegClass;
689 }
690 
693  const DebugLoc &DL, unsigned DstReg,
695  unsigned TrueReg,
696  unsigned FalseReg) const {
698  assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
699  "Not a VGPR32 reg");
700 
701  if (Cond.size() == 1) {
702  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
703  BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
704  .add(Cond[0]);
705  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
706  .addReg(FalseReg)
707  .addReg(TrueReg)
708  .addReg(SReg);
709  } else if (Cond.size() == 2) {
710  assert(Cond[0].isImm() && "Cond[0] is not an immediate");
711  switch (Cond[0].getImm()) {
712  case SIInstrInfo::SCC_TRUE: {
713  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
714  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
715  .addImm(-1)
716  .addImm(0);
717  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
718  .addReg(FalseReg)
719  .addReg(TrueReg)
720  .addReg(SReg);
721  break;
722  }
723  case SIInstrInfo::SCC_FALSE: {
724  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
725  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
726  .addImm(0)
727  .addImm(-1);
728  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
729  .addReg(FalseReg)
730  .addReg(TrueReg)
731  .addReg(SReg);
732  break;
733  }
734  case SIInstrInfo::VCCNZ: {
735  MachineOperand RegOp = Cond[1];
736  RegOp.setImplicit(false);
737  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
738  BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
739  .add(RegOp);
740  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
741  .addReg(FalseReg)
742  .addReg(TrueReg)
743  .addReg(SReg);
744  break;
745  }
746  case SIInstrInfo::VCCZ: {
747  MachineOperand RegOp = Cond[1];
748  RegOp.setImplicit(false);
749  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
750  BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
751  .add(RegOp);
752  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
753  .addReg(TrueReg)
754  .addReg(FalseReg)
755  .addReg(SReg);
756  break;
757  }
758  case SIInstrInfo::EXECNZ: {
759  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
760  unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
761  BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
762  .addImm(0);
763  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
764  .addImm(-1)
765  .addImm(0);
766  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
767  .addReg(FalseReg)
768  .addReg(TrueReg)
769  .addReg(SReg);
770  break;
771  }
772  case SIInstrInfo::EXECZ: {
773  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
774  unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
775  BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
776  .addImm(0);
777  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
778  .addImm(0)
779  .addImm(-1);
780  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
781  .addReg(FalseReg)
782  .addReg(TrueReg)
783  .addReg(SReg);
784  llvm_unreachable("Unhandled branch predicate EXECZ");
785  break;
786  }
787  default:
788  llvm_unreachable("invalid branch predicate");
789  }
790  } else {
791  llvm_unreachable("Can only handle Cond size 1 or 2");
792  }
793 }
794 
797  const DebugLoc &DL,
798  unsigned SrcReg, int Value) const {
800  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
801  BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
802  .addImm(Value)
803  .addReg(SrcReg);
804 
805  return Reg;
806 }
807 
810  const DebugLoc &DL,
811  unsigned SrcReg, int Value) const {
813  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
814  BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
815  .addImm(Value)
816  .addReg(SrcReg);
817 
818  return Reg;
819 }
820 
821 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
822 
823  if (RI.getRegSizeInBits(*DstRC) == 32) {
824  return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
825  } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
826  return AMDGPU::S_MOV_B64;
827  } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
828  return AMDGPU::V_MOV_B64_PSEUDO;
829  }
830  return AMDGPU::COPY;
831 }
832 
833 static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
834  switch (Size) {
835  case 4:
836  return AMDGPU::SI_SPILL_S32_SAVE;
837  case 8:
838  return AMDGPU::SI_SPILL_S64_SAVE;
839  case 16:
840  return AMDGPU::SI_SPILL_S128_SAVE;
841  case 32:
842  return AMDGPU::SI_SPILL_S256_SAVE;
843  case 64:
844  return AMDGPU::SI_SPILL_S512_SAVE;
845  default:
846  llvm_unreachable("unknown register size");
847  }
848 }
849 
850 static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
851  switch (Size) {
852  case 4:
853  return AMDGPU::SI_SPILL_V32_SAVE;
854  case 8:
855  return AMDGPU::SI_SPILL_V64_SAVE;
856  case 12:
857  return AMDGPU::SI_SPILL_V96_SAVE;
858  case 16:
859  return AMDGPU::SI_SPILL_V128_SAVE;
860  case 32:
861  return AMDGPU::SI_SPILL_V256_SAVE;
862  case 64:
863  return AMDGPU::SI_SPILL_V512_SAVE;
864  default:
865  llvm_unreachable("unknown register size");
866  }
867 }
868 
871  unsigned SrcReg, bool isKill,
872  int FrameIndex,
873  const TargetRegisterClass *RC,
874  const TargetRegisterInfo *TRI) const {
875  MachineFunction *MF = MBB.getParent();
877  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
878  const DebugLoc &DL = MBB.findDebugLoc(MI);
879 
880  unsigned Size = FrameInfo.getObjectSize(FrameIndex);
881  unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
882  MachinePointerInfo PtrInfo
883  = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
884  MachineMemOperand *MMO
886  Size, Align);
887  unsigned SpillSize = TRI->getSpillSize(*RC);
888 
889  if (RI.isSGPRClass(RC)) {
890  MFI->setHasSpilledSGPRs();
891 
892  // We are only allowed to create one new instruction when spilling
893  // registers, so we need to use pseudo instruction for spilling SGPRs.
894  const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
895 
896  // The SGPR spill/restore instructions only work on number sgprs, so we need
897  // to make sure we are using the correct register class.
898  if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) {
900  MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
901  }
902 
903  MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc)
904  .addReg(SrcReg, getKillRegState(isKill)) // data
905  .addFrameIndex(FrameIndex) // addr
906  .addMemOperand(MMO)
908  .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
909  // Add the scratch resource registers as implicit uses because we may end up
910  // needing them, and need to ensure that the reserved registers are
911  // correctly handled.
912 
913  FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
914  if (ST.hasScalarStores()) {
915  // m0 is used for offset to scalar stores if used to spill.
916  Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
917  }
918 
919  return;
920  }
921 
922  assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
923 
924  unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize);
925  MFI->setHasSpilledVGPRs();
926  BuildMI(MBB, MI, DL, get(Opcode))
927  .addReg(SrcReg, getKillRegState(isKill)) // data
928  .addFrameIndex(FrameIndex) // addr
929  .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
930  .addReg(MFI->getFrameOffsetReg()) // scratch_offset
931  .addImm(0) // offset
932  .addMemOperand(MMO);
933 }
934 
935 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
936  switch (Size) {
937  case 4:
938  return AMDGPU::SI_SPILL_S32_RESTORE;
939  case 8:
940  return AMDGPU::SI_SPILL_S64_RESTORE;
941  case 16:
942  return AMDGPU::SI_SPILL_S128_RESTORE;
943  case 32:
944  return AMDGPU::SI_SPILL_S256_RESTORE;
945  case 64:
946  return AMDGPU::SI_SPILL_S512_RESTORE;
947  default:
948  llvm_unreachable("unknown register size");
949  }
950 }
951 
952 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
953  switch (Size) {
954  case 4:
955  return AMDGPU::SI_SPILL_V32_RESTORE;
956  case 8:
957  return AMDGPU::SI_SPILL_V64_RESTORE;
958  case 12:
959  return AMDGPU::SI_SPILL_V96_RESTORE;
960  case 16:
961  return AMDGPU::SI_SPILL_V128_RESTORE;
962  case 32:
963  return AMDGPU::SI_SPILL_V256_RESTORE;
964  case 64:
965  return AMDGPU::SI_SPILL_V512_RESTORE;
966  default:
967  llvm_unreachable("unknown register size");
968  }
969 }
970 
973  unsigned DestReg, int FrameIndex,
974  const TargetRegisterClass *RC,
975  const TargetRegisterInfo *TRI) const {
976  MachineFunction *MF = MBB.getParent();
978  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
979  const DebugLoc &DL = MBB.findDebugLoc(MI);
980  unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
981  unsigned Size = FrameInfo.getObjectSize(FrameIndex);
982  unsigned SpillSize = TRI->getSpillSize(*RC);
983 
984  MachinePointerInfo PtrInfo
985  = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
986 
988  PtrInfo, MachineMemOperand::MOLoad, Size, Align);
989 
990  if (RI.isSGPRClass(RC)) {
991  MFI->setHasSpilledSGPRs();
992 
993  // FIXME: Maybe this should not include a memoperand because it will be
994  // lowered to non-memory instructions.
995  const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
996  if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) {
998  MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
999  }
1000 
1001  FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
1002  MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
1003  .addFrameIndex(FrameIndex) // addr
1004  .addMemOperand(MMO)
1006  .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
1007 
1008  if (ST.hasScalarStores()) {
1009  // m0 is used for offset to scalar stores if used to spill.
1010  Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
1011  }
1012 
1013  return;
1014  }
1015 
1016  assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
1017 
1018  unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize);
1019  BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1020  .addFrameIndex(FrameIndex) // vaddr
1021  .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
1022  .addReg(MFI->getFrameOffsetReg()) // scratch_offset
1023  .addImm(0) // offset
1024  .addMemOperand(MMO);
1025 }
1026 
1027 /// \param @Offset Offset in bytes of the FrameIndex being spilled
1029  MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg,
1030  unsigned FrameOffset, unsigned Size) const {
1031  MachineFunction *MF = MBB.getParent();
1033  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
1034  const DebugLoc &DL = MBB.findDebugLoc(MI);
1035  unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
1036  unsigned WavefrontSize = ST.getWavefrontSize();
1037 
1038  unsigned TIDReg = MFI->getTIDReg();
1039  if (!MFI->hasCalculatedTID()) {
1040  MachineBasicBlock &Entry = MBB.getParent()->front();
1041  MachineBasicBlock::iterator Insert = Entry.front();
1042  const DebugLoc &DL = Insert->getDebugLoc();
1043 
1044  TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass,
1045  *MF);
1046  if (TIDReg == AMDGPU::NoRegister)
1047  return TIDReg;
1048 
1050  WorkGroupSize > WavefrontSize) {
1051  unsigned TIDIGXReg
1053  unsigned TIDIGYReg
1055  unsigned TIDIGZReg
1057  unsigned InputPtrReg =
1059  for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
1060  if (!Entry.isLiveIn(Reg))
1061  Entry.addLiveIn(Reg);
1062  }
1063 
1064  RS->enterBasicBlock(Entry);
1065  // FIXME: Can we scavenge an SReg_64 and access the subregs?
1066  unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
1067  unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
1068  BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
1069  .addReg(InputPtrReg)
1071  BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1)
1072  .addReg(InputPtrReg)
1074 
1075  // NGROUPS.X * NGROUPS.Y
1076  BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1)
1077  .addReg(STmp1)
1078  .addReg(STmp0);
1079  // (NGROUPS.X * NGROUPS.Y) * TIDIG.X
1080  BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg)
1081  .addReg(STmp1)
1082  .addReg(TIDIGXReg);
1083  // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)
1084  BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg)
1085  .addReg(STmp0)
1086  .addReg(TIDIGYReg)
1087  .addReg(TIDReg);
1088  // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
1089  getAddNoCarry(Entry, Insert, DL, TIDReg)
1090  .addReg(TIDReg)
1091  .addReg(TIDIGZReg);
1092  } else {
1093  // Get the wave id
1094  BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
1095  TIDReg)
1096  .addImm(-1)
1097  .addImm(0);
1098 
1099  BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
1100  TIDReg)
1101  .addImm(-1)
1102  .addReg(TIDReg);
1103  }
1104 
1105  BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
1106  TIDReg)
1107  .addImm(2)
1108  .addReg(TIDReg);
1109  MFI->setTIDReg(TIDReg);
1110  }
1111 
1112  // Add FrameIndex to LDS offset
1113  unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize);
1114  getAddNoCarry(MBB, MI, DL, TmpReg)
1115  .addImm(LDSOffset)
1116  .addReg(TIDReg);
1117 
1118  return TmpReg;
1119 }
1120 
1123  int Count) const {
1124  DebugLoc DL = MBB.findDebugLoc(MI);
1125  while (Count > 0) {
1126  int Arg;
1127  if (Count >= 8)
1128  Arg = 7;
1129  else
1130  Arg = Count - 1;
1131  Count -= 8;
1132  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP))
1133  .addImm(Arg);
1134  }
1135 }
1136 
1139  insertWaitStates(MBB, MI, 1);
1140 }
1141 
1143  auto MF = MBB.getParent();
1145 
1146  assert(Info->isEntryFunction());
1147 
1148  if (MBB.succ_empty()) {
1149  bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1150  if (HasNoTerminator)
1151  BuildMI(MBB, MBB.end(), DebugLoc(),
1152  get(Info->returnsVoid() ? AMDGPU::S_ENDPGM : AMDGPU::SI_RETURN_TO_EPILOG));
1153  }
1154 }
1155 
1157  switch (MI.getOpcode()) {
1158  default: return 1; // FIXME: Do wait states equal cycles?
1159 
1160  case AMDGPU::S_NOP:
1161  return MI.getOperand(0).getImm() + 1;
1162  }
1163 }
1164 
1166  MachineBasicBlock &MBB = *MI.getParent();
1167  DebugLoc DL = MBB.findDebugLoc(MI);
1168  switch (MI.getOpcode()) {
1169  default: return TargetInstrInfo::expandPostRAPseudo(MI);
1170  case AMDGPU::S_MOV_B64_term:
1171  // This is only a terminator to get the correct spill code placement during
1172  // register allocation.
1173  MI.setDesc(get(AMDGPU::S_MOV_B64));
1174  break;
1175 
1176  case AMDGPU::S_XOR_B64_term:
1177  // This is only a terminator to get the correct spill code placement during
1178  // register allocation.
1179  MI.setDesc(get(AMDGPU::S_XOR_B64));
1180  break;
1181 
1182  case AMDGPU::S_ANDN2_B64_term:
1183  // This is only a terminator to get the correct spill code placement during
1184  // register allocation.
1185  MI.setDesc(get(AMDGPU::S_ANDN2_B64));
1186  break;
1187 
1188  case AMDGPU::V_MOV_B64_PSEUDO: {
1189  unsigned Dst = MI.getOperand(0).getReg();
1190  unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
1191  unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
1192 
1193  const MachineOperand &SrcOp = MI.getOperand(1);
1194  // FIXME: Will this work for 64-bit floating point immediates?
1195  assert(!SrcOp.isFPImm());
1196  if (SrcOp.isImm()) {
1197  APInt Imm(64, SrcOp.getImm());
1198  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1199  .addImm(Imm.getLoBits(32).getZExtValue())
1200  .addReg(Dst, RegState::Implicit | RegState::Define);
1201  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1202  .addImm(Imm.getHiBits(32).getZExtValue())
1203  .addReg(Dst, RegState::Implicit | RegState::Define);
1204  } else {
1205  assert(SrcOp.isReg());
1206  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1207  .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
1209  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1210  .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
1212  }
1213  MI.eraseFromParent();
1214  break;
1215  }
1216  case AMDGPU::V_SET_INACTIVE_B32: {
1217  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1218  .addReg(AMDGPU::EXEC);
1219  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
1220  .add(MI.getOperand(2));
1221  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1222  .addReg(AMDGPU::EXEC);
1223  MI.eraseFromParent();
1224  break;
1225  }
1226  case AMDGPU::V_SET_INACTIVE_B64: {
1227  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1228  .addReg(AMDGPU::EXEC);
1229  MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
1230  MI.getOperand(0).getReg())
1231  .add(MI.getOperand(2));
1232  expandPostRAPseudo(*Copy);
1233  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1234  .addReg(AMDGPU::EXEC);
1235  MI.eraseFromParent();
1236  break;
1237  }
1238  case AMDGPU::V_MOVRELD_B32_V1:
1239  case AMDGPU::V_MOVRELD_B32_V2:
1240  case AMDGPU::V_MOVRELD_B32_V4:
1241  case AMDGPU::V_MOVRELD_B32_V8:
1242  case AMDGPU::V_MOVRELD_B32_V16: {
1243  const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32);
1244  unsigned VecReg = MI.getOperand(0).getReg();
1245  bool IsUndef = MI.getOperand(1).isUndef();
1246  unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm();
1247  assert(VecReg == MI.getOperand(1).getReg());
1248 
1249  MachineInstr *MovRel =
1250  BuildMI(MBB, MI, DL, MovRelDesc)
1251  .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
1252  .add(MI.getOperand(2))
1253  .addReg(VecReg, RegState::ImplicitDefine)
1254  .addReg(VecReg,
1255  RegState::Implicit | (IsUndef ? RegState::Undef : 0));
1256 
1257  const int ImpDefIdx =
1258  MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses();
1259  const int ImpUseIdx = ImpDefIdx + 1;
1260  MovRel->tieOperands(ImpDefIdx, ImpUseIdx);
1261 
1262  MI.eraseFromParent();
1263  break;
1264  }
1265  case AMDGPU::SI_PC_ADD_REL_OFFSET: {
1266  MachineFunction &MF = *MBB.getParent();
1267  unsigned Reg = MI.getOperand(0).getReg();
1268  unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
1269  unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
1270 
1271  // Create a bundle so these instructions won't be re-ordered by the
1272  // post-RA scheduler.
1273  MIBundleBuilder Bundler(MBB, MI);
1274  Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
1275 
1276  // Add 32-bit offset from this instruction to the start of the
1277  // constant data.
1278  Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
1279  .addReg(RegLo)
1280  .add(MI.getOperand(1)));
1281 
1282  MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
1283  .addReg(RegHi);
1285  MIB.addImm(0);
1286  else
1287  MIB.add(MI.getOperand(2));
1288 
1289  Bundler.append(MIB);
1290  finalizeBundle(MBB, Bundler.begin());
1291 
1292  MI.eraseFromParent();
1293  break;
1294  }
1295  case AMDGPU::EXIT_WWM: {
1296  // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM
1297  // is exited.
1298  MI.setDesc(get(AMDGPU::S_MOV_B64));
1299  break;
1300  }
1301  case TargetOpcode::BUNDLE: {
1302  if (!MI.mayLoad())
1303  return false;
1304 
1305  // If it is a load it must be a memory clause
1307  I->isBundledWithSucc(); ++I) {
1308  I->unbundleFromSucc();
1309  for (MachineOperand &MO : I->operands())
1310  if (MO.isReg())
1311  MO.setIsInternalRead(false);
1312  }
1313 
1314  MI.eraseFromParent();
1315  break;
1316  }
1317  }
1318  return true;
1319 }
1320 
1322  MachineOperand &Src0,
1323  unsigned Src0OpName,
1324  MachineOperand &Src1,
1325  unsigned Src1OpName) const {
1326  MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
1327  if (!Src0Mods)
1328  return false;
1329 
1330  MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
1331  assert(Src1Mods &&
1332  "All commutable instructions have both src0 and src1 modifiers");
1333 
1334  int Src0ModsVal = Src0Mods->getImm();
1335  int Src1ModsVal = Src1Mods->getImm();
1336 
1337  Src1Mods->setImm(Src0ModsVal);
1338  Src0Mods->setImm(Src1ModsVal);
1339  return true;
1340 }
1341 
1343  MachineOperand &RegOp,
1344  MachineOperand &NonRegOp) {
1345  unsigned Reg = RegOp.getReg();
1346  unsigned SubReg = RegOp.getSubReg();
1347  bool IsKill = RegOp.isKill();
1348  bool IsDead = RegOp.isDead();
1349  bool IsUndef = RegOp.isUndef();
1350  bool IsDebug = RegOp.isDebug();
1351 
1352  if (NonRegOp.isImm())
1353  RegOp.ChangeToImmediate(NonRegOp.getImm());
1354  else if (NonRegOp.isFI())
1355  RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
1356  else
1357  return nullptr;
1358 
1359  NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
1360  NonRegOp.setSubReg(SubReg);
1361 
1362  return &MI;
1363 }
1364 
1366  unsigned Src0Idx,
1367  unsigned Src1Idx) const {
1368  assert(!NewMI && "this should never be used");
1369 
1370  unsigned Opc = MI.getOpcode();
1371  int CommutedOpcode = commuteOpcode(Opc);
1372  if (CommutedOpcode == -1)
1373  return nullptr;
1374 
1375  assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
1376  static_cast<int>(Src0Idx) &&
1377  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
1378  static_cast<int>(Src1Idx) &&
1379  "inconsistency with findCommutedOpIndices");
1380 
1381  MachineOperand &Src0 = MI.getOperand(Src0Idx);
1382  MachineOperand &Src1 = MI.getOperand(Src1Idx);
1383 
1384  MachineInstr *CommutedMI = nullptr;
1385  if (Src0.isReg() && Src1.isReg()) {
1386  if (isOperandLegal(MI, Src1Idx, &Src0)) {
1387  // Be sure to copy the source modifiers to the right place.
1388  CommutedMI
1389  = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
1390  }
1391 
1392  } else if (Src0.isReg() && !Src1.isReg()) {
1393  // src0 should always be able to support any operand type, so no need to
1394  // check operand legality.
1395  CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
1396  } else if (!Src0.isReg() && Src1.isReg()) {
1397  if (isOperandLegal(MI, Src1Idx, &Src0))
1398  CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
1399  } else {
1400  // FIXME: Found two non registers to commute. This does happen.
1401  return nullptr;
1402  }
1403 
1404  if (CommutedMI) {
1405  swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
1406  Src1, AMDGPU::OpName::src1_modifiers);
1407 
1408  CommutedMI->setDesc(get(CommutedOpcode));
1409  }
1410 
1411  return CommutedMI;
1412 }
1413 
1414 // This needs to be implemented because the source modifiers may be inserted
1415 // between the true commutable operands, and the base
1416 // TargetInstrInfo::commuteInstruction uses it.
1418  unsigned &SrcOpIdx1) const {
1419  return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
1420 }
1421 
1422 bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0,
1423  unsigned &SrcOpIdx1) const {
1424  if (!Desc.isCommutable())
1425  return false;
1426 
1427  unsigned Opc = Desc.getOpcode();
1428  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1429  if (Src0Idx == -1)
1430  return false;
1431 
1432  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1433  if (Src1Idx == -1)
1434  return false;
1435 
1436  return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
1437 }
1438 
1439 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
1440  int64_t BrOffset) const {
1441  // BranchRelaxation should never have to check s_setpc_b64 because its dest
1442  // block is unanalyzable.
1443  assert(BranchOp != AMDGPU::S_SETPC_B64);
1444 
1445  // Convert to dwords.
1446  BrOffset /= 4;
1447 
1448  // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
1449  // from the next instruction.
1450  BrOffset -= 1;
1451 
1452  return isIntN(BranchOffsetBits, BrOffset);
1453 }
1454 
1456  const MachineInstr &MI) const {
1457  if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
1458  // This would be a difficult analysis to perform, but can always be legal so
1459  // there's no need to analyze it.
1460  return nullptr;
1461  }
1462 
1463  return MI.getOperand(0).getMBB();
1464 }
1465 
1467  MachineBasicBlock &DestBB,
1468  const DebugLoc &DL,
1469  int64_t BrOffset,
1470  RegScavenger *RS) const {
1471  assert(RS && "RegScavenger required for long branching");
1472  assert(MBB.empty() &&
1473  "new block should be inserted for expanding unconditional branch");
1474  assert(MBB.pred_size() == 1);
1475 
1476  MachineFunction *MF = MBB.getParent();
1477  MachineRegisterInfo &MRI = MF->getRegInfo();
1478 
1479  // FIXME: Virtual register workaround for RegScavenger not working with empty
1480  // blocks.
1481  unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
1482 
1483  auto I = MBB.end();
1484 
1485  // We need to compute the offset relative to the instruction immediately after
1486  // s_getpc_b64. Insert pc arithmetic code before last terminator.
1487  MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
1488 
1489  // TODO: Handle > 32-bit block address.
1490  if (BrOffset >= 0) {
1491  BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
1492  .addReg(PCReg, RegState::Define, AMDGPU::sub0)
1493  .addReg(PCReg, 0, AMDGPU::sub0)
1495  BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
1496  .addReg(PCReg, RegState::Define, AMDGPU::sub1)
1497  .addReg(PCReg, 0, AMDGPU::sub1)
1498  .addImm(0);
1499  } else {
1500  // Backwards branch.
1501  BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
1502  .addReg(PCReg, RegState::Define, AMDGPU::sub0)
1503  .addReg(PCReg, 0, AMDGPU::sub0)
1505  BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
1506  .addReg(PCReg, RegState::Define, AMDGPU::sub1)
1507  .addReg(PCReg, 0, AMDGPU::sub1)
1508  .addImm(0);
1509  }
1510 
1511  // Insert the indirect branch after the other terminator.
1512  BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
1513  .addReg(PCReg);
1514 
1515  // FIXME: If spilling is necessary, this will fail because this scavenger has
1516  // no emergency stack slots. It is non-trivial to spill in this situation,
1517  // because the restore code needs to be specially placed after the
1518  // jump. BranchRelaxation then needs to be made aware of the newly inserted
1519  // block.
1520  //
1521  // If a spill is needed for the pc register pair, we need to insert a spill
1522  // restore block right before the destination block, and insert a short branch
1523  // into the old destination block's fallthrough predecessor.
1524  // e.g.:
1525  //
1526  // s_cbranch_scc0 skip_long_branch:
1527  //
1528  // long_branch_bb:
1529  // spill s[8:9]
1530  // s_getpc_b64 s[8:9]
1531  // s_add_u32 s8, s8, restore_bb
1532  // s_addc_u32 s9, s9, 0
1533  // s_setpc_b64 s[8:9]
1534  //
1535  // skip_long_branch:
1536  // foo;
1537  //
1538  // .....
1539  //
1540  // dest_bb_fallthrough_predecessor:
1541  // bar;
1542  // s_branch dest_bb
1543  //
1544  // restore_bb:
1545  // restore s[8:9]
1546  // fallthrough dest_bb
1547  ///
1548  // dest_bb:
1549  // buzz;
1550 
1551  RS->enterBasicBlockEnd(MBB);
1552  unsigned Scav = RS->scavengeRegisterBackwards(
1553  AMDGPU::SReg_64RegClass,
1554  MachineBasicBlock::iterator(GetPC), false, 0);
1555  MRI.replaceRegWith(PCReg, Scav);
1556  MRI.clearVirtRegs();
1557  RS->setRegUsed(Scav);
1558 
1559  return 4 + 8 + 4 + 4;
1560 }
1561 
1562 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
1563  switch (Cond) {
1564  case SIInstrInfo::SCC_TRUE:
1565  return AMDGPU::S_CBRANCH_SCC1;
1566  case SIInstrInfo::SCC_FALSE:
1567  return AMDGPU::S_CBRANCH_SCC0;
1568  case SIInstrInfo::VCCNZ:
1569  return AMDGPU::S_CBRANCH_VCCNZ;
1570  case SIInstrInfo::VCCZ:
1571  return AMDGPU::S_CBRANCH_VCCZ;
1572  case SIInstrInfo::EXECNZ:
1573  return AMDGPU::S_CBRANCH_EXECNZ;
1574  case SIInstrInfo::EXECZ:
1575  return AMDGPU::S_CBRANCH_EXECZ;
1576  default:
1577  llvm_unreachable("invalid branch predicate");
1578  }
1579 }
1580 
1581 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
1582  switch (Opcode) {
1583  case AMDGPU::S_CBRANCH_SCC0:
1584  return SCC_FALSE;
1585  case AMDGPU::S_CBRANCH_SCC1:
1586  return SCC_TRUE;
1587  case AMDGPU::S_CBRANCH_VCCNZ:
1588  return VCCNZ;
1589  case AMDGPU::S_CBRANCH_VCCZ:
1590  return VCCZ;
1591  case AMDGPU::S_CBRANCH_EXECNZ:
1592  return EXECNZ;
1593  case AMDGPU::S_CBRANCH_EXECZ:
1594  return EXECZ;
1595  default:
1596  return INVALID_BR;
1597  }
1598 }
1599 
1602  MachineBasicBlock *&TBB,
1603  MachineBasicBlock *&FBB,
1605  bool AllowModify) const {
1606  if (I->getOpcode() == AMDGPU::S_BRANCH) {
1607  // Unconditional Branch
1608  TBB = I->getOperand(0).getMBB();
1609  return false;
1610  }
1611 
1612  MachineBasicBlock *CondBB = nullptr;
1613 
1614  if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
1615  CondBB = I->getOperand(1).getMBB();
1616  Cond.push_back(I->getOperand(0));
1617  } else {
1618  BranchPredicate Pred = getBranchPredicate(I->getOpcode());
1619  if (Pred == INVALID_BR)
1620  return true;
1621 
1622  CondBB = I->getOperand(0).getMBB();
1624  Cond.push_back(I->getOperand(1)); // Save the branch register.
1625  }
1626  ++I;
1627 
1628  if (I == MBB.end()) {
1629  // Conditional branch followed by fall-through.
1630  TBB = CondBB;
1631  return false;
1632  }
1633 
1634  if (I->getOpcode() == AMDGPU::S_BRANCH) {
1635  TBB = CondBB;
1636  FBB = I->getOperand(0).getMBB();
1637  return false;
1638  }
1639 
1640  return true;
1641 }
1642 
1644  MachineBasicBlock *&FBB,
1646  bool AllowModify) const {
1648  auto E = MBB.end();
1649  if (I == E)
1650  return false;
1651 
1652  // Skip over the instructions that are artificially terminators for special
1653  // exec management.
1654  while (I != E && !I->isBranch() && !I->isReturn() &&
1655  I->getOpcode() != AMDGPU::SI_MASK_BRANCH) {
1656  switch (I->getOpcode()) {
1657  case AMDGPU::SI_MASK_BRANCH:
1658  case AMDGPU::S_MOV_B64_term:
1659  case AMDGPU::S_XOR_B64_term:
1660  case AMDGPU::S_ANDN2_B64_term:
1661  break;
1662  case AMDGPU::SI_IF:
1663  case AMDGPU::SI_ELSE:
1664  case AMDGPU::SI_KILL_I1_TERMINATOR:
1665  case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1666  // FIXME: It's messy that these need to be considered here at all.
1667  return true;
1668  default:
1669  llvm_unreachable("unexpected non-branch terminator inst");
1670  }
1671 
1672  ++I;
1673  }
1674 
1675  if (I == E)
1676  return false;
1677 
1678  if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH)
1679  return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
1680 
1681  ++I;
1682 
1683  // TODO: Should be able to treat as fallthrough?
1684  if (I == MBB.end())
1685  return true;
1686 
1687  if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify))
1688  return true;
1689 
1690  MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB();
1691 
1692  // Specifically handle the case where the conditional branch is to the same
1693  // destination as the mask branch. e.g.
1694  //
1695  // si_mask_branch BB8
1696  // s_cbranch_execz BB8
1697  // s_cbranch BB9
1698  //
1699  // This is required to understand divergent loops which may need the branches
1700  // to be relaxed.
1701  if (TBB != MaskBrDest || Cond.empty())
1702  return true;
1703 
1704  auto Pred = Cond[0].getImm();
1705  return (Pred != EXECZ && Pred != EXECNZ);
1706 }
1707 
1709  int *BytesRemoved) const {
1711 
1712  unsigned Count = 0;
1713  unsigned RemovedSize = 0;
1714  while (I != MBB.end()) {
1715  MachineBasicBlock::iterator Next = std::next(I);
1716  if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
1717  I = Next;
1718  continue;
1719  }
1720 
1721  RemovedSize += getInstSizeInBytes(*I);
1722  I->eraseFromParent();
1723  ++Count;
1724  I = Next;
1725  }
1726 
1727  if (BytesRemoved)
1728  *BytesRemoved = RemovedSize;
1729 
1730  return Count;
1731 }
1732 
1733 // Copy the flags onto the implicit condition register operand.
1735  const MachineOperand &OrigCond) {
1736  CondReg.setIsUndef(OrigCond.isUndef());
1737  CondReg.setIsKill(OrigCond.isKill());
1738 }
1739 
1741  MachineBasicBlock *TBB,
1742  MachineBasicBlock *FBB,
1744  const DebugLoc &DL,
1745  int *BytesAdded) const {
1746  if (!FBB && Cond.empty()) {
1747  BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
1748  .addMBB(TBB);
1749  if (BytesAdded)
1750  *BytesAdded = 4;
1751  return 1;
1752  }
1753 
1754  if(Cond.size() == 1 && Cond[0].isReg()) {
1755  BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
1756  .add(Cond[0])
1757  .addMBB(TBB);
1758  return 1;
1759  }
1760 
1761  assert(TBB && Cond[0].isImm());
1762 
1763  unsigned Opcode
1764  = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
1765 
1766  if (!FBB) {
1767  Cond[1].isUndef();
1768  MachineInstr *CondBr =
1769  BuildMI(&MBB, DL, get(Opcode))
1770  .addMBB(TBB);
1771 
1772  // Copy the flags onto the implicit condition register operand.
1773  preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
1774 
1775  if (BytesAdded)
1776  *BytesAdded = 4;
1777  return 1;
1778  }
1779 
1780  assert(TBB && FBB);
1781 
1782  MachineInstr *CondBr =
1783  BuildMI(&MBB, DL, get(Opcode))
1784  .addMBB(TBB);
1785  BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
1786  .addMBB(FBB);
1787 
1788  MachineOperand &CondReg = CondBr->getOperand(1);
1789  CondReg.setIsUndef(Cond[1].isUndef());
1790  CondReg.setIsKill(Cond[1].isKill());
1791 
1792  if (BytesAdded)
1793  *BytesAdded = 8;
1794 
1795  return 2;
1796 }
1797 
1799  SmallVectorImpl<MachineOperand> &Cond) const {
1800  if (Cond.size() != 2) {
1801  return true;
1802  }
1803 
1804  if (Cond[0].isImm()) {
1805  Cond[0].setImm(-Cond[0].getImm());
1806  return false;
1807  }
1808 
1809  return true;
1810 }
1811 
1814  unsigned TrueReg, unsigned FalseReg,
1815  int &CondCycles,
1816  int &TrueCycles, int &FalseCycles) const {
1817  switch (Cond[0].getImm()) {
1818  case VCCNZ:
1819  case VCCZ: {
1820  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1821  const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
1822  assert(MRI.getRegClass(FalseReg) == RC);
1823 
1824  int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
1825  CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
1826 
1827  // Limit to equal cost for branch vs. N v_cndmask_b32s.
1828  return !RI.isSGPRClass(RC) && NumInsts <= 6;
1829  }
1830  case SCC_TRUE:
1831  case SCC_FALSE: {
1832  // FIXME: We could insert for VGPRs if we could replace the original compare
1833  // with a vector one.
1834  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1835  const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
1836  assert(MRI.getRegClass(FalseReg) == RC);
1837 
1838  int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
1839 
1840  // Multiples of 8 can do s_cselect_b64
1841  if (NumInsts % 2 == 0)
1842  NumInsts /= 2;
1843 
1844  CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
1845  return RI.isSGPRClass(RC);
1846  }
1847  default:
1848  return false;
1849  }
1850 }
1851 
1854  unsigned DstReg, ArrayRef<MachineOperand> Cond,
1855  unsigned TrueReg, unsigned FalseReg) const {
1856  BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
1857  if (Pred == VCCZ || Pred == SCC_FALSE) {
1858  Pred = static_cast<BranchPredicate>(-Pred);
1859  std::swap(TrueReg, FalseReg);
1860  }
1861 
1863  const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
1864  unsigned DstSize = RI.getRegSizeInBits(*DstRC);
1865 
1866  if (DstSize == 32) {
1867  unsigned SelOp = Pred == SCC_TRUE ?
1868  AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32;
1869 
1870  // Instruction's operands are backwards from what is expected.
1871  MachineInstr *Select =
1872  BuildMI(MBB, I, DL, get(SelOp), DstReg)
1873  .addReg(FalseReg)
1874  .addReg(TrueReg);
1875 
1876  preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1877  return;
1878  }
1879 
1880  if (DstSize == 64 && Pred == SCC_TRUE) {
1881  MachineInstr *Select =
1882  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
1883  .addReg(FalseReg)
1884  .addReg(TrueReg);
1885 
1886  preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1887  return;
1888  }
1889 
1890  static const int16_t Sub0_15[] = {
1891  AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1892  AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
1893  AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
1894  AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
1895  };
1896 
1897  static const int16_t Sub0_15_64[] = {
1898  AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
1899  AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
1900  AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
1901  AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
1902  };
1903 
1904  unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
1905  const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
1906  const int16_t *SubIndices = Sub0_15;
1907  int NElts = DstSize / 32;
1908 
1909  // 64-bit select is only avaialble for SALU.
1910  if (Pred == SCC_TRUE) {
1911  SelOp = AMDGPU::S_CSELECT_B64;
1912  EltRC = &AMDGPU::SGPR_64RegClass;
1913  SubIndices = Sub0_15_64;
1914 
1915  assert(NElts % 2 == 0);
1916  NElts /= 2;
1917  }
1918 
1920  MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
1921 
1922  I = MIB->getIterator();
1923 
1925  for (int Idx = 0; Idx != NElts; ++Idx) {
1926  unsigned DstElt = MRI.createVirtualRegister(EltRC);
1927  Regs.push_back(DstElt);
1928 
1929  unsigned SubIdx = SubIndices[Idx];
1930 
1931  MachineInstr *Select =
1932  BuildMI(MBB, I, DL, get(SelOp), DstElt)
1933  .addReg(FalseReg, 0, SubIdx)
1934  .addReg(TrueReg, 0, SubIdx);
1935  preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1936 
1937  MIB.addReg(DstElt)
1938  .addImm(SubIdx);
1939  }
1940 }
1941 
1943  switch (MI.getOpcode()) {
1944  case AMDGPU::V_MOV_B32_e32:
1945  case AMDGPU::V_MOV_B32_e64:
1946  case AMDGPU::V_MOV_B64_PSEUDO: {
1947  // If there are additional implicit register operands, this may be used for
1948  // register indexing so the source register operand isn't simply copied.
1949  unsigned NumOps = MI.getDesc().getNumOperands() +
1950  MI.getDesc().getNumImplicitUses();
1951 
1952  return MI.getNumOperands() == NumOps;
1953  }
1954  case AMDGPU::S_MOV_B32:
1955  case AMDGPU::S_MOV_B64:
1956  case AMDGPU::COPY:
1957  return true;
1958  default:
1959  return false;
1960  }
1961 }
1962 
1964  unsigned Kind) const {
1965  switch(Kind) {
1976  }
1977  return AMDGPUAS::FLAT_ADDRESS;
1978 }
1979 
1981  unsigned Opc = MI.getOpcode();
1982  int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1983  AMDGPU::OpName::src0_modifiers);
1984  int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1985  AMDGPU::OpName::src1_modifiers);
1986  int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1987  AMDGPU::OpName::src2_modifiers);
1988 
1989  MI.RemoveOperand(Src2ModIdx);
1990  MI.RemoveOperand(Src1ModIdx);
1991  MI.RemoveOperand(Src0ModIdx);
1992 }
1993 
1995  unsigned Reg, MachineRegisterInfo *MRI) const {
1996  if (!MRI->hasOneNonDBGUse(Reg))
1997  return false;
1998 
1999  switch (DefMI.getOpcode()) {
2000  default:
2001  return false;
2002  case AMDGPU::S_MOV_B64:
2003  // TODO: We could fold 64-bit immediates, but this get compilicated
2004  // when there are sub-registers.
2005  return false;
2006 
2007  case AMDGPU::V_MOV_B32_e32:
2008  case AMDGPU::S_MOV_B32:
2009  break;
2010  }
2011 
2012  const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
2013  assert(ImmOp);
2014  // FIXME: We could handle FrameIndex values here.
2015  if (!ImmOp->isImm())
2016  return false;
2017 
2018  unsigned Opc = UseMI.getOpcode();
2019  if (Opc == AMDGPU::COPY) {
2020  bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg());
2021  unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2022  UseMI.setDesc(get(NewOpc));
2023  UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm());
2024  UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
2025  return true;
2026  }
2027 
2028  if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
2029  Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) {
2030  // Don't fold if we are using source or output modifiers. The new VOP2
2031  // instructions don't have them.
2032  if (hasAnyModifiersSet(UseMI))
2033  return false;
2034 
2035  // If this is a free constant, there's no reason to do this.
2036  // TODO: We could fold this here instead of letting SIFoldOperands do it
2037  // later.
2038  MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
2039 
2040  // Any src operand can be used for the legality check.
2041  if (isInlineConstant(UseMI, *Src0, *ImmOp))
2042  return false;
2043 
2044  bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
2045  MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
2046  MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
2047 
2048  // Multiplied part is the constant: Use v_madmk_{f16, f32}.
2049  // We should only expect these to be on src0 due to canonicalizations.
2050  if (Src0->isReg() && Src0->getReg() == Reg) {
2051  if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
2052  return false;
2053 
2054  if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
2055  return false;
2056 
2057  // We need to swap operands 0 and 1 since madmk constant is at operand 1.
2058 
2059  const int64_t Imm = ImmOp->getImm();
2060 
2061  // FIXME: This would be a lot easier if we could return a new instruction
2062  // instead of having to modify in place.
2063 
2064  // Remove these first since they are at the end.
2065  UseMI.RemoveOperand(
2066  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
2067  UseMI.RemoveOperand(
2068  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
2069 
2070  unsigned Src1Reg = Src1->getReg();
2071  unsigned Src1SubReg = Src1->getSubReg();
2072  Src0->setReg(Src1Reg);
2073  Src0->setSubReg(Src1SubReg);
2074  Src0->setIsKill(Src1->isKill());
2075 
2076  if (Opc == AMDGPU::V_MAC_F32_e64 ||
2077  Opc == AMDGPU::V_MAC_F16_e64)
2078  UseMI.untieRegOperand(
2079  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
2080 
2081  Src1->ChangeToImmediate(Imm);
2082 
2083  removeModOperands(UseMI);
2084  UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16));
2085 
2086  bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
2087  if (DeleteDef)
2088  DefMI.eraseFromParent();
2089 
2090  return true;
2091  }
2092 
2093  // Added part is the constant: Use v_madak_{f16, f32}.
2094  if (Src2->isReg() && Src2->getReg() == Reg) {
2095  // Not allowed to use constant bus for another operand.
2096  // We can however allow an inline immediate as src0.
2097  bool Src0Inlined = false;
2098  if (Src0->isReg()) {
2099  // Try to inline constant if possible.
2100  // If the Def moves immediate and the use is single
2101  // We are saving VGPR here.
2102  MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
2103  if (Def && Def->isMoveImmediate() &&
2104  isInlineConstant(Def->getOperand(1)) &&
2105  MRI->hasOneUse(Src0->getReg())) {
2106  Src0->ChangeToImmediate(Def->getOperand(1).getImm());
2107  Src0Inlined = true;
2108  } else if ((RI.isPhysicalRegister(Src0->getReg()) &&
2109  RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg()))) ||
2110  (RI.isVirtualRegister(Src0->getReg()) &&
2111  RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
2112  return false;
2113  // VGPR is okay as Src0 - fallthrough
2114  }
2115 
2116  if (Src1->isReg() && !Src0Inlined ) {
2117  // We have one slot for inlinable constant so far - try to fill it
2118  MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
2119  if (Def && Def->isMoveImmediate() &&
2120  isInlineConstant(Def->getOperand(1)) &&
2121  MRI->hasOneUse(Src1->getReg()) &&
2122  commuteInstruction(UseMI)) {
2123  Src0->ChangeToImmediate(Def->getOperand(1).getImm());
2124  } else if ((RI.isPhysicalRegister(Src1->getReg()) &&
2125  RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) ||
2126  (RI.isVirtualRegister(Src1->getReg()) &&
2127  RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
2128  return false;
2129  // VGPR is okay as Src1 - fallthrough
2130  }
2131 
2132  const int64_t Imm = ImmOp->getImm();
2133 
2134  // FIXME: This would be a lot easier if we could return a new instruction
2135  // instead of having to modify in place.
2136 
2137  // Remove these first since they are at the end.
2138  UseMI.RemoveOperand(
2139  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
2140  UseMI.RemoveOperand(
2141  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
2142 
2143  if (Opc == AMDGPU::V_MAC_F32_e64 ||
2144  Opc == AMDGPU::V_MAC_F16_e64)
2145  UseMI.untieRegOperand(
2146  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
2147 
2148  // ChangingToImmediate adds Src2 back to the instruction.
2149  Src2->ChangeToImmediate(Imm);
2150 
2151  // These come before src2.
2152  removeModOperands(UseMI);
2153  UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16));
2154 
2155  bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
2156  if (DeleteDef)
2157  DefMI.eraseFromParent();
2158 
2159  return true;
2160  }
2161  }
2162 
2163  return false;
2164 }
2165 
2166 static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
2167  int WidthB, int OffsetB) {
2168  int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
2169  int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
2170  int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
2171  return LowOffset + LowWidth <= HighOffset;
2172 }
2173 
2174 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa,
2175  MachineInstr &MIb) const {
2176  MachineOperand *BaseOp0, *BaseOp1;
2177  int64_t Offset0, Offset1;
2178 
2179  if (getMemOperandWithOffset(MIa, BaseOp0, Offset0, &RI) &&
2180  getMemOperandWithOffset(MIb, BaseOp1, Offset1, &RI)) {
2181  if (!BaseOp0->isIdenticalTo(*BaseOp1))
2182  return false;
2183 
2184  if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
2185  // FIXME: Handle ds_read2 / ds_write2.
2186  return false;
2187  }
2188  unsigned Width0 = (*MIa.memoperands_begin())->getSize();
2189  unsigned Width1 = (*MIb.memoperands_begin())->getSize();
2190  if (offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
2191  return true;
2192  }
2193  }
2194 
2195  return false;
2196 }
2197 
2199  MachineInstr &MIb,
2200  AliasAnalysis *AA) const {
2201  assert((MIa.mayLoad() || MIa.mayStore()) &&
2202  "MIa must load from or modify a memory location");
2203  assert((MIb.mayLoad() || MIb.mayStore()) &&
2204  "MIb must load from or modify a memory location");
2205 
2207  return false;
2208 
2209  // XXX - Can we relax this between address spaces?
2210  if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
2211  return false;
2212 
2213  if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) {
2214  const MachineMemOperand *MMOa = *MIa.memoperands_begin();
2215  const MachineMemOperand *MMOb = *MIb.memoperands_begin();
2216  if (MMOa->getValue() && MMOb->getValue()) {
2217  MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo());
2218  MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo());
2219  if (!AA->alias(LocA, LocB))
2220  return true;
2221  }
2222  }
2223 
2224  // TODO: Should we check the address space from the MachineMemOperand? That
2225  // would allow us to distinguish objects we know don't alias based on the
2226  // underlying address space, even if it was lowered to a different one,
2227  // e.g. private accesses lowered to use MUBUF instructions on a scratch
2228  // buffer.
2229  if (isDS(MIa)) {
2230  if (isDS(MIb))
2231  return checkInstOffsetsDoNotOverlap(MIa, MIb);
2232 
2233  return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
2234  }
2235 
2236  if (isMUBUF(MIa) || isMTBUF(MIa)) {
2237  if (isMUBUF(MIb) || isMTBUF(MIb))
2238  return checkInstOffsetsDoNotOverlap(MIa, MIb);
2239 
2240  return !isFLAT(MIb) && !isSMRD(MIb);
2241  }
2242 
2243  if (isSMRD(MIa)) {
2244  if (isSMRD(MIb))
2245  return checkInstOffsetsDoNotOverlap(MIa, MIb);
2246 
2247  return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa);
2248  }
2249 
2250  if (isFLAT(MIa)) {
2251  if (isFLAT(MIb))
2252  return checkInstOffsetsDoNotOverlap(MIa, MIb);
2253 
2254  return false;
2255  }
2256 
2257  return false;
2258 }
2259 
2260 static int64_t getFoldableImm(const MachineOperand* MO) {
2261  if (!MO->isReg())
2262  return false;
2263  const MachineFunction *MF = MO->getParent()->getParent()->getParent();
2264  const MachineRegisterInfo &MRI = MF->getRegInfo();
2265  auto Def = MRI.getUniqueVRegDef(MO->getReg());
2266  if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
2267  Def->getOperand(1).isImm())
2268  return Def->getOperand(1).getImm();
2269  return AMDGPU::NoRegister;
2270 }
2271 
2273  MachineInstr &MI,
2274  LiveVariables *LV) const {
2275  unsigned Opc = MI.getOpcode();
2276  bool IsF16 = false;
2277  bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64;
2278 
2279  switch (Opc) {
2280  default:
2281  return nullptr;
2282  case AMDGPU::V_MAC_F16_e64:
2283  IsF16 = true;
2285  case AMDGPU::V_MAC_F32_e64:
2286  case AMDGPU::V_FMAC_F32_e64:
2287  break;
2288  case AMDGPU::V_MAC_F16_e32:
2289  IsF16 = true;
2291  case AMDGPU::V_MAC_F32_e32:
2292  case AMDGPU::V_FMAC_F32_e32: {
2293  int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
2294  AMDGPU::OpName::src0);
2295  const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
2296  if (!Src0->isReg() && !Src0->isImm())
2297  return nullptr;
2298 
2299  if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
2300  return nullptr;
2301 
2302  break;
2303  }
2304  }
2305 
2306  const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
2307  const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
2308  const MachineOperand *Src0Mods =
2309  getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2310  const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
2311  const MachineOperand *Src1Mods =
2312  getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2313  const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
2314  const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
2315  const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
2316 
2317  if (!IsFMA && !Src0Mods && !Src1Mods && !Clamp && !Omod &&
2318  // If we have an SGPR input, we will violate the constant bus restriction.
2319  (!Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
2320  if (auto Imm = getFoldableImm(Src2)) {
2321  return BuildMI(*MBB, MI, MI.getDebugLoc(),
2322  get(IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32))
2323  .add(*Dst)
2324  .add(*Src0)
2325  .add(*Src1)
2326  .addImm(Imm);
2327  }
2328  if (auto Imm = getFoldableImm(Src1)) {
2329  return BuildMI(*MBB, MI, MI.getDebugLoc(),
2330  get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
2331  .add(*Dst)
2332  .add(*Src0)
2333  .addImm(Imm)
2334  .add(*Src2);
2335  }
2336  if (auto Imm = getFoldableImm(Src0)) {
2337  if (isOperandLegal(MI, AMDGPU::getNamedOperandIdx(AMDGPU::V_MADMK_F32,
2338  AMDGPU::OpName::src0), Src1))
2339  return BuildMI(*MBB, MI, MI.getDebugLoc(),
2340  get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
2341  .add(*Dst)
2342  .add(*Src1)
2343  .addImm(Imm)
2344  .add(*Src2);
2345  }
2346  }
2347 
2348  assert((!IsFMA || !IsF16) && "fmac only expected with f32");
2349  unsigned NewOpc = IsFMA ? AMDGPU::V_FMA_F32 :
2350  (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32);
2351  return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
2352  .add(*Dst)
2353  .addImm(Src0Mods ? Src0Mods->getImm() : 0)
2354  .add(*Src0)
2355  .addImm(Src1Mods ? Src1Mods->getImm() : 0)
2356  .add(*Src1)
2357  .addImm(0) // Src mods
2358  .add(*Src2)
2359  .addImm(Clamp ? Clamp->getImm() : 0)
2360  .addImm(Omod ? Omod->getImm() : 0);
2361 }
2362 
2363 // It's not generally safe to move VALU instructions across these since it will
2364 // start using the register as a base index rather than directly.
2365 // XXX - Why isn't hasSideEffects sufficient for these?
2367  switch (MI.getOpcode()) {
2368  case AMDGPU::S_SET_GPR_IDX_ON:
2369  case AMDGPU::S_SET_GPR_IDX_MODE:
2370  case AMDGPU::S_SET_GPR_IDX_OFF:
2371  return true;
2372  default:
2373  return false;
2374  }
2375 }
2376 
2378  const MachineBasicBlock *MBB,
2379  const MachineFunction &MF) const {
2380  // XXX - Do we want the SP check in the base implementation?
2381 
2382  // Target-independent instructions do not have an implicit-use of EXEC, even
2383  // when they operate on VGPRs. Treating EXEC modifications as scheduling
2384  // boundaries prevents incorrect movements of such instructions.
2385  return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) ||
2386  MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
2387  MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
2388  MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
2390 }
2391 
2392 bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
2393  return Opcode == AMDGPU::DS_ORDERED_COUNT ||
2394  Opcode == AMDGPU::DS_GWS_INIT ||
2395  Opcode == AMDGPU::DS_GWS_SEMA_V ||
2396  Opcode == AMDGPU::DS_GWS_SEMA_BR ||
2397  Opcode == AMDGPU::DS_GWS_SEMA_P ||
2398  Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL ||
2399  Opcode == AMDGPU::DS_GWS_BARRIER;
2400 }
2401 
2403  unsigned Opcode = MI.getOpcode();
2404 
2405  if (MI.mayStore() && isSMRD(MI))
2406  return true; // scalar store or atomic
2407 
2408  // These instructions cause shader I/O that may cause hardware lockups
2409  // when executed with an empty EXEC mask.
2410  //
2411  // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
2412  // EXEC = 0, but checking for that case here seems not worth it
2413  // given the typical code patterns.
2414  if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
2415  Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE ||
2416  Opcode == AMDGPU::DS_ORDERED_COUNT)
2417  return true;
2418 
2419  if (MI.isInlineAsm())
2420  return true; // conservative assumption
2421 
2422  // These are like SALU instructions in terms of effects, so it's questionable
2423  // whether we should return true for those.
2424  //
2425  // However, executing them with EXEC = 0 causes them to operate on undefined
2426  // data, which we avoid by returning true here.
2427  if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || Opcode == AMDGPU::V_READLANE_B32)
2428  return true;
2429 
2430  return false;
2431 }
2432 
2433 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
2434  switch (Imm.getBitWidth()) {
2435  case 32:
2437  ST.hasInv2PiInlineImm());
2438  case 64:
2440  ST.hasInv2PiInlineImm());
2441  case 16:
2442  return ST.has16BitInsts() &&
2444  ST.hasInv2PiInlineImm());
2445  default:
2446  llvm_unreachable("invalid bitwidth");
2447  }
2448 }
2449 
2451  uint8_t OperandType) const {
2452  if (!MO.isImm() ||
2453  OperandType < AMDGPU::OPERAND_SRC_FIRST ||
2454  OperandType > AMDGPU::OPERAND_SRC_LAST)
2455  return false;
2456 
2457  // MachineOperand provides no way to tell the true operand size, since it only
2458  // records a 64-bit value. We need to know the size to determine if a 32-bit
2459  // floating point immediate bit pattern is legal for an integer immediate. It
2460  // would be for any 32-bit integer operand, but would not be for a 64-bit one.
2461 
2462  int64_t Imm = MO.getImm();
2463  switch (OperandType) {
2468  int32_t Trunc = static_cast<int32_t>(Imm);
2470  }
2476  ST.hasInv2PiInlineImm());
2481  if (isInt<16>(Imm) || isUInt<16>(Imm)) {
2482  // A few special case instructions have 16-bit operands on subtargets
2483  // where 16-bit instructions are not legal.
2484  // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
2485  // constants in these cases
2486  int16_t Trunc = static_cast<int16_t>(Imm);
2487  return ST.has16BitInsts() &&
2489  }
2490 
2491  return false;
2492  }
2495  if (isUInt<16>(Imm)) {
2496  int16_t Trunc = static_cast<int16_t>(Imm);
2497  return ST.has16BitInsts() &&
2499  }
2500  if (!(Imm & 0xffff)) {
2501  return ST.has16BitInsts() &&
2503  }
2504  uint32_t Trunc = static_cast<uint32_t>(Imm);
2506  }
2507  default:
2508  llvm_unreachable("invalid bitwidth");
2509  }
2510 }
2511 
2513  const MCOperandInfo &OpInfo) const {
2514  switch (MO.getType()) {
2516  return false;
2518  return !isInlineConstant(MO, OpInfo);
2524  return true;
2525  default:
2526  llvm_unreachable("unexpected operand type");
2527  }
2528 }
2529 
2530 static bool compareMachineOp(const MachineOperand &Op0,
2531  const MachineOperand &Op1) {
2532  if (Op0.getType() != Op1.getType())
2533  return false;
2534 
2535  switch (Op0.getType()) {
2537  return Op0.getReg() == Op1.getReg();
2539  return Op0.getImm() == Op1.getImm();
2540  default:
2541  llvm_unreachable("Didn't expect to be comparing these operand types");
2542  }
2543 }
2544 
2546  const MachineOperand &MO) const {
2547  const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo];
2548 
2549  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
2550 
2551  if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
2552  return true;
2553 
2554  if (OpInfo.RegClass < 0)
2555  return false;
2556 
2557  if (MO.isImm() && isInlineConstant(MO, OpInfo))
2558  return RI.opCanUseInlineConstant(OpInfo.OperandType);
2559 
2560  return RI.opCanUseLiteralConstant(OpInfo.OperandType);
2561 }
2562 
2563 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
2564  int Op32 = AMDGPU::getVOPe32(Opcode);
2565  if (Op32 == -1)
2566  return false;
2567 
2568  return pseudoToMCOpcode(Op32) != -1;
2569 }
2570 
2571 bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
2572  // The src0_modifier operand is present on all instructions
2573  // that have modifiers.
2574 
2575  return AMDGPU::getNamedOperandIdx(Opcode,
2576  AMDGPU::OpName::src0_modifiers) != -1;
2577 }
2578 
2580  unsigned OpName) const {
2581  const MachineOperand *Mods = getNamedOperand(MI, OpName);
2582  return Mods && Mods->getImm();
2583 }
2584 
2586  return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
2587  hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
2588  hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) ||
2589  hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
2590  hasModifiersSet(MI, AMDGPU::OpName::omod);
2591 }
2592 
2594  const MachineRegisterInfo &MRI) const {
2595  const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
2596  // Can't shrink instruction with three operands.
2597  // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
2598  // a special case for it. It can only be shrunk if the third operand
2599  // is vcc. We should handle this the same way we handle vopc, by addding
2600  // a register allocation hint pre-regalloc and then do the shrinking
2601  // post-regalloc.
2602  if (Src2) {
2603  switch (MI.getOpcode()) {
2604  default: return false;
2605 
2606  case AMDGPU::V_ADDC_U32_e64:
2607  case AMDGPU::V_SUBB_U32_e64:
2608  case AMDGPU::V_SUBBREV_U32_e64: {
2609  const MachineOperand *Src1
2610  = getNamedOperand(MI, AMDGPU::OpName::src1);
2611  if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
2612  return false;
2613  // Additional verification is needed for sdst/src2.
2614  return true;
2615  }
2616  case AMDGPU::V_MAC_F32_e64:
2617  case AMDGPU::V_MAC_F16_e64:
2618  case AMDGPU::V_FMAC_F32_e64:
2619  if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
2620  hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
2621  return false;
2622  break;
2623 
2624  case AMDGPU::V_CNDMASK_B32_e64:
2625  break;
2626  }
2627  }
2628 
2629  const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
2630  if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
2631  hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
2632  return false;
2633 
2634  // We don't need to check src0, all input types are legal, so just make sure
2635  // src0 isn't using any modifiers.
2636  if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
2637  return false;
2638 
2639  // Can it be shrunk to a valid 32 bit opcode?
2640  if (!hasVALU32BitEncoding(MI.getOpcode()))
2641  return false;
2642 
2643  // Check output modifiers
2644  return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
2645  !hasModifiersSet(MI, AMDGPU::OpName::clamp);
2646 }
2647 
2648 // Set VCC operand with all flags from \p Orig, except for setting it as
2649 // implicit.
2651  const MachineOperand &Orig) {
2652 
2653  for (MachineOperand &Use : MI.implicit_operands()) {
2654  if (Use.isUse() && Use.getReg() == AMDGPU::VCC) {
2655  Use.setIsUndef(Orig.isUndef());
2656  Use.setIsKill(Orig.isKill());
2657  return;
2658  }
2659  }
2660 }
2661 
2663  unsigned Op32) const {
2664  MachineBasicBlock *MBB = MI.getParent();;
2665  MachineInstrBuilder Inst32 =
2666  BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32));
2667 
2668  // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
2669  // For VOPC instructions, this is replaced by an implicit def of vcc.
2670  int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
2671  if (Op32DstIdx != -1) {
2672  // dst
2673  Inst32.add(MI.getOperand(0));
2674  } else {
2675  assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
2676  "Unexpected case");
2677  }
2678 
2679  Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0));
2680 
2681  const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
2682  if (Src1)
2683  Inst32.add(*Src1);
2684 
2685  const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
2686 
2687  if (Src2) {
2688  int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
2689  if (Op32Src2Idx != -1) {
2690  Inst32.add(*Src2);
2691  } else {
2692  // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
2693  // replaced with an implicit read of vcc. This was already added
2694  // during the initial BuildMI, so find it to preserve the flags.
2695  copyFlagsToImplicitVCC(*Inst32, *Src2);
2696  }
2697  }
2698 
2699  return Inst32;
2700 }
2701 
2703  const MachineOperand &MO,
2704  const MCOperandInfo &OpInfo) const {
2705  // Literal constants use the constant bus.
2706  //if (isLiteralConstantLike(MO, OpInfo))
2707  // return true;
2708  if (MO.isImm())
2709  return !isInlineConstant(MO, OpInfo);
2710 
2711  if (!MO.isReg())
2712  return true; // Misc other operands like FrameIndex
2713 
2714  if (!MO.isUse())
2715  return false;
2716 
2718  return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
2719 
2720  // FLAT_SCR is just an SGPR pair.
2721  if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR))
2722  return true;
2723 
2724  // EXEC register uses the constant bus.
2725  if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
2726  return true;
2727 
2728  // SGPRs use the constant bus
2729  return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 ||
2730  (!MO.isImplicit() &&
2731  (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
2732  AMDGPU::SGPR_64RegClass.contains(MO.getReg()))));
2733 }
2734 
2735 static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
2736  for (const MachineOperand &MO : MI.implicit_operands()) {
2737  // We only care about reads.
2738  if (MO.isDef())
2739  continue;
2740 
2741  switch (MO.getReg()) {
2742  case AMDGPU::VCC:
2743  case AMDGPU::M0:
2744  case AMDGPU::FLAT_SCR:
2745  return MO.getReg();
2746 
2747  default:
2748  break;
2749  }
2750  }
2751 
2752  return AMDGPU::NoRegister;
2753 }
2754 
2755 static bool shouldReadExec(const MachineInstr &MI) {
2756  if (SIInstrInfo::isVALU(MI)) {
2757  switch (MI.getOpcode()) {
2758  case AMDGPU::V_READLANE_B32:
2759  case AMDGPU::V_READLANE_B32_si:
2760  case AMDGPU::V_READLANE_B32_vi:
2761  case AMDGPU::V_WRITELANE_B32:
2762  case AMDGPU::V_WRITELANE_B32_si:
2763  case AMDGPU::V_WRITELANE_B32_vi:
2764  return false;
2765  }
2766 
2767  return true;
2768  }
2769 
2770  if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
2771  SIInstrInfo::isSALU(MI) ||
2772  SIInstrInfo::isSMRD(MI))
2773  return false;
2774 
2775  return true;
2776 }
2777 
2778 static bool isSubRegOf(const SIRegisterInfo &TRI,
2779  const MachineOperand &SuperVec,
2780  const MachineOperand &SubReg) {
2782  return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
2783 
2784  return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
2785  SubReg.getReg() == SuperVec.getReg();
2786 }
2787 
2789  StringRef &ErrInfo) const {
2790  uint16_t Opcode = MI.getOpcode();
2791  if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
2792  return true;
2793 
2794  const MachineFunction *MF = MI.getParent()->getParent();
2795  const MachineRegisterInfo &MRI = MF->getRegInfo();
2796 
2797  int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
2798  int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
2799  int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
2800 
2801  // Make sure the number of operands is correct.
2802  const MCInstrDesc &Desc = get(Opcode);
2803  if (!Desc.isVariadic() &&
2804  Desc.getNumOperands() != MI.getNumExplicitOperands()) {
2805  ErrInfo = "Instruction has wrong number of operands.";
2806  return false;
2807  }
2808 
2809  if (MI.isInlineAsm()) {
2810  // Verify register classes for inlineasm constraints.
2811  for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
2812  I != E; ++I) {
2813  const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
2814  if (!RC)
2815  continue;
2816 
2817  const MachineOperand &Op = MI.getOperand(I);
2818  if (!Op.isReg())
2819  continue;
2820 
2821  unsigned Reg = Op.getReg();
2822  if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) {
2823  ErrInfo = "inlineasm operand has incorrect register class.";
2824  return false;
2825  }
2826  }
2827 
2828  return true;
2829  }
2830 
2831  // Make sure the register classes are correct.
2832  for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
2833  if (MI.getOperand(i).isFPImm()) {
2834  ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
2835  "all fp values to integers.";
2836  return false;
2837  }
2838 
2839  int RegClass = Desc.OpInfo[i].RegClass;
2840 
2841  switch (Desc.OpInfo[i].OperandType) {
2843  if (MI.getOperand(i).isImm()) {
2844  ErrInfo = "Illegal immediate value for operand.";
2845  return false;
2846  }
2847  break;
2850  break;
2857  const MachineOperand &MO = MI.getOperand(i);
2858  if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
2859  ErrInfo = "Illegal immediate value for operand.";
2860  return false;
2861  }
2862  break;
2863  }
2866  // Check if this operand is an immediate.
2867  // FrameIndex operands will be replaced by immediates, so they are
2868  // allowed.
2869  if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
2870  ErrInfo = "Expected immediate, but got non-immediate";
2871  return false;
2872  }
2874  default:
2875  continue;
2876  }
2877 
2878  if (!MI.getOperand(i).isReg())
2879  continue;
2880 
2881  if (RegClass != -1) {
2882  unsigned Reg = MI.getOperand(i).getReg();
2883  if (Reg == AMDGPU::NoRegister ||
2885  continue;
2886 
2887  const TargetRegisterClass *RC = RI.getRegClass(RegClass);
2888  if (!RC->contains(Reg)) {
2889  ErrInfo = "Operand has incorrect register class.";
2890  return false;
2891  }
2892  }
2893  }
2894 
2895  // Verify SDWA
2896  if (isSDWA(MI)) {
2897  if (!ST.hasSDWA()) {
2898  ErrInfo = "SDWA is not supported on this target";
2899  return false;
2900  }
2901 
2902  int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
2903 
2904  const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx };
2905 
2906  for (int OpIdx: OpIndicies) {
2907  if (OpIdx == -1)
2908  continue;
2909  const MachineOperand &MO = MI.getOperand(OpIdx);
2910 
2911  if (!ST.hasSDWAScalar()) {
2912  // Only VGPRS on VI
2913  if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
2914  ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
2915  return false;
2916  }
2917  } else {
2918  // No immediates on GFX9
2919  if (!MO.isReg()) {
2920  ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9";
2921  return false;
2922  }
2923  }
2924  }
2925 
2926  if (!ST.hasSDWAOmod()) {
2927  // No omod allowed on VI
2928  const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
2929  if (OMod != nullptr &&
2930  (!OMod->isImm() || OMod->getImm() != 0)) {
2931  ErrInfo = "OMod not allowed in SDWA instructions on VI";
2932  return false;
2933  }
2934  }
2935 
2936  uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
2937  if (isVOPC(BasicOpcode)) {
2938  if (!ST.hasSDWASdst() && DstIdx != -1) {
2939  // Only vcc allowed as dst on VI for VOPC
2940  const MachineOperand &Dst = MI.getOperand(DstIdx);
2941  if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
2942  ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
2943  return false;
2944  }
2945  } else if (!ST.hasSDWAOutModsVOPC()) {
2946  // No clamp allowed on GFX9 for VOPC
2947  const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
2948  if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
2949  ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
2950  return false;
2951  }
2952 
2953  // No omod allowed on GFX9 for VOPC
2954  const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
2955  if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
2956  ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
2957  return false;
2958  }
2959  }
2960  }
2961 
2962  const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
2963  if (DstUnused && DstUnused->isImm() &&
2964  DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
2965  const MachineOperand &Dst = MI.getOperand(DstIdx);
2966  if (!Dst.isReg() || !Dst.isTied()) {
2967  ErrInfo = "Dst register should have tied register";
2968  return false;
2969  }
2970 
2971  const MachineOperand &TiedMO =
2972  MI.getOperand(MI.findTiedOperandIdx(DstIdx));
2973  if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
2974  ErrInfo =
2975  "Dst register should be tied to implicit use of preserved register";
2976  return false;
2977  } else if (TargetRegisterInfo::isPhysicalRegister(TiedMO.getReg()) &&
2978  Dst.getReg() != TiedMO.getReg()) {
2979  ErrInfo = "Dst register should use same physical register as preserved";
2980  return false;
2981  }
2982  }
2983  }
2984 
2985  // Verify MIMG
2986  if (isMIMG(MI.getOpcode()) && !MI.mayStore()) {
2987  // Ensure that the return type used is large enough for all the options
2988  // being used TFE/LWE require an extra result register.
2989  const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
2990  if (DMask) {
2991  uint64_t DMaskImm = DMask->getImm();
2992  uint32_t RegCount =
2993  isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm);
2994  const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
2995  const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
2996  const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
2997 
2998  // Adjust for packed 16 bit values
2999  if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
3000  RegCount >>= 1;
3001 
3002  // Adjust if using LWE or TFE
3003  if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
3004  RegCount += 1;
3005 
3006  const uint32_t DstIdx =
3007  AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
3008  const MachineOperand &Dst = MI.getOperand(DstIdx);
3009  if (Dst.isReg()) {
3010  const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
3011  uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
3012  if (RegCount > DstSize) {
3013  ErrInfo = "MIMG instruction returns too many registers for dst "
3014  "register class";
3015  return false;
3016  }
3017  }
3018  }
3019  }
3020 
3021  // Verify VOP*. Ignore multiple sgpr operands on writelane.
3022  if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
3023  && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
3024  // Only look at the true operands. Only a real operand can use the constant
3025  // bus, and we don't want to check pseudo-operands like the source modifier
3026  // flags.
3027  const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
3028 
3029  unsigned ConstantBusCount = 0;
3030  unsigned LiteralCount = 0;
3031 
3032  if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
3033  ++ConstantBusCount;
3034 
3035  unsigned SGPRUsed = findImplicitSGPRRead(MI);
3036  if (SGPRUsed != AMDGPU::NoRegister)
3037  ++ConstantBusCount;
3038 
3039  for (int OpIdx : OpIndices) {
3040  if (OpIdx == -1)
3041  break;
3042  const MachineOperand &MO = MI.getOperand(OpIdx);
3043  if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
3044  if (MO.isReg()) {
3045  if (MO.getReg() != SGPRUsed)
3046  ++ConstantBusCount;
3047  SGPRUsed = MO.getReg();
3048  } else {
3049  ++ConstantBusCount;
3050  ++LiteralCount;
3051  }
3052  }
3053  }
3054  if (ConstantBusCount > 1) {
3055  ErrInfo = "VOP* instruction uses the constant bus more than once";
3056  return false;
3057  }
3058 
3059  if (isVOP3(MI) && LiteralCount) {
3060  ErrInfo = "VOP3 instruction uses literal";
3061  return false;
3062  }
3063  }
3064 
3065  // Verify misc. restrictions on specific instructions.
3066  if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
3067  Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
3068  const MachineOperand &Src0 = MI.getOperand(Src0Idx);
3069  const MachineOperand &Src1 = MI.getOperand(Src1Idx);
3070  const MachineOperand &Src2 = MI.getOperand(Src2Idx);
3071  if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
3072  if (!compareMachineOp(Src0, Src1) &&
3073  !compareMachineOp(Src0, Src2)) {
3074  ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
3075  return false;
3076  }
3077  }
3078  }
3079 
3080  if (isSOPK(MI)) {
3081  int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm();
3082  if (sopkIsZext(MI)) {
3083  if (!isUInt<16>(Imm)) {
3084  ErrInfo = "invalid immediate for SOPK instruction";
3085  return false;
3086  }
3087  } else {
3088  if (!isInt<16>(Imm)) {
3089  ErrInfo = "invalid immediate for SOPK instruction";
3090  return false;
3091  }
3092  }
3093  }
3094 
3095  if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
3096  Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
3097  Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
3098  Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
3099  const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
3100  Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
3101 
3102  const unsigned StaticNumOps = Desc.getNumOperands() +
3103  Desc.getNumImplicitUses();
3104  const unsigned NumImplicitOps = IsDst ? 2 : 1;
3105 
3106  // Allow additional implicit operands. This allows a fixup done by the post
3107  // RA scheduler where the main implicit operand is killed and implicit-defs
3108  // are added for sub-registers that remain live after this instruction.
3109  if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
3110  ErrInfo = "missing implicit register operands";
3111  return false;
3112  }
3113 
3114  const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
3115  if (IsDst) {
3116  if (!Dst->isUse()) {
3117  ErrInfo = "v_movreld_b32 vdst should be a use operand";
3118  return false;
3119  }
3120 
3121  unsigned UseOpIdx;
3122  if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
3123  UseOpIdx != StaticNumOps + 1) {
3124  ErrInfo = "movrel implicit operands should be tied";
3125  return false;
3126  }
3127  }
3128 
3129  const MachineOperand &Src0 = MI.getOperand(Src0Idx);
3130  const MachineOperand &ImpUse
3131  = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
3132  if (!ImpUse.isReg() || !ImpUse.isUse() ||
3133  !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
3134  ErrInfo = "src0 should be subreg of implicit vector use";
3135  return false;
3136  }
3137  }
3138 
3139  // Make sure we aren't losing exec uses in the td files. This mostly requires
3140  // being careful when using let Uses to try to add other use registers.
3141  if (shouldReadExec(MI)) {
3142  if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
3143  ErrInfo = "VALU instruction does not implicitly read exec mask";
3144  return false;
3145  }
3146  }
3147 
3148  if (isSMRD(MI)) {
3149  if (MI.mayStore()) {
3150  // The register offset form of scalar stores may only use m0 as the
3151  // soffset register.
3152  const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff);
3153  if (Soff && Soff->getReg() != AMDGPU::M0) {
3154  ErrInfo = "scalar stores must use m0 as offset register";
3155  return false;
3156  }
3157  }
3158  }
3159 
3160  if (isFLAT(MI) && !MF->getSubtarget<GCNSubtarget>().hasFlatInstOffsets()) {
3161  const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
3162  if (Offset->getImm() != 0) {
3163  ErrInfo = "subtarget does not support offsets in flat instructions";
3164  return false;
3165  }
3166  }
3167 
3168  const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
3169  if (DppCt) {
3170  using namespace AMDGPU::DPP;
3171 
3172  unsigned DC = DppCt->getImm();
3173  if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
3174  DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
3179  ErrInfo = "Invalid dpp_ctrl value";
3180  return false;
3181  }
3182  }
3183 
3184  return true;
3185 }
3186 
3187 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
3188  switch (MI.getOpcode()) {
3189  default: return AMDGPU::INSTRUCTION_LIST_END;
3190  case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
3191  case AMDGPU::COPY: return AMDGPU::COPY;
3192  case AMDGPU::PHI: return AMDGPU::PHI;
3193  case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
3194  case AMDGPU::WQM: return AMDGPU::WQM;
3195  case AMDGPU::WWM: return AMDGPU::WWM;
3196  case AMDGPU::S_MOV_B32:
3197  return MI.getOperand(1).isReg() ?
3198  AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
3199  case AMDGPU::S_ADD_I32:
3200  return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32;
3201  case AMDGPU::S_ADDC_U32:
3202  return AMDGPU::V_ADDC_U32_e32;
3203  case AMDGPU::S_SUB_I32:
3204  return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32;
3205  // FIXME: These are not consistently handled, and selected when the carry is
3206  // used.
3207  case AMDGPU::S_ADD_U32:
3208  return AMDGPU::V_ADD_I32_e32;
3209  case AMDGPU::S_SUB_U32:
3210  return AMDGPU::V_SUB_I32_e32;
3211  case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
3212  case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
3213  case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
3214  case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
3215  case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
3216  case AMDGPU::S_XNOR_B32:
3217  return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
3218  case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
3219  case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
3220  case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
3221  case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
3222  case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
3223  case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
3224  case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
3225  case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
3226  case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
3227  case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
3228  case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
3229  case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
3230  case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
3231  case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
3232  case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
3233  case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
3234  case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
3235  case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
3236  case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
3237  case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
3238  case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
3239  case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
3240  case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
3241  case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
3242  case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
3243  case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
3244  case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
3245  case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
3246  case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
3247  case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
3248  case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32;
3249  case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32;
3250  case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
3251  case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
3252  case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
3253  case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
3254  case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
3255  case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
3256  }
3257 }
3258 
3260  unsigned OpNo) const {
3262  const MCInstrDesc &Desc = get(MI.getOpcode());
3263  if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
3264  Desc.OpInfo[OpNo].RegClass == -1) {
3265  unsigned Reg = MI.getOperand(OpNo).getReg();
3266 
3268  return MRI.getRegClass(Reg);
3269  return RI.getPhysRegClass(Reg);
3270  }
3271 
3272  unsigned RCID = Desc.OpInfo[OpNo].RegClass;
3273  return RI.getRegClass(RCID);
3274 }
3275 
3276 bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
3277  switch (MI.getOpcode()) {
3278  case AMDGPU::COPY:
3279  case AMDGPU::REG_SEQUENCE:
3280  case AMDGPU::PHI:
3281  case AMDGPU::INSERT_SUBREG:
3282  return RI.hasVGPRs(getOpRegClass(MI, 0));
3283  default:
3284  return RI.hasVGPRs(getOpRegClass(MI, OpNo));
3285  }
3286 }
3287 
3288 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
3290  MachineBasicBlock *MBB = MI.getParent();
3291  MachineOperand &MO = MI.getOperand(OpIdx);
3293  unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
3294  const TargetRegisterClass *RC = RI.getRegClass(RCID);
3295  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
3296  if (MO.isReg())
3297  Opcode = AMDGPU::COPY;
3298  else if (RI.isSGPRClass(RC))
3299  Opcode = AMDGPU::S_MOV_B32;
3300 
3301  const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
3302  if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
3303  VRC = &AMDGPU::VReg_64RegClass;
3304  else
3305  VRC = &AMDGPU::VGPR_32RegClass;
3306 
3307  unsigned Reg = MRI.createVirtualRegister(VRC);
3308  DebugLoc DL = MBB->findDebugLoc(I);
3309  BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
3310  MO.ChangeToRegister(Reg, false);
3311 }
3312 
3315  MachineOperand &SuperReg,
3316  const TargetRegisterClass *SuperRC,
3317  unsigned SubIdx,
3318  const TargetRegisterClass *SubRC)
3319  const {
3320  MachineBasicBlock *MBB = MI->getParent();
3321  DebugLoc DL = MI->getDebugLoc();
3322  unsigned SubReg = MRI.createVirtualRegister(SubRC);
3323 
3324  if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
3325  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
3326  .addReg(SuperReg.getReg(), 0, SubIdx);
3327  return SubReg;
3328  }
3329 
3330  // Just in case the super register is itself a sub-register, copy it to a new
3331  // value so we don't need to worry about merging its subreg index with the
3332  // SubIdx passed to this function. The register coalescer should be able to
3333  // eliminate this extra copy.
3334  unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
3335 
3336  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
3337  .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
3338 
3339  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
3340  .addReg(NewSuperReg, 0, SubIdx);
3341 
3342  return SubReg;
3343 }
3344 
3348  MachineOperand &Op,
3349  const TargetRegisterClass *SuperRC,
3350  unsigned SubIdx,
3351  const TargetRegisterClass *SubRC) const {
3352  if (Op.isImm()) {
3353  if (SubIdx == AMDGPU::sub0)
3354  return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
3355  if (SubIdx == AMDGPU::sub1)
3356  return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
3357 
3358  llvm_unreachable("Unhandled register index for immediate");
3359  }
3360 
3361  unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
3362  SubIdx, SubRC);
3363  return MachineOperand::CreateReg(SubReg, false);
3364 }
3365 
3366 // Change the order of operands from (0, 1, 2) to (0, 2, 1)
3367 void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
3368  assert(Inst.getNumExplicitOperands() == 3);
3369  MachineOperand Op1 = Inst.getOperand(1);
3370  Inst.RemoveOperand(1);
3371  Inst.addOperand(Op1);
3372 }
3373 
3375  const MCOperandInfo &OpInfo,
3376  const MachineOperand &MO) const {
3377  if (!MO.isReg())
3378  return false;
3379 
3380  unsigned Reg = MO.getReg();
3381  const TargetRegisterClass *RC =
3383  MRI.getRegClass(Reg) :
3384  RI.getPhysRegClass(Reg);
3385 
3386  const SIRegisterInfo *TRI =
3387  static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
3388  RC = TRI->getSubRegClass(RC, MO.getSubReg());
3389 
3390  // In order to be legal, the common sub-class must be equal to the
3391  // class of the current operand. For example:
3392  //
3393  // v_mov_b32 s0 ; Operand defined as vsrc_b32
3394  // ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL
3395  //
3396  // s_sendmsg 0, s0 ; Operand defined as m0reg
3397  // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
3398 
3399  return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
3400 }
3401 
3403  const MCOperandInfo &OpInfo,
3404  const MachineOperand &MO) const {
3405  if (MO.isReg())
3406  return isLegalRegOperand(MRI, OpInfo, MO);
3407 
3408  // Handle non-register types that are treated like immediates.
3409  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
3410  return true;
3411 }
3412 
3413 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
3414  const MachineOperand *MO) const {
3416  const MCInstrDesc &InstDesc = MI.getDesc();
3417  const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
3418  const TargetRegisterClass *DefinedRC =
3419  OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
3420  if (!MO)
3421  MO = &MI.getOperand(OpIdx);
3422 
3423  if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
3424 
3425  RegSubRegPair SGPRUsed;
3426  if (MO->isReg())
3427  SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg());
3428 
3429  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3430  if (i == OpIdx)
3431  continue;
3432  const MachineOperand &Op = MI.getOperand(i);
3433  if (Op.isReg()) {
3434  if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) &&
3435  usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) {
3436  return false;
3437  }
3438  } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
3439  return false;
3440  }
3441  }
3442  }
3443 
3444  if (MO->isReg()) {
3445  assert(DefinedRC);
3446  return isLegalRegOperand(MRI, OpInfo, *MO);
3447  }
3448 
3449  // Handle non-register types that are treated like immediates.
3450  assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
3451 
3452  if (!DefinedRC) {
3453  // This operand expects an immediate.
3454  return true;
3455  }
3456 
3457  return isImmOperandLegal(MI, OpIdx, *MO);
3458 }
3459 
3461  MachineInstr &MI) const {
3462  unsigned Opc = MI.getOpcode();
3463  const MCInstrDesc &InstrDesc = get(Opc);
3464 
3465  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
3466  MachineOperand &Src1 = MI.getOperand(Src1Idx);
3467 
3468  // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
3469  // we need to only have one constant bus use.
3470  //
3471  // Note we do not need to worry about literal constants here. They are
3472  // disabled for the operand type for instructions because they will always
3473  // violate the one constant bus use rule.
3474  bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister;
3475  if (HasImplicitSGPR) {
3476  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3477  MachineOperand &Src0 = MI.getOperand(Src0Idx);
3478 
3479  if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
3480  legalizeOpWithMove(MI, Src0Idx);
3481  }
3482 
3483  // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
3484  // both the value to write (src0) and lane select (src1). Fix up non-SGPR
3485  // src0/src1 with V_READFIRSTLANE.
3486  if (Opc == AMDGPU::V_WRITELANE_B32) {
3487  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3488  MachineOperand &Src0 = MI.getOperand(Src0Idx);
3489  const DebugLoc &DL = MI.getDebugLoc();
3490  if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
3491  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3492  BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3493  .add(Src0);
3494  Src0.ChangeToRegister(Reg, false);
3495  }
3496  if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
3497  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3498  const DebugLoc &DL = MI.getDebugLoc();
3499  BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3500  .add(Src1);
3501  Src1.ChangeToRegister(Reg, false);
3502  }
3503  return;
3504  }
3505 
3506  // VOP2 src0 instructions support all operand types, so we don't need to check
3507  // their legality. If src1 is already legal, we don't need to do anything.
3508  if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
3509  return;
3510 
3511  // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
3512  // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
3513  // select is uniform.
3514  if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
3515  RI.isVGPR(MRI, Src1.getReg())) {
3516  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3517  const DebugLoc &DL = MI.getDebugLoc();
3518  BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3519  .add(Src1);
3520  Src1.ChangeToRegister(Reg, false);
3521  return;
3522  }
3523 
3524  // We do not use commuteInstruction here because it is too aggressive and will
3525  // commute if it is possible. We only want to commute here if it improves
3526  // legality. This can be called a fairly large number of times so don't waste
3527  // compile time pointlessly swapping and checking legality again.
3528  if (HasImplicitSGPR || !MI.isCommutable()) {
3529  legalizeOpWithMove(MI, Src1Idx);
3530  return;
3531  }
3532 
3533  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3534  MachineOperand &Src0 = MI.getOperand(Src0Idx);
3535 
3536  // If src0 can be used as src1, commuting will make the operands legal.
3537  // Otherwise we have to give up and insert a move.
3538  //
3539  // TODO: Other immediate-like operand kinds could be commuted if there was a
3540  // MachineOperand::ChangeTo* for them.
3541  if ((!Src1.isImm() && !Src1.isReg()) ||
3542  !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
3543  legalizeOpWithMove(MI, Src1Idx);
3544  return;
3545  }
3546 
3547  int CommutedOpc = commuteOpcode(MI);
3548  if (CommutedOpc == -1) {
3549  legalizeOpWithMove(MI, Src1Idx);
3550  return;
3551  }
3552 
3553  MI.setDesc(get(CommutedOpc));
3554 
3555  unsigned Src0Reg = Src0.getReg();
3556  unsigned Src0SubReg = Src0.getSubReg();
3557  bool Src0Kill = Src0.isKill();
3558 
3559  if (Src1.isImm())
3560  Src0.ChangeToImmediate(Src1.getImm());
3561  else if (Src1.isReg()) {
3562  Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
3563  Src0.setSubReg(Src1.getSubReg());
3564  } else
3565  llvm_unreachable("Should only have register or immediate operands");
3566 
3567  Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
3568  Src1.setSubReg(Src0SubReg);
3569 }
3570 
3571 // Legalize VOP3 operands. Because all operand types are supported for any
3572 // operand, and since literal constants are not allowed and should never be
3573 // seen, we only need to worry about inserting copies if we use multiple SGPR
3574 // operands.
3576  MachineInstr &MI) const {
3577  unsigned Opc = MI.getOpcode();
3578 
3579  int VOP3Idx[3] = {
3580  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
3581  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
3582  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
3583  };
3584 
3585  // Find the one SGPR operand we are allowed to use.
3586  unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
3587 
3588  for (unsigned i = 0; i < 3; ++i) {
3589  int Idx = VOP3Idx[i];
3590  if (Idx == -1)
3591  break;
3592  MachineOperand &MO = MI.getOperand(Idx);
3593 
3594  // We should never see a VOP3 instruction with an illegal immediate operand.
3595  if (!MO.isReg())
3596  continue;
3597 
3598  if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
3599  continue; // VGPRs are legal
3600 
3601  if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
3602  SGPRReg = MO.getReg();
3603  // We can use one SGPR in each VOP3 instruction.
3604  continue;
3605  }
3606 
3607  // If we make it this far, then the operand is not legal and we must
3608  // legalize it.
3609  legalizeOpWithMove(MI, Idx);
3610  }
3611 }
3612 
3614  MachineRegisterInfo &MRI) const {
3615  const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
3616  const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
3617  unsigned DstReg = MRI.createVirtualRegister(SRC);
3618  unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
3619 
3620  if (SubRegs == 1) {
3621  BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
3622  get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
3623  .addReg(SrcReg);
3624  return DstReg;
3625  }
3626 
3628  for (unsigned i = 0; i < SubRegs; ++i) {
3629  unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3630  BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
3631  get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
3632  .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
3633  SRegs.push_back(SGPR);
3634  }
3635 
3636  MachineInstrBuilder MIB =
3637  BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
3638  get(AMDGPU::REG_SEQUENCE), DstReg);
3639  for (unsigned i = 0; i < SubRegs; ++i) {
3640  MIB.addReg(SRegs[i]);
3641  MIB.addImm(RI.getSubRegFromChannel(i));
3642  }
3643  return DstReg;
3644 }
3645 
3647  MachineInstr &MI) const {
3648 
3649  // If the pointer is store in VGPRs, then we need to move them to
3650  // SGPRs using v_readfirstlane. This is safe because we only select
3651  // loads with uniform pointers to SMRD instruction so we know the
3652  // pointer value is uniform.
3653  MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
3654  if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
3655  unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
3656  SBase->setReg(SGPR);
3657  }
3658  MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff);
3659  if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
3660  unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
3661  SOff->setReg(SGPR);
3662  }
3663 }
3664 
3667  const TargetRegisterClass *DstRC,
3668  MachineOperand &Op,
3670  const DebugLoc &DL) const {
3671  unsigned OpReg = Op.getReg();
3672  unsigned OpSubReg = Op.getSubReg();
3673 
3674  const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
3675  RI.getRegClassForReg(MRI, OpReg), OpSubReg);
3676 
3677  // Check if operand is already the correct register class.
3678  if (DstRC == OpRC)
3679  return;
3680 
3681  unsigned DstReg = MRI.createVirtualRegister(DstRC);
3682  MachineInstr *Copy =
3683  BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
3684 
3685  Op.setReg(DstReg);
3686  Op.setSubReg(0);
3687 
3688  MachineInstr *Def = MRI.getVRegDef(OpReg);
3689  if (!Def)
3690  return;
3691 
3692  // Try to eliminate the copy if it is copying an immediate value.
3693  if (Def->isMoveImmediate())
3694  FoldImmediate(*Copy, *Def, OpReg, &MRI);
3695 }
3696 
3697 // Emit the actual waterfall loop, executing the wrapped instruction for each
3698 // unique value of \p Rsrc across all lanes. In the best case we execute 1
3699 // iteration, in the worst case we execute 64 (once per lane).
3700 static void
3702  MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
3703  const DebugLoc &DL, MachineOperand &Rsrc) {
3704  MachineBasicBlock::iterator I = LoopBB.begin();
3705 
3706  unsigned VRsrc = Rsrc.getReg();
3707  unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef());
3708 
3709  unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3710  unsigned CondReg0 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3711  unsigned CondReg1 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3712  unsigned AndCond = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3713  unsigned SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3714  unsigned SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3715  unsigned SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3716  unsigned SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3717  unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
3718 
3719  // Beginning of the loop, read the next Rsrc variant.
3720  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub0)
3721  .addReg(VRsrc, VRsrcUndef, AMDGPU::sub0);
3722  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub1)
3723  .addReg(VRsrc, VRsrcUndef, AMDGPU::sub1);
3724  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub2)
3725  .addReg(VRsrc, VRsrcUndef, AMDGPU::sub2);
3726  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub3)
3727  .addReg(VRsrc, VRsrcUndef, AMDGPU::sub3);
3728 
3729  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc)
3730  .addReg(SRsrcSub0)
3731  .addImm(AMDGPU::sub0)
3732  .addReg(SRsrcSub1)
3733  .addImm(AMDGPU::sub1)
3734  .addReg(SRsrcSub2)
3735  .addImm(AMDGPU::sub2)
3736  .addReg(SRsrcSub3)
3737  .addImm(AMDGPU::sub3);
3738 
3739  // Update Rsrc operand to use the SGPR Rsrc.
3740  Rsrc.setReg(SRsrc);
3741  Rsrc.setIsKill(true);
3742 
3743  // Identify all lanes with identical Rsrc operands in their VGPRs.
3744  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg0)
3745  .addReg(SRsrc, 0, AMDGPU::sub0_sub1)
3746  .addReg(VRsrc, 0, AMDGPU::sub0_sub1);
3747  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg1)
3748  .addReg(SRsrc, 0, AMDGPU::sub2_sub3)
3749  .addReg(VRsrc, 0, AMDGPU::sub2_sub3);
3750  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_B64), AndCond)
3751  .addReg(CondReg0)
3752  .addReg(CondReg1);
3753 
3754  MRI.setSimpleHint(SaveExec, AndCond);
3755 
3756  // Update EXEC to matching lanes, saving original to SaveExec.
3757  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_SAVEEXEC_B64), SaveExec)
3758  .addReg(AndCond, RegState::Kill);
3759 
3760  // The original instruction is here; we insert the terminators after it.
3761  I = LoopBB.end();
3762 
3763  // Update EXEC, switch all done bits to 0 and all todo bits to 1.
3764  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC)
3765  .addReg(AMDGPU::EXEC)
3766  .addReg(SaveExec);
3767  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB);
3768 }
3769 
3770 // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register
3771 // with SGPRs by iterating over all unique values across all lanes.
3773  MachineOperand &Rsrc, MachineDominatorTree *MDT) {
3774  MachineBasicBlock &MBB = *MI.getParent();
3775  MachineFunction &MF = *MBB.getParent();
3778  const DebugLoc &DL = MI.getDebugLoc();
3779 
3780  unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
3781 
3782  // Save the EXEC mask
3783  BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B64), SaveExec)
3784  .addReg(AMDGPU::EXEC);
3785 
3786  // Killed uses in the instruction we are waterfalling around will be
3787  // incorrect due to the added control-flow.
3788  for (auto &MO : MI.uses()) {
3789  if (MO.isReg() && MO.isUse()) {
3790  MRI.clearKillFlags(MO.getReg());
3791  }
3792  }
3793 
3794  // To insert the loop we need to split the block. Move everything after this
3795  // point to a new block, and insert a new empty block between the two.
3797  MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
3798  MachineFunction::iterator MBBI(MBB);
3799  ++MBBI;
3800 
3801  MF.insert(MBBI, LoopBB);
3802  MF.insert(MBBI, RemainderBB);
3803 
3804  LoopBB->addSuccessor(LoopBB);
3805  LoopBB->addSuccessor(RemainderBB);
3806 
3807  // Move MI to the LoopBB, and the remainder of the block to RemainderBB.
3809  RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
3810  RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
3811  LoopBB->splice(LoopBB->begin(), &MBB, J);
3812 
3813  MBB.addSuccessor(LoopBB);
3814 
3815  // Update dominators. We know that MBB immediately dominates LoopBB, that
3816  // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately
3817  // dominates all of the successors transferred to it from MBB that MBB used
3818  // to dominate.
3819  if (MDT) {
3820  MDT->addNewBlock(LoopBB, &MBB);
3821  MDT->addNewBlock(RemainderBB, LoopBB);
3822  for (auto &Succ : RemainderBB->successors()) {
3823  if (MDT->dominates(&MBB, Succ)) {
3824  MDT->changeImmediateDominator(Succ, RemainderBB);
3825  }
3826  }
3827  }
3828 
3829  emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc);
3830 
3831  // Restore the EXEC mask
3832  MachineBasicBlock::iterator First = RemainderBB->begin();
3833  BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
3834  .addReg(SaveExec);
3835 }
3836 
3837 // Extract pointer from Rsrc and return a zero-value Rsrc replacement.
3838 static std::tuple<unsigned, unsigned>
3840  MachineBasicBlock &MBB = *MI.getParent();
3841  MachineFunction &MF = *MBB.getParent();
3843 
3844  // Extract the ptr from the resource descriptor.
3845  unsigned RsrcPtr =
3846  TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
3847  AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
3848 
3849  // Create an empty resource descriptor
3850  unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3851  unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3852  unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3853  unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
3854  uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
3855 
3856  // Zero64 = 0
3857  BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
3858  .addImm(0);
3859 
3860  // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
3861  BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
3862  .addImm(RsrcDataFormat & 0xFFFFFFFF);
3863 
3864  // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
3865  BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
3866  .addImm(RsrcDataFormat >> 32);
3867 
3868  // NewSRsrc = {Zero64, SRsrcFormat}
3869  BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
3870  .addReg(Zero64)
3871  .addImm(AMDGPU::sub0_sub1)
3872  .addReg(SRsrcFormatLo)
3873  .addImm(AMDGPU::sub2)
3874  .addReg(SRsrcFormatHi)
3875  .addImm(AMDGPU::sub3);
3876 
3877  return std::make_tuple(RsrcPtr, NewSRsrc);
3878 }
3879 
3881  MachineDominatorTree *MDT) const {
3882  MachineFunction &MF = *MI.getParent()->getParent();
3884 
3885  // Legalize VOP2
3886  if (isVOP2(MI) || isVOPC(MI)) {
3887  legalizeOperandsVOP2(MRI, MI);
3888  return;
3889  }
3890 
3891  // Legalize VOP3
3892  if (isVOP3(MI)) {
3893  legalizeOperandsVOP3(MRI, MI);
3894  return;
3895  }
3896 
3897  // Legalize SMRD
3898  if (isSMRD(MI)) {
3899  legalizeOperandsSMRD(MRI, MI);
3900  return;
3901  }
3902 
3903  // Legalize REG_SEQUENCE and PHI
3904  // The register class of the operands much be the same type as the register
3905  // class of the output.
3906  if (MI.getOpcode() == AMDGPU::PHI) {
3907  const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
3908  for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
3909  if (!MI.getOperand(i).isReg() ||
3911  continue;
3912  const TargetRegisterClass *OpRC =
3913  MRI.getRegClass(MI.getOperand(i).getReg());
3914  if (RI.hasVGPRs(OpRC)) {
3915  VRC = OpRC;
3916  } else {
3917  SRC = OpRC;
3918  }
3919  }
3920 
3921  // If any of the operands are VGPR registers, then they all most be
3922  // otherwise we will create illegal VGPR->SGPR copies when legalizing
3923  // them.
3924  if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
3925  if (!VRC) {
3926  assert(SRC);
3927  VRC = RI.getEquivalentVGPRClass(SRC);
3928  }
3929  RC = VRC;
3930  } else {
3931  RC = SRC;
3932  }
3933 
3934  // Update all the operands so they have the same type.
3935  for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3936  MachineOperand &Op = MI.getOperand(I);
3938  continue;
3939 
3940  // MI is a PHI instruction.
3941  MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
3942  MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
3943 
3944  // Avoid creating no-op copies with the same src and dst reg class. These
3945  // confuse some of the machine passes.
3946  legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
3947  }
3948  }
3949 
3950  // REG_SEQUENCE doesn't really require operand legalization, but if one has a
3951  // VGPR dest type and SGPR sources, insert copies so all operands are
3952  // VGPRs. This seems to help operand folding / the register coalescer.
3953  if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
3954  MachineBasicBlock *MBB = MI.getParent();
3955  const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
3956  if (RI.hasVGPRs(DstRC)) {
3957  // Update all the operands so they are VGPR register classes. These may
3958  // not be the same register class because REG_SEQUENCE supports mixing
3959  // subregister index types e.g. sub0_sub1 + sub2 + sub3
3960  for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3961  MachineOperand &Op = MI.getOperand(I);
3963  continue;
3964 
3965  const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
3966  const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
3967  if (VRC == OpRC)
3968  continue;
3969 
3970  legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
3971  Op.setIsKill();
3972  }
3973  }
3974 
3975  return;
3976  }
3977 
3978  // Legalize INSERT_SUBREG
3979  // src0 must have the same register class as dst
3980  if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
3981  unsigned Dst = MI.getOperand(0).getReg();
3982  unsigned Src0 = MI.getOperand(1).getReg();
3983  const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
3984  const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
3985  if (DstRC != Src0RC) {
3986  MachineBasicBlock *MBB = MI.getParent();
3987  MachineOperand &Op = MI.getOperand(1);
3988  legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
3989  }
3990  return;
3991  }
3992 
3993  // Legalize SI_INIT_M0
3994  if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
3995  MachineOperand &Src = MI.getOperand(0);
3996  if (Src.isReg() && RI.hasVGPRs(MRI.getRegClass(Src.getReg())))
3997  Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
3998  return;
3999  }
4000 
4001  // Legalize MIMG and MUBUF/MTBUF for shaders.
4002  //
4003  // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
4004  // scratch memory access. In both cases, the legalization never involves
4005  // conversion to the addr64 form.
4006  if (isMIMG(MI) ||
4008  (isMUBUF(MI) || isMTBUF(MI)))) {
4009  MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
4010  if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
4011  unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
4012  SRsrc->setReg(SGPR);
4013  }
4014 
4015  MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
4016  if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) {
4017  unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI);
4018  SSamp->setReg(SGPR);
4019  }
4020  return;
4021  }
4022 
4023  // Legalize MUBUF* instructions.
4024  int RsrcIdx =
4025  AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
4026  if (RsrcIdx != -1) {
4027  // We have an MUBUF instruction
4028  MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
4029  unsigned RsrcRC = get(MI.getOpcode()).OpInfo[RsrcIdx].RegClass;
4030  if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()),
4031  RI.getRegClass(RsrcRC))) {
4032  // The operands are legal.
4033  // FIXME: We may need to legalize operands besided srsrc.
4034  return;
4035  }
4036 
4037  // Legalize a VGPR Rsrc.
4038  //
4039  // If the instruction is _ADDR64, we can avoid a waterfall by extracting
4040  // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
4041  // a zero-value SRsrc.
4042  //
4043  // If the instruction is _OFFSET (both idxen and offen disabled), and we
4044  // support ADDR64 instructions, we can convert to ADDR64 and do the same as
4045  // above.
4046  //
4047  // Otherwise we are on non-ADDR64 hardware, and/or we have
4048  // idxen/offen/bothen and we fall back to a waterfall loop.
4049 
4050  MachineBasicBlock &MBB = *MI.getParent();
4051 
4052  MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
4053  if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
4054  // This is already an ADDR64 instruction so we need to add the pointer
4055  // extracted from the resource descriptor to the current value of VAddr.
4056  unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4057  unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4058  unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4059 
4060  unsigned RsrcPtr, NewSRsrc;
4061  std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
4062 
4063  // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
4064  DebugLoc DL = MI.getDebugLoc();
4065  BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
4066  .addReg(RsrcPtr, 0, AMDGPU::sub0)
4067  .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
4068 
4069  // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
4070  BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
4071  .addReg(RsrcPtr, 0, AMDGPU::sub1)
4072  .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
4073 
4074  // NewVaddr = {NewVaddrHi, NewVaddrLo}
4075  BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
4076  .addReg(NewVAddrLo)
4077  .addImm(AMDGPU::sub0)
4078  .addReg(NewVAddrHi)
4079  .addImm(AMDGPU::sub1);
4080 
4081  VAddr->setReg(NewVAddr);
4082  Rsrc->setReg(NewSRsrc);
4083  } else if (!VAddr && ST.hasAddr64()) {
4084  // This instructions is the _OFFSET variant, so we need to convert it to
4085  // ADDR64.
4086  assert(MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration()
4088  "FIXME: Need to emit flat atomics here");
4089 
4090  unsigned RsrcPtr, NewSRsrc;
4091  std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
4092 
4093  unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4094  MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
4095  MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
4096  MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
4097  unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
4098 
4099  // Atomics rith return have have an additional tied operand and are
4100  // missing some of the special bits.
4101  MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
4102  MachineInstr *Addr64;
4103 
4104  if (!VDataIn) {
4105  // Regular buffer load / store.
4106  MachineInstrBuilder MIB =
4107  BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
4108  .add(*VData)
4109  .addReg(NewVAddr)
4110  .addReg(NewSRsrc)
4111  .add(*SOffset)
4112  .add(*Offset);
4113 
4114  // Atomics do not have this operand.
4115  if (const MachineOperand *GLC =
4116  getNamedOperand(MI, AMDGPU::OpName::glc)) {
4117  MIB.addImm(GLC->getImm());
4118  }
4119 
4120  MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc));
4121 
4122  if (const MachineOperand *TFE =
4123  getNamedOperand(MI, AMDGPU::OpName::tfe)) {
4124  MIB.addImm(TFE->getImm());
4125  }
4126 
4127  MIB.cloneMemRefs(MI);
4128  Addr64 = MIB;
4129  } else {
4130  // Atomics with return.
4131  Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
4132  .add(*VData)
4133  .add(*VDataIn)
4134  .addReg(NewVAddr)
4135  .addReg(NewSRsrc)
4136  .add(*SOffset)
4137  .add(*Offset)
4138  .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc))
4139  .cloneMemRefs(MI);
4140  }
4141 
4142  MI.removeFromParent();
4143 
4144  // NewVaddr = {NewVaddrHi, NewVaddrLo}
4145  BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
4146  NewVAddr)
4147  .addReg(RsrcPtr, 0, AMDGPU::sub0)
4148  .addImm(AMDGPU::sub0)
4149  .addReg(RsrcPtr, 0, AMDGPU::sub1)
4150  .addImm(AMDGPU::sub1);
4151  } else {
4152  // This is another variant; legalize Rsrc with waterfall loop from VGPRs
4153  // to SGPRs.
4154  loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT);
4155  }
4156  }
4157 }
4158 
4160  MachineDominatorTree *MDT) const {
4161  SetVectorType Worklist;
4162  Worklist.insert(&TopInst);
4163 
4164  while (!Worklist.empty()) {
4165  MachineInstr &Inst = *Worklist.pop_back_val();
4166  MachineBasicBlock *MBB = Inst.getParent();
4168 
4169  unsigned Opcode = Inst.getOpcode();
4170  unsigned NewOpcode = getVALUOp(Inst);
4171 
4172  // Handle some special cases
4173  switch (Opcode) {
4174  default:
4175  break;
4176  case AMDGPU::S_ADD_U64_PSEUDO:
4177  case AMDGPU::S_SUB_U64_PSEUDO:
4178  splitScalar64BitAddSub(Worklist, Inst, MDT);
4179  Inst.eraseFromParent();
4180  continue;
4181  case AMDGPU::S_ADD_I32:
4182  case AMDGPU::S_SUB_I32:
4183  // FIXME: The u32 versions currently selected use the carry.
4184  if (moveScalarAddSub(Worklist, Inst, MDT))
4185  continue;
4186 
4187  // Default handling
4188  break;
4189  case AMDGPU::S_AND_B64:
4190  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
4191  Inst.eraseFromParent();
4192  continue;
4193 
4194  case AMDGPU::S_OR_B64:
4195  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
4196  Inst.eraseFromParent();
4197  continue;
4198 
4199  case AMDGPU::S_XOR_B64:
4200  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
4201  Inst.eraseFromParent();
4202  continue;
4203 
4204  case AMDGPU::S_NAND_B64:
4205  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
4206  Inst.eraseFromParent();
4207  continue;
4208 
4209  case AMDGPU::S_NOR_B64:
4210  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
4211  Inst.eraseFromParent();
4212  continue;
4213 
4214  case AMDGPU::S_XNOR_B64:
4215  if (ST.hasDLInsts())
4216  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
4217  else
4218  splitScalar64BitXnor(Worklist, Inst, MDT);
4219  Inst.eraseFromParent();
4220  continue;
4221 
4222  case AMDGPU::S_ANDN2_B64:
4223  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
4224  Inst.eraseFromParent();
4225  continue;
4226 
4227  case AMDGPU::S_ORN2_B64:
4228  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
4229  Inst.eraseFromParent();
4230  continue;
4231 
4232  case AMDGPU::S_NOT_B64:
4233  splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
4234  Inst.eraseFromParent();
4235  continue;
4236 
4237  case AMDGPU::S_BCNT1_I32_B64:
4238  splitScalar64BitBCNT(Worklist, Inst);
4239  Inst.eraseFromParent();
4240  continue;
4241 
4242  case AMDGPU::S_BFE_I64:
4243  splitScalar64BitBFE(Worklist, Inst);
4244  Inst.eraseFromParent();
4245  continue;
4246 
4247  case AMDGPU::S_LSHL_B32:
4248  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4249  NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
4250  swapOperands(Inst);
4251  }
4252  break;
4253  case AMDGPU::S_ASHR_I32:
4254  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4255  NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
4256  swapOperands(Inst);
4257  }
4258  break;
4259  case AMDGPU::S_LSHR_B32:
4260  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4261  NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
4262  swapOperands(Inst);
4263  }
4264  break;
4265  case AMDGPU::S_LSHL_B64:
4266  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4267  NewOpcode = AMDGPU::V_LSHLREV_B64;
4268  swapOperands(Inst);
4269  }
4270  break;
4271  case AMDGPU::S_ASHR_I64:
4272  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4273  NewOpcode = AMDGPU::V_ASHRREV_I64;
4274  swapOperands(Inst);
4275  }
4276  break;
4277  case AMDGPU::S_LSHR_B64:
4278  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4279  NewOpcode = AMDGPU::V_LSHRREV_B64;
4280  swapOperands(Inst);
4281  }
4282  break;
4283 
4284  case AMDGPU::S_ABS_I32:
4285  lowerScalarAbs(Worklist, Inst);
4286  Inst.eraseFromParent();
4287  continue;
4288 
4289  case AMDGPU::S_CBRANCH_SCC0:
4290  case AMDGPU::S_CBRANCH_SCC1:
4291  // Clear unused bits of vcc
4292  BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
4293  AMDGPU::VCC)
4294  .addReg(AMDGPU::EXEC)
4295  .addReg(AMDGPU::VCC);
4296  break;
4297 
4298  case AMDGPU::S_BFE_U64:
4299  case AMDGPU::S_BFM_B64:
4300  llvm_unreachable("Moving this op to VALU not implemented");
4301 
4302  case AMDGPU::S_PACK_LL_B32_B16:
4303  case AMDGPU::S_PACK_LH_B32_B16:
4304  case AMDGPU::S_PACK_HH_B32_B16:
4305  movePackToVALU(Worklist, MRI, Inst);
4306  Inst.eraseFromParent();
4307  continue;
4308 
4309  case AMDGPU::S_XNOR_B32:
4310  lowerScalarXnor(Worklist, Inst);
4311  Inst.eraseFromParent();
4312  continue;
4313 
4314  case AMDGPU::S_NAND_B32:
4315  splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
4316  Inst.eraseFromParent();
4317  continue;
4318 
4319  case AMDGPU::S_NOR_B32:
4320  splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
4321  Inst.eraseFromParent();
4322  continue;
4323 
4324  case AMDGPU::S_ANDN2_B32:
4325  splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
4326  Inst.eraseFromParent();
4327  continue;
4328 
4329  case AMDGPU::S_ORN2_B32:
4330  splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
4331  Inst.eraseFromParent();
4332  continue;
4333  }
4334 
4335  if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
4336  // We cannot move this instruction to the VALU, so we should try to
4337  // legalize its operands instead.
4338  legalizeOperands(Inst, MDT);
4339  continue;
4340  }
4341 
4342  // Use the new VALU Opcode.
4343  const MCInstrDesc &NewDesc = get(NewOpcode);
4344  Inst.setDesc(NewDesc);
4345 
4346  // Remove any references to SCC. Vector instructions can't read from it, and
4347  // We're just about to add the implicit use / defs of VCC, and we don't want
4348  // both.
4349  for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) {
4350  MachineOperand &Op = Inst.getOperand(i);
4351  if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
4352  Inst.RemoveOperand(i);
4353  addSCCDefUsersToVALUWorklist(Inst, Worklist);
4354  }
4355  }
4356 
4357  if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
4358  // We are converting these to a BFE, so we need to add the missing
4359  // operands for the size and offset.
4360  unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
4363 
4364  } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
4365  // The VALU version adds the second operand to the result, so insert an
4366  // extra 0 operand.
4368  }
4369 
4371 
4372  if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
4373  const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
4374  // If we need to move this to VGPRs, we need to unpack the second operand
4375  // back into the 2 separate ones for bit offset and width.
4376  assert(OffsetWidthOp.isImm() &&
4377  "Scalar BFE is only implemented for constant width and offset");
4378  uint32_t Imm = OffsetWidthOp.getImm();
4379 
4380  uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
4381  uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
4382  Inst.RemoveOperand(2); // Remove old immediate.
4383  Inst.addOperand(MachineOperand::CreateImm(Offset));
4384  Inst.addOperand(MachineOperand::CreateImm(BitWidth));
4385  }
4386 
4387  bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
4388  unsigned NewDstReg = AMDGPU::NoRegister;
4389  if (HasDst) {
4390  unsigned DstReg = Inst.getOperand(0).getReg();
4392  continue;
4393 
4394  // Update the destination register class.
4395  const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
4396  if (!NewDstRC)
4397  continue;
4398 
4399  if (Inst.isCopy() &&
4401  NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
4402  // Instead of creating a copy where src and dst are the same register
4403  // class, we just replace all uses of dst with src. These kinds of
4404  // copies interfere with the heuristics MachineSink uses to decide
4405  // whether or not to split a critical edge. Since the pass assumes
4406  // that copies will end up as machine instructions and not be
4407  // eliminated.
4408  addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
4409  MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
4410  MRI.clearKillFlags(Inst.getOperand(1).getReg());
4411  Inst.getOperand(0).setReg(DstReg);
4412 
4413  // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
4414  // these are deleted later, but at -O0 it would leave a suspicious
4415  // looking illegal copy of an undef register.
4416  for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
4417  Inst.RemoveOperand(I);
4418  Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
4419  continue;
4420  }
4421 
4422  NewDstReg = MRI.createVirtualRegister(NewDstRC);
4423  MRI.replaceRegWith(DstReg, NewDstReg);
4424  }
4425 
4426  // Legalize the operands
4427  legalizeOperands(Inst, MDT);
4428 
4429  if (HasDst)
4430  addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
4431  }
4432 }
4433 
4434 // Add/sub require special handling to deal with carry outs.
4435 bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
4436  MachineDominatorTree *MDT) const {
4437  if (ST.hasAddNoCarry()) {
4438  // Assume there is no user of scc since we don't select this in that case.
4439  // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
4440  // is used.
4441 
4442  MachineBasicBlock &MBB = *Inst.getParent();
4444 
4445  unsigned OldDstReg = Inst.getOperand(0).getReg();
4446  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4447 
4448  unsigned Opc = Inst.getOpcode();
4449  assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
4450 
4451  unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
4452  AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
4453 
4454  assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
4455  Inst.RemoveOperand(3);
4456 
4457  Inst.setDesc(get(NewOpc));
4458  Inst.addImplicitDefUseOperands(*MBB.getParent());
4459  MRI.replaceRegWith(OldDstReg, ResultReg);
4460  legalizeOperands(Inst, MDT);
4461 
4462  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4463  return true;
4464  }
4465 
4466  return false;
4467 }
4468 
4469 void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
4470  MachineInstr &Inst) const {
4471  MachineBasicBlock &MBB = *Inst.getParent();
4473  MachineBasicBlock::iterator MII = Inst;
4474  DebugLoc DL = Inst.getDebugLoc();
4475 
4476  MachineOperand &Dest = Inst.getOperand(0);
4477  MachineOperand &Src = Inst.getOperand(1);
4478  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4479  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4480 
4481  unsigned SubOp = ST.hasAddNoCarry() ?
4482  AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_I32_e32;
4483 
4484  BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
4485  .addImm(0)
4486  .addReg(Src.getReg());
4487 
4488  BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
4489  .addReg(Src.getReg())
4490  .addReg(TmpReg);
4491 
4492  MRI.replaceRegWith(Dest.getReg(), ResultReg);
4493  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4494 }
4495 
4496 void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
4497  MachineInstr &Inst) const {
4498  MachineBasicBlock &MBB = *Inst.getParent();
4500  MachineBasicBlock::iterator MII = Inst;
4501  const DebugLoc &DL = Inst.getDebugLoc();
4502 
4503  MachineOperand &Dest = Inst.getOperand(0);
4504  MachineOperand &Src0 = Inst.getOperand(1);
4505  MachineOperand &Src1 = Inst.getOperand(2);
4506 
4507  if (ST.hasDLInsts()) {
4508  unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4509  legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
4510  legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
4511 
4512  BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
4513  .add(Src0)
4514  .add(Src1);
4515 
4516  MRI.replaceRegWith(Dest.getReg(), NewDest);
4517  addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
4518  } else {
4519  // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
4520  // invert either source and then perform the XOR. If either source is a
4521  // scalar register, then we can leave the inversion on the scalar unit to
4522  // acheive a better distrubution of scalar and vector instructions.
4523  bool Src0IsSGPR = Src0.isReg() &&
4524  RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
4525  bool Src1IsSGPR = Src1.isReg() &&
4526  RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
4527  MachineInstr *Not = nullptr;
4528  MachineInstr *Xor = nullptr;
4529  unsigned Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4530  unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4531 
4532  // Build a pair of scalar instructions and add them to the work list.
4533  // The next iteration over the work list will lower these to the vector
4534  // unit as necessary.
4535  if (Src0IsSGPR) {
4536  Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp)
4537  .add(Src0);
4538  Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
4539  .addReg(Temp)
4540  .add(Src1);
4541  } else if (Src1IsSGPR) {
4542  Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp)
4543  .add(Src1);
4544  Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
4545  .add(Src0)
4546  .addReg(Temp);
4547  } else {
4548  Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
4549  .add(Src0)
4550  .add(Src1);
4551  Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
4552  .addReg(Temp);
4553  Worklist.insert(Not);
4554  }
4555 
4556  MRI.replaceRegWith(Dest.getReg(), NewDest);
4557 
4558  Worklist.insert(Xor);
4559 
4560  addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
4561  }
4562 }
4563 
4564 void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist,
4565  MachineInstr &Inst,
4566  unsigned Opcode) const {
4567  MachineBasicBlock &MBB = *Inst.getParent();
4569  MachineBasicBlock::iterator MII = Inst;
4570  const DebugLoc &DL = Inst.getDebugLoc();
4571 
4572  MachineOperand &Dest = Inst.getOperand(0);
4573  MachineOperand &Src0 = Inst.getOperand(1);
4574  MachineOperand &Src1 = Inst.getOperand(2);
4575 
4576  unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4577  unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4578 
4579  MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
4580  .add(Src0)
4581  .add(Src1);
4582 
4583  MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
4584  .addReg(Interm);
4585 
4586  Worklist.insert(&Op);
4587  Worklist.insert(&Not);
4588 
4589  MRI.replaceRegWith(Dest.getReg(), NewDest);
4590  addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
4591 }
4592 
4593 void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist,
4594  MachineInstr &Inst,
4595  unsigned Opcode) const {
4596  MachineBasicBlock &MBB = *Inst.getParent();
4598  MachineBasicBlock::iterator MII = Inst;
4599  const DebugLoc &DL = Inst.getDebugLoc();
4600 
4601  MachineOperand &Dest = Inst.getOperand(0);
4602  MachineOperand &Src0 = Inst.getOperand(1);
4603  MachineOperand &Src1 = Inst.getOperand(2);
4604 
4605  unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4606  unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4607 
4608  MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
4609  .add(Src1);
4610 
4611  MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
4612  .add(Src0)
4613  .addReg(Interm);
4614 
4615  Worklist.insert(&Not);
4616  Worklist.insert(&Op);
4617 
4618  MRI.replaceRegWith(Dest.getReg(), NewDest);
4619  addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
4620 }
4621 
4622 void SIInstrInfo::splitScalar64BitUnaryOp(
4623  SetVectorType &Worklist, MachineInstr &Inst,
4624  unsigned Opcode) const {
4625  MachineBasicBlock &MBB = *Inst.getParent();
4627 
4628  MachineOperand &Dest = Inst.getOperand(0);
4629  MachineOperand &Src0 = Inst.getOperand(1);
4630  DebugLoc DL = Inst.getDebugLoc();
4631 
4632  MachineBasicBlock::iterator MII = Inst;
4633 
4634  const MCInstrDesc &InstDesc = get(Opcode);
4635  const TargetRegisterClass *Src0RC = Src0.isReg() ?
4636  MRI.getRegClass(Src0.getReg()) :
4637  &AMDGPU::SGPR_32RegClass;
4638 
4639  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
4640 
4641  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4642  AMDGPU::sub0, Src0SubRC);
4643 
4644  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
4645  const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
4646  const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
4647 
4648  unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
4649  MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
4650 
4651  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4652  AMDGPU::sub1, Src0SubRC);
4653 
4654  unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
4655  MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
4656 
4657  unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
4658  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
4659  .addReg(DestSub0)
4660  .addImm(AMDGPU::sub0)
4661  .addReg(DestSub1)
4662  .addImm(AMDGPU::sub1);
4663 
4664  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
4665 
4666  Worklist.insert(&LoHalf);
4667  Worklist.insert(&HiHalf);
4668 
4669  // We don't need to legalizeOperands here because for a single operand, src0
4670  // will support any kind of input.
4671 
4672  // Move all users of this moved value.
4673  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
4674 }
4675 
4676 void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
4677  MachineInstr &Inst,
4678  MachineDominatorTree *MDT) const {
4679  bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
4680 
4681  MachineBasicBlock &MBB = *Inst.getParent();
4683 
4684  unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4685  unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4686  unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4687 
4688  unsigned CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
4689  unsigned DeadCarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
4690 
4691  MachineOperand &Dest = Inst.getOperand(0);
4692  MachineOperand &Src0 = Inst.getOperand(1);
4693  MachineOperand &Src1 = Inst.getOperand(2);
4694  const DebugLoc &DL = Inst.getDebugLoc();
4695  MachineBasicBlock::iterator MII = Inst;
4696 
4697  const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
4698  const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
4699  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
4700  const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
4701 
4702  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4703  AMDGPU::sub0, Src0SubRC);
4704  MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4705  AMDGPU::sub0, Src1SubRC);
4706 
4707 
4708  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4709  AMDGPU::sub1, Src0SubRC);
4710  MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4711  AMDGPU::sub1, Src1SubRC);
4712 
4713  unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
4714  MachineInstr *LoHalf =
4715  BuildMI(MBB, MII, DL, get(LoOpc), DestSub0)
4716  .addReg(CarryReg, RegState::Define)
4717  .add(SrcReg0Sub0)
4718  .add(SrcReg1Sub0);
4719 
4720  unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
4721  MachineInstr *HiHalf =
4722  BuildMI(MBB, MII, DL, get(HiOpc), DestSub1)
4723  .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
4724  .add(SrcReg0Sub1)
4725  .add(SrcReg1Sub1)
4726  .addReg(CarryReg, RegState::Kill);
4727 
4728  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
4729  .addReg(DestSub0)
4730  .addImm(AMDGPU::sub0)
4731  .addReg(DestSub1)
4732  .addImm(AMDGPU::sub1);
4733 
4734  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
4735 
4736  // Try to legalize the operands in case we need to swap the order to keep it
4737  // valid.
4738  legalizeOperands(*LoHalf, MDT);
4739  legalizeOperands(*HiHalf, MDT);
4740 
4741  // Move all users of this moved vlaue.
4742  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
4743 }
4744 
4745 void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
4746  MachineInstr &Inst, unsigned Opcode,
4747  MachineDominatorTree *MDT) const {
4748  MachineBasicBlock &MBB = *Inst.getParent();
4750 
4751  MachineOperand &Dest = Inst.getOperand(0);
4752  MachineOperand &Src0 = Inst.getOperand(1);
4753  MachineOperand &Src1 = Inst.getOperand(2);
4754  DebugLoc DL = Inst.getDebugLoc();
4755 
4756  MachineBasicBlock::iterator MII = Inst;
4757 
4758  const MCInstrDesc &InstDesc = get(Opcode);
4759  const TargetRegisterClass *Src0RC = Src0.isReg() ?
4760  MRI.getRegClass(Src0.getReg()) :
4761  &AMDGPU::SGPR_32RegClass;
4762 
4763  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
4764  const TargetRegisterClass *Src1RC = Src1.isReg() ?
4765  MRI.getRegClass(Src1.getReg()) :
4766  &AMDGPU::SGPR_32RegClass;
4767 
4768  const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
4769 
4770  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4771  AMDGPU::sub0, Src0SubRC);
4772  MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4773  AMDGPU::sub0, Src1SubRC);
4774  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4775  AMDGPU::sub1, Src0SubRC);
4776  MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4777  AMDGPU::sub1, Src1SubRC);
4778 
4779  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
4780  const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
4781  const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
4782 
4783  unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
4784  MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
4785  .add(SrcReg0Sub0)
4786  .add(SrcReg1Sub0);
4787 
4788  unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
4789  MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
4790  .add(SrcReg0Sub1)
4791  .add(SrcReg1Sub1);
4792 
4793  unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
4794  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
4795  .addReg(DestSub0)
4796  .addImm(AMDGPU::sub0)
4797  .addReg(DestSub1)
4798  .addImm(AMDGPU::sub1);
4799 
4800  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
4801 
4802  Worklist.insert(&LoHalf);
4803  Worklist.insert(&HiHalf);
4804 
4805  // Move all users of this moved vlaue.
4806  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
4807 }
4808 
4809 void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist,
4810  MachineInstr &Inst,
4811  MachineDominatorTree *MDT) const {
4812  MachineBasicBlock &MBB = *Inst.getParent();
4814 
4815  MachineOperand &Dest = Inst.getOperand(0);
4816  MachineOperand &Src0 = Inst.getOperand(1);
4817  MachineOperand &Src1 = Inst.getOperand(2);
4818  const DebugLoc &DL = Inst.getDebugLoc();
4819 
4820  MachineBasicBlock::iterator MII = Inst;
4821 
4822  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
4823 
4824  unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4825 
4826  MachineOperand* Op0;
4827  MachineOperand* Op1;
4828 
4829  if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
4830  Op0 = &Src0;
4831  Op1 = &Src1;
4832  } else {
4833  Op0 = &Src1;
4834  Op1 = &Src0;
4835  }
4836 
4837  BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
4838  .add(*Op0);
4839 
4840  unsigned NewDest = MRI.createVirtualRegister(DestRC);
4841 
4842  MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
4843  .addReg(Interm)
4844  .add(*Op1);
4845 
4846  MRI.replaceRegWith(Dest.getReg(), NewDest);
4847 
4848  Worklist.insert(&Xor);
4849 }
4850 
4851 void SIInstrInfo::splitScalar64BitBCNT(
4852  SetVectorType &Worklist, MachineInstr &Inst) const {
4853  MachineBasicBlock &MBB = *Inst.getParent();
4855 
4856  MachineBasicBlock::iterator MII = Inst;
4857  const DebugLoc &DL = Inst.getDebugLoc();
4858 
4859  MachineOperand &Dest = Inst.getOperand(0);
4860  MachineOperand &Src = Inst.getOperand(1);
4861 
4862  const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
4863  const TargetRegisterClass *SrcRC = Src.isReg() ?
4864  MRI.getRegClass(Src.getReg()) :
4865  &AMDGPU::SGPR_32RegClass;
4866 
4867  unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4868  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4869 
4870  const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
4871 
4872  MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
4873  AMDGPU::sub0, SrcSubRC);
4874  MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
4875  AMDGPU::sub1, SrcSubRC);
4876 
4877  BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
4878 
4879  BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
4880 
4881  MRI.replaceRegWith(Dest.getReg(), ResultReg);
4882 
4883  // We don't need to legalize operands here. src0 for etiher instruction can be
4884  // an SGPR, and the second input is unused or determined here.
4885  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4886 }
4887 
4888 void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
4889  MachineInstr &Inst) const {
4890  MachineBasicBlock &MBB = *Inst.getParent();
4892  MachineBasicBlock::iterator MII = Inst;
4893  const DebugLoc &DL = Inst.getDebugLoc();
4894 
4895  MachineOperand &Dest = Inst.getOperand(0);
4896  uint32_t Imm = Inst.getOperand(2).getImm();
4897  uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
4898  uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
4899 
4900  (void) Offset;
4901 
4902  // Only sext_inreg cases handled.
4903  assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
4904  Offset == 0 && "Not implemented");
4905 
4906  if (BitWidth < 32) {
4907  unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4908  unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4909  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4910 
4911  BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
4912  .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
4913  .addImm(0)
4914  .addImm(BitWidth);
4915 
4916  BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
4917  .addImm(31)
4918  .addReg(MidRegLo);
4919 
4920  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
4921  .addReg(MidRegLo)
4922  .addImm(AMDGPU::sub0)
4923  .addReg(MidRegHi)
4924  .addImm(AMDGPU::sub1);
4925 
4926  MRI.replaceRegWith(Dest.getReg(), ResultReg);
4927  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4928  return;
4929  }
4930 
4931  MachineOperand &Src = Inst.getOperand(1);
4932  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4933  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4934 
4935  BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
4936  .addImm(31)
4937  .addReg(Src.getReg(), 0, AMDGPU::sub0);
4938 
4939  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
4940  .addReg(Src.getReg(), 0, AMDGPU::sub0)
4941  .addImm(AMDGPU::sub0)
4942  .addReg(TmpReg)
4943  .addImm(AMDGPU::sub1);
4944 
4945  MRI.replaceRegWith(Dest.getReg(), ResultReg);
4946  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4947 }
4948 
4949 void SIInstrInfo::addUsersToMoveToVALUWorklist(
4950  unsigned DstReg,
4952  SetVectorType &Worklist) const {
4953  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
4954  E = MRI.use_end(); I != E;) {
4955  MachineInstr &UseMI = *I->getParent();
4956  if (!canReadVGPR(UseMI, I.getOperandNo())) {
4957  Worklist.insert(&UseMI);
4958 
4959  do {
4960  ++I;
4961  } while (I != E && I->getParent() == &UseMI);
4962  } else {
4963  ++I;
4964  }
4965  }
4966 }
4967 
4968 void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
4969  MachineRegisterInfo &MRI,
4970  MachineInstr &Inst) const {
4971  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4972  MachineBasicBlock *MBB = Inst.getParent();
4973  MachineOperand &Src0 = Inst.getOperand(1);
4974  MachineOperand &Src1 = Inst.getOperand(2);
4975  const DebugLoc &DL = Inst.getDebugLoc();
4976 
4977  switch (Inst.getOpcode()) {
4978  case AMDGPU::S_PACK_LL_B32_B16: {
4979  unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4980  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4981 
4982  // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
4983  // 0.
4984  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
4985  .addImm(0xffff);
4986 
4987  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
4988  .addReg(ImmReg, RegState::Kill)
4989  .add(Src0);
4990 
4991  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg)
4992  .add(Src1)
4993  .addImm(16)
4994  .addReg(TmpReg, RegState::Kill);
4995  break;
4996  }
4997  case AMDGPU::S_PACK_LH_B32_B16: {
4998  unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4999  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
5000  .addImm(0xffff);
5001  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg)
5002  .addReg(ImmReg, RegState::Kill)
5003  .add(Src0)
5004  .add(Src1);
5005  break;
5006  }
5007  case AMDGPU::S_PACK_HH_B32_B16: {
5008  unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5009  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5010  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
5011  .addImm(16)
5012  .add(Src0);
5013  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
5014  .addImm(0xffff0000);
5015  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg)
5016  .add(Src1)
5017  .addReg(ImmReg, RegState::Kill)
5018  .addReg(TmpReg, RegState::Kill);
5019  break;
5020  }
5021  default:
5022  llvm_unreachable("unhandled s_pack_* instruction");
5023  }
5024 
5025  MachineOperand &Dest = Inst.getOperand(0);
5026  MRI.replaceRegWith(Dest.getReg(), ResultReg);
5027  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
5028 }
5029 
5030 void SIInstrInfo::addSCCDefUsersToVALUWorklist(
5031  MachineInstr &SCCDefInst, SetVectorType &Worklist) const {
5032  // This assumes that all the users of SCC are in the same block
5033  // as the SCC def.
5034  for (MachineInstr &MI :
5036  SCCDefInst.getParent()->end())) {
5037  // Exit if we find another SCC def.
5038  if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1)
5039  return;
5040 
5041  if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1)
5042  Worklist.insert(&MI);
5043  }
5044 }
5045 
5046 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
5047  const MachineInstr &Inst) const {
5048  const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
5049 
5050  switch (Inst.getOpcode()) {
5051  // For target instructions, getOpRegClass just returns the virtual register
5052  // class associated with the operand, so we need to find an equivalent VGPR
5053  // register class in order to move the instruction to the VALU.
5054  case AMDGPU::COPY:
5055  case AMDGPU::PHI:
5056  case AMDGPU::REG_SEQUENCE:
5057  case AMDGPU::INSERT_SUBREG:
5058  case AMDGPU::WQM:
5059  case AMDGPU::WWM:
5060  if (RI.hasVGPRs(NewDstRC))
5061  return nullptr;
5062 
5063  NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
5064  if (!NewDstRC)
5065  return nullptr;
5066  return NewDstRC;
5067  default:
5068  return NewDstRC;
5069  }
5070 }
5071 
5072 // Find the one SGPR operand we are allowed to use.
5073 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
5074  int OpIndices[3]) const {
5075  const MCInstrDesc &Desc = MI.getDesc();
5076 
5077  // Find the one SGPR operand we are allowed to use.
5078  //
5079  // First we need to consider the instruction's operand requirements before
5080  // legalizing. Some operands are required to be SGPRs, such as implicit uses
5081  // of VCC, but we are still bound by the constant bus requirement to only use
5082  // one.
5083  //
5084  // If the operand's class is an SGPR, we can never move it.
5085 
5086  unsigned SGPRReg = findImplicitSGPRRead(MI);
5087  if (SGPRReg != AMDGPU::NoRegister)
5088  return SGPRReg;
5089 
5090  unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
5091  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5092 
5093  for (unsigned i = 0; i < 3; ++i) {
5094  int Idx = OpIndices[i];
5095  if (Idx == -1)
5096  break;
5097 
5098  const MachineOperand &MO = MI.getOperand(Idx);
5099  if (!MO.isReg())
5100  continue;
5101 
5102  // Is this operand statically required to be an SGPR based on the operand
5103  // constraints?
5104  const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass);
5105  bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
5106  if (IsRequiredSGPR)
5107  return MO.getReg();
5108 
5109  // If this could be a VGPR or an SGPR, Check the dynamic register class.
5110  unsigned Reg = MO.getReg();
5111  const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
5112  if (RI.isSGPRClass(RegRC))
5113  UsedSGPRs[i] = Reg;
5114  }
5115 
5116  // We don't have a required SGPR operand, so we have a bit more freedom in
5117  // selecting operands to move.
5118 
5119  // Try to select the most used SGPR. If an SGPR is equal to one of the
5120  // others, we choose that.
5121  //
5122  // e.g.
5123  // V_FMA_F32 v0, s0, s0, s0 -> No moves
5124  // V_FMA_F32 v0, s0, s1, s0 -> Move s1
5125 
5126  // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
5127  // prefer those.
5128 
5129  if (UsedSGPRs[0] != AMDGPU::NoRegister) {
5130  if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
5131  SGPRReg = UsedSGPRs[0];
5132  }
5133 
5134  if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) {
5135  if (UsedSGPRs[1] == UsedSGPRs[2])
5136  SGPRReg = UsedSGPRs[1];
5137  }
5138 
5139  return SGPRReg;
5140 }
5141 
5143  unsigned OperandName) const {
5144  int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
5145  if (Idx == -1)
5146  return nullptr;
5147 
5148  return &MI.getOperand(Idx);
5149 }
5150 
5152  uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
5153  if (ST.isAmdHsaOS()) {
5154  // Set ATC = 1. GFX9 doesn't have this bit.
5155  if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
5156  RsrcDataFormat |= (1ULL << 56);
5157 
5158  // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
5159  // BTW, it disables TC L2 and therefore decreases performance.
5160  if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
5161  RsrcDataFormat |= (2ULL << 59);
5162  }
5163 
5164  return RsrcDataFormat;
5165 }
5166 
5168  uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
5170  0xffffffff; // Size;
5171 
5172  // GFX9 doesn't have ELEMENT_SIZE.
5173  if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5174  uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
5175  Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
5176  }
5177 
5178  // IndexStride = 64.
5179  Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
5180 
5181  // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
5182  // Clear them unless we want a huge stride.
5183  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
5184  Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
5185 
5186  return Rsrc23;
5187 }
5188 
5190  unsigned Opc = MI.getOpcode();
5191 
5192  return isSMRD(Opc);
5193 }
5194 
5196  unsigned Opc = MI.getOpcode();
5197 
5198  return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc);
5199 }
5200 
5202  int &FrameIndex) const {
5203  const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
5204  if (!Addr || !Addr->isFI())
5205  return AMDGPU::NoRegister;
5206 
5207  assert(!MI.memoperands_empty() &&
5209 
5210  FrameIndex = Addr->getIndex();
5211  return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
5212 }
5213 
5215  int &FrameIndex) const {
5216  const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
5217  assert(Addr && Addr->isFI());
5218  FrameIndex = Addr->getIndex();
5219  return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
5220 }
5221 
5223  int &FrameIndex) const {
5224  if (!MI.mayLoad())
5225  return AMDGPU::NoRegister;
5226 
5227  if (isMUBUF(MI) || isVGPRSpill(MI))
5228  return isStackAccess(MI, FrameIndex);
5229 
5230  if (isSGPRSpill(MI))
5231  return isSGPRStackAccess(MI, FrameIndex);
5232 
5233  return AMDGPU::NoRegister;
5234 }
5235 
5237  int &FrameIndex) const {
5238  if (!MI.mayStore())
5239  return AMDGPU::NoRegister;
5240 
5241  if (isMUBUF(MI) || isVGPRSpill(MI))
5242  return isStackAccess(MI, FrameIndex);
5243 
5244  if (isSGPRSpill(MI))
5245  return isSGPRStackAccess(MI, FrameIndex);
5246 
5247  return AMDGPU::NoRegister;
5248 }
5249 
5251  unsigned Size = 0;
5254  while (++I != E && I->isInsideBundle()) {
5255  assert(!I->isBundle() && "No nested bundle!");
5256  Size += getInstSizeInBytes(*I);
5257  }
5258 
5259  return Size;
5260 }
5261 
5263  unsigned Opc = MI.getOpcode();
5264  const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);
5265  unsigned DescSize = Desc.getSize();
5266 
5267  // If we have a definitive size, we can use it. Otherwise we need to inspect
5268  // the operands to know the size.
5269  if (isFixedSize(MI))
5270  return DescSize;
5271 
5272  // 4-byte instructions may have a 32-bit literal encoded after them. Check
5273  // operands that coud ever be literals.
5274  if (isVALU(MI) || isSALU(MI)) {
5275  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
5276  if (Src0Idx == -1)
5277  return DescSize; // No operands.
5278 
5279  if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx]))
5280  return DescSize + 4;
5281 
5282  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
5283  if (Src1Idx == -1)
5284  return DescSize;
5285 
5286  if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx]))
5287  return DescSize + 4;