LLVM  9.0.0svn
SIInstrInfo.cpp
Go to the documentation of this file.
1 //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// SI Implementation of TargetInstrInfo.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "SIInstrInfo.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUSubtarget.h"
17 #include "GCNHazardRecognizer.h"
18 #include "SIDefines.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "SIRegisterInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/SmallVector.h"
26 #include "llvm/ADT/StringRef.h"
46 #include "llvm/IR/DebugLoc.h"
47 #include "llvm/IR/DiagnosticInfo.h"
48 #include "llvm/IR/Function.h"
49 #include "llvm/IR/InlineAsm.h"
50 #include "llvm/IR/LLVMContext.h"
51 #include "llvm/MC/MCInstrDesc.h"
52 #include "llvm/Support/Casting.h"
54 #include "llvm/Support/Compiler.h"
59 #include <cassert>
60 #include <cstdint>
61 #include <iterator>
62 #include <utility>
63 
64 using namespace llvm;
65 
66 #define GET_INSTRINFO_CTOR_DTOR
67 #include "AMDGPUGenInstrInfo.inc"
68 
69 namespace llvm {
70 namespace AMDGPU {
71 #define GET_D16ImageDimIntrinsics_IMPL
72 #define GET_ImageDimIntrinsicTable_IMPL
73 #define GET_RsrcIntrinsics_IMPL
74 #include "AMDGPUGenSearchableTables.inc"
75 }
76 }
77 
78 
79 // Must be at least 4 to be able to branch over minimum unconditional branch
80 // code. This is only for making it possible to write reasonably small tests for
81 // long branches.
82 static cl::opt<unsigned>
83 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
84  cl::desc("Restrict range of branch instructions (DEBUG)"));
85 
87  : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
88  RI(ST), ST(ST) {}
89 
90 //===----------------------------------------------------------------------===//
91 // TargetInstrInfo callbacks
92 //===----------------------------------------------------------------------===//
93 
94 static unsigned getNumOperandsNoGlue(SDNode *Node) {
95  unsigned N = Node->getNumOperands();
96  while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
97  --N;
98  return N;
99 }
100 
101 /// Returns true if both nodes have the same value for the given
102 /// operand \p Op, or if both nodes do not have this operand.
103 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
104  unsigned Opc0 = N0->getMachineOpcode();
105  unsigned Opc1 = N1->getMachineOpcode();
106 
107  int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
108  int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
109 
110  if (Op0Idx == -1 && Op1Idx == -1)
111  return true;
112 
113 
114  if ((Op0Idx == -1 && Op1Idx != -1) ||
115  (Op1Idx == -1 && Op0Idx != -1))
116  return false;
117 
118  // getNamedOperandIdx returns the index for the MachineInstr's operands,
119  // which includes the result as the first operand. We are indexing into the
120  // MachineSDNode's operands, so we need to skip the result operand to get
121  // the real index.
122  --Op0Idx;
123  --Op1Idx;
124 
125  return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
126 }
127 
129  AliasAnalysis *AA) const {
130  // TODO: The generic check fails for VALU instructions that should be
131  // rematerializable due to implicit reads of exec. We really want all of the
132  // generic logic for this except for this.
133  switch (MI.getOpcode()) {
134  case AMDGPU::V_MOV_B32_e32:
135  case AMDGPU::V_MOV_B32_e64:
136  case AMDGPU::V_MOV_B64_PSEUDO:
137  // No implicit operands.
138  return MI.getNumOperands() == MI.getDesc().getNumOperands();
139  default:
140  return false;
141  }
142 }
143 
145  int64_t &Offset0,
146  int64_t &Offset1) const {
147  if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
148  return false;
149 
150  unsigned Opc0 = Load0->getMachineOpcode();
151  unsigned Opc1 = Load1->getMachineOpcode();
152 
153  // Make sure both are actually loads.
154  if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
155  return false;
156 
157  if (isDS(Opc0) && isDS(Opc1)) {
158 
159  // FIXME: Handle this case:
160  if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
161  return false;
162 
163  // Check base reg.
164  if (Load0->getOperand(0) != Load1->getOperand(0))
165  return false;
166 
167  // Skip read2 / write2 variants for simplicity.
168  // TODO: We should report true if the used offsets are adjacent (excluded
169  // st64 versions).
170  if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 ||
171  AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1)
172  return false;
173 
174  Offset0 = cast<ConstantSDNode>(Load0->getOperand(1))->getZExtValue();
175  Offset1 = cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue();
176  return true;
177  }
178 
179  if (isSMRD(Opc0) && isSMRD(Opc1)) {
180  // Skip time and cache invalidation instructions.
181  if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 ||
182  AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1)
183  return false;
184 
186 
187  // Check base reg.
188  if (Load0->getOperand(0) != Load1->getOperand(0))
189  return false;
190 
191  const ConstantSDNode *Load0Offset =
192  dyn_cast<ConstantSDNode>(Load0->getOperand(1));
193  const ConstantSDNode *Load1Offset =
194  dyn_cast<ConstantSDNode>(Load1->getOperand(1));
195 
196  if (!Load0Offset || !Load1Offset)
197  return false;
198 
199  Offset0 = Load0Offset->getZExtValue();
200  Offset1 = Load1Offset->getZExtValue();
201  return true;
202  }
203 
204  // MUBUF and MTBUF can access the same addresses.
205  if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
206 
207  // MUBUF and MTBUF have vaddr at different indices.
208  if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
209  !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
210  !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
211  return false;
212 
213  int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
214  int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
215 
216  if (OffIdx0 == -1 || OffIdx1 == -1)
217  return false;
218 
219  // getNamedOperandIdx returns the index for MachineInstrs. Since they
220  // include the output in the operand list, but SDNodes don't, we need to
221  // subtract the index by one.
222  --OffIdx0;
223  --OffIdx1;
224 
225  SDValue Off0 = Load0->getOperand(OffIdx0);
226  SDValue Off1 = Load1->getOperand(OffIdx1);
227 
228  // The offset might be a FrameIndexSDNode.
229  if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
230  return false;
231 
232  Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
233  Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
234  return true;
235  }
236 
237  return false;
238 }
239 
240 static bool isStride64(unsigned Opc) {
241  switch (Opc) {
242  case AMDGPU::DS_READ2ST64_B32:
243  case AMDGPU::DS_READ2ST64_B64:
244  case AMDGPU::DS_WRITE2ST64_B32:
245  case AMDGPU::DS_WRITE2ST64_B64:
246  return true;
247  default:
248  return false;
249  }
250 }
251 
253  MachineOperand *&BaseOp,
254  int64_t &Offset,
255  const TargetRegisterInfo *TRI) const {
256  unsigned Opc = LdSt.getOpcode();
257 
258  if (isDS(LdSt)) {
259  const MachineOperand *OffsetImm =
260  getNamedOperand(LdSt, AMDGPU::OpName::offset);
261  if (OffsetImm) {
262  // Normal, single offset LDS instruction.
263  BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
264  // TODO: ds_consume/ds_append use M0 for the base address. Is it safe to
265  // report that here?
266  if (!BaseOp)
267  return false;
268 
269  Offset = OffsetImm->getImm();
270  assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
271  "operands of type register.");
272  return true;
273  }
274 
275  // The 2 offset instructions use offset0 and offset1 instead. We can treat
276  // these as a load with a single offset if the 2 offsets are consecutive. We
277  // will use this for some partially aligned loads.
278  const MachineOperand *Offset0Imm =
279  getNamedOperand(LdSt, AMDGPU::OpName::offset0);
280  const MachineOperand *Offset1Imm =
281  getNamedOperand(LdSt, AMDGPU::OpName::offset1);
282 
283  uint8_t Offset0 = Offset0Imm->getImm();
284  uint8_t Offset1 = Offset1Imm->getImm();
285 
286  if (Offset1 > Offset0 && Offset1 - Offset0 == 1) {
287  // Each of these offsets is in element sized units, so we need to convert
288  // to bytes of the individual reads.
289 
290  unsigned EltSize;
291  if (LdSt.mayLoad())
292  EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
293  else {
294  assert(LdSt.mayStore());
295  int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
296  EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
297  }
298 
299  if (isStride64(Opc))
300  EltSize *= 64;
301 
302  BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
303  Offset = EltSize * Offset0;
304  assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
305  "operands of type register.");
306  return true;
307  }
308 
309  return false;
310  }
311 
312  if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
313  const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset);
314  if (SOffset && SOffset->isReg())
315  return false;
316 
317  MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
318  if (!AddrReg)
319  return false;
320 
321  const MachineOperand *OffsetImm =
322  getNamedOperand(LdSt, AMDGPU::OpName::offset);
323  BaseOp = AddrReg;
324  Offset = OffsetImm->getImm();
325 
326  if (SOffset) // soffset can be an inline immediate.
327  Offset += SOffset->getImm();
328 
329  assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
330  "operands of type register.");
331  return true;
332  }
333 
334  if (isSMRD(LdSt)) {
335  const MachineOperand *OffsetImm =
336  getNamedOperand(LdSt, AMDGPU::OpName::offset);
337  if (!OffsetImm)
338  return false;
339 
340  MachineOperand *SBaseReg = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
341  BaseOp = SBaseReg;
342  Offset = OffsetImm->getImm();
343  assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
344  "operands of type register.");
345  return true;
346  }
347 
348  if (isFLAT(LdSt)) {
349  MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
350  if (VAddr) {
351  // Can't analyze 2 offsets.
352  if (getNamedOperand(LdSt, AMDGPU::OpName::saddr))
353  return false;
354 
355  BaseOp = VAddr;
356  } else {
357  // scratch instructions have either vaddr or saddr.
358  BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
359  }
360 
361  Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
362  assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
363  "operands of type register.");
364  return true;
365  }
366 
367  return false;
368 }
369 
370 static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
371  const MachineOperand &BaseOp1,
372  const MachineInstr &MI2,
373  const MachineOperand &BaseOp2) {
374  // Support only base operands with base registers.
375  // Note: this could be extended to support FI operands.
376  if (!BaseOp1.isReg() || !BaseOp2.isReg())
377  return false;
378 
379  if (BaseOp1.isIdenticalTo(BaseOp2))
380  return true;
381 
382  if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
383  return false;
384 
385  auto MO1 = *MI1.memoperands_begin();
386  auto MO2 = *MI2.memoperands_begin();
387  if (MO1->getAddrSpace() != MO2->getAddrSpace())
388  return false;
389 
390  auto Base1 = MO1->getValue();
391  auto Base2 = MO2->getValue();
392  if (!Base1 || !Base2)
393  return false;
394  const MachineFunction &MF = *MI1.getParent()->getParent();
395  const DataLayout &DL = MF.getFunction().getParent()->getDataLayout();
396  Base1 = GetUnderlyingObject(Base1, DL);
397  Base2 = GetUnderlyingObject(Base1, DL);
398 
399  if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
400  return false;
401 
402  return Base1 == Base2;
403 }
404 
406  MachineOperand &BaseOp2,
407  unsigned NumLoads) const {
408  MachineInstr &FirstLdSt = *BaseOp1.getParent();
409  MachineInstr &SecondLdSt = *BaseOp2.getParent();
410 
411  if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOp1, SecondLdSt, BaseOp2))
412  return false;
413 
414  const MachineOperand *FirstDst = nullptr;
415  const MachineOperand *SecondDst = nullptr;
416 
417  if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
418  (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
419  (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
420  const unsigned MaxGlobalLoadCluster = 6;
421  if (NumLoads > MaxGlobalLoadCluster)
422  return false;
423 
424  FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
425  if (!FirstDst)
426  FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
427  SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
428  if (!SecondDst)
429  SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
430  } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
431  FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
432  SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
433  } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
434  FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
435  SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
436  }
437 
438  if (!FirstDst || !SecondDst)
439  return false;
440 
441  // Try to limit clustering based on the total number of bytes loaded
442  // rather than the number of instructions. This is done to help reduce
443  // register pressure. The method used is somewhat inexact, though,
444  // because it assumes that all loads in the cluster will load the
445  // same number of bytes as FirstLdSt.
446 
447  // The unit of this value is bytes.
448  // FIXME: This needs finer tuning.
449  unsigned LoadClusterThreshold = 16;
450 
451  const MachineRegisterInfo &MRI =
452  FirstLdSt.getParent()->getParent()->getRegInfo();
453  const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());
454 
455  return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
456 }
457 
458 // FIXME: This behaves strangely. If, for example, you have 32 load + stores,
459 // the first 16 loads will be interleaved with the stores, and the next 16 will
460 // be clustered as expected. It should really split into 2 16 store batches.
461 //
462 // Loads are clustered until this returns false, rather than trying to schedule
463 // groups of stores. This also means we have to deal with saying different
464 // address space loads should be clustered, and ones which might cause bank
465 // conflicts.
466 //
467 // This might be deprecated so it might not be worth that much effort to fix.
469  int64_t Offset0, int64_t Offset1,
470  unsigned NumLoads) const {
471  assert(Offset1 > Offset0 &&
472  "Second offset should be larger than first offset!");
473  // If we have less than 16 loads in a row, and the offsets are within 64
474  // bytes, then schedule together.
475 
476  // A cacheline is 64 bytes (for global memory).
477  return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
478 }
479 
482  const DebugLoc &DL, unsigned DestReg,
483  unsigned SrcReg, bool KillSrc) {
484  MachineFunction *MF = MBB.getParent();
485  DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(),
486  "illegal SGPR to VGPR copy",
487  DL, DS_Error);
488  LLVMContext &C = MF->getFunction().getContext();
489  C.diagnose(IllegalCopy);
490 
491  BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
492  .addReg(SrcReg, getKillRegState(KillSrc));
493 }
494 
497  const DebugLoc &DL, unsigned DestReg,
498  unsigned SrcReg, bool KillSrc) const {
499  const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg);
500 
501  if (RC == &AMDGPU::VGPR_32RegClass) {
502  assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
503  AMDGPU::SReg_32RegClass.contains(SrcReg));
504  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
505  .addReg(SrcReg, getKillRegState(KillSrc));
506  return;
507  }
508 
509  if (RC == &AMDGPU::SReg_32_XM0RegClass ||
510  RC == &AMDGPU::SReg_32RegClass) {
511  if (SrcReg == AMDGPU::SCC) {
512  BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
513  .addImm(-1)
514  .addImm(0);
515  return;
516  }
517 
518  if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
519  reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
520  return;
521  }
522 
523  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
524  .addReg(SrcReg, getKillRegState(KillSrc));
525  return;
526  }
527 
528  if (RC == &AMDGPU::SReg_64RegClass) {
529  if (DestReg == AMDGPU::VCC) {
530  if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
531  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
532  .addReg(SrcReg, getKillRegState(KillSrc));
533  } else {
534  // FIXME: Hack until VReg_1 removed.
535  assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
536  BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
537  .addImm(0)
538  .addReg(SrcReg, getKillRegState(KillSrc));
539  }
540 
541  return;
542  }
543 
544  if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
545  reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
546  return;
547  }
548 
549  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
550  .addReg(SrcReg, getKillRegState(KillSrc));
551  return;
552  }
553 
554  if (DestReg == AMDGPU::SCC) {
555  assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
556  BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
557  .addReg(SrcReg, getKillRegState(KillSrc))
558  .addImm(0);
559  return;
560  }
561 
562  unsigned EltSize = 4;
563  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
564  if (RI.isSGPRClass(RC)) {
565  // TODO: Copy vec3/vec5 with s_mov_b64s then final s_mov_b32.
566  if (!(RI.getRegSizeInBits(*RC) % 64)) {
567  Opcode = AMDGPU::S_MOV_B64;
568  EltSize = 8;
569  } else {
570  Opcode = AMDGPU::S_MOV_B32;
571  EltSize = 4;
572  }
573 
574  if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) {
575  reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
576  return;
577  }
578  }
579 
580  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
581  bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
582 
583  for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
584  unsigned SubIdx;
585  if (Forward)
586  SubIdx = SubIndices[Idx];
587  else
588  SubIdx = SubIndices[SubIndices.size() - Idx - 1];
589 
590  MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
591  get(Opcode), RI.getSubReg(DestReg, SubIdx));
592 
593  Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
594 
595  if (Idx == 0)
596  Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
597 
598  bool UseKill = KillSrc && Idx == SubIndices.size() - 1;
599  Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
600  }
601 }
602 
603 int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
604  int NewOpc;
605 
606  // Try to map original to commuted opcode
607  NewOpc = AMDGPU::getCommuteRev(Opcode);
608  if (NewOpc != -1)
609  // Check if the commuted (REV) opcode exists on the target.
610  return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
611 
612  // Try to map commuted to original opcode
613  NewOpc = AMDGPU::getCommuteOrig(Opcode);
614  if (NewOpc != -1)
615  // Check if the original (non-REV) opcode exists on the target.
616  return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
617 
618  return Opcode;
619 }
620 
623  const DebugLoc &DL, unsigned DestReg,
624  int64_t Value) const {
626  const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
627  if (RegClass == &AMDGPU::SReg_32RegClass ||
628  RegClass == &AMDGPU::SGPR_32RegClass ||
629  RegClass == &AMDGPU::SReg_32_XM0RegClass ||
630  RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
631  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
632  .addImm(Value);
633  return;
634  }
635 
636  if (RegClass == &AMDGPU::SReg_64RegClass ||
637  RegClass == &AMDGPU::SGPR_64RegClass ||
638  RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
639  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
640  .addImm(Value);
641  return;
642  }
643 
644  if (RegClass == &AMDGPU::VGPR_32RegClass) {
645  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
646  .addImm(Value);
647  return;
648  }
649  if (RegClass == &AMDGPU::VReg_64RegClass) {
650  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
651  .addImm(Value);
652  return;
653  }
654 
655  unsigned EltSize = 4;
656  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
657  if (RI.isSGPRClass(RegClass)) {
658  if (RI.getRegSizeInBits(*RegClass) > 32) {
659  Opcode = AMDGPU::S_MOV_B64;
660  EltSize = 8;
661  } else {
662  Opcode = AMDGPU::S_MOV_B32;
663  EltSize = 4;
664  }
665  }
666 
667  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
668  for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
669  int64_t IdxValue = Idx == 0 ? Value : 0;
670 
671  MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
672  get(Opcode), RI.getSubReg(DestReg, Idx));
673  Builder.addImm(IdxValue);
674  }
675 }
676 
677 const TargetRegisterClass *
679  return &AMDGPU::VGPR_32RegClass;
680 }
681 
684  const DebugLoc &DL, unsigned DstReg,
686  unsigned TrueReg,
687  unsigned FalseReg) const {
689  assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
690  "Not a VGPR32 reg");
691 
692  if (Cond.size() == 1) {
693  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
694  BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
695  .add(Cond[0]);
696  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
697  .addImm(0)
698  .addReg(FalseReg)
699  .addImm(0)
700  .addReg(TrueReg)
701  .addReg(SReg);
702  } else if (Cond.size() == 2) {
703  assert(Cond[0].isImm() && "Cond[0] is not an immediate");
704  switch (Cond[0].getImm()) {
705  case SIInstrInfo::SCC_TRUE: {
706  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
707  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
708  .addImm(-1)
709  .addImm(0);
710  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
711  .addImm(0)
712  .addReg(FalseReg)
713  .addImm(0)
714  .addReg(TrueReg)
715  .addReg(SReg);
716  break;
717  }
718  case SIInstrInfo::SCC_FALSE: {
719  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
720  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
721  .addImm(0)
722  .addImm(-1);
723  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
724  .addImm(0)
725  .addReg(FalseReg)
726  .addImm(0)
727  .addReg(TrueReg)
728  .addReg(SReg);
729  break;
730  }
731  case SIInstrInfo::VCCNZ: {
732  MachineOperand RegOp = Cond[1];
733  RegOp.setImplicit(false);
734  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
735  BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
736  .add(RegOp);
737  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
738  .addImm(0)
739  .addReg(FalseReg)
740  .addImm(0)
741  .addReg(TrueReg)
742  .addReg(SReg);
743  break;
744  }
745  case SIInstrInfo::VCCZ: {
746  MachineOperand RegOp = Cond[1];
747  RegOp.setImplicit(false);
748  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
749  BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
750  .add(RegOp);
751  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
752  .addImm(0)
753  .addReg(TrueReg)
754  .addImm(0)
755  .addReg(FalseReg)
756  .addReg(SReg);
757  break;
758  }
759  case SIInstrInfo::EXECNZ: {
760  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
761  unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
762  BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
763  .addImm(0);
764  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
765  .addImm(-1)
766  .addImm(0);
767  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
768  .addImm(0)
769  .addReg(FalseReg)
770  .addImm(0)
771  .addReg(TrueReg)
772  .addReg(SReg);
773  break;
774  }
775  case SIInstrInfo::EXECZ: {
776  unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
777  unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
778  BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
779  .addImm(0);
780  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
781  .addImm(0)
782  .addImm(-1);
783  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
784  .addImm(0)
785  .addReg(FalseReg)
786  .addImm(0)
787  .addReg(TrueReg)
788  .addReg(SReg);
789  llvm_unreachable("Unhandled branch predicate EXECZ");
790  break;
791  }
792  default:
793  llvm_unreachable("invalid branch predicate");
794  }
795  } else {
796  llvm_unreachable("Can only handle Cond size 1 or 2");
797  }
798 }
799 
802  const DebugLoc &DL,
803  unsigned SrcReg, int Value) const {
805  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
806  BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
807  .addImm(Value)
808  .addReg(SrcReg);
809 
810  return Reg;
811 }
812 
815  const DebugLoc &DL,
816  unsigned SrcReg, int Value) const {
818  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
819  BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
820  .addImm(Value)
821  .addReg(SrcReg);
822 
823  return Reg;
824 }
825 
826 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
827 
828  if (RI.getRegSizeInBits(*DstRC) == 32) {
829  return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
830  } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
831  return AMDGPU::S_MOV_B64;
832  } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
833  return AMDGPU::V_MOV_B64_PSEUDO;
834  }
835  return AMDGPU::COPY;
836 }
837 
838 static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
839  switch (Size) {
840  case 4:
841  return AMDGPU::SI_SPILL_S32_SAVE;
842  case 8:
843  return AMDGPU::SI_SPILL_S64_SAVE;
844  case 12:
845  return AMDGPU::SI_SPILL_S96_SAVE;
846  case 16:
847  return AMDGPU::SI_SPILL_S128_SAVE;
848  case 20:
849  return AMDGPU::SI_SPILL_S160_SAVE;
850  case 32:
851  return AMDGPU::SI_SPILL_S256_SAVE;
852  case 64:
853  return AMDGPU::SI_SPILL_S512_SAVE;
854  default:
855  llvm_unreachable("unknown register size");
856  }
857 }
858 
859 static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
860  switch (Size) {
861  case 4:
862  return AMDGPU::SI_SPILL_V32_SAVE;
863  case 8:
864  return AMDGPU::SI_SPILL_V64_SAVE;
865  case 12:
866  return AMDGPU::SI_SPILL_V96_SAVE;
867  case 16:
868  return AMDGPU::SI_SPILL_V128_SAVE;
869  case 20:
870  return AMDGPU::SI_SPILL_V160_SAVE;
871  case 32:
872  return AMDGPU::SI_SPILL_V256_SAVE;
873  case 64:
874  return AMDGPU::SI_SPILL_V512_SAVE;
875  default:
876  llvm_unreachable("unknown register size");
877  }
878 }
879 
882  unsigned SrcReg, bool isKill,
883  int FrameIndex,
884  const TargetRegisterClass *RC,
885  const TargetRegisterInfo *TRI) const {
886  MachineFunction *MF = MBB.getParent();
888  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
889  const DebugLoc &DL = MBB.findDebugLoc(MI);
890 
891  unsigned Size = FrameInfo.getObjectSize(FrameIndex);
892  unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
893  MachinePointerInfo PtrInfo
894  = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
895  MachineMemOperand *MMO
897  Size, Align);
898  unsigned SpillSize = TRI->getSpillSize(*RC);
899 
900  if (RI.isSGPRClass(RC)) {
901  MFI->setHasSpilledSGPRs();
902 
903  // We are only allowed to create one new instruction when spilling
904  // registers, so we need to use pseudo instruction for spilling SGPRs.
905  const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
906 
907  // The SGPR spill/restore instructions only work on number sgprs, so we need
908  // to make sure we are using the correct register class.
909  if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) {
911  MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
912  }
913 
914  MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc)
915  .addReg(SrcReg, getKillRegState(isKill)) // data
916  .addFrameIndex(FrameIndex) // addr
917  .addMemOperand(MMO)
919  .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
920  // Add the scratch resource registers as implicit uses because we may end up
921  // needing them, and need to ensure that the reserved registers are
922  // correctly handled.
923 
924  FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
925  if (ST.hasScalarStores()) {
926  // m0 is used for offset to scalar stores if used to spill.
927  Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
928  }
929 
930  return;
931  }
932 
933  assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
934 
935  unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize);
936  MFI->setHasSpilledVGPRs();
937  BuildMI(MBB, MI, DL, get(Opcode))
938  .addReg(SrcReg, getKillRegState(isKill)) // data
939  .addFrameIndex(FrameIndex) // addr
940  .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
941  .addReg(MFI->getFrameOffsetReg()) // scratch_offset
942  .addImm(0) // offset
943  .addMemOperand(MMO);
944 }
945 
946 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
947  switch (Size) {
948  case 4:
949  return AMDGPU::SI_SPILL_S32_RESTORE;
950  case 8:
951  return AMDGPU::SI_SPILL_S64_RESTORE;
952  case 12:
953  return AMDGPU::SI_SPILL_S96_RESTORE;
954  case 16:
955  return AMDGPU::SI_SPILL_S128_RESTORE;
956  case 20:
957  return AMDGPU::SI_SPILL_S160_RESTORE;
958  case 32:
959  return AMDGPU::SI_SPILL_S256_RESTORE;
960  case 64:
961  return AMDGPU::SI_SPILL_S512_RESTORE;
962  default:
963  llvm_unreachable("unknown register size");
964  }
965 }
966 
967 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
968  switch (Size) {
969  case 4:
970  return AMDGPU::SI_SPILL_V32_RESTORE;
971  case 8:
972  return AMDGPU::SI_SPILL_V64_RESTORE;
973  case 12:
974  return AMDGPU::SI_SPILL_V96_RESTORE;
975  case 16:
976  return AMDGPU::SI_SPILL_V128_RESTORE;
977  case 20:
978  return AMDGPU::SI_SPILL_V160_RESTORE;
979  case 32:
980  return AMDGPU::SI_SPILL_V256_RESTORE;
981  case 64:
982  return AMDGPU::SI_SPILL_V512_RESTORE;
983  default:
984  llvm_unreachable("unknown register size");
985  }
986 }
987 
990  unsigned DestReg, int FrameIndex,
991  const TargetRegisterClass *RC,
992  const TargetRegisterInfo *TRI) const {
993  MachineFunction *MF = MBB.getParent();
995  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
996  const DebugLoc &DL = MBB.findDebugLoc(MI);
997  unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
998  unsigned Size = FrameInfo.getObjectSize(FrameIndex);
999  unsigned SpillSize = TRI->getSpillSize(*RC);
1000 
1001  MachinePointerInfo PtrInfo
1002  = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1003 
1005  PtrInfo, MachineMemOperand::MOLoad, Size, Align);
1006 
1007  if (RI.isSGPRClass(RC)) {
1008  MFI->setHasSpilledSGPRs();
1009 
1010  // FIXME: Maybe this should not include a memoperand because it will be
1011  // lowered to non-memory instructions.
1012  const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1013  if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) {
1015  MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
1016  }
1017 
1018  FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
1019  MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
1020  .addFrameIndex(FrameIndex) // addr
1021  .addMemOperand(MMO)
1023  .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
1024 
1025  if (ST.hasScalarStores()) {
1026  // m0 is used for offset to scalar stores if used to spill.
1027  Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
1028  }
1029 
1030  return;
1031  }
1032 
1033  assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
1034 
1035  unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize);
1036  BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1037  .addFrameIndex(FrameIndex) // vaddr
1038  .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
1039  .addReg(MFI->getFrameOffsetReg()) // scratch_offset
1040  .addImm(0) // offset
1041  .addMemOperand(MMO);
1042 }
1043 
1044 /// \param @Offset Offset in bytes of the FrameIndex being spilled
1046  MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg,
1047  unsigned FrameOffset, unsigned Size) const {
1048  MachineFunction *MF = MBB.getParent();
1050  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
1051  const DebugLoc &DL = MBB.findDebugLoc(MI);
1052  unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
1053  unsigned WavefrontSize = ST.getWavefrontSize();
1054 
1055  unsigned TIDReg = MFI->getTIDReg();
1056  if (!MFI->hasCalculatedTID()) {
1057  MachineBasicBlock &Entry = MBB.getParent()->front();
1058  MachineBasicBlock::iterator Insert = Entry.front();
1059  const DebugLoc &DL = Insert->getDebugLoc();
1060 
1061  TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass,
1062  *MF);
1063  if (TIDReg == AMDGPU::NoRegister)
1064  return TIDReg;
1065 
1067  WorkGroupSize > WavefrontSize) {
1068  unsigned TIDIGXReg
1070  unsigned TIDIGYReg
1072  unsigned TIDIGZReg
1074  unsigned InputPtrReg =
1076  for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
1077  if (!Entry.isLiveIn(Reg))
1078  Entry.addLiveIn(Reg);
1079  }
1080 
1081  RS->enterBasicBlock(Entry);
1082  // FIXME: Can we scavenge an SReg_64 and access the subregs?
1083  unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
1084  unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
1085  BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
1086  .addReg(InputPtrReg)
1088  BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1)
1089  .addReg(InputPtrReg)
1091 
1092  // NGROUPS.X * NGROUPS.Y
1093  BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1)
1094  .addReg(STmp1)
1095  .addReg(STmp0);
1096  // (NGROUPS.X * NGROUPS.Y) * TIDIG.X
1097  BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg)
1098  .addReg(STmp1)
1099  .addReg(TIDIGXReg);
1100  // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)
1101  BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg)
1102  .addReg(STmp0)
1103  .addReg(TIDIGYReg)
1104  .addReg(TIDReg);
1105  // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
1106  getAddNoCarry(Entry, Insert, DL, TIDReg)
1107  .addReg(TIDReg)
1108  .addReg(TIDIGZReg)
1109  .addImm(0); // clamp bit
1110  } else {
1111  // Get the wave id
1112  BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
1113  TIDReg)
1114  .addImm(-1)
1115  .addImm(0);
1116 
1117  BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
1118  TIDReg)
1119  .addImm(-1)
1120  .addReg(TIDReg);
1121  }
1122 
1123  BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
1124  TIDReg)
1125  .addImm(2)
1126  .addReg(TIDReg);
1127  MFI->setTIDReg(TIDReg);
1128  }
1129 
1130  // Add FrameIndex to LDS offset
1131  unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize);
1132  getAddNoCarry(MBB, MI, DL, TmpReg)
1133  .addImm(LDSOffset)
1134  .addReg(TIDReg)
1135  .addImm(0); // clamp bit
1136 
1137  return TmpReg;
1138 }
1139 
1142  int Count) const {
1143  DebugLoc DL = MBB.findDebugLoc(MI);
1144  while (Count > 0) {
1145  int Arg;
1146  if (Count >= 8)
1147  Arg = 7;
1148  else
1149  Arg = Count - 1;
1150  Count -= 8;
1151  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP))
1152  .addImm(Arg);
1153  }
1154 }
1155 
1158  insertWaitStates(MBB, MI, 1);
1159 }
1160 
1162  auto MF = MBB.getParent();
1164 
1165  assert(Info->isEntryFunction());
1166 
1167  if (MBB.succ_empty()) {
1168  bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1169  if (HasNoTerminator) {
1170  if (Info->returnsVoid()) {
1171  BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1172  } else {
1173  BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1174  }
1175  }
1176  }
1177 }
1178 
1180  switch (MI.getOpcode()) {
1181  default: return 1; // FIXME: Do wait states equal cycles?
1182 
1183  case AMDGPU::S_NOP:
1184  return MI.getOperand(0).getImm() + 1;
1185  }
1186 }
1187 
1189  MachineBasicBlock &MBB = *MI.getParent();
1190  DebugLoc DL = MBB.findDebugLoc(MI);
1191  switch (MI.getOpcode()) {
1192  default: return TargetInstrInfo::expandPostRAPseudo(MI);
1193  case AMDGPU::S_MOV_B64_term:
1194  // This is only a terminator to get the correct spill code placement during
1195  // register allocation.
1196  MI.setDesc(get(AMDGPU::S_MOV_B64));
1197  break;
1198 
1199  case AMDGPU::S_XOR_B64_term:
1200  // This is only a terminator to get the correct spill code placement during
1201  // register allocation.
1202  MI.setDesc(get(AMDGPU::S_XOR_B64));
1203  break;
1204 
1205  case AMDGPU::S_ANDN2_B64_term:
1206  // This is only a terminator to get the correct spill code placement during
1207  // register allocation.
1208  MI.setDesc(get(AMDGPU::S_ANDN2_B64));
1209  break;
1210 
1211  case AMDGPU::V_MOV_B64_PSEUDO: {
1212  unsigned Dst = MI.getOperand(0).getReg();
1213  unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
1214  unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
1215 
1216  const MachineOperand &SrcOp = MI.getOperand(1);
1217  // FIXME: Will this work for 64-bit floating point immediates?
1218  assert(!SrcOp.isFPImm());
1219  if (SrcOp.isImm()) {
1220  APInt Imm(64, SrcOp.getImm());
1221  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1222  .addImm(Imm.getLoBits(32).getZExtValue())
1223  .addReg(Dst, RegState::Implicit | RegState::Define);
1224  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1225  .addImm(Imm.getHiBits(32).getZExtValue())
1226  .addReg(Dst, RegState::Implicit | RegState::Define);
1227  } else {
1228  assert(SrcOp.isReg());
1229  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1230  .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
1232  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1233  .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
1235  }
1236  MI.eraseFromParent();
1237  break;
1238  }
1239  case AMDGPU::V_SET_INACTIVE_B32: {
1240  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1241  .addReg(AMDGPU::EXEC);
1242  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
1243  .add(MI.getOperand(2));
1244  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1245  .addReg(AMDGPU::EXEC);
1246  MI.eraseFromParent();
1247  break;
1248  }
1249  case AMDGPU::V_SET_INACTIVE_B64: {
1250  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1251  .addReg(AMDGPU::EXEC);
1252  MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
1253  MI.getOperand(0).getReg())
1254  .add(MI.getOperand(2));
1255  expandPostRAPseudo(*Copy);
1256  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1257  .addReg(AMDGPU::EXEC);
1258  MI.eraseFromParent();
1259  break;
1260  }
1261  case AMDGPU::V_MOVRELD_B32_V1:
1262  case AMDGPU::V_MOVRELD_B32_V2:
1263  case AMDGPU::V_MOVRELD_B32_V4:
1264  case AMDGPU::V_MOVRELD_B32_V8:
1265  case AMDGPU::V_MOVRELD_B32_V16: {
1266  const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32);
1267  unsigned VecReg = MI.getOperand(0).getReg();
1268  bool IsUndef = MI.getOperand(1).isUndef();
1269  unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm();
1270  assert(VecReg == MI.getOperand(1).getReg());
1271 
1272  MachineInstr *MovRel =
1273  BuildMI(MBB, MI, DL, MovRelDesc)
1274  .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
1275  .add(MI.getOperand(2))
1276  .addReg(VecReg, RegState::ImplicitDefine)
1277  .addReg(VecReg,
1278  RegState::Implicit | (IsUndef ? RegState::Undef : 0));
1279 
1280  const int ImpDefIdx =
1281  MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses();
1282  const int ImpUseIdx = ImpDefIdx + 1;
1283  MovRel->tieOperands(ImpDefIdx, ImpUseIdx);
1284 
1285  MI.eraseFromParent();
1286  break;
1287  }
1288  case AMDGPU::SI_PC_ADD_REL_OFFSET: {
1289  MachineFunction &MF = *MBB.getParent();
1290  unsigned Reg = MI.getOperand(0).getReg();
1291  unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
1292  unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
1293 
1294  // Create a bundle so these instructions won't be re-ordered by the
1295  // post-RA scheduler.
1296  MIBundleBuilder Bundler(MBB, MI);
1297  Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
1298 
1299  // Add 32-bit offset from this instruction to the start of the
1300  // constant data.
1301  Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
1302  .addReg(RegLo)
1303  .add(MI.getOperand(1)));
1304 
1305  MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
1306  .addReg(RegHi);
1308  MIB.addImm(0);
1309  else
1310  MIB.add(MI.getOperand(2));
1311 
1312  Bundler.append(MIB);
1313  finalizeBundle(MBB, Bundler.begin());
1314 
1315  MI.eraseFromParent();
1316  break;
1317  }
1318  case AMDGPU::EXIT_WWM: {
1319  // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM
1320  // is exited.
1321  MI.setDesc(get(AMDGPU::S_MOV_B64));
1322  break;
1323  }
1324  case TargetOpcode::BUNDLE: {
1325  if (!MI.mayLoad())
1326  return false;
1327 
1328  // If it is a load it must be a memory clause
1330  I->isBundledWithSucc(); ++I) {
1331  I->unbundleFromSucc();
1332  for (MachineOperand &MO : I->operands())
1333  if (MO.isReg())
1334  MO.setIsInternalRead(false);
1335  }
1336 
1337  MI.eraseFromParent();
1338  break;
1339  }
1340  }
1341  return true;
1342 }
1343 
1345  MachineOperand &Src0,
1346  unsigned Src0OpName,
1347  MachineOperand &Src1,
1348  unsigned Src1OpName) const {
1349  MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
1350  if (!Src0Mods)
1351  return false;
1352 
1353  MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
1354  assert(Src1Mods &&
1355  "All commutable instructions have both src0 and src1 modifiers");
1356 
1357  int Src0ModsVal = Src0Mods->getImm();
1358  int Src1ModsVal = Src1Mods->getImm();
1359 
1360  Src1Mods->setImm(Src0ModsVal);
1361  Src0Mods->setImm(Src1ModsVal);
1362  return true;
1363 }
1364 
1366  MachineOperand &RegOp,
1367  MachineOperand &NonRegOp) {
1368  unsigned Reg = RegOp.getReg();
1369  unsigned SubReg = RegOp.getSubReg();
1370  bool IsKill = RegOp.isKill();
1371  bool IsDead = RegOp.isDead();
1372  bool IsUndef = RegOp.isUndef();
1373  bool IsDebug = RegOp.isDebug();
1374 
1375  if (NonRegOp.isImm())
1376  RegOp.ChangeToImmediate(NonRegOp.getImm());
1377  else if (NonRegOp.isFI())
1378  RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
1379  else
1380  return nullptr;
1381 
1382  NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
1383  NonRegOp.setSubReg(SubReg);
1384 
1385  return &MI;
1386 }
1387 
1389  unsigned Src0Idx,
1390  unsigned Src1Idx) const {
1391  assert(!NewMI && "this should never be used");
1392 
1393  unsigned Opc = MI.getOpcode();
1394  int CommutedOpcode = commuteOpcode(Opc);
1395  if (CommutedOpcode == -1)
1396  return nullptr;
1397 
1398  assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
1399  static_cast<int>(Src0Idx) &&
1400  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
1401  static_cast<int>(Src1Idx) &&
1402  "inconsistency with findCommutedOpIndices");
1403 
1404  MachineOperand &Src0 = MI.getOperand(Src0Idx);
1405  MachineOperand &Src1 = MI.getOperand(Src1Idx);
1406 
1407  MachineInstr *CommutedMI = nullptr;
1408  if (Src0.isReg() && Src1.isReg()) {
1409  if (isOperandLegal(MI, Src1Idx, &Src0)) {
1410  // Be sure to copy the source modifiers to the right place.
1411  CommutedMI
1412  = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
1413  }
1414 
1415  } else if (Src0.isReg() && !Src1.isReg()) {
1416  // src0 should always be able to support any operand type, so no need to
1417  // check operand legality.
1418  CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
1419  } else if (!Src0.isReg() && Src1.isReg()) {
1420  if (isOperandLegal(MI, Src1Idx, &Src0))
1421  CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
1422  } else {
1423  // FIXME: Found two non registers to commute. This does happen.
1424  return nullptr;
1425  }
1426 
1427  if (CommutedMI) {
1428  swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
1429  Src1, AMDGPU::OpName::src1_modifiers);
1430 
1431  CommutedMI->setDesc(get(CommutedOpcode));
1432  }
1433 
1434  return CommutedMI;
1435 }
1436 
1437 // This needs to be implemented because the source modifiers may be inserted
1438 // between the true commutable operands, and the base
1439 // TargetInstrInfo::commuteInstruction uses it.
1441  unsigned &SrcOpIdx1) const {
1442  return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
1443 }
1444 
1445 bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0,
1446  unsigned &SrcOpIdx1) const {
1447  if (!Desc.isCommutable())
1448  return false;
1449 
1450  unsigned Opc = Desc.getOpcode();
1451  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1452  if (Src0Idx == -1)
1453  return false;
1454 
1455  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1456  if (Src1Idx == -1)
1457  return false;
1458 
1459  return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
1460 }
1461 
1462 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
1463  int64_t BrOffset) const {
1464  // BranchRelaxation should never have to check s_setpc_b64 because its dest
1465  // block is unanalyzable.
1466  assert(BranchOp != AMDGPU::S_SETPC_B64);
1467 
1468  // Convert to dwords.
1469  BrOffset /= 4;
1470 
1471  // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
1472  // from the next instruction.
1473  BrOffset -= 1;
1474 
1475  return isIntN(BranchOffsetBits, BrOffset);
1476 }
1477 
1479  const MachineInstr &MI) const {
1480  if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
1481  // This would be a difficult analysis to perform, but can always be legal so
1482  // there's no need to analyze it.
1483  return nullptr;
1484  }
1485 
1486  return MI.getOperand(0).getMBB();
1487 }
1488 
1490  MachineBasicBlock &DestBB,
1491  const DebugLoc &DL,
1492  int64_t BrOffset,
1493  RegScavenger *RS) const {
1494  assert(RS && "RegScavenger required for long branching");
1495  assert(MBB.empty() &&
1496  "new block should be inserted for expanding unconditional branch");
1497  assert(MBB.pred_size() == 1);
1498 
1499  MachineFunction *MF = MBB.getParent();
1500  MachineRegisterInfo &MRI = MF->getRegInfo();
1501 
1502  // FIXME: Virtual register workaround for RegScavenger not working with empty
1503  // blocks.
1504  unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
1505 
1506  auto I = MBB.end();
1507 
1508  // We need to compute the offset relative to the instruction immediately after
1509  // s_getpc_b64. Insert pc arithmetic code before last terminator.
1510  MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
1511 
1512  // TODO: Handle > 32-bit block address.
1513  if (BrOffset >= 0) {
1514  BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
1515  .addReg(PCReg, RegState::Define, AMDGPU::sub0)
1516  .addReg(PCReg, 0, AMDGPU::sub0)
1518  BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
1519  .addReg(PCReg, RegState::Define, AMDGPU::sub1)
1520  .addReg(PCReg, 0, AMDGPU::sub1)
1521  .addImm(0);
1522  } else {
1523  // Backwards branch.
1524  BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
1525  .addReg(PCReg, RegState::Define, AMDGPU::sub0)
1526  .addReg(PCReg, 0, AMDGPU::sub0)
1528  BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
1529  .addReg(PCReg, RegState::Define, AMDGPU::sub1)
1530  .addReg(PCReg, 0, AMDGPU::sub1)
1531  .addImm(0);
1532  }
1533 
1534  // Insert the indirect branch after the other terminator.
1535  BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
1536  .addReg(PCReg);
1537 
1538  // FIXME: If spilling is necessary, this will fail because this scavenger has
1539  // no emergency stack slots. It is non-trivial to spill in this situation,
1540  // because the restore code needs to be specially placed after the
1541  // jump. BranchRelaxation then needs to be made aware of the newly inserted
1542  // block.
1543  //
1544  // If a spill is needed for the pc register pair, we need to insert a spill
1545  // restore block right before the destination block, and insert a short branch
1546  // into the old destination block's fallthrough predecessor.
1547  // e.g.:
1548  //
1549  // s_cbranch_scc0 skip_long_branch:
1550  //
1551  // long_branch_bb:
1552  // spill s[8:9]
1553  // s_getpc_b64 s[8:9]
1554  // s_add_u32 s8, s8, restore_bb
1555  // s_addc_u32 s9, s9, 0
1556  // s_setpc_b64 s[8:9]
1557  //
1558  // skip_long_branch:
1559  // foo;
1560  //
1561  // .....
1562  //
1563  // dest_bb_fallthrough_predecessor:
1564  // bar;
1565  // s_branch dest_bb
1566  //
1567  // restore_bb:
1568  // restore s[8:9]
1569  // fallthrough dest_bb
1570  ///
1571  // dest_bb:
1572  // buzz;
1573 
1574  RS->enterBasicBlockEnd(MBB);
1575  unsigned Scav = RS->scavengeRegisterBackwards(
1576  AMDGPU::SReg_64RegClass,
1577  MachineBasicBlock::iterator(GetPC), false, 0);
1578  MRI.replaceRegWith(PCReg, Scav);
1579  MRI.clearVirtRegs();
1580  RS->setRegUsed(Scav);
1581 
1582  return 4 + 8 + 4 + 4;
1583 }
1584 
1585 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
1586  switch (Cond) {
1587  case SIInstrInfo::SCC_TRUE:
1588  return AMDGPU::S_CBRANCH_SCC1;
1589  case SIInstrInfo::SCC_FALSE:
1590  return AMDGPU::S_CBRANCH_SCC0;
1591  case SIInstrInfo::VCCNZ:
1592  return AMDGPU::S_CBRANCH_VCCNZ;
1593  case SIInstrInfo::VCCZ:
1594  return AMDGPU::S_CBRANCH_VCCZ;
1595  case SIInstrInfo::EXECNZ:
1596  return AMDGPU::S_CBRANCH_EXECNZ;
1597  case SIInstrInfo::EXECZ:
1598  return AMDGPU::S_CBRANCH_EXECZ;
1599  default:
1600  llvm_unreachable("invalid branch predicate");
1601  }
1602 }
1603 
1604 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
1605  switch (Opcode) {
1606  case AMDGPU::S_CBRANCH_SCC0:
1607  return SCC_FALSE;
1608  case AMDGPU::S_CBRANCH_SCC1:
1609  return SCC_TRUE;
1610  case AMDGPU::S_CBRANCH_VCCNZ:
1611  return VCCNZ;
1612  case AMDGPU::S_CBRANCH_VCCZ:
1613  return VCCZ;
1614  case AMDGPU::S_CBRANCH_EXECNZ:
1615  return EXECNZ;
1616  case AMDGPU::S_CBRANCH_EXECZ:
1617  return EXECZ;
1618  default:
1619  return INVALID_BR;
1620  }
1621 }
1622 
1625  MachineBasicBlock *&TBB,
1626  MachineBasicBlock *&FBB,
1628  bool AllowModify) const {
1629  if (I->getOpcode() == AMDGPU::S_BRANCH) {
1630  // Unconditional Branch
1631  TBB = I->getOperand(0).getMBB();
1632  return false;
1633  }
1634 
1635  MachineBasicBlock *CondBB = nullptr;
1636 
1637  if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
1638  CondBB = I->getOperand(1).getMBB();
1639  Cond.push_back(I->getOperand(0));
1640  } else {
1641  BranchPredicate Pred = getBranchPredicate(I->getOpcode());
1642  if (Pred == INVALID_BR)
1643  return true;
1644 
1645  CondBB = I->getOperand(0).getMBB();
1647  Cond.push_back(I->getOperand(1)); // Save the branch register.
1648  }
1649  ++I;
1650 
1651  if (I == MBB.end()) {
1652  // Conditional branch followed by fall-through.
1653  TBB = CondBB;
1654  return false;
1655  }
1656 
1657  if (I->getOpcode() == AMDGPU::S_BRANCH) {
1658  TBB = CondBB;
1659  FBB = I->getOperand(0).getMBB();
1660  return false;
1661  }
1662 
1663  return true;
1664 }
1665 
1667  MachineBasicBlock *&FBB,
1669  bool AllowModify) const {
1671  auto E = MBB.end();
1672  if (I == E)
1673  return false;
1674 
1675  // Skip over the instructions that are artificially terminators for special
1676  // exec management.
1677  while (I != E && !I->isBranch() && !I->isReturn() &&
1678  I->getOpcode() != AMDGPU::SI_MASK_BRANCH) {
1679  switch (I->getOpcode()) {
1680  case AMDGPU::SI_MASK_BRANCH:
1681  case AMDGPU::S_MOV_B64_term:
1682  case AMDGPU::S_XOR_B64_term:
1683  case AMDGPU::S_ANDN2_B64_term:
1684  break;
1685  case AMDGPU::SI_IF:
1686  case AMDGPU::SI_ELSE:
1687  case AMDGPU::SI_KILL_I1_TERMINATOR:
1688  case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1689  // FIXME: It's messy that these need to be considered here at all.
1690  return true;
1691  default:
1692  llvm_unreachable("unexpected non-branch terminator inst");
1693  }
1694 
1695  ++I;
1696  }
1697 
1698  if (I == E)
1699  return false;
1700 
1701  if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH)
1702  return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
1703 
1704  ++I;
1705 
1706  // TODO: Should be able to treat as fallthrough?
1707  if (I == MBB.end())
1708  return true;
1709 
1710  if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify))
1711  return true;
1712 
1713  MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB();
1714 
1715  // Specifically handle the case where the conditional branch is to the same
1716  // destination as the mask branch. e.g.
1717  //
1718  // si_mask_branch BB8
1719  // s_cbranch_execz BB8
1720  // s_cbranch BB9
1721  //
1722  // This is required to understand divergent loops which may need the branches
1723  // to be relaxed.
1724  if (TBB != MaskBrDest || Cond.empty())
1725  return true;
1726 
1727  auto Pred = Cond[0].getImm();
1728  return (Pred != EXECZ && Pred != EXECNZ);
1729 }
1730 
1732  int *BytesRemoved) const {
1734 
1735  unsigned Count = 0;
1736  unsigned RemovedSize = 0;
1737  while (I != MBB.end()) {
1738  MachineBasicBlock::iterator Next = std::next(I);
1739  if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
1740  I = Next;
1741  continue;
1742  }
1743 
1744  RemovedSize += getInstSizeInBytes(*I);
1745  I->eraseFromParent();
1746  ++Count;
1747  I = Next;
1748  }
1749 
1750  if (BytesRemoved)
1751  *BytesRemoved = RemovedSize;
1752 
1753  return Count;
1754 }
1755 
1756 // Copy the flags onto the implicit condition register operand.
1758  const MachineOperand &OrigCond) {
1759  CondReg.setIsUndef(OrigCond.isUndef());
1760  CondReg.setIsKill(OrigCond.isKill());
1761 }
1762 
1764  MachineBasicBlock *TBB,
1765  MachineBasicBlock *FBB,
1767  const DebugLoc &DL,
1768  int *BytesAdded) const {
1769  if (!FBB && Cond.empty()) {
1770  BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
1771  .addMBB(TBB);
1772  if (BytesAdded)
1773  *BytesAdded = 4;
1774  return 1;
1775  }
1776 
1777  if(Cond.size() == 1 && Cond[0].isReg()) {
1778  BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
1779  .add(Cond[0])
1780  .addMBB(TBB);
1781  return 1;
1782  }
1783 
1784  assert(TBB && Cond[0].isImm());
1785 
1786  unsigned Opcode
1787  = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
1788 
1789  if (!FBB) {
1790  Cond[1].isUndef();
1791  MachineInstr *CondBr =
1792  BuildMI(&MBB, DL, get(Opcode))
1793  .addMBB(TBB);
1794 
1795  // Copy the flags onto the implicit condition register operand.
1796  preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
1797 
1798  if (BytesAdded)
1799  *BytesAdded = 4;
1800  return 1;
1801  }
1802 
1803  assert(TBB && FBB);
1804 
1805  MachineInstr *CondBr =
1806  BuildMI(&MBB, DL, get(Opcode))
1807  .addMBB(TBB);
1808  BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
1809  .addMBB(FBB);
1810 
1811  MachineOperand &CondReg = CondBr->getOperand(1);
1812  CondReg.setIsUndef(Cond[1].isUndef());
1813  CondReg.setIsKill(Cond[1].isKill());
1814 
1815  if (BytesAdded)
1816  *BytesAdded = 8;
1817 
1818  return 2;
1819 }
1820 
1822  SmallVectorImpl<MachineOperand> &Cond) const {
1823  if (Cond.size() != 2) {
1824  return true;
1825  }
1826 
1827  if (Cond[0].isImm()) {
1828  Cond[0].setImm(-Cond[0].getImm());
1829  return false;
1830  }
1831 
1832  return true;
1833 }
1834 
1837  unsigned TrueReg, unsigned FalseReg,
1838  int &CondCycles,
1839  int &TrueCycles, int &FalseCycles) const {
1840  switch (Cond[0].getImm()) {
1841  case VCCNZ:
1842  case VCCZ: {
1843  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1844  const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
1845  assert(MRI.getRegClass(FalseReg) == RC);
1846 
1847  int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
1848  CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
1849 
1850  // Limit to equal cost for branch vs. N v_cndmask_b32s.
1851  return !RI.isSGPRClass(RC) && NumInsts <= 6;
1852  }
1853  case SCC_TRUE:
1854  case SCC_FALSE: {
1855  // FIXME: We could insert for VGPRs if we could replace the original compare
1856  // with a vector one.
1857  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1858  const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
1859  assert(MRI.getRegClass(FalseReg) == RC);
1860 
1861  int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
1862 
1863  // Multiples of 8 can do s_cselect_b64
1864  if (NumInsts % 2 == 0)
1865  NumInsts /= 2;
1866 
1867  CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
1868  return RI.isSGPRClass(RC);
1869  }
1870  default:
1871  return false;
1872  }
1873 }
1874 
1877  unsigned DstReg, ArrayRef<MachineOperand> Cond,
1878  unsigned TrueReg, unsigned FalseReg) const {
1879  BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
1880  if (Pred == VCCZ || Pred == SCC_FALSE) {
1881  Pred = static_cast<BranchPredicate>(-Pred);
1882  std::swap(TrueReg, FalseReg);
1883  }
1884 
1886  const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
1887  unsigned DstSize = RI.getRegSizeInBits(*DstRC);
1888 
1889  if (DstSize == 32) {
1890  unsigned SelOp = Pred == SCC_TRUE ?
1891  AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32;
1892 
1893  // Instruction's operands are backwards from what is expected.
1894  MachineInstr *Select =
1895  BuildMI(MBB, I, DL, get(SelOp), DstReg)
1896  .addReg(FalseReg)
1897  .addReg(TrueReg);
1898 
1899  preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1900  return;
1901  }
1902 
1903  if (DstSize == 64 && Pred == SCC_TRUE) {
1904  MachineInstr *Select =
1905  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
1906  .addReg(FalseReg)
1907  .addReg(TrueReg);
1908 
1909  preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1910  return;
1911  }
1912 
1913  static const int16_t Sub0_15[] = {
1914  AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1915  AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
1916  AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
1917  AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
1918  };
1919 
1920  static const int16_t Sub0_15_64[] = {
1921  AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
1922  AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
1923  AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
1924  AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
1925  };
1926 
1927  unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
1928  const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
1929  const int16_t *SubIndices = Sub0_15;
1930  int NElts = DstSize / 32;
1931 
1932  // 64-bit select is only available for SALU.
1933  // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
1934  if (Pred == SCC_TRUE) {
1935  if (NElts % 2) {
1936  SelOp = AMDGPU::S_CSELECT_B32;
1937  EltRC = &AMDGPU::SGPR_32RegClass;
1938  } else {
1939  SelOp = AMDGPU::S_CSELECT_B64;
1940  EltRC = &AMDGPU::SGPR_64RegClass;
1941  SubIndices = Sub0_15_64;
1942  NElts /= 2;
1943  }
1944  }
1945 
1947  MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
1948 
1949  I = MIB->getIterator();
1950 
1952  for (int Idx = 0; Idx != NElts; ++Idx) {
1953  unsigned DstElt = MRI.createVirtualRegister(EltRC);
1954  Regs.push_back(DstElt);
1955 
1956  unsigned SubIdx = SubIndices[Idx];
1957 
1958  MachineInstr *Select =
1959  BuildMI(MBB, I, DL, get(SelOp), DstElt)
1960  .addReg(FalseReg, 0, SubIdx)
1961  .addReg(TrueReg, 0, SubIdx);
1962  preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1963 
1964  MIB.addReg(DstElt)
1965  .addImm(SubIdx);
1966  }
1967 }
1968 
1970  switch (MI.getOpcode()) {
1971  case AMDGPU::V_MOV_B32_e32:
1972  case AMDGPU::V_MOV_B32_e64:
1973  case AMDGPU::V_MOV_B64_PSEUDO: {
1974  // If there are additional implicit register operands, this may be used for
1975  // register indexing so the source register operand isn't simply copied.
1976  unsigned NumOps = MI.getDesc().getNumOperands() +
1977  MI.getDesc().getNumImplicitUses();
1978 
1979  return MI.getNumOperands() == NumOps;
1980  }
1981  case AMDGPU::S_MOV_B32:
1982  case AMDGPU::S_MOV_B64:
1983  case AMDGPU::COPY:
1984  return true;
1985  default:
1986  return false;
1987  }
1988 }
1989 
1991  unsigned Kind) const {
1992  switch(Kind) {
2003  }
2004  return AMDGPUAS::FLAT_ADDRESS;
2005 }
2006 
2008  unsigned Opc = MI.getOpcode();
2009  int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
2010  AMDGPU::OpName::src0_modifiers);
2011  int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
2012  AMDGPU::OpName::src1_modifiers);
2013  int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
2014  AMDGPU::OpName::src2_modifiers);
2015 
2016  MI.RemoveOperand(Src2ModIdx);
2017  MI.RemoveOperand(Src1ModIdx);
2018  MI.RemoveOperand(Src0ModIdx);
2019 }
2020 
2022  unsigned Reg, MachineRegisterInfo *MRI) const {
2023  if (!MRI->hasOneNonDBGUse(Reg))
2024  return false;
2025 
2026  switch (DefMI.getOpcode()) {
2027  default:
2028  return false;
2029  case AMDGPU::S_MOV_B64:
2030  // TODO: We could fold 64-bit immediates, but this get compilicated
2031  // when there are sub-registers.
2032  return false;
2033 
2034  case AMDGPU::V_MOV_B32_e32:
2035  case AMDGPU::S_MOV_B32:
2036  break;
2037  }
2038 
2039  const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
2040  assert(ImmOp);
2041  // FIXME: We could handle FrameIndex values here.
2042  if (!ImmOp->isImm())
2043  return false;
2044 
2045  unsigned Opc = UseMI.getOpcode();
2046  if (Opc == AMDGPU::COPY) {
2047  bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg());
2048  unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2049  UseMI.setDesc(get(NewOpc));
2050  UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm());
2051  UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
2052  return true;
2053  }
2054 
2055  if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
2056  Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) {
2057  // Don't fold if we are using source or output modifiers. The new VOP2
2058  // instructions don't have them.
2059  if (hasAnyModifiersSet(UseMI))
2060  return false;
2061 
2062  // If this is a free constant, there's no reason to do this.
2063  // TODO: We could fold this here instead of letting SIFoldOperands do it
2064  // later.
2065  MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
2066 
2067  // Any src operand can be used for the legality check.
2068  if (isInlineConstant(UseMI, *Src0, *ImmOp))
2069  return false;
2070 
2071  bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
2072  MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
2073  MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
2074 
2075  // Multiplied part is the constant: Use v_madmk_{f16, f32}.
2076  // We should only expect these to be on src0 due to canonicalizations.
2077  if (Src0->isReg() && Src0->getReg() == Reg) {
2078  if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
2079  return false;
2080 
2081  if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
2082  return false;
2083 
2084  // We need to swap operands 0 and 1 since madmk constant is at operand 1.
2085 
2086  const int64_t Imm = ImmOp->getImm();
2087 
2088  // FIXME: This would be a lot easier if we could return a new instruction
2089  // instead of having to modify in place.
2090 
2091  // Remove these first since they are at the end.
2092  UseMI.RemoveOperand(
2093  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
2094  UseMI.RemoveOperand(
2095  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
2096 
2097  unsigned Src1Reg = Src1->getReg();
2098  unsigned Src1SubReg = Src1->getSubReg();
2099  Src0->setReg(Src1Reg);
2100  Src0->setSubReg(Src1SubReg);
2101  Src0->setIsKill(Src1->isKill());
2102 
2103  if (Opc == AMDGPU::V_MAC_F32_e64 ||
2104  Opc == AMDGPU::V_MAC_F16_e64)
2105  UseMI.untieRegOperand(
2106  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
2107 
2108  Src1->ChangeToImmediate(Imm);
2109 
2110  removeModOperands(UseMI);
2111  UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16));
2112 
2113  bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
2114  if (DeleteDef)
2115  DefMI.eraseFromParent();
2116 
2117  return true;
2118  }
2119 
2120  // Added part is the constant: Use v_madak_{f16, f32}.
2121  if (Src2->isReg() && Src2->getReg() == Reg) {
2122  // Not allowed to use constant bus for another operand.
2123  // We can however allow an inline immediate as src0.
2124  bool Src0Inlined = false;
2125  if (Src0->isReg()) {
2126  // Try to inline constant if possible.
2127  // If the Def moves immediate and the use is single
2128  // We are saving VGPR here.
2129  MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
2130  if (Def && Def->isMoveImmediate() &&
2131  isInlineConstant(Def->getOperand(1)) &&
2132  MRI->hasOneUse(Src0->getReg())) {
2133  Src0->ChangeToImmediate(Def->getOperand(1).getImm());
2134  Src0Inlined = true;
2135  } else if ((RI.isPhysicalRegister(Src0->getReg()) &&
2136  RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg()))) ||
2137  (RI.isVirtualRegister(Src0->getReg()) &&
2138  RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
2139  return false;
2140  // VGPR is okay as Src0 - fallthrough
2141  }
2142 
2143  if (Src1->isReg() && !Src0Inlined ) {
2144  // We have one slot for inlinable constant so far - try to fill it
2145  MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
2146  if (Def && Def->isMoveImmediate() &&
2147  isInlineConstant(Def->getOperand(1)) &&
2148  MRI->hasOneUse(Src1->getReg()) &&
2149  commuteInstruction(UseMI)) {
2150  Src0->ChangeToImmediate(Def->getOperand(1).getImm());
2151  } else if ((RI.isPhysicalRegister(Src1->getReg()) &&
2152  RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) ||
2153  (RI.isVirtualRegister(Src1->getReg()) &&
2154  RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
2155  return false;
2156  // VGPR is okay as Src1 - fallthrough
2157  }
2158 
2159  const int64_t Imm = ImmOp->getImm();
2160 
2161  // FIXME: This would be a lot easier if we could return a new instruction
2162  // instead of having to modify in place.
2163 
2164  // Remove these first since they are at the end.
2165  UseMI.RemoveOperand(
2166  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
2167  UseMI.RemoveOperand(
2168  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
2169 
2170  if (Opc == AMDGPU::V_MAC_F32_e64 ||
2171  Opc == AMDGPU::V_MAC_F16_e64)
2172  UseMI.untieRegOperand(
2173  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
2174 
2175  // ChangingToImmediate adds Src2 back to the instruction.
2176  Src2->ChangeToImmediate(Imm);
2177 
2178  // These come before src2.
2179  removeModOperands(UseMI);
2180  UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16));
2181 
2182  bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
2183  if (DeleteDef)
2184  DefMI.eraseFromParent();
2185 
2186  return true;
2187  }
2188  }
2189 
2190  return false;
2191 }
2192 
2193 static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
2194  int WidthB, int OffsetB) {
2195  int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
2196  int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
2197  int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
2198  return LowOffset + LowWidth <= HighOffset;
2199 }
2200 
2201 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa,
2202  MachineInstr &MIb) const {
2203  MachineOperand *BaseOp0, *BaseOp1;
2204  int64_t Offset0, Offset1;
2205 
2206  if (getMemOperandWithOffset(MIa, BaseOp0, Offset0, &RI) &&
2207  getMemOperandWithOffset(MIb, BaseOp1, Offset1, &RI)) {
2208  if (!BaseOp0->isIdenticalTo(*BaseOp1))
2209  return false;
2210 
2211  if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
2212  // FIXME: Handle ds_read2 / ds_write2.
2213  return false;
2214  }
2215  unsigned Width0 = (*MIa.memoperands_begin())->getSize();
2216  unsigned Width1 = (*MIb.memoperands_begin())->getSize();
2217  if (offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
2218  return true;
2219  }
2220  }
2221 
2222  return false;
2223 }
2224 
2226  MachineInstr &MIb,
2227  AliasAnalysis *AA) const {
2228  assert((MIa.mayLoad() || MIa.mayStore()) &&
2229  "MIa must load from or modify a memory location");
2230  assert((MIb.mayLoad() || MIb.mayStore()) &&
2231  "MIb must load from or modify a memory location");
2232 
2234  return false;
2235 
2236  // XXX - Can we relax this between address spaces?
2237  if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
2238  return false;
2239 
2240  // TODO: Should we check the address space from the MachineMemOperand? That
2241  // would allow us to distinguish objects we know don't alias based on the
2242  // underlying address space, even if it was lowered to a different one,
2243  // e.g. private accesses lowered to use MUBUF instructions on a scratch
2244  // buffer.
2245  if (isDS(MIa)) {
2246  if (isDS(MIb))
2247  return checkInstOffsetsDoNotOverlap(MIa, MIb);
2248 
2249  return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
2250  }
2251 
2252  if (isMUBUF(MIa) || isMTBUF(MIa)) {
2253  if (isMUBUF(MIb) || isMTBUF(MIb))
2254  return checkInstOffsetsDoNotOverlap(MIa, MIb);
2255 
2256  return !isFLAT(MIb) && !isSMRD(MIb);
2257  }
2258 
2259  if (isSMRD(MIa)) {
2260  if (isSMRD(MIb))
2261  return checkInstOffsetsDoNotOverlap(MIa, MIb);
2262 
2263  return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa);
2264  }
2265 
2266  if (isFLAT(MIa)) {
2267  if (isFLAT(MIb))
2268  return checkInstOffsetsDoNotOverlap(MIa, MIb);
2269 
2270  return false;
2271  }
2272 
2273  return false;
2274 }
2275 
2276 static int64_t getFoldableImm(const MachineOperand* MO) {
2277  if (!MO->isReg())
2278  return false;
2279  const MachineFunction *MF = MO->getParent()->getParent()->getParent();
2280  const MachineRegisterInfo &MRI = MF->getRegInfo();
2281  auto Def = MRI.getUniqueVRegDef(MO->getReg());
2282  if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
2283  Def->getOperand(1).isImm())
2284  return Def->getOperand(1).getImm();
2285  return AMDGPU::NoRegister;
2286 }
2287 
2289  MachineInstr &MI,
2290  LiveVariables *LV) const {
2291  unsigned Opc = MI.getOpcode();
2292  bool IsF16 = false;
2293  bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64;
2294 
2295  switch (Opc) {
2296  default:
2297  return nullptr;
2298  case AMDGPU::V_MAC_F16_e64:
2299  IsF16 = true;
2301  case AMDGPU::V_MAC_F32_e64:
2302  case AMDGPU::V_FMAC_F32_e64:
2303  break;
2304  case AMDGPU::V_MAC_F16_e32:
2305  IsF16 = true;
2307  case AMDGPU::V_MAC_F32_e32:
2308  case AMDGPU::V_FMAC_F32_e32: {
2309  int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
2310  AMDGPU::OpName::src0);
2311  const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
2312  if (!Src0->isReg() && !Src0->isImm())
2313  return nullptr;
2314 
2315  if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
2316  return nullptr;
2317 
2318  break;
2319  }
2320  }
2321 
2322  const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
2323  const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
2324  const MachineOperand *Src0Mods =
2325  getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2326  const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
2327  const MachineOperand *Src1Mods =
2328  getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2329  const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
2330  const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
2331  const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
2332 
2333  if (!IsFMA && !Src0Mods && !Src1Mods && !Clamp && !Omod &&
2334  // If we have an SGPR input, we will violate the constant bus restriction.
2335  (!Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
2336  if (auto Imm = getFoldableImm(Src2)) {
2337  return BuildMI(*MBB, MI, MI.getDebugLoc(),
2338  get(IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32))
2339  .add(*Dst)
2340  .add(*Src0)
2341  .add(*Src1)
2342  .addImm(Imm);
2343  }
2344  if (auto Imm = getFoldableImm(Src1)) {
2345  return BuildMI(*MBB, MI, MI.getDebugLoc(),
2346  get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
2347  .add(*Dst)
2348  .add(*Src0)
2349  .addImm(Imm)
2350  .add(*Src2);
2351  }
2352  if (auto Imm = getFoldableImm(Src0)) {
2353  if (isOperandLegal(MI, AMDGPU::getNamedOperandIdx(AMDGPU::V_MADMK_F32,
2354  AMDGPU::OpName::src0), Src1))
2355  return BuildMI(*MBB, MI, MI.getDebugLoc(),
2356  get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
2357  .add(*Dst)
2358  .add(*Src1)
2359  .addImm(Imm)
2360  .add(*Src2);
2361  }
2362  }
2363 
2364  assert((!IsFMA || !IsF16) && "fmac only expected with f32");
2365  unsigned NewOpc = IsFMA ? AMDGPU::V_FMA_F32 :
2366  (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32);
2367  return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
2368  .add(*Dst)
2369  .addImm(Src0Mods ? Src0Mods->getImm() : 0)
2370  .add(*Src0)
2371  .addImm(Src1Mods ? Src1Mods->getImm() : 0)
2372  .add(*Src1)
2373  .addImm(0) // Src mods
2374  .add(*Src2)
2375  .addImm(Clamp ? Clamp->getImm() : 0)
2376  .addImm(Omod ? Omod->getImm() : 0);
2377 }
2378 
2379 // It's not generally safe to move VALU instructions across these since it will
2380 // start using the register as a base index rather than directly.
2381 // XXX - Why isn't hasSideEffects sufficient for these?
2383  switch (MI.getOpcode()) {
2384  case AMDGPU::S_SET_GPR_IDX_ON:
2385  case AMDGPU::S_SET_GPR_IDX_MODE:
2386  case AMDGPU::S_SET_GPR_IDX_OFF:
2387  return true;
2388  default:
2389  return false;
2390  }
2391 }
2392 
2394  const MachineBasicBlock *MBB,
2395  const MachineFunction &MF) const {
2396  // XXX - Do we want the SP check in the base implementation?
2397 
2398  // Target-independent instructions do not have an implicit-use of EXEC, even
2399  // when they operate on VGPRs. Treating EXEC modifications as scheduling
2400  // boundaries prevents incorrect movements of such instructions.
2401  return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) ||
2402  MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
2403  MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
2404  MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
2406 }
2407 
2408 bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
2409  return Opcode == AMDGPU::DS_ORDERED_COUNT ||
2410  Opcode == AMDGPU::DS_GWS_INIT ||
2411  Opcode == AMDGPU::DS_GWS_SEMA_V ||
2412  Opcode == AMDGPU::DS_GWS_SEMA_BR ||
2413  Opcode == AMDGPU::DS_GWS_SEMA_P ||
2414  Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL ||
2415  Opcode == AMDGPU::DS_GWS_BARRIER;
2416 }
2417 
2419  unsigned Opcode = MI.getOpcode();
2420 
2421  if (MI.mayStore() && isSMRD(MI))
2422  return true; // scalar store or atomic
2423 
2424  // These instructions cause shader I/O that may cause hardware lockups
2425  // when executed with an empty EXEC mask.
2426  //
2427  // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
2428  // EXEC = 0, but checking for that case here seems not worth it
2429  // given the typical code patterns.
2430  if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
2431  Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE ||
2432  Opcode == AMDGPU::DS_ORDERED_COUNT)
2433  return true;
2434 
2435  if (MI.isInlineAsm())
2436  return true; // conservative assumption
2437 
2438  // These are like SALU instructions in terms of effects, so it's questionable
2439  // whether we should return true for those.
2440  //
2441  // However, executing them with EXEC = 0 causes them to operate on undefined
2442  // data, which we avoid by returning true here.
2443  if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || Opcode == AMDGPU::V_READLANE_B32)
2444  return true;
2445 
2446  return false;
2447 }
2448 
2449 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
2450  switch (Imm.getBitWidth()) {
2451  case 32:
2453  ST.hasInv2PiInlineImm());
2454  case 64:
2456  ST.hasInv2PiInlineImm());
2457  case 16:
2458  return ST.has16BitInsts() &&
2460  ST.hasInv2PiInlineImm());
2461  default:
2462  llvm_unreachable("invalid bitwidth");
2463  }
2464 }
2465 
2467  uint8_t OperandType) const {
2468  if (!MO.isImm() ||
2469  OperandType < AMDGPU::OPERAND_SRC_FIRST ||
2470  OperandType > AMDGPU::OPERAND_SRC_LAST)
2471  return false;
2472 
2473  // MachineOperand provides no way to tell the true operand size, since it only
2474  // records a 64-bit value. We need to know the size to determine if a 32-bit
2475  // floating point immediate bit pattern is legal for an integer immediate. It
2476  // would be for any 32-bit integer operand, but would not be for a 64-bit one.
2477 
2478  int64_t Imm = MO.getImm();
2479  switch (OperandType) {
2484  int32_t Trunc = static_cast<int32_t>(Imm);
2486  }
2492  ST.hasInv2PiInlineImm());
2497  if (isInt<16>(Imm) || isUInt<16>(Imm)) {
2498  // A few special case instructions have 16-bit operands on subtargets
2499  // where 16-bit instructions are not legal.
2500  // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
2501  // constants in these cases
2502  int16_t Trunc = static_cast<int16_t>(Imm);
2503  return ST.has16BitInsts() &&
2505  }
2506 
2507  return false;
2508  }
2511  if (isUInt<16>(Imm)) {
2512  int16_t Trunc = static_cast<int16_t>(Imm);
2513  return ST.has16BitInsts() &&
2515  }
2516  if (!(Imm & 0xffff)) {
2517  return ST.has16BitInsts() &&
2519  }
2520  uint32_t Trunc = static_cast<uint32_t>(Imm);
2522  }
2523  default:
2524  llvm_unreachable("invalid bitwidth");
2525  }
2526 }
2527 
2529  const MCOperandInfo &OpInfo) const {
2530  switch (MO.getType()) {
2532  return false;
2534  return !isInlineConstant(MO, OpInfo);
2540  return true;
2541  default:
2542  llvm_unreachable("unexpected operand type");
2543  }
2544 }
2545 
2546 static bool compareMachineOp(const MachineOperand &Op0,
2547  const MachineOperand &Op1) {
2548  if (Op0.getType() != Op1.getType())
2549  return false;
2550 
2551  switch (Op0.getType()) {
2553  return Op0.getReg() == Op1.getReg();
2555  return Op0.getImm() == Op1.getImm();
2556  default:
2557  llvm_unreachable("Didn't expect to be comparing these operand types");
2558  }
2559 }
2560 
2562  const MachineOperand &MO) const {
2563  const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo];
2564 
2565  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
2566 
2567  if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
2568  return true;
2569 
2570  if (OpInfo.RegClass < 0)
2571  return false;
2572 
2573  if (MO.isImm() && isInlineConstant(MO, OpInfo))
2574  return RI.opCanUseInlineConstant(OpInfo.OperandType);
2575 
2576  return RI.opCanUseLiteralConstant(OpInfo.OperandType);
2577 }
2578 
2579 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
2580  int Op32 = AMDGPU::getVOPe32(Opcode);
2581  if (Op32 == -1)
2582  return false;
2583 
2584  return pseudoToMCOpcode(Op32) != -1;
2585 }
2586 
2587 bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
2588  // The src0_modifier operand is present on all instructions
2589  // that have modifiers.
2590 
2591  return AMDGPU::getNamedOperandIdx(Opcode,
2592  AMDGPU::OpName::src0_modifiers) != -1;
2593 }
2594 
2596  unsigned OpName) const {
2597  const MachineOperand *Mods = getNamedOperand(MI, OpName);
2598  return Mods && Mods->getImm();
2599 }
2600 
2602  return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
2603  hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
2604  hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) ||
2605  hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
2606  hasModifiersSet(MI, AMDGPU::OpName::omod);
2607 }
2608 
2610  const MachineRegisterInfo &MRI) const {
2611  const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
2612  // Can't shrink instruction with three operands.
2613  // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
2614  // a special case for it. It can only be shrunk if the third operand
2615  // is vcc, and src0_modifiers and src1_modifiers are not set.
2616  // We should handle this the same way we handle vopc, by addding
2617  // a register allocation hint pre-regalloc and then do the shrinking
2618  // post-regalloc.
2619  if (Src2) {
2620  switch (MI.getOpcode()) {
2621  default: return false;
2622 
2623  case AMDGPU::V_ADDC_U32_e64:
2624  case AMDGPU::V_SUBB_U32_e64:
2625  case AMDGPU::V_SUBBREV_U32_e64: {
2626  const MachineOperand *Src1
2627  = getNamedOperand(MI, AMDGPU::OpName::src1);
2628  if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
2629  return false;
2630  // Additional verification is needed for sdst/src2.
2631  return true;
2632  }
2633  case AMDGPU::V_MAC_F32_e64:
2634  case AMDGPU::V_MAC_F16_e64:
2635  case AMDGPU::V_FMAC_F32_e64:
2636  if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
2637  hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
2638  return false;
2639  break;
2640 
2641  case AMDGPU::V_CNDMASK_B32_e64:
2642  break;
2643  }
2644  }
2645 
2646  const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
2647  if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
2648  hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
2649  return false;
2650 
2651  // We don't need to check src0, all input types are legal, so just make sure
2652  // src0 isn't using any modifiers.
2653  if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
2654  return false;
2655 
2656  // Can it be shrunk to a valid 32 bit opcode?
2657  if (!hasVALU32BitEncoding(MI.getOpcode()))
2658  return false;
2659 
2660  // Check output modifiers
2661  return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
2662  !hasModifiersSet(MI, AMDGPU::OpName::clamp);
2663 }
2664 
2665 // Set VCC operand with all flags from \p Orig, except for setting it as
2666 // implicit.
2668  const MachineOperand &Orig) {
2669 
2670  for (MachineOperand &Use : MI.implicit_operands()) {
2671  if (Use.isUse() && Use.getReg() == AMDGPU::VCC) {
2672  Use.setIsUndef(Orig.isUndef());
2673  Use.setIsKill(Orig.isKill());
2674  return;
2675  }
2676  }
2677 }
2678 
2680  unsigned Op32) const {
2681  MachineBasicBlock *MBB = MI.getParent();;
2682  MachineInstrBuilder Inst32 =
2683  BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32));
2684 
2685  // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
2686  // For VOPC instructions, this is replaced by an implicit def of vcc.
2687  int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
2688  if (Op32DstIdx != -1) {
2689  // dst
2690  Inst32.add(MI.getOperand(0));
2691  } else {
2692  assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
2693  "Unexpected case");
2694  }
2695 
2696  Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0));
2697 
2698  const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
2699  if (Src1)
2700  Inst32.add(*Src1);
2701 
2702  const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
2703 
2704  if (Src2) {
2705  int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
2706  if (Op32Src2Idx != -1) {
2707  Inst32.add(*Src2);
2708  } else {
2709  // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
2710  // replaced with an implicit read of vcc. This was already added
2711  // during the initial BuildMI, so find it to preserve the flags.
2712  copyFlagsToImplicitVCC(*Inst32, *Src2);
2713  }
2714  }
2715 
2716  return Inst32;
2717 }
2718 
2720  const MachineOperand &MO,
2721  const MCOperandInfo &OpInfo) const {
2722  // Literal constants use the constant bus.
2723  //if (isLiteralConstantLike(MO, OpInfo))
2724  // return true;
2725  if (MO.isImm())
2726  return !isInlineConstant(MO, OpInfo);
2727 
2728  if (!MO.isReg())
2729  return true; // Misc other operands like FrameIndex
2730 
2731  if (!MO.isUse())
2732  return false;
2733 
2735  return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
2736 
2737  // FLAT_SCR is just an SGPR pair.
2738  if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR))
2739  return true;
2740 
2741  // EXEC register uses the constant bus.
2742  if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
2743  return true;
2744 
2745  // SGPRs use the constant bus
2746  return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 ||
2747  (!MO.isImplicit() &&
2748  (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
2749  AMDGPU::SGPR_64RegClass.contains(MO.getReg()))));
2750 }
2751 
2752 static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
2753  for (const MachineOperand &MO : MI.implicit_operands()) {
2754  // We only care about reads.
2755  if (MO.isDef())
2756  continue;
2757 
2758  switch (MO.getReg()) {
2759  case AMDGPU::VCC:
2760  case AMDGPU::M0:
2761  case AMDGPU::FLAT_SCR:
2762  return MO.getReg();
2763 
2764  default:
2765  break;
2766  }
2767  }
2768 
2769  return AMDGPU::NoRegister;
2770 }
2771 
2772 static bool shouldReadExec(const MachineInstr &MI) {
2773  if (SIInstrInfo::isVALU(MI)) {
2774  switch (MI.getOpcode()) {
2775  case AMDGPU::V_READLANE_B32:
2776  case AMDGPU::V_READLANE_B32_si:
2777  case AMDGPU::V_READLANE_B32_vi:
2778  case AMDGPU::V_WRITELANE_B32:
2779  case AMDGPU::V_WRITELANE_B32_si:
2780  case AMDGPU::V_WRITELANE_B32_vi:
2781  return false;
2782  }
2783 
2784  return true;
2785  }
2786 
2787  if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
2788  SIInstrInfo::isSALU(MI) ||
2789  SIInstrInfo::isSMRD(MI))
2790  return false;
2791 
2792  return true;
2793 }
2794 
2795 static bool isSubRegOf(const SIRegisterInfo &TRI,
2796  const MachineOperand &SuperVec,
2797  const MachineOperand &SubReg) {
2799  return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
2800 
2801  return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
2802  SubReg.getReg() == SuperVec.getReg();
2803 }
2804 
2806  StringRef &ErrInfo) const {
2807  uint16_t Opcode = MI.getOpcode();
2808  if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
2809  return true;
2810 
2811  const MachineFunction *MF = MI.getParent()->getParent();
2812  const MachineRegisterInfo &MRI = MF->getRegInfo();
2813 
2814  int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
2815  int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
2816  int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
2817 
2818  // Make sure the number of operands is correct.
2819  const MCInstrDesc &Desc = get(Opcode);
2820  if (!Desc.isVariadic() &&
2821  Desc.getNumOperands() != MI.getNumExplicitOperands()) {
2822  ErrInfo = "Instruction has wrong number of operands.";
2823  return false;
2824  }
2825 
2826  if (MI.isInlineAsm()) {
2827  // Verify register classes for inlineasm constraints.
2828  for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
2829  I != E; ++I) {
2830  const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
2831  if (!RC)
2832  continue;
2833 
2834  const MachineOperand &Op = MI.getOperand(I);
2835  if (!Op.isReg())
2836  continue;
2837 
2838  unsigned Reg = Op.getReg();
2839  if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) {
2840  ErrInfo = "inlineasm operand has incorrect register class.";
2841  return false;
2842  }
2843  }
2844 
2845  return true;
2846  }
2847 
2848  // Make sure the register classes are correct.
2849  for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
2850  if (MI.getOperand(i).isFPImm()) {
2851  ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
2852  "all fp values to integers.";
2853  return false;
2854  }
2855 
2856  int RegClass = Desc.OpInfo[i].RegClass;
2857 
2858  switch (Desc.OpInfo[i].OperandType) {
2860  if (MI.getOperand(i).isImm()) {
2861  ErrInfo = "Illegal immediate value for operand.";
2862  return false;
2863  }
2864  break;
2867  break;
2874  const MachineOperand &MO = MI.getOperand(i);
2875  if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
2876  ErrInfo = "Illegal immediate value for operand.";
2877  return false;
2878  }
2879  break;
2880  }
2883  // Check if this operand is an immediate.
2884  // FrameIndex operands will be replaced by immediates, so they are
2885  // allowed.
2886  if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
2887  ErrInfo = "Expected immediate, but got non-immediate";
2888  return false;
2889  }
2891  default:
2892  continue;
2893  }
2894 
2895  if (!MI.getOperand(i).isReg())
2896  continue;
2897 
2898  if (RegClass != -1) {
2899  unsigned Reg = MI.getOperand(i).getReg();
2900  if (Reg == AMDGPU::NoRegister ||
2902  continue;
2903 
2904  const TargetRegisterClass *RC = RI.getRegClass(RegClass);
2905  if (!RC->contains(Reg)) {
2906  ErrInfo = "Operand has incorrect register class.";
2907  return false;
2908  }
2909  }
2910  }
2911 
2912  // Verify SDWA
2913  if (isSDWA(MI)) {
2914  if (!ST.hasSDWA()) {
2915  ErrInfo = "SDWA is not supported on this target";
2916  return false;
2917  }
2918 
2919  int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
2920 
2921  const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx };
2922 
2923  for (int OpIdx: OpIndicies) {
2924  if (OpIdx == -1)
2925  continue;
2926  const MachineOperand &MO = MI.getOperand(OpIdx);
2927 
2928  if (!ST.hasSDWAScalar()) {
2929  // Only VGPRS on VI
2930  if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
2931  ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
2932  return false;
2933  }
2934  } else {
2935  // No immediates on GFX9
2936  if (!MO.isReg()) {
2937  ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9";
2938  return false;
2939  }
2940  }
2941  }
2942 
2943  if (!ST.hasSDWAOmod()) {
2944  // No omod allowed on VI
2945  const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
2946  if (OMod != nullptr &&
2947  (!OMod->isImm() || OMod->getImm() != 0)) {
2948  ErrInfo = "OMod not allowed in SDWA instructions on VI";
2949  return false;
2950  }
2951  }
2952 
2953  uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
2954  if (isVOPC(BasicOpcode)) {
2955  if (!ST.hasSDWASdst() && DstIdx != -1) {
2956  // Only vcc allowed as dst on VI for VOPC
2957  const MachineOperand &Dst = MI.getOperand(DstIdx);
2958  if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
2959  ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
2960  return false;
2961  }
2962  } else if (!ST.hasSDWAOutModsVOPC()) {
2963  // No clamp allowed on GFX9 for VOPC
2964  const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
2965  if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
2966  ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
2967  return false;
2968  }
2969 
2970  // No omod allowed on GFX9 for VOPC
2971  const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
2972  if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
2973  ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
2974  return false;
2975  }
2976  }
2977  }
2978 
2979  const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
2980  if (DstUnused && DstUnused->isImm() &&
2981  DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
2982  const MachineOperand &Dst = MI.getOperand(DstIdx);
2983  if (!Dst.isReg() || !Dst.isTied()) {
2984  ErrInfo = "Dst register should have tied register";
2985  return false;
2986  }
2987 
2988  const MachineOperand &TiedMO =
2989  MI.getOperand(MI.findTiedOperandIdx(DstIdx));
2990  if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
2991  ErrInfo =
2992  "Dst register should be tied to implicit use of preserved register";
2993  return false;
2994  } else if (TargetRegisterInfo::isPhysicalRegister(TiedMO.getReg()) &&
2995  Dst.getReg() != TiedMO.getReg()) {
2996  ErrInfo = "Dst register should use same physical register as preserved";
2997  return false;
2998  }
2999  }
3000  }
3001 
3002  // Verify MIMG
3003  if (isMIMG(MI.getOpcode()) && !MI.mayStore()) {
3004  // Ensure that the return type used is large enough for all the options
3005  // being used TFE/LWE require an extra result register.
3006  const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
3007  if (DMask) {
3008  uint64_t DMaskImm = DMask->getImm();
3009  uint32_t RegCount =
3010  isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm);
3011  const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
3012  const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
3013  const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
3014 
3015  // Adjust for packed 16 bit values
3016  if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
3017  RegCount >>= 1;
3018 
3019  // Adjust if using LWE or TFE
3020  if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
3021  RegCount += 1;
3022 
3023  const uint32_t DstIdx =
3024  AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
3025  const MachineOperand &Dst = MI.getOperand(DstIdx);
3026  if (Dst.isReg()) {
3027  const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
3028  uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
3029  if (RegCount > DstSize) {
3030  ErrInfo = "MIMG instruction returns too many registers for dst "
3031  "register class";
3032  return false;
3033  }
3034  }
3035  }
3036  }
3037 
3038  // Verify VOP*. Ignore multiple sgpr operands on writelane.
3039  if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
3040  && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
3041  // Only look at the true operands. Only a real operand can use the constant
3042  // bus, and we don't want to check pseudo-operands like the source modifier
3043  // flags.
3044  const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
3045 
3046  unsigned ConstantBusCount = 0;
3047  unsigned LiteralCount = 0;
3048 
3049  if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
3050  ++ConstantBusCount;
3051 
3052  unsigned SGPRUsed = findImplicitSGPRRead(MI);
3053  if (SGPRUsed != AMDGPU::NoRegister)
3054  ++ConstantBusCount;
3055 
3056  for (int OpIdx : OpIndices) {
3057  if (OpIdx == -1)
3058  break;
3059  const MachineOperand &MO = MI.getOperand(OpIdx);
3060  if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
3061  if (MO.isReg()) {
3062  if (MO.getReg() != SGPRUsed)
3063  ++ConstantBusCount;
3064  SGPRUsed = MO.getReg();
3065  } else {
3066  ++ConstantBusCount;
3067  ++LiteralCount;
3068  }
3069  }
3070  }
3071  if (ConstantBusCount > 1) {
3072  ErrInfo = "VOP* instruction uses the constant bus more than once";
3073  return false;
3074  }
3075 
3076  if (isVOP3(MI) && LiteralCount) {
3077  ErrInfo = "VOP3 instruction uses literal";
3078  return false;
3079  }
3080  }
3081 
3082  // Verify misc. restrictions on specific instructions.
3083  if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
3084  Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
3085  const MachineOperand &Src0 = MI.getOperand(Src0Idx);
3086  const MachineOperand &Src1 = MI.getOperand(Src1Idx);
3087  const MachineOperand &Src2 = MI.getOperand(Src2Idx);
3088  if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
3089  if (!compareMachineOp(Src0, Src1) &&
3090  !compareMachineOp(Src0, Src2)) {
3091  ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
3092  return false;
3093  }
3094  }
3095  }
3096 
3097  if (isSOPK(MI)) {
3098  int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm();
3099  if (sopkIsZext(MI)) {
3100  if (!isUInt<16>(Imm)) {
3101  ErrInfo = "invalid immediate for SOPK instruction";
3102  return false;
3103  }
3104  } else {
3105  if (!isInt<16>(Imm)) {
3106  ErrInfo = "invalid immediate for SOPK instruction";
3107  return false;
3108  }
3109  }
3110  }
3111 
3112  if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
3113  Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
3114  Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
3115  Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
3116  const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
3117  Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
3118 
3119  const unsigned StaticNumOps = Desc.getNumOperands() +
3120  Desc.getNumImplicitUses();
3121  const unsigned NumImplicitOps = IsDst ? 2 : 1;
3122 
3123  // Allow additional implicit operands. This allows a fixup done by the post
3124  // RA scheduler where the main implicit operand is killed and implicit-defs
3125  // are added for sub-registers that remain live after this instruction.
3126  if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
3127  ErrInfo = "missing implicit register operands";
3128  return false;
3129  }
3130 
3131  const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
3132  if (IsDst) {
3133  if (!Dst->isUse()) {
3134  ErrInfo = "v_movreld_b32 vdst should be a use operand";
3135  return false;
3136  }
3137 
3138  unsigned UseOpIdx;
3139  if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
3140  UseOpIdx != StaticNumOps + 1) {
3141  ErrInfo = "movrel implicit operands should be tied";
3142  return false;
3143  }
3144  }
3145 
3146  const MachineOperand &Src0 = MI.getOperand(Src0Idx);
3147  const MachineOperand &ImpUse
3148  = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
3149  if (!ImpUse.isReg() || !ImpUse.isUse() ||
3150  !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
3151  ErrInfo = "src0 should be subreg of implicit vector use";
3152  return false;
3153  }
3154  }
3155 
3156  // Make sure we aren't losing exec uses in the td files. This mostly requires
3157  // being careful when using let Uses to try to add other use registers.
3158  if (shouldReadExec(MI)) {
3159  if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
3160  ErrInfo = "VALU instruction does not implicitly read exec mask";
3161  return false;
3162  }
3163  }
3164 
3165  if (isSMRD(MI)) {
3166  if (MI.mayStore()) {
3167  // The register offset form of scalar stores may only use m0 as the
3168  // soffset register.
3169  const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff);
3170  if (Soff && Soff->getReg() != AMDGPU::M0) {
3171  ErrInfo = "scalar stores must use m0 as offset register";
3172  return false;
3173  }
3174  }
3175  }
3176 
3177  if (isFLAT(MI) && !MF->getSubtarget<GCNSubtarget>().hasFlatInstOffsets()) {
3178  const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
3179  if (Offset->getImm() != 0) {
3180  ErrInfo = "subtarget does not support offsets in flat instructions";
3181  return false;
3182  }
3183  }
3184 
3185  const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
3186  if (DppCt) {
3187  using namespace AMDGPU::DPP;
3188 
3189  unsigned DC = DppCt->getImm();
3190  if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
3191  DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
3196  ErrInfo = "Invalid dpp_ctrl value";
3197  return false;
3198  }
3199  }
3200 
3201  return true;
3202 }
3203 
3204 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
3205  switch (MI.getOpcode()) {
3206  default: return AMDGPU::INSTRUCTION_LIST_END;
3207  case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
3208  case AMDGPU::COPY: return AMDGPU::COPY;
3209  case AMDGPU::PHI: return AMDGPU::PHI;
3210  case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
3211  case AMDGPU::WQM: return AMDGPU::WQM;
3212  case AMDGPU::WWM: return AMDGPU::WWM;
3213  case AMDGPU::S_MOV_B32:
3214  return MI.getOperand(1).isReg() ?
3215  AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
3216  case AMDGPU::S_ADD_I32:
3217  return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32;
3218  case AMDGPU::S_ADDC_U32:
3219  return AMDGPU::V_ADDC_U32_e32;
3220  case AMDGPU::S_SUB_I32:
3221  return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32;
3222  // FIXME: These are not consistently handled, and selected when the carry is
3223  // used.
3224  case AMDGPU::S_ADD_U32:
3225  return AMDGPU::V_ADD_I32_e32;
3226  case AMDGPU::S_SUB_U32:
3227  return AMDGPU::V_SUB_I32_e32;
3228  case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
3229  case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
3230  case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32;
3231  case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32;
3232  case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
3233  case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
3234  case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
3235  case AMDGPU::S_XNOR_B32:
3236  return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
3237  case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
3238  case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
3239  case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
3240  case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
3241  case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
3242  case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
3243  case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
3244  case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
3245  case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
3246  case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
3247  case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
3248  case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
3249  case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
3250  case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
3251  case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
3252  case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
3253  case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
3254  case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
3255  case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
3256  case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
3257  case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
3258  case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
3259  case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
3260  case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
3261  case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
3262  case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
3263  case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
3264  case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
3265  case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
3266  case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
3267  case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32;
3268  case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32;
3269  case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
3270  case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
3271  case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
3272  case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
3273  case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
3274  case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
3275  }
3277  "Unexpected scalar opcode without corresponding vector one!");
3278 }
3279 
3281  unsigned OpNo) const {
3283  const MCInstrDesc &Desc = get(MI.getOpcode());
3284  if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
3285  Desc.OpInfo[OpNo].RegClass == -1) {
3286  unsigned Reg = MI.getOperand(OpNo).getReg();
3287 
3289  return MRI.getRegClass(Reg);
3290  return RI.getPhysRegClass(Reg);
3291  }
3292 
3293  unsigned RCID = Desc.OpInfo[OpNo].RegClass;
3294  return RI.getRegClass(RCID);
3295 }
3296 
3297 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
3299  MachineBasicBlock *MBB = MI.getParent();
3300  MachineOperand &MO = MI.getOperand(OpIdx);
3302  unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
3303  const TargetRegisterClass *RC = RI.getRegClass(RCID);
3304  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
3305  if (MO.isReg())
3306  Opcode = AMDGPU::COPY;
3307  else if (RI.isSGPRClass(RC))
3308  Opcode = AMDGPU::S_MOV_B32;
3309 
3310  const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
3311  if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
3312  VRC = &AMDGPU::VReg_64RegClass;
3313  else
3314  VRC = &AMDGPU::VGPR_32RegClass;
3315 
3316  unsigned Reg = MRI.createVirtualRegister(VRC);
3317  DebugLoc DL = MBB->findDebugLoc(I);
3318  BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
3319  MO.ChangeToRegister(Reg, false);
3320 }
3321 
3324  MachineOperand &SuperReg,
3325  const TargetRegisterClass *SuperRC,
3326  unsigned SubIdx,
3327  const TargetRegisterClass *SubRC)
3328  const {
3329  MachineBasicBlock *MBB = MI->getParent();
3330  DebugLoc DL = MI->getDebugLoc();
3331  unsigned SubReg = MRI.createVirtualRegister(SubRC);
3332 
3333  if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
3334  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
3335  .addReg(SuperReg.getReg(), 0, SubIdx);
3336  return SubReg;
3337  }
3338 
3339  // Just in case the super register is itself a sub-register, copy it to a new
3340  // value so we don't need to worry about merging its subreg index with the
3341  // SubIdx passed to this function. The register coalescer should be able to
3342  // eliminate this extra copy.
3343  unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
3344 
3345  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
3346  .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
3347 
3348  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
3349  .addReg(NewSuperReg, 0, SubIdx);
3350 
3351  return SubReg;
3352 }
3353 
3357  MachineOperand &Op,
3358  const TargetRegisterClass *SuperRC,
3359  unsigned SubIdx,
3360  const TargetRegisterClass *SubRC) const {
3361  if (Op.isImm()) {
3362  if (SubIdx == AMDGPU::sub0)
3363  return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
3364  if (SubIdx == AMDGPU::sub1)
3365  return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
3366 
3367  llvm_unreachable("Unhandled register index for immediate");
3368  }
3369 
3370  unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
3371  SubIdx, SubRC);
3372  return MachineOperand::CreateReg(SubReg, false);
3373 }
3374 
3375 // Change the order of operands from (0, 1, 2) to (0, 2, 1)
3376 void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
3377  assert(Inst.getNumExplicitOperands() == 3);
3378  MachineOperand Op1 = Inst.getOperand(1);
3379  Inst.RemoveOperand(1);
3380  Inst.addOperand(Op1);
3381 }
3382 
3384  const MCOperandInfo &OpInfo,
3385  const MachineOperand &MO) const {
3386  if (!MO.isReg())
3387  return false;
3388 
3389  unsigned Reg = MO.getReg();
3390  const TargetRegisterClass *RC =
3392  MRI.getRegClass(Reg) :
3393  RI.getPhysRegClass(Reg);
3394 
3395  const SIRegisterInfo *TRI =
3396  static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
3397  RC = TRI->getSubRegClass(RC, MO.getSubReg());
3398 
3399  // In order to be legal, the common sub-class must be equal to the
3400  // class of the current operand. For example:
3401  //
3402  // v_mov_b32 s0 ; Operand defined as vsrc_b32
3403  // ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL
3404  //
3405  // s_sendmsg 0, s0 ; Operand defined as m0reg
3406  // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
3407 
3408  return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
3409 }
3410 
3412  const MCOperandInfo &OpInfo,
3413  const MachineOperand &MO) const {
3414  if (MO.isReg())
3415  return isLegalRegOperand(MRI, OpInfo, MO);
3416 
3417  // Handle non-register types that are treated like immediates.
3418  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
3419  return true;
3420 }
3421 
3422 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
3423  const MachineOperand *MO) const {
3425  const MCInstrDesc &InstDesc = MI.getDesc();
3426  const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
3427  const TargetRegisterClass *DefinedRC =
3428  OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
3429  if (!MO)
3430  MO = &MI.getOperand(OpIdx);
3431 
3432  if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
3433 
3434  RegSubRegPair SGPRUsed;
3435  if (MO->isReg())
3436  SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg());
3437 
3438  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3439  if (i == OpIdx)
3440  continue;
3441  const MachineOperand &Op = MI.getOperand(i);
3442  if (Op.isReg()) {
3443  if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) &&
3444  usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) {
3445  return false;
3446  }
3447  } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
3448  return false;
3449  }
3450  }
3451  }
3452 
3453  if (MO->isReg()) {
3454  assert(DefinedRC);
3455  return isLegalRegOperand(MRI, OpInfo, *MO);
3456  }
3457 
3458  // Handle non-register types that are treated like immediates.
3459  assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
3460 
3461  if (!DefinedRC) {
3462  // This operand expects an immediate.
3463  return true;
3464  }
3465 
3466  return isImmOperandLegal(MI, OpIdx, *MO);
3467 }
3468 
3470  MachineInstr &MI) const {
3471  unsigned Opc = MI.getOpcode();
3472  const MCInstrDesc &InstrDesc = get(Opc);
3473 
3474  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
3475  MachineOperand &Src1 = MI.getOperand(Src1Idx);
3476 
3477  // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
3478  // we need to only have one constant bus use.
3479  //
3480  // Note we do not need to worry about literal constants here. They are
3481  // disabled for the operand type for instructions because they will always
3482  // violate the one constant bus use rule.
3483  bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister;
3484  if (HasImplicitSGPR) {
3485  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3486  MachineOperand &Src0 = MI.getOperand(Src0Idx);
3487 
3488  if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
3489  legalizeOpWithMove(MI, Src0Idx);
3490  }
3491 
3492  // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
3493  // both the value to write (src0) and lane select (src1). Fix up non-SGPR
3494  // src0/src1 with V_READFIRSTLANE.
3495  if (Opc == AMDGPU::V_WRITELANE_B32) {
3496  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3497  MachineOperand &Src0 = MI.getOperand(Src0Idx);
3498  const DebugLoc &DL = MI.getDebugLoc();
3499  if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
3500  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3501  BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3502  .add(Src0);
3503  Src0.ChangeToRegister(Reg, false);
3504  }
3505  if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
3506  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3507  const DebugLoc &DL = MI.getDebugLoc();
3508  BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3509  .add(Src1);
3510  Src1.ChangeToRegister(Reg, false);
3511  }
3512  return;
3513  }
3514 
3515  // VOP2 src0 instructions support all operand types, so we don't need to check
3516  // their legality. If src1 is already legal, we don't need to do anything.
3517  if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
3518  return;
3519 
3520  // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
3521  // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
3522  // select is uniform.
3523  if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
3524  RI.isVGPR(MRI, Src1.getReg())) {
3525  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3526  const DebugLoc &DL = MI.getDebugLoc();
3527  BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3528  .add(Src1);
3529  Src1.ChangeToRegister(Reg, false);
3530  return;
3531  }
3532 
3533  // We do not use commuteInstruction here because it is too aggressive and will
3534  // commute if it is possible. We only want to commute here if it improves
3535  // legality. This can be called a fairly large number of times so don't waste
3536  // compile time pointlessly swapping and checking legality again.
3537  if (HasImplicitSGPR || !MI.isCommutable()) {
3538  legalizeOpWithMove(MI, Src1Idx);
3539  return;
3540  }
3541 
3542  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3543  MachineOperand &Src0 = MI.getOperand(Src0Idx);
3544 
3545  // If src0 can be used as src1, commuting will make the operands legal.
3546  // Otherwise we have to give up and insert a move.
3547  //
3548  // TODO: Other immediate-like operand kinds could be commuted if there was a
3549  // MachineOperand::ChangeTo* for them.
3550  if ((!Src1.isImm() && !Src1.isReg()) ||
3551  !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
3552  legalizeOpWithMove(MI, Src1Idx);
3553  return;
3554  }
3555 
3556  int CommutedOpc = commuteOpcode(MI);
3557  if (CommutedOpc == -1) {
3558  legalizeOpWithMove(MI, Src1Idx);
3559  return;
3560  }
3561 
3562  MI.setDesc(get(CommutedOpc));
3563 
3564  unsigned Src0Reg = Src0.getReg();
3565  unsigned Src0SubReg = Src0.getSubReg();
3566  bool Src0Kill = Src0.isKill();
3567 
3568  if (Src1.isImm())
3569  Src0.ChangeToImmediate(Src1.getImm());
3570  else if (Src1.isReg()) {
3571  Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
3572  Src0.setSubReg(Src1.getSubReg());
3573  } else
3574  llvm_unreachable("Should only have register or immediate operands");
3575 
3576  Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
3577  Src1.setSubReg(Src0SubReg);
3578 }
3579 
3580 // Legalize VOP3 operands. Because all operand types are supported for any
3581 // operand, and since literal constants are not allowed and should never be
3582 // seen, we only need to worry about inserting copies if we use multiple SGPR
3583 // operands.
3585  MachineInstr &MI) const {
3586  unsigned Opc = MI.getOpcode();
3587 
3588  int VOP3Idx[3] = {
3589  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
3590  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
3591  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
3592  };
3593 
3594  // Find the one SGPR operand we are allowed to use.
3595  unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
3596 
3597  for (unsigned i = 0; i < 3; ++i) {
3598  int Idx = VOP3Idx[i];
3599  if (Idx == -1)
3600  break;
3601  MachineOperand &MO = MI.getOperand(Idx);
3602 
3603  // We should never see a VOP3 instruction with an illegal immediate operand.
3604  if (!MO.isReg())
3605  continue;
3606 
3607  if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
3608  continue; // VGPRs are legal
3609 
3610  if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
3611  SGPRReg = MO.getReg();
3612  // We can use one SGPR in each VOP3 instruction.
3613  continue;
3614  }
3615 
3616  // If we make it this far, then the operand is not legal and we must
3617  // legalize it.
3618  legalizeOpWithMove(MI, Idx);
3619  }
3620 }
3621 
3623  MachineRegisterInfo &MRI) const {
3624  const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
3625  const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
3626  unsigned DstReg = MRI.createVirtualRegister(SRC);
3627  unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
3628 
3629  if (SubRegs == 1) {
3630  BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
3631  get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
3632  .addReg(SrcReg);
3633  return DstReg;
3634  }
3635 
3637  for (unsigned i = 0; i < SubRegs; ++i) {
3638  unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3639  BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
3640  get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
3641  .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
3642  SRegs.push_back(SGPR);
3643  }
3644 
3645  MachineInstrBuilder MIB =
3646  BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
3647  get(AMDGPU::REG_SEQUENCE), DstReg);
3648  for (unsigned i = 0; i < SubRegs; ++i) {
3649  MIB.addReg(SRegs[i]);
3650  MIB.addImm(RI.getSubRegFromChannel(i));
3651  }
3652  return DstReg;
3653 }
3654 
3656  MachineInstr &MI) const {
3657 
3658  // If the pointer is store in VGPRs, then we need to move them to
3659  // SGPRs using v_readfirstlane. This is safe because we only select
3660  // loads with uniform pointers to SMRD instruction so we know the
3661  // pointer value is uniform.
3662  MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
3663  if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
3664  unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
3665  SBase->setReg(SGPR);
3666  }
3667  MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff);
3668  if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
3669  unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
3670  SOff->setReg(SGPR);
3671  }
3672 }
3673 
3676  const TargetRegisterClass *DstRC,
3677  MachineOperand &Op,
3679  const DebugLoc &DL) const {
3680  unsigned OpReg = Op.getReg();
3681  unsigned OpSubReg = Op.getSubReg();
3682 
3683  const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
3684  RI.getRegClassForReg(MRI, OpReg), OpSubReg);
3685 
3686  // Check if operand is already the correct register class.
3687  if (DstRC == OpRC)
3688  return;
3689 
3690  unsigned DstReg = MRI.createVirtualRegister(DstRC);
3691  MachineInstr *Copy =
3692  BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
3693 
3694  Op.setReg(DstReg);
3695  Op.setSubReg(0);
3696 
3697  MachineInstr *Def = MRI.getVRegDef(OpReg);
3698  if (!Def)
3699  return;
3700 
3701  // Try to eliminate the copy if it is copying an immediate value.
3702  if (Def->isMoveImmediate())
3703  FoldImmediate(*Copy, *Def, OpReg, &MRI);
3704 }
3705 
3706 // Emit the actual waterfall loop, executing the wrapped instruction for each
3707 // unique value of \p Rsrc across all lanes. In the best case we execute 1
3708 // iteration, in the worst case we execute 64 (once per lane).
3709 static void
3711  MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
3712  const DebugLoc &DL, MachineOperand &Rsrc) {
3713  MachineBasicBlock::iterator I = LoopBB.begin();
3714 
3715  unsigned VRsrc = Rsrc.getReg();
3716  unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef());
3717 
3718  unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3719  unsigned CondReg0 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3720  unsigned CondReg1 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3721  unsigned AndCond = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3722  unsigned SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3723  unsigned SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3724  unsigned SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3725  unsigned SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3726  unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
3727 
3728  // Beginning of the loop, read the next Rsrc variant.
3729  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub0)
3730  .addReg(VRsrc, VRsrcUndef, AMDGPU::sub0);
3731  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub1)
3732  .addReg(VRsrc, VRsrcUndef, AMDGPU::sub1);
3733  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub2)
3734  .addReg(VRsrc, VRsrcUndef, AMDGPU::sub2);
3735  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub3)
3736  .addReg(VRsrc, VRsrcUndef, AMDGPU::sub3);
3737 
3738  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc)
3739  .addReg(SRsrcSub0)
3740  .addImm(AMDGPU::sub0)
3741  .addReg(SRsrcSub1)
3742  .addImm(AMDGPU::sub1)
3743  .addReg(SRsrcSub2)
3744  .addImm(AMDGPU::sub2)
3745  .addReg(SRsrcSub3)
3746  .addImm(AMDGPU::sub3);
3747 
3748  // Update Rsrc operand to use the SGPR Rsrc.
3749  Rsrc.setReg(SRsrc);
3750  Rsrc.setIsKill(true);
3751 
3752  // Identify all lanes with identical Rsrc operands in their VGPRs.
3753  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg0)
3754  .addReg(SRsrc, 0, AMDGPU::sub0_sub1)
3755  .addReg(VRsrc, 0, AMDGPU::sub0_sub1);
3756  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg1)
3757  .addReg(SRsrc, 0, AMDGPU::sub2_sub3)
3758  .addReg(VRsrc, 0, AMDGPU::sub2_sub3);
3759  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_B64), AndCond)
3760  .addReg(CondReg0)
3761  .addReg(CondReg1);
3762 
3763  MRI.setSimpleHint(SaveExec, AndCond);
3764 
3765  // Update EXEC to matching lanes, saving original to SaveExec.
3766  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_SAVEEXEC_B64), SaveExec)
3767  .addReg(AndCond, RegState::Kill);
3768 
3769  // The original instruction is here; we insert the terminators after it.
3770  I = LoopBB.end();
3771 
3772  // Update EXEC, switch all done bits to 0 and all todo bits to 1.
3773  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC)
3774  .addReg(AMDGPU::EXEC)
3775  .addReg(SaveExec);
3776  BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB);
3777 }
3778 
3779 // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register
3780 // with SGPRs by iterating over all unique values across all lanes.
3782  MachineOperand &Rsrc, MachineDominatorTree *MDT) {
3783  MachineBasicBlock &MBB = *MI.getParent();
3784  MachineFunction &MF = *MBB.getParent();
3787  const DebugLoc &DL = MI.getDebugLoc();
3788 
3789  unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
3790 
3791  // Save the EXEC mask
3792  BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B64), SaveExec)
3793  .addReg(AMDGPU::EXEC);
3794 
3795  // Killed uses in the instruction we are waterfalling around will be
3796  // incorrect due to the added control-flow.
3797  for (auto &MO : MI.uses()) {
3798  if (MO.isReg() && MO.isUse()) {
3799  MRI.clearKillFlags(MO.getReg());
3800  }
3801  }
3802 
3803  // To insert the loop we need to split the block. Move everything after this
3804  // point to a new block, and insert a new empty block between the two.
3806  MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
3807  MachineFunction::iterator MBBI(MBB);
3808  ++MBBI;
3809 
3810  MF.insert(MBBI, LoopBB);
3811  MF.insert(MBBI, RemainderBB);
3812 
3813  LoopBB->addSuccessor(LoopBB);
3814  LoopBB->addSuccessor(RemainderBB);
3815 
3816  // Move MI to the LoopBB, and the remainder of the block to RemainderBB.
3818  RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
3819  RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
3820  LoopBB->splice(LoopBB->begin(), &MBB, J);
3821 
3822  MBB.addSuccessor(LoopBB);
3823 
3824  // Update dominators. We know that MBB immediately dominates LoopBB, that
3825  // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately
3826  // dominates all of the successors transferred to it from MBB that MBB used
3827  // to dominate.
3828  if (MDT) {
3829  MDT->addNewBlock(LoopBB, &MBB);
3830  MDT->addNewBlock(RemainderBB, LoopBB);
3831  for (auto &Succ : RemainderBB->successors()) {
3832  if (MDT->dominates(&MBB, Succ)) {
3833  MDT->changeImmediateDominator(Succ, RemainderBB);
3834  }
3835  }
3836  }
3837 
3838  emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc);
3839 
3840  // Restore the EXEC mask
3841  MachineBasicBlock::iterator First = RemainderBB->begin();
3842  BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
3843  .addReg(SaveExec);
3844 }
3845 
3846 // Extract pointer from Rsrc and return a zero-value Rsrc replacement.
3847 static std::tuple<unsigned, unsigned>
3849  MachineBasicBlock &MBB = *MI.getParent();
3850  MachineFunction &MF = *MBB.getParent();
3852 
3853  // Extract the ptr from the resource descriptor.
3854  unsigned RsrcPtr =
3855  TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
3856  AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
3857 
3858  // Create an empty resource descriptor
3859  unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3860  unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3861  unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3862  unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
3863  uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
3864 
3865  // Zero64 = 0
3866  BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
3867  .addImm(0);
3868 
3869  // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
3870  BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
3871  .addImm(RsrcDataFormat & 0xFFFFFFFF);
3872 
3873  // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
3874  BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
3875  .addImm(RsrcDataFormat >> 32);
3876 
3877  // NewSRsrc = {Zero64, SRsrcFormat}
3878  BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
3879  .addReg(Zero64)
3880  .addImm(AMDGPU::sub0_sub1)
3881  .addReg(SRsrcFormatLo)
3882  .addImm(AMDGPU::sub2)
3883  .addReg(SRsrcFormatHi)
3884  .addImm(AMDGPU::sub3);
3885 
3886  return std::make_tuple(RsrcPtr, NewSRsrc);
3887 }
3888 
3890  MachineDominatorTree *MDT) const {
3891  MachineFunction &MF = *MI.getParent()->getParent();
3893 
3894  // Legalize VOP2
3895  if (isVOP2(MI) || isVOPC(MI)) {
3896  legalizeOperandsVOP2(MRI, MI);
3897  return;
3898  }
3899 
3900  // Legalize VOP3
3901  if (isVOP3(MI)) {
3902  legalizeOperandsVOP3(MRI, MI);
3903  return;
3904  }
3905 
3906  // Legalize SMRD
3907  if (isSMRD(MI)) {
3908  legalizeOperandsSMRD(MRI, MI);
3909  return;
3910  }
3911 
3912  // Legalize REG_SEQUENCE and PHI
3913  // The register class of the operands much be the same type as the register
3914  // class of the output.
3915  if (MI.getOpcode() == AMDGPU::PHI) {
3916  const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
3917  for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
3918  if (!MI.getOperand(i).isReg() ||
3920  continue;
3921  const TargetRegisterClass *OpRC =
3922  MRI.getRegClass(MI.getOperand(i).getReg());
3923  if (RI.hasVGPRs(OpRC)) {
3924  VRC = OpRC;
3925  } else {
3926  SRC = OpRC;
3927  }
3928  }
3929 
3930  // If any of the operands are VGPR registers, then they all most be
3931  // otherwise we will create illegal VGPR->SGPR copies when legalizing
3932  // them.
3933  if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
3934  if (!VRC) {
3935  assert(SRC);
3936  VRC = RI.getEquivalentVGPRClass(SRC);
3937  }
3938  RC = VRC;
3939  } else {
3940  RC = SRC;
3941  }
3942 
3943  // Update all the operands so they have the same type.
3944  for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3945  MachineOperand &Op = MI.getOperand(I);
3947  continue;
3948 
3949  // MI is a PHI instruction.
3950  MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
3951  MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
3952 
3953  // Avoid creating no-op copies with the same src and dst reg class. These
3954  // confuse some of the machine passes.
3955  legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
3956  }
3957  }
3958 
3959  // REG_SEQUENCE doesn't really require operand legalization, but if one has a
3960  // VGPR dest type and SGPR sources, insert copies so all operands are
3961  // VGPRs. This seems to help operand folding / the register coalescer.
3962  if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
3963  MachineBasicBlock *MBB = MI.getParent();
3964  const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
3965  if (RI.hasVGPRs(DstRC)) {
3966  // Update all the operands so they are VGPR register classes. These may
3967  // not be the same register class because REG_SEQUENCE supports mixing
3968  // subregister index types e.g. sub0_sub1 + sub2 + sub3
3969  for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3970  MachineOperand &Op = MI.getOperand(I);
3972  continue;
3973 
3974  const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
3975  const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
3976  if (VRC == OpRC)
3977  continue;
3978 
3979  legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
3980  Op.setIsKill();
3981  }
3982  }
3983 
3984  return;
3985  }
3986 
3987  // Legalize INSERT_SUBREG
3988  // src0 must have the same register class as dst
3989  if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
3990  unsigned Dst = MI.getOperand(0).getReg();
3991  unsigned Src0 = MI.getOperand(1).getReg();
3992  const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
3993  const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
3994  if (DstRC != Src0RC) {
3995  MachineBasicBlock *MBB = MI.getParent();
3996  MachineOperand &Op = MI.getOperand(1);
3997  legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
3998  }
3999  return;
4000  }
4001 
4002  // Legalize SI_INIT_M0
4003  if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
4004  MachineOperand &Src = MI.getOperand(0);
4005  if (Src.isReg() && RI.hasVGPRs(MRI.getRegClass(Src.getReg())))
4006  Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
4007  return;
4008  }
4009 
4010  // Legalize MIMG and MUBUF/MTBUF for shaders.
4011  //
4012  // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
4013  // scratch memory access. In both cases, the legalization never involves
4014  // conversion to the addr64 form.
4015  if (isMIMG(MI) ||
4017  (isMUBUF(MI) || isMTBUF(MI)))) {
4018  MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
4019  if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
4020  unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
4021  SRsrc->setReg(SGPR);
4022  }
4023 
4024  MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
4025  if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) {
4026  unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI);
4027  SSamp->setReg(SGPR);
4028  }
4029  return;
4030  }
4031 
4032  // Legalize MUBUF* instructions.
4033  int RsrcIdx =
4034  AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
4035  if (RsrcIdx != -1) {
4036  // We have an MUBUF instruction
4037  MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
4038  unsigned RsrcRC = get(MI.getOpcode()).OpInfo[RsrcIdx].RegClass;
4039  if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()),
4040  RI.getRegClass(RsrcRC))) {
4041  // The operands are legal.
4042  // FIXME: We may need to legalize operands besided srsrc.
4043  return;
4044  }
4045 
4046  // Legalize a VGPR Rsrc.
4047  //
4048  // If the instruction is _ADDR64, we can avoid a waterfall by extracting
4049  // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
4050  // a zero-value SRsrc.
4051  //
4052  // If the instruction is _OFFSET (both idxen and offen disabled), and we
4053  // support ADDR64 instructions, we can convert to ADDR64 and do the same as
4054  // above.
4055  //
4056  // Otherwise we are on non-ADDR64 hardware, and/or we have
4057  // idxen/offen/bothen and we fall back to a waterfall loop.
4058 
4059  MachineBasicBlock &MBB = *MI.getParent();
4060 
4061  MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
4062  if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
4063  // This is already an ADDR64 instruction so we need to add the pointer
4064  // extracted from the resource descriptor to the current value of VAddr.
4065  unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4066  unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4067  unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4068 
4069  unsigned RsrcPtr, NewSRsrc;
4070  std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
4071 
4072  // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
4073  DebugLoc DL = MI.getDebugLoc();
4074  BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
4075  .addReg(RsrcPtr, 0, AMDGPU::sub0)
4076  .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
4077 
4078  // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
4079  BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
4080  .addReg(RsrcPtr, 0, AMDGPU::sub1)
4081  .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
4082 
4083  // NewVaddr = {NewVaddrHi, NewVaddrLo}
4084  BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
4085  .addReg(NewVAddrLo)
4086  .addImm(AMDGPU::sub0)
4087  .addReg(NewVAddrHi)
4088  .addImm(AMDGPU::sub1);
4089 
4090  VAddr->setReg(NewVAddr);
4091  Rsrc->setReg(NewSRsrc);
4092  } else if (!VAddr && ST.hasAddr64()) {
4093  // This instructions is the _OFFSET variant, so we need to convert it to
4094  // ADDR64.
4095  assert(MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration()
4097  "FIXME: Need to emit flat atomics here");
4098 
4099  unsigned RsrcPtr, NewSRsrc;
4100  std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
4101 
4102  unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4103  MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
4104  MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
4105  MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
4106  unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
4107 
4108  // Atomics rith return have have an additional tied operand and are
4109  // missing some of the special bits.
4110  MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
4111  MachineInstr *Addr64;
4112 
4113  if (!VDataIn) {
4114  // Regular buffer load / store.
4115  MachineInstrBuilder MIB =
4116  BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
4117  .add(*VData)
4118  .addReg(NewVAddr)
4119  .addReg(NewSRsrc)
4120  .add(*SOffset)
4121  .add(*Offset);
4122 
4123  // Atomics do not have this operand.
4124  if (const MachineOperand *GLC =
4125  getNamedOperand(MI, AMDGPU::OpName::glc)) {
4126  MIB.addImm(GLC->getImm());
4127  }
4128 
4129  MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc));
4130 
4131  if (const MachineOperand *TFE =
4132  getNamedOperand(MI, AMDGPU::OpName::tfe)) {
4133  MIB.addImm(TFE->getImm());
4134  }
4135 
4136  MIB.cloneMemRefs(MI);
4137  Addr64 = MIB;
4138  } else {
4139  // Atomics with return.
4140  Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
4141  .add(*VData)
4142  .add(*VDataIn)
4143  .addReg(NewVAddr)
4144  .addReg(NewSRsrc)
4145  .add(*SOffset)
4146  .add(*Offset)
4147  .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc))
4148  .cloneMemRefs(MI);
4149  }
4150 
4151  MI.removeFromParent();
4152 
4153  // NewVaddr = {NewVaddrHi, NewVaddrLo}
4154  BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
4155  NewVAddr)
4156  .addReg(RsrcPtr, 0, AMDGPU::sub0)
4157  .addImm(AMDGPU::sub0)
4158  .addReg(RsrcPtr, 0, AMDGPU::sub1)
4159  .addImm(AMDGPU::sub1);
4160  } else {
4161  // This is another variant; legalize Rsrc with waterfall loop from VGPRs
4162  // to SGPRs.
4163  loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT);
4164  }
4165  }
4166 }
4167 
4169  MachineDominatorTree *MDT) const {
4170  SetVectorType Worklist;
4171  Worklist.insert(&TopInst);
4172 
4173  while (!Worklist.empty()) {
4174  MachineInstr &Inst = *Worklist.pop_back_val();
4175  MachineBasicBlock *MBB = Inst.getParent();
4177 
4178  unsigned Opcode = Inst.getOpcode();
4179  unsigned NewOpcode = getVALUOp(Inst);
4180 
4181  // Handle some special cases
4182  switch (Opcode) {
4183  default:
4184  break;
4185  case AMDGPU::S_ADD_U64_PSEUDO:
4186  case AMDGPU::S_SUB_U64_PSEUDO:
4187  splitScalar64BitAddSub(Worklist, Inst, MDT);
4188  Inst.eraseFromParent();
4189  continue;
4190  case AMDGPU::S_ADD_I32:
4191  case AMDGPU::S_SUB_I32:
4192  // FIXME: The u32 versions currently selected use the carry.
4193  if (moveScalarAddSub(Worklist, Inst, MDT))
4194  continue;
4195 
4196  // Default handling
4197  break;
4198  case AMDGPU::S_AND_B64:
4199  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
4200  Inst.eraseFromParent();
4201  continue;
4202 
4203  case AMDGPU::S_OR_B64:
4204  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
4205  Inst.eraseFromParent();
4206  continue;
4207 
4208  case AMDGPU::S_XOR_B64:
4209  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
4210  Inst.eraseFromParent();
4211  continue;
4212 
4213  case AMDGPU::S_NAND_B64:
4214  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
4215  Inst.eraseFromParent();
4216  continue;
4217 
4218  case AMDGPU::S_NOR_B64:
4219  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
4220  Inst.eraseFromParent();
4221  continue;
4222 
4223  case AMDGPU::S_XNOR_B64:
4224  if (ST.hasDLInsts())
4225  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
4226  else
4227  splitScalar64BitXnor(Worklist, Inst, MDT);
4228  Inst.eraseFromParent();
4229  continue;
4230 
4231  case AMDGPU::S_ANDN2_B64:
4232  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
4233  Inst.eraseFromParent();
4234  continue;
4235 
4236  case AMDGPU::S_ORN2_B64:
4237  splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
4238  Inst.eraseFromParent();
4239  continue;
4240 
4241  case AMDGPU::S_NOT_B64:
4242  splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
4243  Inst.eraseFromParent();
4244  continue;
4245 
4246  case AMDGPU::S_BCNT1_I32_B64:
4247  splitScalar64BitBCNT(Worklist, Inst);
4248  Inst.eraseFromParent();
4249  continue;
4250 
4251  case AMDGPU::S_BFE_I64:
4252  splitScalar64BitBFE(Worklist, Inst);
4253  Inst.eraseFromParent();
4254  continue;
4255 
4256  case AMDGPU::S_LSHL_B32:
4257  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4258  NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
4259  swapOperands(Inst);
4260  }
4261  break;
4262  case AMDGPU::S_ASHR_I32:
4263  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4264  NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
4265  swapOperands(Inst);
4266  }
4267  break;
4268  case AMDGPU::S_LSHR_B32:
4269  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4270  NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
4271  swapOperands(Inst);
4272  }
4273  break;
4274  case AMDGPU::S_LSHL_B64:
4275  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4276  NewOpcode = AMDGPU::V_LSHLREV_B64;
4277  swapOperands(Inst);
4278  }
4279  break;
4280  case AMDGPU::S_ASHR_I64:
4281  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4282  NewOpcode = AMDGPU::V_ASHRREV_I64;
4283  swapOperands(Inst);
4284  }
4285  break;
4286  case AMDGPU::S_LSHR_B64:
4287  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4288  NewOpcode = AMDGPU::V_LSHRREV_B64;
4289  swapOperands(Inst);
4290  }
4291  break;
4292 
4293  case AMDGPU::S_ABS_I32:
4294  lowerScalarAbs(Worklist, Inst);
4295  Inst.eraseFromParent();
4296  continue;
4297 
4298  case AMDGPU::S_CBRANCH_SCC0:
4299  case AMDGPU::S_CBRANCH_SCC1:
4300  // Clear unused bits of vcc
4301  BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
4302  AMDGPU::VCC)
4303  .addReg(AMDGPU::EXEC)
4304  .addReg(AMDGPU::VCC);
4305  break;
4306 
4307  case AMDGPU::S_BFE_U64:
4308  case AMDGPU::S_BFM_B64:
4309  llvm_unreachable("Moving this op to VALU not implemented");
4310 
4311  case AMDGPU::S_PACK_LL_B32_B16:
4312  case AMDGPU::S_PACK_LH_B32_B16:
4313  case AMDGPU::S_PACK_HH_B32_B16:
4314  movePackToVALU(Worklist, MRI, Inst);
4315  Inst.eraseFromParent();
4316  continue;
4317 
4318  case AMDGPU::S_XNOR_B32:
4319  lowerScalarXnor(Worklist, Inst);
4320  Inst.eraseFromParent();
4321  continue;
4322 
4323  case AMDGPU::S_NAND_B32:
4324  splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
4325  Inst.eraseFromParent();
4326  continue;
4327 
4328  case AMDGPU::S_NOR_B32:
4329  splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
4330  Inst.eraseFromParent();
4331  continue;
4332 
4333  case AMDGPU::S_ANDN2_B32:
4334  splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
4335  Inst.eraseFromParent();
4336  continue;
4337 
4338  case AMDGPU::S_ORN2_B32:
4339  splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
4340  Inst.eraseFromParent();
4341  continue;
4342  }
4343 
4344  if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
4345  // We cannot move this instruction to the VALU, so we should try to
4346  // legalize its operands instead.
4347  legalizeOperands(Inst, MDT);
4348  continue;
4349  }
4350 
4351  // Use the new VALU Opcode.
4352  const MCInstrDesc &NewDesc = get(NewOpcode);
4353  Inst.setDesc(NewDesc);
4354 
4355  // Remove any references to SCC. Vector instructions can't read from it, and
4356  // We're just about to add the implicit use / defs of VCC, and we don't want
4357  // both.
4358  for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) {
4359  MachineOperand &Op = Inst.getOperand(i);
4360  if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
4361  // Only propagate through live-def of SCC.
4362  if (Op.isDef() && !Op.isDead())
4363  addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
4364  Inst.RemoveOperand(i);
4365  }
4366  }
4367 
4368  if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
4369  // We are converting these to a BFE, so we need to add the missing
4370  // operands for the size and offset.
4371  unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
4374 
4375  } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
4376  // The VALU version adds the second operand to the result, so insert an
4377  // extra 0 operand.
4379  }
4380 
4382 
4383  if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
4384  const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
4385  // If we need to move this to VGPRs, we need to unpack the second operand
4386  // back into the 2 separate ones for bit offset and width.
4387  assert(OffsetWidthOp.isImm() &&
4388  "Scalar BFE is only implemented for constant width and offset");
4389  uint32_t Imm = OffsetWidthOp.getImm();
4390 
4391  uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
4392  uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
4393  Inst.RemoveOperand(2); // Remove old immediate.
4394  Inst.addOperand(MachineOperand::CreateImm(Offset));
4395  Inst.addOperand(MachineOperand::CreateImm(BitWidth));
4396  }
4397 
4398  bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
4399  unsigned NewDstReg = AMDGPU::NoRegister;
4400  if (HasDst) {
4401  unsigned DstReg = Inst.getOperand(0).getReg();
4403  continue;
4404 
4405  // Update the destination register class.
4406  const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
4407  if (!NewDstRC)
4408  continue;
4409 
4410  if (Inst.isCopy() &&
4412  NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
4413  // Instead of creating a copy where src and dst are the same register
4414  // class, we just replace all uses of dst with src. These kinds of
4415  // copies interfere with the heuristics MachineSink uses to decide
4416  // whether or not to split a critical edge. Since the pass assumes
4417  // that copies will end up as machine instructions and not be
4418  // eliminated.
4419  addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
4420  MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
4421  MRI.clearKillFlags(Inst.getOperand(1).getReg());
4422  Inst.getOperand(0).setReg(DstReg);
4423 
4424  // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
4425  // these are deleted later, but at -O0 it would leave a suspicious
4426  // looking illegal copy of an undef register.
4427  for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
4428  Inst.RemoveOperand(I);
4429  Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
4430  continue;
4431  }
4432 
4433  NewDstReg = MRI.createVirtualRegister(NewDstRC);
4434  MRI.replaceRegWith(DstReg, NewDstReg);
4435  }
4436 
4437  // Legalize the operands
4438  legalizeOperands(Inst, MDT);
4439 
4440  if (HasDst)
4441  addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
4442  }
4443 }
4444 
4445 // Add/sub require special handling to deal with carry outs.
4446 bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
4447  MachineDominatorTree *MDT) const {
4448  if (ST.hasAddNoCarry()) {
4449  // Assume there is no user of scc since we don't select this in that case.
4450  // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
4451  // is used.
4452 
4453  MachineBasicBlock &MBB = *Inst.getParent();
4455 
4456  unsigned OldDstReg = Inst.getOperand(0).getReg();
4457  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4458 
4459  unsigned Opc = Inst.getOpcode();
4460  assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
4461 
4462  unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
4463  AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
4464 
4465  assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
4466  Inst.RemoveOperand(3);
4467 
4468  Inst.setDesc(get(NewOpc));
4469  Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
4470  Inst.addImplicitDefUseOperands(*MBB.getParent());
4471  MRI.replaceRegWith(OldDstReg, ResultReg);
4472  legalizeOperands(Inst, MDT);
4473 
4474  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4475  return true;
4476  }
4477 
4478  return false;
4479 }
4480 
4481 void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
4482  MachineInstr &Inst) const {
4483  MachineBasicBlock &MBB = *Inst.getParent();
4485  MachineBasicBlock::iterator MII = Inst;
4486  DebugLoc DL = Inst.getDebugLoc();
4487 
4488  MachineOperand &Dest = Inst.getOperand(0);
4489  MachineOperand &Src = Inst.getOperand(1);
4490  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4491  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4492 
4493  unsigned SubOp = ST.hasAddNoCarry() ?
4494  AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_I32_e32;
4495 
4496  BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
4497  .addImm(0)
4498  .addReg(Src.getReg());
4499 
4500  BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
4501  .addReg(Src.getReg())
4502  .addReg(TmpReg);
4503 
4504  MRI.replaceRegWith(Dest.getReg(), ResultReg);
4505  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4506 }
4507 
4508 void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
4509  MachineInstr &Inst) const {
4510  MachineBasicBlock &MBB = *Inst.getParent();
4512  MachineBasicBlock::iterator MII = Inst;
4513  const DebugLoc &DL = Inst.getDebugLoc();
4514 
4515  MachineOperand &Dest = Inst.getOperand(0);
4516  MachineOperand &Src0 = Inst.getOperand(1);
4517  MachineOperand &Src1 = Inst.getOperand(2);
4518 
4519  if (ST.hasDLInsts()) {
4520  unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4521  legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
4522  legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
4523 
4524  BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
4525  .add(Src0)
4526  .add(Src1);
4527 
4528  MRI.replaceRegWith(Dest.getReg(), NewDest);
4529  addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
4530  } else {
4531  // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
4532  // invert either source and then perform the XOR. If either source is a
4533  // scalar register, then we can leave the inversion on the scalar unit to
4534  // acheive a better distrubution of scalar and vector instructions.
4535  bool Src0IsSGPR = Src0.isReg() &&
4536  RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
4537  bool Src1IsSGPR = Src1.isReg() &&
4538  RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
4539  MachineInstr *Not = nullptr;
4540  MachineInstr *Xor = nullptr;
4541  unsigned Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4542  unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4543 
4544  // Build a pair of scalar instructions and add them to the work list.
4545  // The next iteration over the work list will lower these to the vector
4546  // unit as necessary.
4547  if (Src0IsSGPR) {
4548  Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp)
4549  .add(Src0);
4550  Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
4551  .addReg(Temp)
4552  .add(Src1);
4553  } else if (Src1IsSGPR) {
4554  Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp)
4555  .add(Src1);
4556  Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
4557  .add(Src0)
4558  .addReg(Temp);
4559  } else {
4560  Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
4561  .add(Src0)
4562  .add(Src1);
4563  Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
4564  .addReg(Temp);
4565  Worklist.insert(Not);
4566  }
4567 
4568  MRI.replaceRegWith(Dest.getReg(), NewDest);
4569 
4570  Worklist.insert(Xor);
4571 
4572  addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
4573  }
4574 }
4575 
4576 void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist,
4577  MachineInstr &Inst,
4578  unsigned Opcode) const {
4579  MachineBasicBlock &MBB = *Inst.getParent();
4581  MachineBasicBlock::iterator MII = Inst;
4582  const DebugLoc &DL = Inst.getDebugLoc();
4583 
4584  MachineOperand &Dest = Inst.getOperand(0);
4585  MachineOperand &Src0 = Inst.getOperand(1);
4586  MachineOperand &Src1 = Inst.getOperand(2);
4587 
4588  unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4589  unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4590 
4591  MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
4592  .add(Src0)
4593  .add(Src1);
4594 
4595  MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
4596  .addReg(Interm);
4597 
4598  Worklist.insert(&Op);
4599  Worklist.insert(&Not);
4600 
4601  MRI.replaceRegWith(Dest.getReg(), NewDest);
4602  addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
4603 }
4604 
4605 void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist,
4606  MachineInstr &Inst,
4607  unsigned Opcode) const {
4608  MachineBasicBlock &MBB = *Inst.getParent();
4610  MachineBasicBlock::iterator MII = Inst;
4611  const DebugLoc &DL = Inst.getDebugLoc();
4612 
4613  MachineOperand &Dest = Inst.getOperand(0);
4614  MachineOperand &Src0 = Inst.getOperand(1);
4615  MachineOperand &Src1 = Inst.getOperand(2);
4616 
4617  unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4618  unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4619 
4620  MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
4621  .add(Src1);
4622 
4623  MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
4624  .add(Src0)
4625  .addReg(Interm);
4626 
4627  Worklist.insert(&Not);
4628  Worklist.insert(&Op);
4629 
4630  MRI.replaceRegWith(Dest.getReg(), NewDest);
4631  addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
4632 }
4633 
4634 void SIInstrInfo::splitScalar64BitUnaryOp(
4635  SetVectorType &Worklist, MachineInstr &Inst,
4636  unsigned Opcode) const {
4637  MachineBasicBlock &MBB = *Inst.getParent();
4639 
4640  MachineOperand &Dest = Inst.getOperand(0);
4641  MachineOperand &Src0 = Inst.getOperand(1);
4642  DebugLoc DL = Inst.getDebugLoc();
4643 
4644  MachineBasicBlock::iterator MII = Inst;
4645 
4646  const MCInstrDesc &InstDesc = get(Opcode);
4647  const TargetRegisterClass *Src0RC = Src0.isReg() ?
4648  MRI.getRegClass(Src0.getReg()) :
4649  &AMDGPU::SGPR_32RegClass;
4650 
4651  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
4652 
4653  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4654  AMDGPU::sub0, Src0SubRC);
4655 
4656  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
4657  const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
4658  const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
4659 
4660  unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
4661  MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
4662 
4663  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4664  AMDGPU::sub1, Src0SubRC);
4665 
4666  unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
4667  MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
4668 
4669  unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
4670  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
4671  .addReg(DestSub0)
4672  .addImm(AMDGPU::sub0)
4673  .addReg(DestSub1)
4674  .addImm(AMDGPU::sub1);
4675 
4676  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
4677 
4678  Worklist.insert(&LoHalf);
4679  Worklist.insert(&HiHalf);
4680 
4681  // We don't need to legalizeOperands here because for a single operand, src0
4682  // will support any kind of input.
4683 
4684  // Move all users of this moved value.
4685  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
4686 }
4687 
4688 void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
4689  MachineInstr &Inst,
4690  MachineDominatorTree *MDT) const {
4691  bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
4692 
4693  MachineBasicBlock &MBB = *Inst.getParent();
4695 
4696  unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4697  unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4698  unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4699 
4700  unsigned CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
4701  unsigned DeadCarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
4702 
4703  MachineOperand &Dest = Inst.getOperand(0);
4704  MachineOperand &Src0 = Inst.getOperand(1);
4705  MachineOperand &Src1 = Inst.getOperand(2);
4706  const DebugLoc &DL = Inst.getDebugLoc();
4707  MachineBasicBlock::iterator MII = Inst;
4708 
4709  const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
4710  const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
4711  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
4712  const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
4713 
4714  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4715  AMDGPU::sub0, Src0SubRC);
4716  MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4717  AMDGPU::sub0, Src1SubRC);
4718 
4719 
4720  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4721  AMDGPU::sub1, Src0SubRC);
4722  MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4723  AMDGPU::sub1, Src1SubRC);
4724 
4725  unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
4726  MachineInstr *LoHalf =
4727  BuildMI(MBB, MII, DL, get(LoOpc), DestSub0)
4728  .addReg(CarryReg, RegState::Define)
4729  .add(SrcReg0Sub0)
4730  .add(SrcReg1Sub0)
4731  .addImm(0); // clamp bit
4732 
4733  unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
4734  MachineInstr *HiHalf =
4735  BuildMI(MBB, MII, DL, get(HiOpc), DestSub1)
4736  .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
4737  .add(SrcReg0Sub1)
4738  .add(SrcReg1Sub1)
4739  .addReg(CarryReg, RegState::Kill)
4740  .addImm(0); // clamp bit
4741 
4742  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
4743  .addReg(DestSub0)
4744  .addImm(AMDGPU::sub0)
4745  .addReg(DestSub1)
4746  .addImm(AMDGPU::sub1);
4747 
4748  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
4749 
4750  // Try to legalize the operands in case we need to swap the order to keep it
4751  // valid.
4752  legalizeOperands(*LoHalf, MDT);
4753  legalizeOperands(*HiHalf, MDT);
4754 
4755  // Move all users of this moved vlaue.
4756  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
4757 }
4758 
4759 void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
4760  MachineInstr &Inst, unsigned Opcode,
4761  MachineDominatorTree *MDT) const {
4762  MachineBasicBlock &MBB = *Inst.getParent();
4764 
4765  MachineOperand &Dest = Inst.getOperand(0);
4766  MachineOperand &Src0 = Inst.getOperand(1);
4767  MachineOperand &Src1 = Inst.getOperand(2);
4768  DebugLoc DL = Inst.getDebugLoc();
4769 
4770  MachineBasicBlock::iterator MII = Inst;
4771 
4772  const MCInstrDesc &InstDesc = get(Opcode);
4773  const TargetRegisterClass *Src0RC = Src0.isReg() ?
4774  MRI.getRegClass(Src0.getReg()) :
4775  &AMDGPU::SGPR_32RegClass;
4776 
4777  const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
4778  const TargetRegisterClass *Src1RC = Src1.isReg() ?
4779  MRI.getRegClass(Src1.getReg()) :
4780  &AMDGPU::SGPR_32RegClass;
4781 
4782  const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
4783 
4784  MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4785  AMDGPU::sub0, Src0SubRC);
4786  MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4787  AMDGPU::sub0, Src1SubRC);
4788  MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4789  AMDGPU::sub1, Src0SubRC);
4790  MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4791  AMDGPU::sub1, Src1SubRC);
4792 
4793  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
4794  const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
4795  const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
4796 
4797  unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
4798  MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
4799  .add(SrcReg0Sub0)
4800  .add(SrcReg1Sub0);
4801 
4802  unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
4803  MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
4804  .add(SrcReg0Sub1)
4805  .add(SrcReg1Sub1);
4806 
4807  unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
4808  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
4809  .addReg(DestSub0)
4810  .addImm(AMDGPU::sub0)
4811  .addReg(DestSub1)
4812  .addImm(AMDGPU::sub1);
4813 
4814  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
4815 
4816  Worklist.insert(&LoHalf);
4817  Worklist.insert(&HiHalf);
4818 
4819  // Move all users of this moved vlaue.
4820  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
4821 }
4822 
4823 void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist,
4824  MachineInstr &Inst,
4825  MachineDominatorTree *MDT) const {
4826  MachineBasicBlock &MBB = *Inst.getParent();
4828 
4829  MachineOperand &Dest = Inst.getOperand(0);
4830  MachineOperand &Src0 = Inst.getOperand(1);
4831  MachineOperand &Src1 = Inst.getOperand(2);
4832  const DebugLoc &DL = Inst.getDebugLoc();
4833 
4834  MachineBasicBlock::iterator MII = Inst;
4835 
4836  const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
4837 
4838  unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4839 
4840  MachineOperand* Op0;
4841  MachineOperand* Op1;
4842 
4843  if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
4844  Op0 = &Src0;
4845  Op1 = &Src1;
4846  } else {
4847  Op0 = &Src1;
4848  Op1 = &Src0;
4849  }
4850 
4851  BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
4852  .add(*Op0);
4853 
4854  unsigned NewDest = MRI.createVirtualRegister(DestRC);
4855 
4856  MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
4857  .addReg(Interm)
4858  .add(*Op1);
4859 
4860  MRI.replaceRegWith(Dest.getReg(), NewDest);
4861 
4862  Worklist.insert(&Xor);
4863 }
4864 
4865 void SIInstrInfo::splitScalar64BitBCNT(
4866  SetVectorType &Worklist, MachineInstr &Inst) const {
4867  MachineBasicBlock &MBB = *Inst.getParent();
4869 
4870  MachineBasicBlock::iterator MII = Inst;
4871  const DebugLoc &DL = Inst.getDebugLoc();
4872 
4873  MachineOperand &Dest = Inst.getOperand(0);
4874  MachineOperand &Src = Inst.getOperand(1);
4875 
4876  const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
4877  const TargetRegisterClass *SrcRC = Src.isReg() ?
4878  MRI.getRegClass(Src.getReg()) :
4879  &AMDGPU::SGPR_32RegClass;
4880 
4881  unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4882  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4883 
4884  const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
4885 
4886  MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
4887  AMDGPU::sub0, SrcSubRC);
4888  MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
4889  AMDGPU::sub1, SrcSubRC);
4890 
4891  BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
4892 
4893  BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
4894 
4895  MRI.replaceRegWith(Dest.getReg(), ResultReg);
4896 
4897  // We don't need to legalize operands here. src0 for etiher instruction can be
4898  // an SGPR, and the second input is unused or determined here.
4899  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4900 }
4901 
4902 void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
4903  MachineInstr &Inst) const {
4904  MachineBasicBlock &MBB = *Inst.getParent();
4906  MachineBasicBlock::iterator MII = Inst;
4907  const DebugLoc &DL = Inst.getDebugLoc();
4908 
4909  MachineOperand &Dest = Inst.getOperand(0);
4910  uint32_t Imm = Inst.getOperand(2).getImm();
4911  uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
4912  uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
4913 
4914  (void) Offset;
4915 
4916  // Only sext_inreg cases handled.
4917  assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
4918  Offset == 0 && "Not implemented");
4919 
4920  if (BitWidth < 32) {
4921  unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4922  unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4923  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4924 
4925  BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
4926  .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
4927  .addImm(0)
4928  .addImm(BitWidth);
4929 
4930  BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
4931  .addImm(31)
4932  .addReg(MidRegLo);
4933 
4934  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
4935  .addReg(MidRegLo)
4936  .addImm(AMDGPU::sub0)
4937  .addReg(MidRegHi)
4938  .addImm(AMDGPU::sub1);
4939 
4940  MRI.replaceRegWith(Dest.getReg(), ResultReg);
4941  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4942  return;
4943  }
4944 
4945  MachineOperand &Src = Inst.getOperand(1);
4946  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4947  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4948 
4949  BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
4950  .addImm(31)
4951  .addReg(Src.getReg(), 0, AMDGPU::sub0);
4952 
4953  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
4954  .addReg(Src.getReg(), 0, AMDGPU::sub0)
4955  .addImm(AMDGPU::sub0)
4956  .addReg(TmpReg)
4957  .addImm(AMDGPU::sub1);
4958 
4959  MRI.replaceRegWith(Dest.getReg(), ResultReg);
4960  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4961 }
4962 
4963 void SIInstrInfo::addUsersToMoveToVALUWorklist(
4964  unsigned DstReg,
4966  SetVectorType &Worklist) const {
4967  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
4968  E = MRI.use_end(); I != E;) {
4969  MachineInstr &UseMI = *I->getParent();
4970 
4971  unsigned OpNo = 0;
4972 
4973  switch (UseMI.getOpcode()) {
4974  case AMDGPU::COPY:
4975  case AMDGPU::WQM:
4976  case AMDGPU::WWM:
4977  case AMDGPU::REG_SEQUENCE:
4978  case AMDGPU::PHI:
4979  case AMDGPU::INSERT_SUBREG:
4980  break;
4981  default:
4982  OpNo = I.getOperandNo();
4983  break;
4984  }
4985 
4986  if (!RI.hasVGPRs(getOpRegClass(UseMI, OpNo))) {
4987  Worklist.insert(&UseMI);
4988 
4989  do {
4990  ++I;
4991  } while (I != E && I->getParent() == &UseMI);
4992  } else {
4993  ++I;
4994  }
4995  }
4996 }
4997 
4998 void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
4999  MachineRegisterInfo &MRI,
5000  MachineInstr &Inst) const {
5001  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5002  MachineBasicBlock *MBB = Inst.getParent();
5003  MachineOperand &Src0 = Inst.getOperand(1);
5004  MachineOperand &Src1 = Inst.getOperand(2);
5005  const DebugLoc &DL = Inst.getDebugLoc();
5006 
5007  switch (Inst.getOpcode()) {
5008  case AMDGPU::S_PACK_LL_B32_B16: {
5009  unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5010  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5011 
5012  // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
5013  // 0.
5014  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
5015  .addImm(0xffff);
5016 
5017  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
5018  .addReg(ImmReg, RegState::Kill)
5019  .add(Src0);
5020 
5021  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg)
5022  .add(Src1)
5023  .addImm(16)
5024  .addReg(TmpReg, RegState::Kill);
5025  break;
5026  }
5027  case AMDGPU::S_PACK_LH_B32_B16: {
5028  unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5029  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
5030  .addImm(0xffff);
5031  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg)
5032  .addReg(ImmReg, RegState::Kill)
5033  .add(Src0)
5034  .add(Src1);
5035  break;
5036  }
5037  case AMDGPU::S_PACK_HH_B32_B16: {
5038  unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5039  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5040  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
5041  .addImm(16)
5042  .add(Src0);
5043  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
5044  .addImm(0xffff0000);
5045  BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg)
5046  .add(Src1)
5047  .addReg(ImmReg, RegState::Kill)
5048  .addReg(TmpReg, RegState::Kill);
5049  break;
5050  }
5051  default:
5052  llvm_unreachable("unhandled s_pack_* instruction");
5053  }
5054 
5055  MachineOperand &Dest = Inst.getOperand(0);
5056  MRI.replaceRegWith(Dest.getReg(), ResultReg);
5057  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
5058 }
5059 
5060 void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
5061  MachineInstr &SCCDefInst,
5062  SetVectorType &Worklist) const {
5063  // Ensure that def inst defines SCC, which is still live.
5064  assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
5065  !Op.isDead() && Op.getParent() == &SCCDefInst);
5066  // This assumes that all the users of SCC are in the same block
5067  // as the SCC def.
5068  for (MachineInstr &MI : // Skip the def inst itself.
5069  make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
5070  SCCDefInst.getParent()->end())) {
5071  // Check if SCC is used first.
5072  if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1)
5073  Worklist.insert(&MI);
5074  // Exit if we find another SCC def.
5075  if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1)
5076  return;
5077  }
5078 }
5079 
5080 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
5081  const MachineInstr &Inst) const {
5082  const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
5083 
5084  switch (Inst.getOpcode()) {
5085  // For target instructions, getOpRegClass just returns the virtual register
5086  // class associated with the operand, so we need to find an equivalent VGPR
5087  // register class in order to move the instruction to the VALU.
5088  case AMDGPU::COPY:
5089  case AMDGPU::PHI:
5090  case AMDGPU::REG_SEQUENCE:
5091  case AMDGPU::INSERT_SUBREG:
5092  case AMDGPU::WQM:
5093  case AMDGPU::WWM:
5094  if (RI.hasVGPRs(NewDstRC))
5095  return nullptr;
5096 
5097  NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
5098  if (!NewDstRC)
5099  return nullptr;
5100  return NewDstRC;
5101  default:
5102  return NewDstRC;
5103  }
5104 }
5105 
5106 // Find the one SGPR operand we are allowed to use.
5107 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
5108  int OpIndices[3]) const {
5109  const MCInstrDesc &Desc = MI.getDesc();
5110 
5111  // Find the one SGPR operand we are allowed to use.
5112  //
5113  // First we need to consider the instruction's operand requirements before
5114  // legalizing. Some operands are required to be SGPRs, such as implicit uses
5115  // of VCC, but we are still bound by the constant bus requirement to only use
5116  // one.
5117  //
5118  // If the operand's class is an SGPR, we can never move it.
5119 
5120  unsigned SGPRReg = findImplicitSGPRRead(MI);
5121  if (SGPRReg != AMDGPU::NoRegister)
5122  return SGPRReg;
5123 
5124  unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
5125  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5126 
5127  for (unsigned i = 0; i < 3; ++i) {
5128  int Idx = OpIndices[i];
5129  if (Idx == -1)
5130  break;
5131 
5132  const MachineOperand &MO = MI.getOperand(Idx);
5133  if (!MO.isReg())
5134  continue;
5135 
5136  // Is this operand statically required to be an SGPR based on the operand
5137  // constraints?
5138  const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass);
5139  bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
5140  if (IsRequiredSGPR)
5141  return MO.getReg();
5142 
5143  // If this could be a VGPR or an SGPR, Check the dynamic register class.
5144  unsigned Reg = MO.getReg();
5145  const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
5146  if (RI.isSGPRClass(RegRC))
5147  UsedSGPRs[i] = Reg;
5148  }
5149 
5150  // We don't have a required SGPR operand, so we have a bit more freedom in
5151  // selecting operands to move.
5152 
5153  // Try to select the most used SGPR. If an SGPR is equal to one of the
5154  // others, we choose that.
5155  //
5156  // e.g.
5157  // V_FMA_F32 v0, s0, s0, s0 -> No moves
5158  // V_FMA_F32 v0, s0, s1, s0 -> Move s1
5159 
5160  // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
5161  // prefer those.
5162 
5163  if (UsedSGPRs[0] != AMDGPU::NoRegister) {
5164  if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
5165  SGPRReg = UsedSGPRs[0];
5166  }
5167 
5168  if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) {
5169  if (UsedSGPRs[1] == UsedSGPRs[2])
5170  SGPRReg = UsedSGPRs[1];
5171  }
5172 
5173  return SGPRReg;
5174 }
5175 
5177  unsigned OperandName) const {
5178  int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
5179  if (Idx == -1)
5180  return nullptr;
5181 
5182  return &MI.getOperand(Idx);
5183 }
5184 
5186  uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
5187  if (ST.isAmdHsaOS()) {
5188  // Set ATC = 1. GFX9 doesn't have this bit.
5189  if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
5190  RsrcDataFormat |= (1ULL << 56);
5191 
5192  // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
5193  // BTW, it disables TC L2 and therefore decreases performance.
5194  if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
5195  RsrcDataFormat |= (2ULL << 59);
5196  }
5197 
5198  return RsrcDataFormat;
5199 }
5200 
5202  uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
5204  0xffffffff; // Size;
5205 
5206  // GFX9 doesn't have ELEMENT_SIZE.
5207  if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5208  uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
5209  Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
5210  }
5211 
5212  // IndexStride = 64.
5213  Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
5214 
5215  // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
5216  // Clear them unless we want a huge stride.
5217  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
5218  Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
5219 
5220  return Rsrc23;
5221 }
5222 
5224  unsigned Opc = MI.getOpcode();
5225 
5226  return isSMRD(Opc);
5227 }
5228 
5230  unsigned Opc = MI.getOpcode();
5231 
5232  return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc);
5233 }
5234 
5236  int &FrameIndex) const {
5237  const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
5238  if (!Addr || !Addr->isFI())
5239  return AMDGPU::NoRegister;
5240 
5241  assert(!MI.memoperands_empty() &&
5243 
5244  FrameIndex = Addr->getIndex();
5245  return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
5246 }
5247 
5249  int &FrameIndex) const {
5250  const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
5251  assert(Addr && Addr->isFI());
5252  FrameIndex = Addr->getIndex();
5253  return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
5254 }
5255 
5257  int &FrameIndex) const {
5258  if (!MI.mayLoad())
5259  return AMDGPU::NoRegister;
5260 
5261  if (isMUBUF(MI) || isVGPRSpill(MI))
5262  return isStackAccess(MI, FrameIndex);
5263 
5264  if (isSGPRSpill(MI))
5265  return isSGPRStackAccess(MI, FrameIndex);
5266 
5267  return AMDGPU::NoRegister;
5268 }
5269 
5271  int &FrameIndex) const {
5272  if (!MI.mayStore())
5273  return AMDGPU::NoRegister;
5274 
5275  if (isMUBUF(MI) || isVGPRSpill(MI))
5276  return isStackAccess(MI, FrameIndex);
5277 
5278  if (isSGPRSpill(MI))
5279  return isSGPRStackAccess(MI, FrameIndex);
5280 
5281  return AMDGPU::NoRegister;
5282 }
5283 
5285  unsigned Size = 0;
5288  while (++I != E && I->isInsideBundle()) {
5289  assert(!I->isBundle() && "No nested bundle!");
5290  Size += getInstSizeInBytes(*I);
5291  }
5292 
5293  return Size;
5294 }
5295 
5297  unsigned Opc = MI.getOpcode();