LLVM  14.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1 //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// SI Implementation of TargetInstrInfo.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "SIInstrInfo.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUInstrInfo.h"
17 #include "GCNHazardRecognizer.h"
18 #include "GCNSubtarget.h"
20 #include "SIMachineFunctionInfo.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 #include "llvm/MC/MCContext.h"
32 
33 using namespace llvm;
34 
35 #define DEBUG_TYPE "si-instr-info"
36 
37 #define GET_INSTRINFO_CTOR_DTOR
38 #include "AMDGPUGenInstrInfo.inc"
39 
40 namespace llvm {
41 
42 class AAResults;
43 
44 namespace AMDGPU {
45 #define GET_D16ImageDimIntrinsics_IMPL
46 #define GET_ImageDimIntrinsicTable_IMPL
47 #define GET_RsrcIntrinsics_IMPL
48 #include "AMDGPUGenSearchableTables.inc"
49 }
50 }
51 
52 
53 // Must be at least 4 to be able to branch over minimum unconditional branch
54 // code. This is only for making it possible to write reasonably small tests for
55 // long branches.
56 static cl::opt<unsigned>
57 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
58  cl::desc("Restrict range of branch instructions (DEBUG)"));
59 
61  "amdgpu-fix-16-bit-physreg-copies",
62  cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
63  cl::init(true),
65 
67  : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
68  RI(ST), ST(ST) {
69  SchedModel.init(&ST);
70 }
71 
72 //===----------------------------------------------------------------------===//
73 // TargetInstrInfo callbacks
74 //===----------------------------------------------------------------------===//
75 
76 static unsigned getNumOperandsNoGlue(SDNode *Node) {
77  unsigned N = Node->getNumOperands();
78  while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
79  --N;
80  return N;
81 }
82 
83 /// Returns true if both nodes have the same value for the given
84 /// operand \p Op, or if both nodes do not have this operand.
85 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
86  unsigned Opc0 = N0->getMachineOpcode();
87  unsigned Opc1 = N1->getMachineOpcode();
88 
89  int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
90  int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
91 
92  if (Op0Idx == -1 && Op1Idx == -1)
93  return true;
94 
95 
96  if ((Op0Idx == -1 && Op1Idx != -1) ||
97  (Op1Idx == -1 && Op0Idx != -1))
98  return false;
99 
100  // getNamedOperandIdx returns the index for the MachineInstr's operands,
101  // which includes the result as the first operand. We are indexing into the
102  // MachineSDNode's operands, so we need to skip the result operand to get
103  // the real index.
104  --Op0Idx;
105  --Op1Idx;
106 
107  return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
108 }
109 
111  AAResults *AA) const {
112  if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isSDWA(MI)) {
113  // Normally VALU use of exec would block the rematerialization, but that
114  // is OK in this case to have an implicit exec read as all VALU do.
115  // We really want all of the generic logic for this except for this.
116 
117  // Another potential implicit use is mode register. The core logic of
118  // the RA will not attempt rematerialization if mode is set anywhere
119  // in the function, otherwise it is safe since mode is not changed.
120  return !MI.hasImplicitDef() &&
121  MI.getNumImplicitOperands() == MI.getDesc().getNumImplicitUses() &&
122  !MI.mayRaiseFPException();
123  }
124 
125  return false;
126 }
127 
129  // Any implicit use of exec by VALU is not a real register read.
130  return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
131  isVALU(*MO.getParent());
132 }
133 
135  int64_t &Offset0,
136  int64_t &Offset1) const {
137  if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
138  return false;
139 
140  unsigned Opc0 = Load0->getMachineOpcode();
141  unsigned Opc1 = Load1->getMachineOpcode();
142 
143  // Make sure both are actually loads.
144  if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
145  return false;
146 
147  if (isDS(Opc0) && isDS(Opc1)) {
148 
149  // FIXME: Handle this case:
150  if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
151  return false;
152 
153  // Check base reg.
154  if (Load0->getOperand(0) != Load1->getOperand(0))
155  return false;
156 
157  // Skip read2 / write2 variants for simplicity.
158  // TODO: We should report true if the used offsets are adjacent (excluded
159  // st64 versions).
160  int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
161  int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
162  if (Offset0Idx == -1 || Offset1Idx == -1)
163  return false;
164 
165  // XXX - be careful of datalesss loads
166  // getNamedOperandIdx returns the index for MachineInstrs. Since they
167  // include the output in the operand list, but SDNodes don't, we need to
168  // subtract the index by one.
169  Offset0Idx -= get(Opc0).NumDefs;
170  Offset1Idx -= get(Opc1).NumDefs;
171  Offset0 = cast<ConstantSDNode>(Load0->getOperand(Offset0Idx))->getZExtValue();
172  Offset1 = cast<ConstantSDNode>(Load1->getOperand(Offset1Idx))->getZExtValue();
173  return true;
174  }
175 
176  if (isSMRD(Opc0) && isSMRD(Opc1)) {
177  // Skip time and cache invalidation instructions.
178  if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 ||
179  AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1)
180  return false;
181 
183 
184  // Check base reg.
185  if (Load0->getOperand(0) != Load1->getOperand(0))
186  return false;
187 
188  const ConstantSDNode *Load0Offset =
189  dyn_cast<ConstantSDNode>(Load0->getOperand(1));
190  const ConstantSDNode *Load1Offset =
191  dyn_cast<ConstantSDNode>(Load1->getOperand(1));
192 
193  if (!Load0Offset || !Load1Offset)
194  return false;
195 
196  Offset0 = Load0Offset->getZExtValue();
197  Offset1 = Load1Offset->getZExtValue();
198  return true;
199  }
200 
201  // MUBUF and MTBUF can access the same addresses.
202  if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
203 
204  // MUBUF and MTBUF have vaddr at different indices.
205  if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
206  !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
207  !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
208  return false;
209 
210  int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
211  int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
212 
213  if (OffIdx0 == -1 || OffIdx1 == -1)
214  return false;
215 
216  // getNamedOperandIdx returns the index for MachineInstrs. Since they
217  // include the output in the operand list, but SDNodes don't, we need to
218  // subtract the index by one.
219  OffIdx0 -= get(Opc0).NumDefs;
220  OffIdx1 -= get(Opc1).NumDefs;
221 
222  SDValue Off0 = Load0->getOperand(OffIdx0);
223  SDValue Off1 = Load1->getOperand(OffIdx1);
224 
225  // The offset might be a FrameIndexSDNode.
226  if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
227  return false;
228 
229  Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
230  Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
231  return true;
232  }
233 
234  return false;
235 }
236 
237 static bool isStride64(unsigned Opc) {
238  switch (Opc) {
239  case AMDGPU::DS_READ2ST64_B32:
240  case AMDGPU::DS_READ2ST64_B64:
241  case AMDGPU::DS_WRITE2ST64_B32:
242  case AMDGPU::DS_WRITE2ST64_B64:
243  return true;
244  default:
245  return false;
246  }
247 }
248 
251  int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
252  const TargetRegisterInfo *TRI) const {
253  if (!LdSt.mayLoadOrStore())
254  return false;
255 
256  unsigned Opc = LdSt.getOpcode();
257  OffsetIsScalable = false;
258  const MachineOperand *BaseOp, *OffsetOp;
259  int DataOpIdx;
260 
261  if (isDS(LdSt)) {
262  BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
263  OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
264  if (OffsetOp) {
265  // Normal, single offset LDS instruction.
266  if (!BaseOp) {
267  // DS_CONSUME/DS_APPEND use M0 for the base address.
268  // TODO: find the implicit use operand for M0 and use that as BaseOp?
269  return false;
270  }
271  BaseOps.push_back(BaseOp);
272  Offset = OffsetOp->getImm();
273  // Get appropriate operand, and compute width accordingly.
274  DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
275  if (DataOpIdx == -1)
276  DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
277  Width = getOpSize(LdSt, DataOpIdx);
278  } else {
279  // The 2 offset instructions use offset0 and offset1 instead. We can treat
280  // these as a load with a single offset if the 2 offsets are consecutive.
281  // We will use this for some partially aligned loads.
282  const MachineOperand *Offset0Op =
283  getNamedOperand(LdSt, AMDGPU::OpName::offset0);
284  const MachineOperand *Offset1Op =
285  getNamedOperand(LdSt, AMDGPU::OpName::offset1);
286 
287  unsigned Offset0 = Offset0Op->getImm();
288  unsigned Offset1 = Offset1Op->getImm();
289  if (Offset0 + 1 != Offset1)
290  return false;
291 
292  // Each of these offsets is in element sized units, so we need to convert
293  // to bytes of the individual reads.
294 
295  unsigned EltSize;
296  if (LdSt.mayLoad())
297  EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
298  else {
299  assert(LdSt.mayStore());
300  int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
301  EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
302  }
303 
304  if (isStride64(Opc))
305  EltSize *= 64;
306 
307  BaseOps.push_back(BaseOp);
308  Offset = EltSize * Offset0;
309  // Get appropriate operand(s), and compute width accordingly.
310  DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
311  if (DataOpIdx == -1) {
312  DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
313  Width = getOpSize(LdSt, DataOpIdx);
314  DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
315  Width += getOpSize(LdSt, DataOpIdx);
316  } else {
317  Width = getOpSize(LdSt, DataOpIdx);
318  }
319  }
320  return true;
321  }
322 
323  if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
324  const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
325  if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
326  return false;
327  BaseOps.push_back(RSrc);
328  BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
329  if (BaseOp && !BaseOp->isFI())
330  BaseOps.push_back(BaseOp);
331  const MachineOperand *OffsetImm =
332  getNamedOperand(LdSt, AMDGPU::OpName::offset);
333  Offset = OffsetImm->getImm();
334  const MachineOperand *SOffset =
335  getNamedOperand(LdSt, AMDGPU::OpName::soffset);
336  if (SOffset) {
337  if (SOffset->isReg())
338  BaseOps.push_back(SOffset);
339  else
340  Offset += SOffset->getImm();
341  }
342  // Get appropriate operand, and compute width accordingly.
343  DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
344  if (DataOpIdx == -1)
345  DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
346  Width = getOpSize(LdSt, DataOpIdx);
347  return true;
348  }
349 
350  if (isMIMG(LdSt)) {
351  int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
352  BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
353  int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
354  if (VAddr0Idx >= 0) {
355  // GFX10 possible NSA encoding.
356  for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
357  BaseOps.push_back(&LdSt.getOperand(I));
358  } else {
359  BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
360  }
361  Offset = 0;
362  // Get appropriate operand, and compute width accordingly.
363  DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
364  Width = getOpSize(LdSt, DataOpIdx);
365  return true;
366  }
367 
368  if (isSMRD(LdSt)) {
369  BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
370  if (!BaseOp) // e.g. S_MEMTIME
371  return false;
372  BaseOps.push_back(BaseOp);
373  OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
374  Offset = OffsetOp ? OffsetOp->getImm() : 0;
375  // Get appropriate operand, and compute width accordingly.
376  DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
377  Width = getOpSize(LdSt, DataOpIdx);
378  return true;
379  }
380 
381  if (isFLAT(LdSt)) {
382  // Instructions have either vaddr or saddr or both or none.
383  BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
384  if (BaseOp)
385  BaseOps.push_back(BaseOp);
386  BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
387  if (BaseOp)
388  BaseOps.push_back(BaseOp);
389  Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
390  // Get appropriate operand, and compute width accordingly.
391  DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
392  if (DataOpIdx == -1)
393  DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
394  Width = getOpSize(LdSt, DataOpIdx);
395  return true;
396  }
397 
398  return false;
399 }
400 
401 static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
403  const MachineInstr &MI2,
405  // Only examine the first "base" operand of each instruction, on the
406  // assumption that it represents the real base address of the memory access.
407  // Other operands are typically offsets or indices from this base address.
408  if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
409  return true;
410 
411  if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
412  return false;
413 
414  auto MO1 = *MI1.memoperands_begin();
415  auto MO2 = *MI2.memoperands_begin();
416  if (MO1->getAddrSpace() != MO2->getAddrSpace())
417  return false;
418 
419  auto Base1 = MO1->getValue();
420  auto Base2 = MO2->getValue();
421  if (!Base1 || !Base2)
422  return false;
423  Base1 = getUnderlyingObject(Base1);
424  Base2 = getUnderlyingObject(Base2);
425 
426  if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
427  return false;
428 
429  return Base1 == Base2;
430 }
431 
434  unsigned NumLoads,
435  unsigned NumBytes) const {
436  // If the mem ops (to be clustered) do not have the same base ptr, then they
437  // should not be clustered
438  if (!BaseOps1.empty() && !BaseOps2.empty()) {
439  const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
440  const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
441  if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
442  return false;
443  } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
444  // If only one base op is empty, they do not have the same base ptr
445  return false;
446  }
447 
448  // In order to avoid regester pressure, on an average, the number of DWORDS
449  // loaded together by all clustered mem ops should not exceed 8. This is an
450  // empirical value based on certain observations and performance related
451  // experiments.
452  // The good thing about this heuristic is - it avoids clustering of too many
453  // sub-word loads, and also avoids clustering of wide loads. Below is the
454  // brief summary of how the heuristic behaves for various `LoadSize`.
455  // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
456  // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
457  // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
458  // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
459  // (5) LoadSize >= 17: do not cluster
460  const unsigned LoadSize = NumBytes / NumLoads;
461  const unsigned NumDWORDs = ((LoadSize + 3) / 4) * NumLoads;
462  return NumDWORDs <= 8;
463 }
464 
465 // FIXME: This behaves strangely. If, for example, you have 32 load + stores,
466 // the first 16 loads will be interleaved with the stores, and the next 16 will
467 // be clustered as expected. It should really split into 2 16 store batches.
468 //
469 // Loads are clustered until this returns false, rather than trying to schedule
470 // groups of stores. This also means we have to deal with saying different
471 // address space loads should be clustered, and ones which might cause bank
472 // conflicts.
473 //
474 // This might be deprecated so it might not be worth that much effort to fix.
476  int64_t Offset0, int64_t Offset1,
477  unsigned NumLoads) const {
478  assert(Offset1 > Offset0 &&
479  "Second offset should be larger than first offset!");
480  // If we have less than 16 loads in a row, and the offsets are within 64
481  // bytes, then schedule together.
482 
483  // A cacheline is 64 bytes (for global memory).
484  return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
485 }
486 
489  const DebugLoc &DL, MCRegister DestReg,
490  MCRegister SrcReg, bool KillSrc,
491  const char *Msg = "illegal SGPR to VGPR copy") {
492  MachineFunction *MF = MBB.getParent();
493  DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error);
494  LLVMContext &C = MF->getFunction().getContext();
495  C.diagnose(IllegalCopy);
496 
497  BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
498  .addReg(SrcReg, getKillRegState(KillSrc));
499 }
500 
501 /// Handle copying from SGPR to AGPR, or from AGPR to AGPR. It is not possible
502 /// to directly copy, so an intermediate VGPR needs to be used.
503 static void indirectCopyToAGPR(const SIInstrInfo &TII,
506  const DebugLoc &DL, MCRegister DestReg,
507  MCRegister SrcReg, bool KillSrc,
508  RegScavenger &RS,
509  Register ImpDefSuperReg = Register(),
510  Register ImpUseSuperReg = Register()) {
511  const SIRegisterInfo &RI = TII.getRegisterInfo();
512 
513  assert(AMDGPU::SReg_32RegClass.contains(SrcReg) ||
514  AMDGPU::AGPR_32RegClass.contains(SrcReg));
515 
516  // First try to find defining accvgpr_write to avoid temporary registers.
517  for (auto Def = MI, E = MBB.begin(); Def != E; ) {
518  --Def;
519  if (!Def->definesRegister(SrcReg, &RI))
520  continue;
521  if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
522  break;
523 
524  MachineOperand &DefOp = Def->getOperand(1);
525  assert(DefOp.isReg() || DefOp.isImm());
526 
527  if (DefOp.isReg()) {
528  // Check that register source operand if not clobbered before MI.
529  // Immediate operands are always safe to propagate.
530  bool SafeToPropagate = true;
531  for (auto I = Def; I != MI && SafeToPropagate; ++I)
532  if (I->modifiesRegister(DefOp.getReg(), &RI))
533  SafeToPropagate = false;
534 
535  if (!SafeToPropagate)
536  break;
537 
538  DefOp.setIsKill(false);
539  }
540 
542  BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
543  .add(DefOp);
544  if (ImpDefSuperReg)
545  Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
546 
547  if (ImpUseSuperReg) {
548  Builder.addReg(ImpUseSuperReg,
550  }
551 
552  return;
553  }
554 
555  RS.enterBasicBlock(MBB);
556  RS.forward(MI);
557 
558  // Ideally we want to have three registers for a long reg_sequence copy
559  // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
560  unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
561  *MBB.getParent());
562 
563  // Registers in the sequence are allocated contiguously so we can just
564  // use register number to pick one of three round-robin temps.
565  unsigned RegNo = DestReg % 3;
566  Register Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
567  if (!Tmp)
568  report_fatal_error("Cannot scavenge VGPR to copy to AGPR");
569  RS.setRegUsed(Tmp);
570 
571  if (!TII.getSubtarget().hasGFX90AInsts()) {
572  // Only loop through if there are any free registers left, otherwise
573  // scavenger may report a fatal error without emergency spill slot
574  // or spill with the slot.
575  while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) {
576  Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
577  if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
578  break;
579  Tmp = Tmp2;
580  RS.setRegUsed(Tmp);
581  }
582  }
583 
584  // Insert copy to temporary VGPR.
585  unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
586  if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
587  TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
588  } else {
589  assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
590  }
591 
592  MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
593  .addReg(SrcReg, getKillRegState(KillSrc));
594  if (ImpUseSuperReg) {
595  UseBuilder.addReg(ImpUseSuperReg,
597  }
598 
599  MachineInstrBuilder DefBuilder
600  = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
601  .addReg(Tmp, RegState::Kill);
602 
603  if (ImpDefSuperReg)
604  DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
605 }
606 
609  MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
610  const TargetRegisterClass *RC, bool Forward) {
611  const SIRegisterInfo &RI = TII.getRegisterInfo();
612  ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
614  MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
615 
616  for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
617  int16_t SubIdx = BaseIndices[Idx];
618  Register Reg = RI.getSubReg(DestReg, SubIdx);
619  unsigned Opcode = AMDGPU::S_MOV_B32;
620 
621  // Is SGPR aligned? If so try to combine with next.
622  Register Src = RI.getSubReg(SrcReg, SubIdx);
623  bool AlignedDest = ((Reg - AMDGPU::SGPR0) % 2) == 0;
624  bool AlignedSrc = ((Src - AMDGPU::SGPR0) % 2) == 0;
625  if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
626  // Can use SGPR64 copy
627  unsigned Channel = RI.getChannelFromSubReg(SubIdx);
628  SubIdx = RI.getSubRegFromChannel(Channel, 2);
629  Opcode = AMDGPU::S_MOV_B64;
630  Idx++;
631  }
632 
633  LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), RI.getSubReg(DestReg, SubIdx))
634  .addReg(RI.getSubReg(SrcReg, SubIdx))
635  .addReg(SrcReg, RegState::Implicit);
636 
637  if (!FirstMI)
638  FirstMI = LastMI;
639 
640  if (!Forward)
641  I--;
642  }
643 
644  assert(FirstMI && LastMI);
645  if (!Forward)
646  std::swap(FirstMI, LastMI);
647 
648  FirstMI->addOperand(
649  MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
650 
651  if (KillSrc)
652  LastMI->addRegisterKilled(SrcReg, &RI);
653 }
654 
657  const DebugLoc &DL, MCRegister DestReg,
658  MCRegister SrcReg, bool KillSrc) const {
659  const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg);
660 
661  // FIXME: This is hack to resolve copies between 16 bit and 32 bit
662  // registers until all patterns are fixed.
663  if (Fix16BitCopies &&
664  ((RI.getRegSizeInBits(*RC) == 16) ^
665  (RI.getRegSizeInBits(*RI.getPhysRegClass(SrcReg)) == 16))) {
666  MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg;
667  MCRegister Super = RI.get32BitRegister(RegToFix);
668  assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix);
669  RegToFix = Super;
670 
671  if (DestReg == SrcReg) {
672  // Insert empty bundle since ExpandPostRA expects an instruction here.
673  BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
674  return;
675  }
676 
677  RC = RI.getPhysRegClass(DestReg);
678  }
679 
680  if (RC == &AMDGPU::VGPR_32RegClass) {
681  assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
682  AMDGPU::SReg_32RegClass.contains(SrcReg) ||
683  AMDGPU::AGPR_32RegClass.contains(SrcReg));
684  unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
685  AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
686  BuildMI(MBB, MI, DL, get(Opc), DestReg)
687  .addReg(SrcReg, getKillRegState(KillSrc));
688  return;
689  }
690 
691  if (RC == &AMDGPU::SReg_32_XM0RegClass ||
692  RC == &AMDGPU::SReg_32RegClass) {
693  if (SrcReg == AMDGPU::SCC) {
694  BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
695  .addImm(1)
696  .addImm(0);
697  return;
698  }
699 
700  if (DestReg == AMDGPU::VCC_LO) {
701  if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
702  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
703  .addReg(SrcReg, getKillRegState(KillSrc));
704  } else {
705  // FIXME: Hack until VReg_1 removed.
706  assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
707  BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
708  .addImm(0)
709  .addReg(SrcReg, getKillRegState(KillSrc));
710  }
711 
712  return;
713  }
714 
715  if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
716  reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
717  return;
718  }
719 
720  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
721  .addReg(SrcReg, getKillRegState(KillSrc));
722  return;
723  }
724 
725  if (RC == &AMDGPU::SReg_64RegClass) {
726  if (SrcReg == AMDGPU::SCC) {
727  BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
728  .addImm(1)
729  .addImm(0);
730  return;
731  }
732 
733  if (DestReg == AMDGPU::VCC) {
734  if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
735  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
736  .addReg(SrcReg, getKillRegState(KillSrc));
737  } else {
738  // FIXME: Hack until VReg_1 removed.
739  assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
740  BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
741  .addImm(0)
742  .addReg(SrcReg, getKillRegState(KillSrc));
743  }
744 
745  return;
746  }
747 
748  if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
749  reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
750  return;
751  }
752 
753  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
754  .addReg(SrcReg, getKillRegState(KillSrc));
755  return;
756  }
757 
758  if (DestReg == AMDGPU::SCC) {
759  // Copying 64-bit or 32-bit sources to SCC barely makes sense,
760  // but SelectionDAG emits such copies for i1 sources.
761  if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
762  // This copy can only be produced by patterns
763  // with explicit SCC, which are known to be enabled
764  // only for subtargets with S_CMP_LG_U64 present.
766  BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
767  .addReg(SrcReg, getKillRegState(KillSrc))
768  .addImm(0);
769  } else {
770  assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
771  BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
772  .addReg(SrcReg, getKillRegState(KillSrc))
773  .addImm(0);
774  }
775 
776  return;
777  }
778 
779  if (RC == &AMDGPU::AGPR_32RegClass) {
780  if (AMDGPU::VGPR_32RegClass.contains(SrcReg)) {
781  BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
782  .addReg(SrcReg, getKillRegState(KillSrc));
783  return;
784  }
785 
786  if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
787  BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
788  .addReg(SrcReg, getKillRegState(KillSrc));
789  return;
790  }
791 
792  // FIXME: Pass should maintain scavenger to avoid scan through the block on
793  // every AGPR spill.
794  RegScavenger RS;
795  indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS);
796  return;
797  }
798 
799  const unsigned Size = RI.getRegSizeInBits(*RC);
800  if (Size == 16) {
801  assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
802  AMDGPU::VGPR_HI16RegClass.contains(SrcReg) ||
803  AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
804  AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
805 
806  bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
807  bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
808  bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
809  bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
810  bool DstLow = AMDGPU::VGPR_LO16RegClass.contains(DestReg) ||
811  AMDGPU::SReg_LO16RegClass.contains(DestReg) ||
812  AMDGPU::AGPR_LO16RegClass.contains(DestReg);
813  bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
814  AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
815  AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
816  MCRegister NewDestReg = RI.get32BitRegister(DestReg);
817  MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
818 
819  if (IsSGPRDst) {
820  if (!IsSGPRSrc) {
821  reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
822  return;
823  }
824 
825  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
826  .addReg(NewSrcReg, getKillRegState(KillSrc));
827  return;
828  }
829 
830  if (IsAGPRDst || IsAGPRSrc) {
831  if (!DstLow || !SrcLow) {
832  reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
833  "Cannot use hi16 subreg with an AGPR!");
834  }
835 
836  copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
837  return;
838  }
839 
840  if (IsSGPRSrc && !ST.hasSDWAScalar()) {
841  if (!DstLow || !SrcLow) {
842  reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
843  "Cannot use hi16 subreg on VI!");
844  }
845 
846  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
847  .addReg(NewSrcReg, getKillRegState(KillSrc));
848  return;
849  }
850 
851  auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
852  .addImm(0) // src0_modifiers
853  .addReg(NewSrcReg)
854  .addImm(0) // clamp
860  .addReg(NewDestReg, RegState::Implicit | RegState::Undef);
861  // First implicit operand is $exec.
862  MIB->tieOperands(0, MIB->getNumOperands() - 1);
863  return;
864  }
865 
866  const TargetRegisterClass *SrcRC = RI.getPhysRegClass(SrcReg);
867  if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
868  if (ST.hasPackedFP32Ops()) {
869  BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
871  .addReg(SrcReg)
873  .addReg(SrcReg)
874  .addImm(0) // op_sel_lo
875  .addImm(0) // op_sel_hi
876  .addImm(0) // neg_lo
877  .addImm(0) // neg_hi
878  .addImm(0) // clamp
879  .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
880  return;
881  }
882  }
883 
884  const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
885  if (RI.isSGPRClass(RC)) {
886  if (!RI.isSGPRClass(SrcRC)) {
887  reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
888  return;
889  }
890  expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RC, Forward);
891  return;
892  }
893 
894  unsigned EltSize = 4;
895  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
896  if (RI.hasAGPRs(RC)) {
897  Opcode = (RI.hasVGPRs(SrcRC)) ?
898  AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
899  } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(SrcRC)) {
900  Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
901  } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
902  (RI.isProperlyAlignedRC(*RC) &&
903  (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
904  // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
905  if (ST.hasPackedFP32Ops()) {
906  Opcode = AMDGPU::V_PK_MOV_B32;
907  EltSize = 8;
908  }
909  }
910 
911  // For the cases where we need an intermediate instruction/temporary register
912  // (destination is an AGPR), we need a scavenger.
913  //
914  // FIXME: The pass should maintain this for us so we don't have to re-scan the
915  // whole block for every handled copy.
916  std::unique_ptr<RegScavenger> RS;
917  if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
918  RS.reset(new RegScavenger());
919 
920  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
921 
922  // If there is an overlap, we can't kill the super-register on the last
923  // instruction, since it will also kill the components made live by this def.
924  const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
925 
926  for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
927  unsigned SubIdx;
928  if (Forward)
929  SubIdx = SubIndices[Idx];
930  else
931  SubIdx = SubIndices[SubIndices.size() - Idx - 1];
932 
933  bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
934 
935  if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
936  Register ImpDefSuper = Idx == 0 ? Register(DestReg) : Register();
937  Register ImpUseSuper = SrcReg;
938  indirectCopyToAGPR(*this, MBB, MI, DL, RI.getSubReg(DestReg, SubIdx),
939  RI.getSubReg(SrcReg, SubIdx), UseKill, *RS,
940  ImpDefSuper, ImpUseSuper);
941  } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
942  Register DstSubReg = RI.getSubReg(DestReg, SubIdx);
943  Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
944  MachineInstrBuilder MIB =
945  BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DstSubReg)
947  .addReg(SrcSubReg)
949  .addReg(SrcSubReg)
950  .addImm(0) // op_sel_lo
951  .addImm(0) // op_sel_hi
952  .addImm(0) // neg_lo
953  .addImm(0) // neg_hi
954  .addImm(0) // clamp
955  .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
956  if (Idx == 0)
958  } else {
960  BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx))
961  .addReg(RI.getSubReg(SrcReg, SubIdx));
962  if (Idx == 0)
963  Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
964 
965  Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
966  }
967  }
968 }
969 
970 int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
971  int NewOpc;
972 
973  // Try to map original to commuted opcode
974  NewOpc = AMDGPU::getCommuteRev(Opcode);
975  if (NewOpc != -1)
976  // Check if the commuted (REV) opcode exists on the target.
977  return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
978 
979  // Try to map commuted to original opcode
980  NewOpc = AMDGPU::getCommuteOrig(Opcode);
981  if (NewOpc != -1)
982  // Check if the original (non-REV) opcode exists on the target.
983  return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
984 
985  return Opcode;
986 }
987 
990  const DebugLoc &DL, unsigned DestReg,
991  int64_t Value) const {
993  const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
994  if (RegClass == &AMDGPU::SReg_32RegClass ||
995  RegClass == &AMDGPU::SGPR_32RegClass ||
996  RegClass == &AMDGPU::SReg_32_XM0RegClass ||
997  RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
998  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
999  .addImm(Value);
1000  return;
1001  }
1002 
1003  if (RegClass == &AMDGPU::SReg_64RegClass ||
1004  RegClass == &AMDGPU::SGPR_64RegClass ||
1005  RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
1006  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
1007  .addImm(Value);
1008  return;
1009  }
1010 
1011  if (RegClass == &AMDGPU::VGPR_32RegClass) {
1012  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
1013  .addImm(Value);
1014  return;
1015  }
1016  if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) {
1017  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
1018  .addImm(Value);
1019  return;
1020  }
1021 
1022  unsigned EltSize = 4;
1023  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1024  if (RI.isSGPRClass(RegClass)) {
1025  if (RI.getRegSizeInBits(*RegClass) > 32) {
1026  Opcode = AMDGPU::S_MOV_B64;
1027  EltSize = 8;
1028  } else {
1029  Opcode = AMDGPU::S_MOV_B32;
1030  EltSize = 4;
1031  }
1032  }
1033 
1034  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
1035  for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1036  int64_t IdxValue = Idx == 0 ? Value : 0;
1037 
1039  get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx]));
1040  Builder.addImm(IdxValue);
1041  }
1042 }
1043 
1044 const TargetRegisterClass *
1046  return &AMDGPU::VGPR_32RegClass;
1047 }
1048 
1051  const DebugLoc &DL, Register DstReg,
1053  Register TrueReg,
1054  Register FalseReg) const {
1056  const TargetRegisterClass *BoolXExecRC =
1057  RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1058  assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1059  "Not a VGPR32 reg");
1060 
1061  if (Cond.size() == 1) {
1062  Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1063  BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1064  .add(Cond[0]);
1065  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1066  .addImm(0)
1067  .addReg(FalseReg)
1068  .addImm(0)
1069  .addReg(TrueReg)
1070  .addReg(SReg);
1071  } else if (Cond.size() == 2) {
1072  assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1073  switch (Cond[0].getImm()) {
1074  case SIInstrInfo::SCC_TRUE: {
1075  Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1076  BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1077  : AMDGPU::S_CSELECT_B64), SReg)
1078  .addImm(1)
1079  .addImm(0);
1080  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1081  .addImm(0)
1082  .addReg(FalseReg)
1083  .addImm(0)
1084  .addReg(TrueReg)
1085  .addReg(SReg);
1086  break;
1087  }
1088  case SIInstrInfo::SCC_FALSE: {
1089  Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1090  BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1091  : AMDGPU::S_CSELECT_B64), SReg)
1092  .addImm(0)
1093  .addImm(1);
1094  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1095  .addImm(0)
1096  .addReg(FalseReg)
1097  .addImm(0)
1098  .addReg(TrueReg)
1099  .addReg(SReg);
1100  break;
1101  }
1102  case SIInstrInfo::VCCNZ: {
1103  MachineOperand RegOp = Cond[1];
1104  RegOp.setImplicit(false);
1105  Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1106  BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1107  .add(RegOp);
1108  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1109  .addImm(0)
1110  .addReg(FalseReg)
1111  .addImm(0)
1112  .addReg(TrueReg)
1113  .addReg(SReg);
1114  break;
1115  }
1116  case SIInstrInfo::VCCZ: {
1117  MachineOperand RegOp = Cond[1];
1118  RegOp.setImplicit(false);
1119  Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1120  BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1121  .add(RegOp);
1122  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1123  .addImm(0)
1124  .addReg(TrueReg)
1125  .addImm(0)
1126  .addReg(FalseReg)
1127  .addReg(SReg);
1128  break;
1129  }
1130  case SIInstrInfo::EXECNZ: {
1131  Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1133  BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1134  : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1135  .addImm(0);
1136  BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1137  : AMDGPU::S_CSELECT_B64), SReg)
1138  .addImm(1)
1139  .addImm(0);
1140  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1141  .addImm(0)
1142  .addReg(FalseReg)
1143  .addImm(0)
1144  .addReg(TrueReg)
1145  .addReg(SReg);
1146  break;
1147  }
1148  case SIInstrInfo::EXECZ: {
1149  Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1151  BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1152  : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1153  .addImm(0);
1154  BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1155  : AMDGPU::S_CSELECT_B64), SReg)
1156  .addImm(0)
1157  .addImm(1);
1158  BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1159  .addImm(0)
1160  .addReg(FalseReg)
1161  .addImm(0)
1162  .addReg(TrueReg)
1163  .addReg(SReg);
1164  llvm_unreachable("Unhandled branch predicate EXECZ");
1165  break;
1166  }
1167  default:
1168  llvm_unreachable("invalid branch predicate");
1169  }
1170  } else {
1171  llvm_unreachable("Can only handle Cond size 1 or 2");
1172  }
1173 }
1174 
1177  const DebugLoc &DL,
1178  Register SrcReg, int Value) const {
1181  BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1182  .addImm(Value)
1183  .addReg(SrcReg);
1184 
1185  return Reg;
1186 }
1187 
1190  const DebugLoc &DL,
1191  Register SrcReg, int Value) const {
1194  BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1195  .addImm(Value)
1196  .addReg(SrcReg);
1197 
1198  return Reg;
1199 }
1200 
1201 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
1202 
1203  if (RI.hasAGPRs(DstRC))
1204  return AMDGPU::COPY;
1205  if (RI.getRegSizeInBits(*DstRC) == 32) {
1206  return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1207  } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
1208  return AMDGPU::S_MOV_B64;
1209  } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
1210  return AMDGPU::V_MOV_B64_PSEUDO;
1211  }
1212  return AMDGPU::COPY;
1213 }
1214 
1215 const MCInstrDesc &
1217  bool IsIndirectSrc) const {
1218  if (IsIndirectSrc) {
1219  if (VecSize <= 32) // 4 bytes
1220  return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1221  if (VecSize <= 64) // 8 bytes
1222  return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1223  if (VecSize <= 96) // 12 bytes
1224  return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1225  if (VecSize <= 128) // 16 bytes
1226  return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1227  if (VecSize <= 160) // 20 bytes
1228  return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1229  if (VecSize <= 256) // 32 bytes
1230  return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1231  if (VecSize <= 512) // 64 bytes
1232  return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1233  if (VecSize <= 1024) // 128 bytes
1234  return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1235 
1236  llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1237  }
1238 
1239  if (VecSize <= 32) // 4 bytes
1240  return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1241  if (VecSize <= 64) // 8 bytes
1242  return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1243  if (VecSize <= 96) // 12 bytes
1244  return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1245  if (VecSize <= 128) // 16 bytes
1246  return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1247  if (VecSize <= 160) // 20 bytes
1248  return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1249  if (VecSize <= 256) // 32 bytes
1250  return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1251  if (VecSize <= 512) // 64 bytes
1252  return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1253  if (VecSize <= 1024) // 128 bytes
1254  return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1255 
1256  llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1257 }
1258 
1260  if (VecSize <= 32) // 4 bytes
1261  return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1262  if (VecSize <= 64) // 8 bytes
1263  return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1264  if (VecSize <= 96) // 12 bytes
1265  return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1266  if (VecSize <= 128) // 16 bytes
1267  return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1268  if (VecSize <= 160) // 20 bytes
1269  return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1270  if (VecSize <= 256) // 32 bytes
1271  return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1272  if (VecSize <= 512) // 64 bytes
1273  return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1274  if (VecSize <= 1024) // 128 bytes
1275  return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1276 
1277  llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1278 }
1279 
1280 static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1281  if (VecSize <= 32) // 4 bytes
1282  return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1283  if (VecSize <= 64) // 8 bytes
1284  return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1285  if (VecSize <= 96) // 12 bytes
1286  return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1287  if (VecSize <= 128) // 16 bytes
1288  return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1289  if (VecSize <= 160) // 20 bytes
1290  return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1291  if (VecSize <= 256) // 32 bytes
1292  return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1293  if (VecSize <= 512) // 64 bytes
1294  return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1295  if (VecSize <= 1024) // 128 bytes
1296  return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1297 
1298  llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1299 }
1300 
1301 static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1302  if (VecSize <= 64) // 8 bytes
1303  return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1304  if (VecSize <= 128) // 16 bytes
1305  return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1306  if (VecSize <= 256) // 32 bytes
1307  return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1308  if (VecSize <= 512) // 64 bytes
1309  return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1310  if (VecSize <= 1024) // 128 bytes
1311  return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1312 
1313  llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1314 }
1315 
1316 const MCInstrDesc &
1318  bool IsSGPR) const {
1319  if (IsSGPR) {
1320  switch (EltSize) {
1321  case 32:
1323  case 64:
1325  default:
1326  llvm_unreachable("invalid reg indexing elt size");
1327  }
1328  }
1329 
1330  assert(EltSize == 32 && "invalid reg indexing elt size");
1332 }
1333 
1334 static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1335  switch (Size) {
1336  case 4:
1337  return AMDGPU::SI_SPILL_S32_SAVE;
1338  case 8:
1339  return AMDGPU::SI_SPILL_S64_SAVE;
1340  case 12:
1341  return AMDGPU::SI_SPILL_S96_SAVE;
1342  case 16:
1343  return AMDGPU::SI_SPILL_S128_SAVE;
1344  case 20:
1345  return AMDGPU::SI_SPILL_S160_SAVE;
1346  case 24:
1347  return AMDGPU::SI_SPILL_S192_SAVE;
1348  case 28:
1349  return AMDGPU::SI_SPILL_S224_SAVE;
1350  case 32:
1351  return AMDGPU::SI_SPILL_S256_SAVE;
1352  case 64:
1353  return AMDGPU::SI_SPILL_S512_SAVE;
1354  case 128:
1355  return AMDGPU::SI_SPILL_S1024_SAVE;
1356  default:
1357  llvm_unreachable("unknown register size");
1358  }
1359 }
1360 
1361 static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1362  switch (Size) {
1363  case 4:
1364  return AMDGPU::SI_SPILL_V32_SAVE;
1365  case 8:
1366  return AMDGPU::SI_SPILL_V64_SAVE;
1367  case 12:
1368  return AMDGPU::SI_SPILL_V96_SAVE;
1369  case 16:
1370  return AMDGPU::SI_SPILL_V128_SAVE;
1371  case 20:
1372  return AMDGPU::SI_SPILL_V160_SAVE;
1373  case 24:
1374  return AMDGPU::SI_SPILL_V192_SAVE;
1375  case 28:
1376  return AMDGPU::SI_SPILL_V224_SAVE;
1377  case 32:
1378  return AMDGPU::SI_SPILL_V256_SAVE;
1379  case 64:
1380  return AMDGPU::SI_SPILL_V512_SAVE;
1381  case 128:
1382  return AMDGPU::SI_SPILL_V1024_SAVE;
1383  default:
1384  llvm_unreachable("unknown register size");
1385  }
1386 }
1387 
1388 static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
1389  switch (Size) {
1390  case 4:
1391  return AMDGPU::SI_SPILL_A32_SAVE;
1392  case 8:
1393  return AMDGPU::SI_SPILL_A64_SAVE;
1394  case 12:
1395  return AMDGPU::SI_SPILL_A96_SAVE;
1396  case 16:
1397  return AMDGPU::SI_SPILL_A128_SAVE;
1398  case 20:
1399  return AMDGPU::SI_SPILL_A160_SAVE;
1400  case 24:
1401  return AMDGPU::SI_SPILL_A192_SAVE;
1402  case 28:
1403  return AMDGPU::SI_SPILL_A224_SAVE;
1404  case 32:
1405  return AMDGPU::SI_SPILL_A256_SAVE;
1406  case 64:
1407  return AMDGPU::SI_SPILL_A512_SAVE;
1408  case 128:
1409  return AMDGPU::SI_SPILL_A1024_SAVE;
1410  default:
1411  llvm_unreachable("unknown register size");
1412  }
1413 }
1414 
1417  Register SrcReg, bool isKill,
1418  int FrameIndex,
1419  const TargetRegisterClass *RC,
1420  const TargetRegisterInfo *TRI) const {
1421  MachineFunction *MF = MBB.getParent();
1423  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1424  const DebugLoc &DL = MBB.findDebugLoc(MI);
1425 
1426  MachinePointerInfo PtrInfo
1429  PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1430  FrameInfo.getObjectAlign(FrameIndex));
1431  unsigned SpillSize = TRI->getSpillSize(*RC);
1432 
1433  if (RI.isSGPRClass(RC)) {
1434  MFI->setHasSpilledSGPRs();
1435  assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1436  assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1437  SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1438 
1439  // We are only allowed to create one new instruction when spilling
1440  // registers, so we need to use pseudo instruction for spilling SGPRs.
1441  const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1442 
1443  // The SGPR spill/restore instructions only work on number sgprs, so we need
1444  // to make sure we are using the correct register class.
1445  if (SrcReg.isVirtual() && SpillSize == 4) {
1447  MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1448  }
1449 
1450  BuildMI(MBB, MI, DL, OpDesc)
1451  .addReg(SrcReg, getKillRegState(isKill)) // data
1452  .addFrameIndex(FrameIndex) // addr
1453  .addMemOperand(MMO)
1455 
1456  if (RI.spillSGPRToVGPR())
1457  FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1458  return;
1459  }
1460 
1461  unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillSaveOpcode(SpillSize)
1462  : getVGPRSpillSaveOpcode(SpillSize);
1463  MFI->setHasSpilledVGPRs();
1464 
1465  BuildMI(MBB, MI, DL, get(Opcode))
1466  .addReg(SrcReg, getKillRegState(isKill)) // data
1467  .addFrameIndex(FrameIndex) // addr
1468  .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1469  .addImm(0) // offset
1470  .addMemOperand(MMO);
1471 }
1472 
1473 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1474  switch (Size) {
1475  case 4:
1476  return AMDGPU::SI_SPILL_S32_RESTORE;
1477  case 8:
1478  return AMDGPU::SI_SPILL_S64_RESTORE;
1479  case 12:
1480  return AMDGPU::SI_SPILL_S96_RESTORE;
1481  case 16:
1482  return AMDGPU::SI_SPILL_S128_RESTORE;
1483  case 20:
1484  return AMDGPU::SI_SPILL_S160_RESTORE;
1485  case 24:
1486  return AMDGPU::SI_SPILL_S192_RESTORE;
1487  case 28:
1488  return AMDGPU::SI_SPILL_S224_RESTORE;
1489  case 32:
1490  return AMDGPU::SI_SPILL_S256_RESTORE;
1491  case 64:
1492  return AMDGPU::SI_SPILL_S512_RESTORE;
1493  case 128:
1494  return AMDGPU::SI_SPILL_S1024_RESTORE;
1495  default:
1496  llvm_unreachable("unknown register size");
1497  }
1498 }
1499 
1500 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1501  switch (Size) {
1502  case 4:
1503  return AMDGPU::SI_SPILL_V32_RESTORE;
1504  case 8:
1505  return AMDGPU::SI_SPILL_V64_RESTORE;
1506  case 12:
1507  return AMDGPU::SI_SPILL_V96_RESTORE;
1508  case 16:
1509  return AMDGPU::SI_SPILL_V128_RESTORE;
1510  case 20:
1511  return AMDGPU::SI_SPILL_V160_RESTORE;
1512  case 24:
1513  return AMDGPU::SI_SPILL_V192_RESTORE;
1514  case 28:
1515  return AMDGPU::SI_SPILL_V224_RESTORE;
1516  case 32:
1517  return AMDGPU::SI_SPILL_V256_RESTORE;
1518  case 64:
1519  return AMDGPU::SI_SPILL_V512_RESTORE;
1520  case 128:
1521  return AMDGPU::SI_SPILL_V1024_RESTORE;
1522  default:
1523  llvm_unreachable("unknown register size");
1524  }
1525 }
1526 
1527 static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
1528  switch (Size) {
1529  case 4:
1530  return AMDGPU::SI_SPILL_A32_RESTORE;
1531  case 8:
1532  return AMDGPU::SI_SPILL_A64_RESTORE;
1533  case 12:
1534  return AMDGPU::SI_SPILL_A96_RESTORE;
1535  case 16:
1536  return AMDGPU::SI_SPILL_A128_RESTORE;
1537  case 20:
1538  return AMDGPU::SI_SPILL_A160_RESTORE;
1539  case 24:
1540  return AMDGPU::SI_SPILL_A192_RESTORE;
1541  case 28:
1542  return AMDGPU::SI_SPILL_A224_RESTORE;
1543  case 32:
1544  return AMDGPU::SI_SPILL_A256_RESTORE;
1545  case 64:
1546  return AMDGPU::SI_SPILL_A512_RESTORE;
1547  case 128:
1548  return AMDGPU::SI_SPILL_A1024_RESTORE;
1549  default:
1550  llvm_unreachable("unknown register size");
1551  }
1552 }
1553 
1556  Register DestReg, int FrameIndex,
1557  const TargetRegisterClass *RC,
1558  const TargetRegisterInfo *TRI) const {
1559  MachineFunction *MF = MBB.getParent();
1561  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1562  const DebugLoc &DL = MBB.findDebugLoc(MI);
1563  unsigned SpillSize = TRI->getSpillSize(*RC);
1564 
1565  MachinePointerInfo PtrInfo
1567 
1569  PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1570  FrameInfo.getObjectAlign(FrameIndex));
1571 
1572  if (RI.isSGPRClass(RC)) {
1573  MFI->setHasSpilledSGPRs();
1574  assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1575  assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1576  DestReg != AMDGPU::EXEC && "exec should not be spilled");
1577 
1578  // FIXME: Maybe this should not include a memoperand because it will be
1579  // lowered to non-memory instructions.
1580  const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1581  if (DestReg.isVirtual() && SpillSize == 4) {
1583  MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1584  }
1585 
1586  if (RI.spillSGPRToVGPR())
1587  FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1588  BuildMI(MBB, MI, DL, OpDesc, DestReg)
1589  .addFrameIndex(FrameIndex) // addr
1590  .addMemOperand(MMO)
1592 
1593  return;
1594  }
1595 
1596  unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize)
1597  : getVGPRSpillRestoreOpcode(SpillSize);
1598  BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1599  .addFrameIndex(FrameIndex) // vaddr
1600  .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1601  .addImm(0) // offset
1602  .addMemOperand(MMO);
1603 }
1604 
1607  insertNoops(MBB, MI, 1);
1608 }
1609 
1612  unsigned Quantity) const {
1614  while (Quantity > 0) {
1615  unsigned Arg = std::min(Quantity, 8u);
1616  Quantity -= Arg;
1617  BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1618  }
1619 }
1620 
1622  auto MF = MBB.getParent();
1624 
1625  assert(Info->isEntryFunction());
1626 
1627  if (MBB.succ_empty()) {
1628  bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1629  if (HasNoTerminator) {
1630  if (Info->returnsVoid()) {
1631  BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1632  } else {
1633  BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1634  }
1635  }
1636  }
1637 }
1638 
1640  switch (MI.getOpcode()) {
1641  default:
1642  if (MI.isMetaInstruction())
1643  return 0;
1644  return 1; // FIXME: Do wait states equal cycles?
1645 
1646  case AMDGPU::S_NOP:
1647  return MI.getOperand(0).getImm() + 1;
1648 
1649  // FIXME: Any other pseudo instruction?
1650  // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
1651  // hazard, even if one exist, won't really be visible. Should we handle it?
1652  case AMDGPU::SI_MASKED_UNREACHABLE:
1653  case AMDGPU::WAVE_BARRIER:
1654  return 0;
1655  }
1656 }
1657 
1659  const SIRegisterInfo *TRI = ST.getRegisterInfo();
1660  MachineBasicBlock &MBB = *MI.getParent();
1662  switch (MI.getOpcode()) {
1663  default: return TargetInstrInfo::expandPostRAPseudo(MI);
1664  case AMDGPU::S_MOV_B64_term:
1665  // This is only a terminator to get the correct spill code placement during
1666  // register allocation.
1667  MI.setDesc(get(AMDGPU::S_MOV_B64));
1668  break;
1669 
1670  case AMDGPU::S_MOV_B32_term:
1671  // This is only a terminator to get the correct spill code placement during
1672  // register allocation.
1673  MI.setDesc(get(AMDGPU::S_MOV_B32));
1674  break;
1675 
1676  case AMDGPU::S_XOR_B64_term:
1677  // This is only a terminator to get the correct spill code placement during
1678  // register allocation.
1679  MI.setDesc(get(AMDGPU::S_XOR_B64));
1680  break;
1681 
1682  case AMDGPU::S_XOR_B32_term:
1683  // This is only a terminator to get the correct spill code placement during
1684  // register allocation.
1685  MI.setDesc(get(AMDGPU::S_XOR_B32));
1686  break;
1687  case AMDGPU::S_OR_B64_term:
1688  // This is only a terminator to get the correct spill code placement during
1689  // register allocation.
1690  MI.setDesc(get(AMDGPU::S_OR_B64));
1691  break;
1692  case AMDGPU::S_OR_B32_term:
1693  // This is only a terminator to get the correct spill code placement during
1694  // register allocation.
1695  MI.setDesc(get(AMDGPU::S_OR_B32));
1696  break;
1697 
1698  case AMDGPU::S_ANDN2_B64_term:
1699  // This is only a terminator to get the correct spill code placement during
1700  // register allocation.
1701  MI.setDesc(get(AMDGPU::S_ANDN2_B64));
1702  break;
1703 
1704  case AMDGPU::S_ANDN2_B32_term:
1705  // This is only a terminator to get the correct spill code placement during
1706  // register allocation.
1707  MI.setDesc(get(AMDGPU::S_ANDN2_B32));
1708  break;
1709 
1710  case AMDGPU::S_AND_B64_term:
1711  // This is only a terminator to get the correct spill code placement during
1712  // register allocation.
1713  MI.setDesc(get(AMDGPU::S_AND_B64));
1714  break;
1715 
1716  case AMDGPU::S_AND_B32_term:
1717  // This is only a terminator to get the correct spill code placement during
1718  // register allocation.
1719  MI.setDesc(get(AMDGPU::S_AND_B32));
1720  break;
1721 
1722  case AMDGPU::V_MOV_B64_PSEUDO: {
1723  Register Dst = MI.getOperand(0).getReg();
1724  Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
1725  Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
1726 
1727  const MachineOperand &SrcOp = MI.getOperand(1);
1728  // FIXME: Will this work for 64-bit floating point immediates?
1729  assert(!SrcOp.isFPImm());
1730  if (SrcOp.isImm()) {
1731  APInt Imm(64, SrcOp.getImm());
1732  APInt Lo(32, Imm.getLoBits(32).getZExtValue());
1733  APInt Hi(32, Imm.getHiBits(32).getZExtValue());
1734  if (ST.hasPackedFP32Ops() && Lo == Hi && isInlineConstant(Lo)) {
1735  BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
1737  .addImm(Lo.getSExtValue())
1739  .addImm(Lo.getSExtValue())
1740  .addImm(0) // op_sel_lo
1741  .addImm(0) // op_sel_hi
1742  .addImm(0) // neg_lo
1743  .addImm(0) // neg_hi
1744  .addImm(0); // clamp
1745  } else {
1746  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1747  .addImm(Lo.getSExtValue())
1749  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1750  .addImm(Hi.getSExtValue())
1752  }
1753  } else {
1754  assert(SrcOp.isReg());
1755  if (ST.hasPackedFP32Ops() &&
1756  !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
1757  BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
1758  .addImm(SISrcMods::OP_SEL_1) // src0_mod
1759  .addReg(SrcOp.getReg())
1761  .addReg(SrcOp.getReg())
1762  .addImm(0) // op_sel_lo
1763  .addImm(0) // op_sel_hi
1764  .addImm(0) // neg_lo
1765  .addImm(0) // neg_hi
1766  .addImm(0); // clamp
1767  } else {
1768  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1769  .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
1771  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1772  .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
1774  }
1775  }
1776  MI.eraseFromParent();
1777  break;
1778  }
1779  case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
1780  expandMovDPP64(MI);
1781  break;
1782  }
1783  case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
1784  const MachineOperand &SrcOp = MI.getOperand(1);
1785  assert(!SrcOp.isFPImm());
1786  APInt Imm(64, SrcOp.getImm());
1787  if (Imm.isIntN(32) || isInlineConstant(Imm)) {
1788  MI.setDesc(get(AMDGPU::S_MOV_B64));
1789  break;
1790  }
1791 
1792  Register Dst = MI.getOperand(0).getReg();
1793  Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
1794  Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
1795 
1796  APInt Lo(32, Imm.getLoBits(32).getZExtValue());
1797  APInt Hi(32, Imm.getHiBits(32).getZExtValue());
1798  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
1799  .addImm(Lo.getSExtValue())
1801  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
1802  .addImm(Hi.getSExtValue())
1804  MI.eraseFromParent();
1805  break;
1806  }
1807  case AMDGPU::V_SET_INACTIVE_B32: {
1808  unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
1809  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
1810  auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
1811  FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
1812  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
1813  .add(MI.getOperand(2));
1814  BuildMI(MBB, MI, DL, get(NotOpc), Exec)
1815  .addReg(Exec);
1816  MI.eraseFromParent();
1817  break;
1818  }
1819  case AMDGPU::V_SET_INACTIVE_B64: {
1820  unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
1821  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
1822  auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
1823  FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
1824  MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
1825  MI.getOperand(0).getReg())
1826  .add(MI.getOperand(2));
1827  expandPostRAPseudo(*Copy);
1828  BuildMI(MBB, MI, DL, get(NotOpc), Exec)
1829  .addReg(Exec);
1830  MI.eraseFromParent();
1831  break;
1832  }
1833  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
1834  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
1835  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
1836  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
1837  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
1838  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
1839  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
1840  case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
1841  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
1842  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
1843  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
1844  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
1845  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
1846  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
1847  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
1848  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
1849  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
1850  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
1851  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
1852  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
1853  case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
1854  const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
1855 
1856  unsigned Opc;
1857  if (RI.hasVGPRs(EltRC)) {
1858  Opc = AMDGPU::V_MOVRELD_B32_e32;
1859  } else {
1860  Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
1861  : AMDGPU::S_MOVRELD_B32;
1862  }
1863 
1864  const MCInstrDesc &OpDesc = get(Opc);
1865  Register VecReg = MI.getOperand(0).getReg();
1866  bool IsUndef = MI.getOperand(1).isUndef();
1867  unsigned SubReg = MI.getOperand(3).getImm();
1868  assert(VecReg == MI.getOperand(1).getReg());
1869 
1870  MachineInstrBuilder MIB =
1871  BuildMI(MBB, MI, DL, OpDesc)
1872  .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
1873  .add(MI.getOperand(2))
1875  .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
1876 
1877  const int ImpDefIdx =
1878  OpDesc.getNumOperands() + OpDesc.getNumImplicitUses();
1879  const int ImpUseIdx = ImpDefIdx + 1;
1880  MIB->tieOperands(ImpDefIdx, ImpUseIdx);
1881  MI.eraseFromParent();
1882  break;
1883  }
1884  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
1885  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
1886  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
1887  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
1888  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
1889  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
1890  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
1891  case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
1892  assert(ST.useVGPRIndexMode());
1893  Register VecReg = MI.getOperand(0).getReg();
1894  bool IsUndef = MI.getOperand(1).isUndef();
1895  Register Idx = MI.getOperand(3).getReg();
1896  Register SubReg = MI.getOperand(4).getImm();
1897 
1898  MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
1899  .addReg(Idx)
1901  SetOn->getOperand(3).setIsUndef();
1902 
1903  const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect);
1904  MachineInstrBuilder MIB =
1905  BuildMI(MBB, MI, DL, OpDesc)
1906  .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
1907  .add(MI.getOperand(2))
1909  .addReg(VecReg,
1910  RegState::Implicit | (IsUndef ? RegState::Undef : 0));
1911 
1912  const int ImpDefIdx = OpDesc.getNumOperands() + OpDesc.getNumImplicitUses();
1913  const int ImpUseIdx = ImpDefIdx + 1;
1914  MIB->tieOperands(ImpDefIdx, ImpUseIdx);
1915 
1916  MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
1917 
1918  finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
1919 
1920  MI.eraseFromParent();
1921  break;
1922  }
1923  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
1924  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
1925  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
1926  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
1927  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
1928  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
1929  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
1930  case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
1931  assert(ST.useVGPRIndexMode());
1932  Register Dst = MI.getOperand(0).getReg();
1933  Register VecReg = MI.getOperand(1).getReg();
1934  bool IsUndef = MI.getOperand(1).isUndef();
1935  Register Idx = MI.getOperand(2).getReg();
1936  Register SubReg = MI.getOperand(3).getImm();
1937 
1938  MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
1939  .addReg(Idx)
1941  SetOn->getOperand(3).setIsUndef();
1942 
1943  BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32))
1944  .addDef(Dst)
1945  .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
1946  .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0))
1948 
1949  MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
1950 
1951  finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
1952 
1953  MI.eraseFromParent();
1954  break;
1955  }
1956  case AMDGPU::SI_PC_ADD_REL_OFFSET: {
1957  MachineFunction &MF = *MBB.getParent();
1958  Register Reg = MI.getOperand(0).getReg();
1959  Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
1960  Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
1961 
1962  // Create a bundle so these instructions won't be re-ordered by the
1963  // post-RA scheduler.
1964  MIBundleBuilder Bundler(MBB, MI);
1965  Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
1966 
1967  // Add 32-bit offset from this instruction to the start of the
1968  // constant data.
1969  Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
1970  .addReg(RegLo)
1971  .add(MI.getOperand(1)));
1972 
1973  MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
1974  .addReg(RegHi);
1975  MIB.add(MI.getOperand(2));
1976 
1977  Bundler.append(MIB);
1978  finalizeBundle(MBB, Bundler.begin());
1979 
1980  MI.eraseFromParent();
1981  break;
1982  }
1983  case AMDGPU::ENTER_STRICT_WWM: {
1984  // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
1985  // Whole Wave Mode is entered.
1986  MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1987  : AMDGPU::S_OR_SAVEEXEC_B64));
1988  break;
1989  }
1990  case AMDGPU::ENTER_STRICT_WQM: {
1991  // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
1992  // STRICT_WQM is entered.
1993  const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
1994  const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
1995  const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1996  BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
1997  BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
1998 
1999  MI.eraseFromParent();
2000  break;
2001  }
2002  case AMDGPU::EXIT_STRICT_WWM:
2003  case AMDGPU::EXIT_STRICT_WQM: {
2004  // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2005  // WWM/STICT_WQM is exited.
2006  MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
2007  break;
2008  }
2009  }
2010  return true;
2011 }
2012 
2013 std::pair<MachineInstr*, MachineInstr*>
2015  assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2016 
2017  MachineBasicBlock &MBB = *MI.getParent();
2019  MachineFunction *MF = MBB.getParent();
2021  Register Dst = MI.getOperand(0).getReg();
2022  unsigned Part = 0;
2023  MachineInstr *Split[2];
2024 
2025  for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2026  auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2027  if (Dst.isPhysical()) {
2028  MovDPP.addDef(RI.getSubReg(Dst, Sub));
2029  } else {
2030  assert(MRI.isSSA());
2031  auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2032  MovDPP.addDef(Tmp);
2033  }
2034 
2035  for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2036  const MachineOperand &SrcOp = MI.getOperand(I);
2037  assert(!SrcOp.isFPImm());
2038  if (SrcOp.isImm()) {
2039  APInt Imm(64, SrcOp.getImm());
2040  Imm.ashrInPlace(Part * 32);
2041  MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2042  } else {
2043  assert(SrcOp.isReg());
2044  Register Src = SrcOp.getReg();
2045  if (Src.isPhysical())
2046  MovDPP.addReg(RI.getSubReg(Src, Sub));
2047  else
2048  MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2049  }
2050  }
2051 
2052  for (unsigned I = 3; I < MI.getNumExplicitOperands(); ++I)
2053  MovDPP.addImm(MI.getOperand(I).getImm());
2054 
2055  Split[Part] = MovDPP;
2056  ++Part;
2057  }
2058 
2059  if (Dst.isVirtual())
2060  BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2061  .addReg(Split[0]->getOperand(0).getReg())
2062  .addImm(AMDGPU::sub0)
2063  .addReg(Split[1]->getOperand(0).getReg())
2064  .addImm(AMDGPU::sub1);
2065 
2066  MI.eraseFromParent();
2067  return std::make_pair(Split[0], Split[1]);
2068 }
2069 
2071  MachineOperand &Src0,
2072  unsigned Src0OpName,
2073  MachineOperand &Src1,
2074  unsigned Src1OpName) const {
2075  MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2076  if (!Src0Mods)
2077  return false;
2078 
2079  MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2080  assert(Src1Mods &&
2081  "All commutable instructions have both src0 and src1 modifiers");
2082 
2083  int Src0ModsVal = Src0Mods->getImm();
2084  int Src1ModsVal = Src1Mods->getImm();
2085 
2086  Src1Mods->setImm(Src0ModsVal);
2087  Src0Mods->setImm(Src1ModsVal);
2088  return true;
2089 }
2090 
2092  MachineOperand &RegOp,
2093  MachineOperand &NonRegOp) {
2094  Register Reg = RegOp.getReg();
2095  unsigned SubReg = RegOp.getSubReg();
2096  bool IsKill = RegOp.isKill();
2097  bool IsDead = RegOp.isDead();
2098  bool IsUndef = RegOp.isUndef();
2099  bool IsDebug = RegOp.isDebug();
2100 
2101  if (NonRegOp.isImm())
2102  RegOp.ChangeToImmediate(NonRegOp.getImm());
2103  else if (NonRegOp.isFI())
2104  RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2105  else if (NonRegOp.isGlobal()) {
2106  RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2107  NonRegOp.getTargetFlags());
2108  } else
2109  return nullptr;
2110 
2111  // Make sure we don't reinterpret a subreg index in the target flags.
2112  RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2113 
2114  NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2115  NonRegOp.setSubReg(SubReg);
2116 
2117  return &MI;
2118 }
2119 
2121  unsigned Src0Idx,
2122  unsigned Src1Idx) const {
2123  assert(!NewMI && "this should never be used");
2124 
2125  unsigned Opc = MI.getOpcode();
2126  int CommutedOpcode = commuteOpcode(Opc);
2127  if (CommutedOpcode == -1)
2128  return nullptr;
2129 
2130  assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2131  static_cast<int>(Src0Idx) &&
2132  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2133  static_cast<int>(Src1Idx) &&
2134  "inconsistency with findCommutedOpIndices");
2135 
2136  MachineOperand &Src0 = MI.getOperand(Src0Idx);
2137  MachineOperand &Src1 = MI.getOperand(Src1Idx);
2138 
2139  MachineInstr *CommutedMI = nullptr;
2140  if (Src0.isReg() && Src1.isReg()) {
2141  if (isOperandLegal(MI, Src1Idx, &Src0)) {
2142  // Be sure to copy the source modifiers to the right place.
2143  CommutedMI
2144  = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2145  }
2146 
2147  } else if (Src0.isReg() && !Src1.isReg()) {
2148  // src0 should always be able to support any operand type, so no need to
2149  // check operand legality.
2150  CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2151  } else if (!Src0.isReg() && Src1.isReg()) {
2152  if (isOperandLegal(MI, Src1Idx, &Src0))
2153  CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2154  } else {
2155  // FIXME: Found two non registers to commute. This does happen.
2156  return nullptr;
2157  }
2158 
2159  if (CommutedMI) {
2160  swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2161  Src1, AMDGPU::OpName::src1_modifiers);
2162 
2163  CommutedMI->setDesc(get(CommutedOpcode));
2164  }
2165 
2166  return CommutedMI;
2167 }
2168 
2169 // This needs to be implemented because the source modifiers may be inserted
2170 // between the true commutable operands, and the base
2171 // TargetInstrInfo::commuteInstruction uses it.
2173  unsigned &SrcOpIdx0,
2174  unsigned &SrcOpIdx1) const {
2175  return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2176 }
2177 
2178 bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0,
2179  unsigned &SrcOpIdx1) const {
2180  if (!Desc.isCommutable())
2181  return false;
2182 
2183  unsigned Opc = Desc.getOpcode();
2184  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2185  if (Src0Idx == -1)
2186  return false;
2187 
2188  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2189  if (Src1Idx == -1)
2190  return false;
2191 
2192  return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2193 }
2194 
2195 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
2196  int64_t BrOffset) const {
2197  // BranchRelaxation should never have to check s_setpc_b64 because its dest
2198  // block is unanalyzable.
2199  assert(BranchOp != AMDGPU::S_SETPC_B64);
2200 
2201  // Convert to dwords.
2202  BrOffset /= 4;
2203 
2204  // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2205  // from the next instruction.
2206  BrOffset -= 1;
2207 
2208  return isIntN(BranchOffsetBits, BrOffset);
2209 }
2210 
2212  const MachineInstr &MI) const {
2213  if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
2214  // This would be a difficult analysis to perform, but can always be legal so
2215  // there's no need to analyze it.
2216  return nullptr;
2217  }
2218 
2219  return MI.getOperand(0).getMBB();
2220 }
2221 
2223  MachineBasicBlock &DestBB,
2224  const DebugLoc &DL,
2225  int64_t BrOffset,
2226  RegScavenger *RS) const {
2227  assert(RS && "RegScavenger required for long branching");
2228  assert(MBB.empty() &&
2229  "new block should be inserted for expanding unconditional branch");
2230  assert(MBB.pred_size() == 1);
2231 
2232  MachineFunction *MF = MBB.getParent();
2234 
2235  // FIXME: Virtual register workaround for RegScavenger not working with empty
2236  // blocks.
2237  Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2238 
2239  auto I = MBB.end();
2240 
2241  // We need to compute the offset relative to the instruction immediately after
2242  // s_getpc_b64. Insert pc arithmetic code before last terminator.
2243  MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2244 
2245  auto &MCCtx = MF->getContext();
2246  MCSymbol *PostGetPCLabel =
2247  MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2248  GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2249 
2250  MCSymbol *OffsetLo =
2251  MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2252  MCSymbol *OffsetHi =
2253  MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2254  BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2255  .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2256  .addReg(PCReg, 0, AMDGPU::sub0)
2257  .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2258  BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2259  .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2260  .addReg(PCReg, 0, AMDGPU::sub1)
2261  .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2262 
2263  // Insert the indirect branch after the other terminator.
2264  BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2265  .addReg(PCReg);
2266 
2267  auto ComputeBlockSize = [](const TargetInstrInfo *TII,
2268  const MachineBasicBlock &MBB) {
2269  unsigned Size = 0;
2270  for (const MachineInstr &MI : MBB)
2271  Size += TII->getInstSizeInBytes(MI);
2272  return Size;
2273  };
2274 
2275  // FIXME: If spilling is necessary, this will fail because this scavenger has
2276  // no emergency stack slots. It is non-trivial to spill in this situation,
2277  // because the restore code needs to be specially placed after the
2278  // jump. BranchRelaxation then needs to be made aware of the newly inserted
2279  // block.
2280  //
2281  // If a spill is needed for the pc register pair, we need to insert a spill
2282  // restore block right before the destination block, and insert a short branch
2283  // into the old destination block's fallthrough predecessor.
2284  // e.g.:
2285  //
2286  // s_cbranch_scc0 skip_long_branch:
2287  //
2288  // long_branch_bb:
2289  // spill s[8:9]
2290  // s_getpc_b64 s[8:9]
2291  // s_add_u32 s8, s8, restore_bb
2292  // s_addc_u32 s9, s9, 0
2293  // s_setpc_b64 s[8:9]
2294  //
2295  // skip_long_branch:
2296  // foo;
2297  //
2298  // .....
2299  //
2300  // dest_bb_fallthrough_predecessor:
2301  // bar;
2302  // s_branch dest_bb
2303  //
2304  // restore_bb:
2305  // restore s[8:9]
2306  // fallthrough dest_bb
2307  ///
2308  // dest_bb:
2309  // buzz;
2310 
2311  RS->enterBasicBlockEnd(MBB);
2313  AMDGPU::SReg_64RegClass,
2314  MachineBasicBlock::iterator(GetPC), false, 0);
2315  MRI.replaceRegWith(PCReg, Scav);
2316  MRI.clearVirtRegs();
2317  RS->setRegUsed(Scav);
2318 
2319  // Now, the distance could be defined.
2321  MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx),
2322  MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
2323  // Add offset assignments.
2324  auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
2326  auto *ShAmt = MCConstantExpr::create(32, MCCtx);
2327  OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
2328  return ComputeBlockSize(this, MBB);
2329 }
2330 
2331 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
2332  switch (Cond) {
2333  case SIInstrInfo::SCC_TRUE:
2334  return AMDGPU::S_CBRANCH_SCC1;
2335  case SIInstrInfo::SCC_FALSE:
2336  return AMDGPU::S_CBRANCH_SCC0;
2337  case SIInstrInfo::VCCNZ:
2338  return AMDGPU::S_CBRANCH_VCCNZ;
2339  case SIInstrInfo::VCCZ:
2340  return AMDGPU::S_CBRANCH_VCCZ;
2341  case SIInstrInfo::EXECNZ:
2342  return AMDGPU::S_CBRANCH_EXECNZ;
2343  case SIInstrInfo::EXECZ:
2344  return AMDGPU::S_CBRANCH_EXECZ;
2345  default:
2346  llvm_unreachable("invalid branch predicate");
2347  }
2348 }
2349 
2350 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
2351  switch (Opcode) {
2352  case AMDGPU::S_CBRANCH_SCC0:
2353  return SCC_FALSE;
2354  case AMDGPU::S_CBRANCH_SCC1:
2355  return SCC_TRUE;
2356  case AMDGPU::S_CBRANCH_VCCNZ:
2357  return VCCNZ;
2358  case AMDGPU::S_CBRANCH_VCCZ:
2359  return VCCZ;
2360  case AMDGPU::S_CBRANCH_EXECNZ:
2361  return EXECNZ;
2362  case AMDGPU::S_CBRANCH_EXECZ:
2363  return EXECZ;
2364  default:
2365  return INVALID_BR;
2366  }
2367 }
2368 
2371  MachineBasicBlock *&TBB,
2372  MachineBasicBlock *&FBB,
2374  bool AllowModify) const {
2375  if (I->getOpcode() == AMDGPU::S_BRANCH) {
2376  // Unconditional Branch
2377  TBB = I->getOperand(0).getMBB();
2378  return false;
2379  }
2380 
2381  MachineBasicBlock *CondBB = nullptr;
2382 
2383  if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
2384  CondBB = I->getOperand(1).getMBB();
2385  Cond.push_back(I->getOperand(0));
2386  } else {
2387  BranchPredicate Pred = getBranchPredicate(I->getOpcode());
2388  if (Pred == INVALID_BR)
2389  return true;
2390 
2391  CondBB = I->getOperand(0).getMBB();
2392  Cond.push_back(MachineOperand::CreateImm(Pred));
2393  Cond.push_back(I->getOperand(1)); // Save the branch register.
2394  }
2395  ++I;
2396 
2397  if (I == MBB.end()) {
2398  // Conditional branch followed by fall-through.
2399  TBB = CondBB;
2400  return false;
2401  }
2402 
2403  if (I->getOpcode() == AMDGPU::S_BRANCH) {
2404  TBB = CondBB;
2405  FBB = I->getOperand(0).getMBB();
2406  return false;
2407  }
2408 
2409  return true;
2410 }
2411 
2413  MachineBasicBlock *&FBB,
2415  bool AllowModify) const {
2417  auto E = MBB.end();
2418  if (I == E)
2419  return false;
2420 
2421  // Skip over the instructions that are artificially terminators for special
2422  // exec management.
2423  while (I != E && !I->isBranch() && !I->isReturn()) {
2424  switch (I->getOpcode()) {
2425  case AMDGPU::S_MOV_B64_term:
2426  case AMDGPU::S_XOR_B64_term:
2427  case AMDGPU::S_OR_B64_term:
2428  case AMDGPU::S_ANDN2_B64_term:
2429  case AMDGPU::S_AND_B64_term:
2430  case AMDGPU::S_MOV_B32_term:
2431  case AMDGPU::S_XOR_B32_term:
2432  case AMDGPU::S_OR_B32_term:
2433  case AMDGPU::S_ANDN2_B32_term:
2434  case AMDGPU::S_AND_B32_term:
2435  break;
2436  case AMDGPU::SI_IF:
2437  case AMDGPU::SI_ELSE:
2438  case AMDGPU::SI_KILL_I1_TERMINATOR:
2439  case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
2440  // FIXME: It's messy that these need to be considered here at all.
2441  return true;
2442  default:
2443  llvm_unreachable("unexpected non-branch terminator inst");
2444  }
2445 
2446  ++I;
2447  }
2448 
2449  if (I == E)
2450  return false;
2451 
2452  return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
2453 }
2454 
2456  int *BytesRemoved) const {
2458 
2459  unsigned Count = 0;
2460  unsigned RemovedSize = 0;
2461  while (I != MBB.end()) {
2462  MachineBasicBlock::iterator Next = std::next(I);
2463  // Skip over artificial terminators when removing instructions.
2464  if (I->isBranch() || I->isReturn()) {
2465  RemovedSize += getInstSizeInBytes(*I);
2466  I->eraseFromParent();
2467  ++Count;
2468  }
2469  I = Next;
2470  }
2471 
2472  if (BytesRemoved)
2473  *BytesRemoved = RemovedSize;
2474 
2475  return Count;
2476 }
2477 
2478 // Copy the flags onto the implicit condition register operand.
2480  const MachineOperand &OrigCond) {
2481  CondReg.setIsUndef(OrigCond.isUndef());
2482  CondReg.setIsKill(OrigCond.isKill());
2483 }
2484 
2486  MachineBasicBlock *TBB,
2487  MachineBasicBlock *FBB,
2489  const DebugLoc &DL,
2490  int *BytesAdded) const {
2491  if (!FBB && Cond.empty()) {
2492  BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
2493  .addMBB(TBB);
2494  if (BytesAdded)
2495  *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
2496  return 1;
2497  }
2498 
2499  if(Cond.size() == 1 && Cond[0].isReg()) {
2500  BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
2501  .add(Cond[0])
2502  .addMBB(TBB);
2503  return 1;
2504  }
2505 
2506  assert(TBB && Cond[0].isImm());
2507 
2508  unsigned Opcode
2509  = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
2510 
2511  if (!FBB) {
2512  Cond[1].isUndef();
2513  MachineInstr *CondBr =
2514  BuildMI(&MBB, DL, get(Opcode))
2515  .addMBB(TBB);
2516 
2517  // Copy the flags onto the implicit condition register operand.
2518  preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
2519  fixImplicitOperands(*CondBr);
2520 
2521  if (BytesAdded)
2522  *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
2523  return 1;
2524  }
2525 
2526  assert(TBB && FBB);
2527 
2528  MachineInstr *CondBr =
2529  BuildMI(&MBB, DL, get(Opcode))
2530  .addMBB(TBB);
2531  fixImplicitOperands(*CondBr);
2532  BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
2533  .addMBB(FBB);
2534 
2535  MachineOperand &CondReg = CondBr->getOperand(1);
2536  CondReg.setIsUndef(Cond[1].isUndef());
2537  CondReg.setIsKill(Cond[1].isKill());
2538 
2539  if (BytesAdded)
2540  *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
2541 
2542  return 2;
2543 }
2544 
2547  if (Cond.size() != 2) {
2548  return true;
2549  }
2550 
2551  if (Cond[0].isImm()) {
2552  Cond[0].setImm(-Cond[0].getImm());
2553  return false;
2554  }
2555 
2556  return true;
2557 }
2558 
2561  Register DstReg, Register TrueReg,
2562  Register FalseReg, int &CondCycles,
2563  int &TrueCycles, int &FalseCycles) const {
2564  switch (Cond[0].getImm()) {
2565  case VCCNZ:
2566  case VCCZ: {
2568  const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
2569  if (MRI.getRegClass(FalseReg) != RC)
2570  return false;
2571 
2572  int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
2573  CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
2574 
2575  // Limit to equal cost for branch vs. N v_cndmask_b32s.
2576  return RI.hasVGPRs(RC) && NumInsts <= 6;
2577  }
2578  case SCC_TRUE:
2579  case SCC_FALSE: {
2580  // FIXME: We could insert for VGPRs if we could replace the original compare
2581  // with a vector one.
2583  const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
2584  if (MRI.getRegClass(FalseReg) != RC)
2585  return false;
2586 
2587  int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
2588 
2589  // Multiples of 8 can do s_cselect_b64
2590  if (NumInsts % 2 == 0)
2591  NumInsts /= 2;
2592 
2593  CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
2594  return RI.isSGPRClass(RC);
2595  }
2596  default:
2597  return false;
2598  }
2599 }
2600 
2604  Register TrueReg, Register FalseReg) const {
2605  BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
2606  if (Pred == VCCZ || Pred == SCC_FALSE) {
2607  Pred = static_cast<BranchPredicate>(-Pred);
2608  std::swap(TrueReg, FalseReg);
2609  }
2610 
2612  const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
2613  unsigned DstSize = RI.getRegSizeInBits(*DstRC);
2614 
2615  if (DstSize == 32) {
2617  if (Pred == SCC_TRUE) {
2618  Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
2619  .addReg(TrueReg)
2620  .addReg(FalseReg);
2621  } else {
2622  // Instruction's operands are backwards from what is expected.
2623  Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
2624  .addReg(FalseReg)
2625  .addReg(TrueReg);
2626  }
2627 
2628  preserveCondRegFlags(Select->getOperand(3), Cond[1]);
2629  return;
2630  }
2631 
2632  if (DstSize == 64 && Pred == SCC_TRUE) {
2633  MachineInstr *Select =
2634  BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
2635  .addReg(TrueReg)
2636  .addReg(FalseReg);
2637 
2638  preserveCondRegFlags(Select->getOperand(3), Cond[1]);
2639  return;
2640  }
2641 
2642  static const int16_t Sub0_15[] = {
2643  AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
2644  AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
2645  AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
2646  AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
2647  };
2648 
2649  static const int16_t Sub0_15_64[] = {
2650  AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
2651  AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
2652  AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
2653  AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
2654  };
2655 
2656  unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
2657  const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
2658  const int16_t *SubIndices = Sub0_15;
2659  int NElts = DstSize / 32;
2660 
2661  // 64-bit select is only available for SALU.
2662  // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
2663  if (Pred == SCC_TRUE) {
2664  if (NElts % 2) {
2665  SelOp = AMDGPU::S_CSELECT_B32;
2666  EltRC = &AMDGPU::SGPR_32RegClass;
2667  } else {
2668  SelOp = AMDGPU::S_CSELECT_B64;
2669  EltRC = &AMDGPU::SGPR_64RegClass;
2670  SubIndices = Sub0_15_64;
2671  NElts /= 2;
2672  }
2673  }
2674 
2676  MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
2677 
2678  I = MIB->getIterator();
2679 
2681  for (int Idx = 0; Idx != NElts; ++Idx) {
2682  Register DstElt = MRI.createVirtualRegister(EltRC);
2683  Regs.push_back(DstElt);
2684 
2685  unsigned SubIdx = SubIndices[Idx];
2686 
2688  if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
2689  Select =
2690  BuildMI(MBB, I, DL, get(SelOp), DstElt)
2691  .addReg(FalseReg, 0, SubIdx)
2692  .addReg(TrueReg, 0, SubIdx);
2693  } else {
2694  Select =
2695  BuildMI(MBB, I, DL, get(SelOp), DstElt)
2696  .addReg(TrueReg, 0, SubIdx)
2697  .addReg(FalseReg, 0, SubIdx);
2698  }
2699 
2700  preserveCondRegFlags(Select->getOperand(3), Cond[1]);
2702 
2703  MIB.addReg(DstElt)
2704  .addImm(SubIdx);
2705  }
2706 }
2707 
2709  switch (MI.getOpcode()) {
2710  case AMDGPU::V_MOV_B32_e32:
2711  case AMDGPU::V_MOV_B32_e64:
2712  case AMDGPU::V_MOV_B64_PSEUDO: {
2713  // If there are additional implicit register operands, this may be used for
2714  // register indexing so the source register operand isn't simply copied.
2715  unsigned NumOps = MI.getDesc().getNumOperands() +
2716  MI.getDesc().getNumImplicitUses();
2717 
2718  return MI.getNumOperands() == NumOps;
2719  }
2720  case AMDGPU::S_MOV_B32:
2721  case AMDGPU::S_MOV_B64:
2722  case AMDGPU::COPY:
2723  case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
2724  case AMDGPU::V_ACCVGPR_READ_B32_e64:
2725  case AMDGPU::V_ACCVGPR_MOV_B32:
2726  return true;
2727  default:
2728  return false;
2729  }
2730 }
2731 
2733  unsigned Kind) const {
2734  switch(Kind) {
2745  }
2746  return AMDGPUAS::FLAT_ADDRESS;
2747 }
2748 
2750  unsigned Opc = MI.getOpcode();
2751  int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
2752  AMDGPU::OpName::src0_modifiers);
2753  int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
2754  AMDGPU::OpName::src1_modifiers);
2755  int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
2756  AMDGPU::OpName::src2_modifiers);
2757 
2758  MI.RemoveOperand(Src2ModIdx);
2759  MI.RemoveOperand(Src1ModIdx);
2760  MI.RemoveOperand(Src0ModIdx);
2761 }
2762 
2764  Register Reg, MachineRegisterInfo *MRI) const {
2765  if (!MRI->hasOneNonDBGUse(Reg))
2766  return false;
2767 
2768  switch (DefMI.getOpcode()) {
2769  default:
2770  return false;
2771  case AMDGPU::S_MOV_B64:
2772  // TODO: We could fold 64-bit immediates, but this get compilicated
2773  // when there are sub-registers.
2774  return false;
2775 
2776  case AMDGPU::V_MOV_B32_e32:
2777  case AMDGPU::S_MOV_B32:
2778  case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
2779  break;
2780  }
2781 
2782  const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
2783  assert(ImmOp);
2784  // FIXME: We could handle FrameIndex values here.
2785  if (!ImmOp->isImm())
2786  return false;
2787 
2788  unsigned Opc = UseMI.getOpcode();
2789  if (Opc == AMDGPU::COPY) {
2790  Register DstReg = UseMI.getOperand(0).getReg();
2791  bool Is16Bit = getOpSize(UseMI, 0) == 2;
2792  bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
2793  unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2794  APInt Imm(32, ImmOp->getImm());
2795 
2796  if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16)
2797  Imm = Imm.ashr(16);
2798 
2799  if (RI.isAGPR(*MRI, DstReg)) {
2800  if (!isInlineConstant(Imm))
2801  return false;
2802  NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
2803  }
2804 
2805  if (Is16Bit) {
2806  if (isVGPRCopy)
2807  return false; // Do not clobber vgpr_hi16
2808 
2809  if (DstReg.isVirtual() &&
2810  UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
2811  return false;
2812 
2813  UseMI.getOperand(0).setSubReg(0);
2814  if (DstReg.isPhysical()) {
2815  DstReg = RI.get32BitRegister(DstReg);
2816  UseMI.getOperand(0).setReg(DstReg);
2817  }
2818  assert(UseMI.getOperand(1).getReg().isVirtual());
2819  }
2820 
2821  UseMI.setDesc(get(NewOpc));
2822  UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
2823  UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
2824  return true;
2825  }
2826 
2827  if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
2828  Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
2829  Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
2830  Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64) {
2831  // Don't fold if we are using source or output modifiers. The new VOP2
2832  // instructions don't have them.
2834  return false;
2835 
2836  // If this is a free constant, there's no reason to do this.
2837  // TODO: We could fold this here instead of letting SIFoldOperands do it
2838  // later.
2839  MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
2840 
2841  // Any src operand can be used for the legality check.
2842  if (isInlineConstant(UseMI, *Src0, *ImmOp))
2843  return false;
2844 
2845  bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
2846  Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64;
2847  bool IsFMA = Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
2848  Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64;
2849  MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
2850  MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
2851 
2852  // Multiplied part is the constant: Use v_madmk_{f16, f32}.
2853  // We should only expect these to be on src0 due to canonicalizations.
2854  if (Src0->isReg() && Src0->getReg() == Reg) {
2855  if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
2856  return false;
2857 
2858  if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
2859  return false;
2860 
2861  unsigned NewOpc =
2862  IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 : AMDGPU::V_FMAMK_F16)
2863  : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
2864  if (pseudoToMCOpcode(NewOpc) == -1)
2865  return false;
2866 
2867  // We need to swap operands 0 and 1 since madmk constant is at operand 1.
2868 
2869  const int64_t Imm = ImmOp->getImm();
2870 
2871  // FIXME: This would be a lot easier if we could return a new instruction
2872  // instead of having to modify in place.
2873 
2874  // Remove these first since they are at the end.
2875  UseMI.RemoveOperand(
2876  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
2877  UseMI.RemoveOperand(
2878  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
2879 
2880  Register Src1Reg = Src1->getReg();
2881  unsigned Src1SubReg = Src1->getSubReg();
2882  Src0->setReg(Src1Reg);
2883  Src0->setSubReg(Src1SubReg);
2884  Src0->setIsKill(Src1->isKill());
2885 
2886  if (Opc == AMDGPU::V_MAC_F32_e64 ||
2887  Opc == AMDGPU::V_MAC_F16_e64 ||
2888  Opc == AMDGPU::V_FMAC_F32_e64 ||
2889  Opc == AMDGPU::V_FMAC_F16_e64)
2890  UseMI.untieRegOperand(
2891  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
2892 
2893  Src1->ChangeToImmediate(Imm);
2894 
2896  UseMI.setDesc(get(NewOpc));
2897 
2898  bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
2899  if (DeleteDef)
2900  DefMI.eraseFromParent();
2901 
2902  return true;
2903  }
2904 
2905  // Added part is the constant: Use v_madak_{f16, f32}.
2906  if (Src2->isReg() && Src2->getReg() == Reg) {
2907  // Not allowed to use constant bus for another operand.
2908  // We can however allow an inline immediate as src0.
2909  bool Src0Inlined = false;
2910  if (Src0->isReg()) {
2911  // Try to inline constant if possible.
2912  // If the Def moves immediate and the use is single
2913  // We are saving VGPR here.
2915  if (Def && Def->isMoveImmediate() &&
2916  isInlineConstant(Def->getOperand(1)) &&
2917  MRI->hasOneUse(Src0->getReg())) {
2918  Src0->ChangeToImmediate(Def->getOperand(1).getImm());
2919  Src0Inlined = true;
2920  } else if ((Src0->getReg().isPhysical() &&
2921  (ST.getConstantBusLimit(Opc) <= 1 &&
2922  RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) ||
2923  (Src0->getReg().isVirtual() &&
2924  (ST.getConstantBusLimit(Opc) <= 1 &&
2925  RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))))
2926  return false;
2927  // VGPR is okay as Src0 - fallthrough
2928  }
2929 
2930  if (Src1->isReg() && !Src0Inlined ) {
2931  // We have one slot for inlinable constant so far - try to fill it
2933  if (Def && Def->isMoveImmediate() &&
2934  isInlineConstant(Def->getOperand(1)) &&
2935  MRI->hasOneUse(Src1->getReg()) &&
2936  commuteInstruction(UseMI)) {
2937  Src0->ChangeToImmediate(Def->getOperand(1).getImm());
2938  } else if ((Src1->getReg().isPhysical() &&
2939  RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) ||
2940  (Src1->getReg().isVirtual() &&
2941  RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
2942  return false;
2943  // VGPR is okay as Src1 - fallthrough
2944  }
2945 
2946  unsigned NewOpc =
2947  IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 : AMDGPU::V_FMAAK_F16)
2948  : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
2949  if (pseudoToMCOpcode(NewOpc) == -1)
2950  return false;
2951 
2952  const int64_t Imm = ImmOp->getImm();
2953 
2954  // FIXME: This would be a lot easier if we could return a new instruction
2955  // instead of having to modify in place.
2956 
2957  // Remove these first since they are at the end.
2958  UseMI.RemoveOperand(
2959  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
2960  UseMI.RemoveOperand(
2961  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
2962 
2963  if (Opc == AMDGPU::V_MAC_F32_e64 ||
2964  Opc == AMDGPU::V_MAC_F16_e64 ||
2965  Opc == AMDGPU::V_FMAC_F32_e64 ||
2966  Opc == AMDGPU::V_FMAC_F16_e64)
2967  UseMI.untieRegOperand(
2968  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
2969 
2970  // ChangingToImmediate adds Src2 back to the instruction.
2971  Src2->ChangeToImmediate(Imm);
2972 
2973  // These come before src2.
2975  UseMI.setDesc(get(NewOpc));
2976  // It might happen that UseMI was commuted
2977  // and we now have SGPR as SRC1. If so 2 inlined
2978  // constant and SGPR are illegal.
2980 
2981  bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
2982  if (DeleteDef)
2983  DefMI.eraseFromParent();
2984 
2985  return true;
2986  }
2987  }
2988 
2989  return false;
2990 }
2991 
2992 static bool
2995  if (BaseOps1.size() != BaseOps2.size())
2996  return false;
2997  for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
2998  if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
2999  return false;
3000  }
3001  return true;
3002 }
3003 
3004 static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
3005  int WidthB, int OffsetB) {
3006  int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3007  int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3008  int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3009  return LowOffset + LowWidth <= HighOffset;
3010 }
3011 
3012 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3013  const MachineInstr &MIb) const {
3014  SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3015  int64_t Offset0, Offset1;
3016  unsigned Dummy0, Dummy1;
3017  bool Offset0IsScalable, Offset1IsScalable;
3018  if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3019  Dummy0, &RI) ||
3020  !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3021  Dummy1, &RI))
3022  return false;
3023 
3024  if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3025  return false;
3026 
3027  if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3028  // FIXME: Handle ds_read2 / ds_write2.
3029  return false;
3030  }
3031  unsigned Width0 = MIa.memoperands().front()->getSize();
3032  unsigned Width1 = MIb.memoperands().front()->getSize();
3033  return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3034 }
3035 
3037  const MachineInstr &MIb) const {
3038  assert(MIa.mayLoadOrStore() &&
3039  "MIa must load from or modify a memory location");
3040  assert(MIb.mayLoadOrStore() &&
3041  "MIb must load from or modify a memory location");
3042 
3044  return false;
3045 
3046  // XXX - Can we relax this between address spaces?
3047  if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3048  return false;
3049 
3050  // TODO: Should we check the address space from the MachineMemOperand? That
3051  // would allow us to distinguish objects we know don't alias based on the
3052  // underlying address space, even if it was lowered to a different one,
3053  // e.g. private accesses lowered to use MUBUF instructions on a scratch
3054  // buffer.
3055  if (isDS(MIa)) {
3056  if (isDS(MIb))
3057  return checkInstOffsetsDoNotOverlap(MIa, MIb);
3058 
3059  return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3060  }
3061 
3062  if (isMUBUF(MIa) || isMTBUF(MIa)) {
3063  if (isMUBUF(MIb) || isMTBUF(MIb))
3064  return checkInstOffsetsDoNotOverlap(MIa, MIb);
3065 
3066  return !isFLAT(MIb) && !isSMRD(MIb);
3067  }
3068 
3069  if (isSMRD(MIa)) {
3070  if (isSMRD(MIb))
3071  return checkInstOffsetsDoNotOverlap(MIa, MIb);
3072 
3073  return !isFLAT(MIb) && !isMUBUF(MIb) && !isMTBUF(MIb);
3074  }
3075 
3076  if (isFLAT(MIa)) {
3077  if (isFLAT(MIb))
3078  return checkInstOffsetsDoNotOverlap(MIa, MIb);
3079 
3080  return false;
3081  }
3082 
3083  return false;
3084 }
3085 
3087  int64_t &Imm) {
3088  if (Reg.isPhysical())
3089  return false;
3090  auto *Def = MRI.getUniqueVRegDef(Reg);
3091  if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3092  Imm = Def->getOperand(1).getImm();
3093  return true;
3094  }
3095  return false;
3096 }
3097 
3098 static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm) {
3099  if (!MO->isReg())
3100  return false;
3101  const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3102  const MachineRegisterInfo &MRI = MF->getRegInfo();
3103  return getFoldableImm(MO->getReg(), MRI, Imm);
3104 }
3105 
3107  MachineInstr &NewMI) {
3108  if (LV) {
3109  unsigned NumOps = MI.getNumOperands();
3110  for (unsigned I = 1; I < NumOps; ++I) {
3111  MachineOperand &Op = MI.getOperand(I);
3112  if (Op.isReg() && Op.isKill())
3113  LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
3114  }
3115  }
3116 }
3117 
3119  LiveVariables *LV) const {
3120  unsigned Opc = MI.getOpcode();
3121  bool IsF16 = false;
3122  bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3123  Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3124  Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3125  bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3126 
3127  switch (Opc) {
3128  default:
3129  return nullptr;
3130  case AMDGPU::V_MAC_F16_e64:
3131  case AMDGPU::V_FMAC_F16_e64:
3132  IsF16 = true;
3134  case AMDGPU::V_MAC_F32_e64:
3135  case AMDGPU::V_FMAC_F32_e64:
3136  case AMDGPU::V_FMAC_F64_e64:
3137  break;
3138  case AMDGPU::V_MAC_F16_e32:
3139  case AMDGPU::V_FMAC_F16_e32:
3140  IsF16 = true;
3142  case AMDGPU::V_MAC_F32_e32:
3143  case AMDGPU::V_FMAC_F32_e32:
3144  case AMDGPU::V_FMAC_F64_e32: {
3145  int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
3146  AMDGPU::OpName::src0);
3147  const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
3148  if (!Src0->isReg() && !Src0->isImm())
3149  return nullptr;
3150 
3151  if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
3152  return nullptr;
3153 
3154  break;
3155  }
3156  }
3157 
3158  const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
3159  const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
3160  const MachineOperand *Src0Mods =
3161  getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
3162  const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
3163  const MachineOperand *Src1Mods =
3164  getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
3165  const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
3166  const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
3167  const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
3168  MachineInstrBuilder MIB;
3169  MachineBasicBlock &MBB = *MI.getParent();
3170 
3171  if (!Src0Mods && !Src1Mods && !Clamp && !Omod && !IsF64 &&
3172  // If we have an SGPR input, we will violate the constant bus restriction.
3173  (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
3174  !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
3175  int64_t Imm;
3176  if (getFoldableImm(Src2, Imm)) {
3177  unsigned NewOpc =
3178  IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32)
3179  : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
3180  if (pseudoToMCOpcode(NewOpc) != -1) {
3181  MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3182  .add(*Dst)
3183  .add(*Src0)
3184  .add(*Src1)
3185  .addImm(Imm);
3186  updateLiveVariables(LV, MI, *MIB);
3187  return MIB;
3188  }
3189  }
3190  unsigned NewOpc = IsFMA
3191  ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32)
3192  : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
3193  if (getFoldableImm(Src1, Imm)) {
3194  if (pseudoToMCOpcode(NewOpc) != -1) {
3195  MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3196  .add(*Dst)
3197  .add(*Src0)
3198  .addImm(Imm)
3199  .add(*Src2);
3200  updateLiveVariables(LV, MI, *MIB);
3201  return MIB;
3202  }
3203  }
3204  if (getFoldableImm(Src0, Imm)) {
3205  if (pseudoToMCOpcode(NewOpc) != -1 &&
3207  MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
3208  Src1)) {
3209  MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3210  .add(*Dst)
3211  .add(*Src1)
3212  .addImm(Imm)
3213  .add(*Src2);
3214  updateLiveVariables(LV, MI, *MIB);
3215  return MIB;
3216  }
3217  }
3218  }
3219 
3220  unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_e64
3221  : IsF64 ? AMDGPU::V_FMA_F64_e64
3222  : AMDGPU::V_FMA_F32_e64)
3223  : (IsF16 ? AMDGPU::V_MAD_F16_e64 : AMDGPU::V_MAD_F32_e64);
3224  if (pseudoToMCOpcode(NewOpc) == -1)
3225  return nullptr;
3226 
3227  MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3228  .add(*Dst)
3229  .addImm(Src0Mods ? Src0Mods->getImm() : 0)
3230  .add(*Src0)
3231  .addImm(Src1Mods ? Src1Mods->getImm() : 0)
3232  .add(*Src1)
3233  .addImm(0) // Src mods
3234  .add(*Src2)
3235  .addImm(Clamp ? Clamp->getImm() : 0)
3236  .addImm(Omod ? Omod->getImm() : 0);
3237  updateLiveVariables(LV, MI, *MIB);
3238  return MIB;
3239 }
3240 
3241 // It's not generally safe to move VALU instructions across these since it will
3242 // start using the register as a base index rather than directly.
3243 // XXX - Why isn't hasSideEffects sufficient for these?
3245  switch (MI.getOpcode()) {
3246  case AMDGPU::S_SET_GPR_IDX_ON:
3247  case AMDGPU::S_SET_GPR_IDX_MODE:
3248  case AMDGPU::S_SET_GPR_IDX_OFF:
3249  return true;
3250  default:
3251  return false;
3252  }
3253 }
3254 
3256  const MachineBasicBlock *MBB,
3257  const MachineFunction &MF) const {
3258  // Skipping the check for SP writes in the base implementation. The reason it
3259  // was added was apparently due to compile time concerns.
3260  //
3261  // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
3262  // but is probably avoidable.
3263 
3264  // Copied from base implementation.
3265  // Terminators and labels can't be scheduled around.
3266  if (MI.isTerminator() || MI.isPosition())
3267  return true;
3268 
3269  // INLINEASM_BR can jump to another block
3270  if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
3271  return true;
3272 
3273  // Target-independent instructions do not have an implicit-use of EXEC, even
3274  // when they operate on VGPRs. Treating EXEC modifications as scheduling
3275  // boundaries prevents incorrect movements of such instructions.
3276  return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
3277  MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
3278  MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
3280 }
3281 
3283  return Opcode == AMDGPU::DS_ORDERED_COUNT ||
3284  Opcode == AMDGPU::DS_GWS_INIT ||
3285  Opcode == AMDGPU::DS_GWS_SEMA_V ||
3286  Opcode == AMDGPU::DS_GWS_SEMA_BR ||
3287  Opcode == AMDGPU::DS_GWS_SEMA_P ||
3288  Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL ||
3289  Opcode == AMDGPU::DS_GWS_BARRIER;
3290 }
3291 
3293  // Skip the full operand and register alias search modifiesRegister
3294  // does. There's only a handful of instructions that touch this, it's only an
3295  // implicit def, and doesn't alias any other registers.
3296  if (const MCPhysReg *ImpDef = MI.getDesc().getImplicitDefs()) {
3297  for (; ImpDef && *ImpDef; ++ImpDef) {
3298  if (*ImpDef == AMDGPU::MODE)
3299  return true;
3300  }
3301  }
3302 
3303  return false;
3304 }
3305 
3307  unsigned Opcode = MI.getOpcode();
3308 
3309  if (MI.mayStore() && isSMRD(MI))
3310  return true; // scalar store or atomic
3311 
3312  // This will terminate the function when other lanes may need to continue.
3313  if (MI.isReturn())
3314  return true;
3315 
3316  // These instructions cause shader I/O that may cause hardware lockups
3317  // when executed with an empty EXEC mask.
3318  //
3319  // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
3320  // EXEC = 0, but checking for that case here seems not worth it
3321  // given the typical code patterns.
3322  if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
3323  isEXP(Opcode) ||
3324  Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP ||
3325  Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER)
3326  return true;
3327 
3328  if (MI.isCall() || MI.isInlineAsm())
3329  return true; // conservative assumption
3330 
3331  // A mode change is a scalar operation that influences vector instructions.
3332  if (modifiesModeRegister(MI))
3333  return true;
3334 
3335  // These are like SALU instructions in terms of effects, so it's questionable
3336  // whether we should return true for those.
3337  //
3338  // However, executing them with EXEC = 0 causes them to operate on undefined
3339  // data, which we avoid by returning true here.
3340  if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
3341  Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32)
3342  return true;
3343 
3344  return false;
3345 }
3346 
3348  const MachineInstr &MI) const {
3349  if (MI.isMetaInstruction())
3350  return false;
3351 
3352  // This won't read exec if this is an SGPR->SGPR copy.
3353  if (MI.isCopyLike()) {
3354  if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
3355  return true;
3356 
3357  // Make sure this isn't copying exec as a normal operand
3358  return MI.readsRegister(AMDGPU::EXEC, &RI);
3359  }
3360 
3361  // Make a conservative assumption about the callee.
3362  if (MI.isCall())
3363  return true;
3364 
3365  // Be conservative with any unhandled generic opcodes.
3366  if (!isTargetSpecificOpcode(MI.getOpcode()))
3367  return true;
3368 
3369  return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
3370 }
3371 
3372 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
3373  switch (Imm.getBitWidth()) {
3374  case 1: // This likely will be a condition code mask.
3375  return true;
3376 
3377  case 32:
3379  ST.hasInv2PiInlineImm());
3380  case 64:
3382  ST.hasInv2PiInlineImm());
3383  case 16:
3384  return ST.has16BitInsts() &&
3386  ST.hasInv2PiInlineImm());
3387  default:
3388  llvm_unreachable("invalid bitwidth");
3389  }
3390 }
3391 
3393  uint8_t OperandType) const {
3394  if (!MO.isImm() ||
3397  return false;
3398 
3399  // MachineOperand provides no way to tell the true operand size, since it only
3400  // records a 64-bit value. We need to know the size to determine if a 32-bit
3401  // floating point immediate bit pattern is legal for an integer immediate. It
3402  // would be for any 32-bit integer operand, but would not be for a 64-bit one.
3403 
3404  int64_t Imm = MO.getImm();
3405  switch (OperandType) {
3417  int32_t Trunc = static_cast<int32_t>(Imm);
3419  }
3426  ST.hasInv2PiInlineImm());
3430  // We would expect inline immediates to not be concerned with an integer/fp
3431  // distinction. However, in the case of 16-bit integer operations, the
3432  // "floating point" values appear to not work. It seems read the low 16-bits
3433  // of 32-bit immediates, which happens to always work for the integer
3434  // values.
3435  //
3436  // See llvm bugzilla 46302.
3437  //
3438  // TODO: Theoretically we could use op-sel to use the high bits of the
3439  // 32-bit FP values.
3440  return AMDGPU::isInlinableIntLiteral(Imm);
3444  // This suffers the same problem as the scalar 16-bit cases.
3450  if (isInt<16>(Imm) || isUInt<16>(Imm)) {
3451  // A few special case instructions have 16-bit operands on subtargets
3452  // where 16-bit instructions are not legal.
3453  // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
3454  // constants in these cases
3455  int16_t Trunc = static_cast<int16_t>(Imm);
3456  return ST.has16BitInsts() &&
3458  }
3459 
3460  return false;
3461  }
3465  uint32_t Trunc = static_cast<uint32_t>(Imm);
3467  }
3468  default:
3469  llvm_unreachable("invalid bitwidth");
3470  }
3471 }
3472 
3474  const MCOperandInfo &OpInfo) const {
3475  switch (MO.getType()) {
3477  return false;
3479  return !isInlineConstant(MO, OpInfo);
3485  return true;
3486  default:
3487  llvm_unreachable("unexpected operand type");
3488  }
3489 }
3490 
3491 static bool compareMachineOp(const MachineOperand &Op0,
3492  const MachineOperand &Op1) {
3493  if (Op0.getType() != Op1.getType())
3494  return false;
3495 
3496  switch (Op0.getType()) {
3498  return Op0.getReg() == Op1.getReg();
3500  return Op0.getImm() == Op1.getImm();
3501  default:
3502  llvm_unreachable("Didn't expect to be comparing these operand types");
3503  }
3504 }
3505 
3507  const MachineOperand &MO) const {
3508  const MCInstrDesc &InstDesc = MI.getDesc();
3509  const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo];
3510 
3511  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
3512 
3513  if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
3514  return true;
3515 
3516  if (OpInfo.RegClass < 0)
3517  return false;
3518 
3519  if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
3520  if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
3521  OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
3522  AMDGPU::OpName::src2))
3523  return false;
3524  return RI.opCanUseInlineConstant(OpInfo.OperandType);
3525  }
3526 
3527  if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
3528  return false;
3529 
3530  if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
3531  return true;
3532 
3533  return ST.hasVOP3Literal();
3534 }
3535 
3536 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
3537  // GFX90A does not have V_MUL_LEGACY_F32_e32.
3538  if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
3539  return false;
3540 
3541  int Op32 = AMDGPU::getVOPe32(Opcode);
3542  if (Op32 == -1)
3543  return false;
3544 
3545  return pseudoToMCOpcode(Op32) != -1;
3546 }
3547 
3548 bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
3549  // The src0_modifier operand is present on all instructions
3550  // that have modifiers.
3551 
3552  return AMDGPU::getNamedOperandIdx(Opcode,
3553  AMDGPU::OpName::src0_modifiers) != -1;
3554 }
3555 
3557  unsigned OpName) const {
3558  const MachineOperand *Mods = getNamedOperand(MI, OpName);
3559  return Mods && Mods->getImm();
3560 }
3561 
3563  return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
3564  hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
3565  hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) ||
3566  hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
3567  hasModifiersSet(MI, AMDGPU::OpName::omod);
3568 }
3569 
3571  const MachineRegisterInfo &MRI) const {
3572  const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
3573  // Can't shrink instruction with three operands.
3574  // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
3575  // a special case for it. It can only be shrunk if the third operand
3576  // is vcc, and src0_modifiers and src1_modifiers are not set.
3577  // We should handle this the same way we handle vopc, by addding
3578  // a register allocation hint pre-regalloc and then do the shrinking
3579  // post-regalloc.
3580  if (Src2) {
3581  switch (MI.getOpcode()) {
3582  default: return false;
3583 
3584  case AMDGPU::V_ADDC_U32_e64:
3585  case AMDGPU::V_SUBB_U32_e64:
3586  case AMDGPU::V_SUBBREV_U32_e64: {
3587  const MachineOperand *Src1
3588  = getNamedOperand(MI, AMDGPU::OpName::src1);
3589  if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
3590  return false;
3591  // Additional verification is needed for sdst/src2.
3592  return true;
3593  }
3594  case AMDGPU::V_MAC_F32_e64:
3595  case AMDGPU::V_MAC_F16_e64:
3596  case AMDGPU::V_FMAC_F32_e64:
3597  case AMDGPU::V_FMAC_F16_e64:
3598  case AMDGPU::V_FMAC_F64_e64:
3599  if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
3600  hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
3601  return false;
3602  break;
3603 
3604  case AMDGPU::V_CNDMASK_B32_e64:
3605  break;
3606  }
3607  }
3608 
3609  const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
3610  if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
3611  hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
3612  return false;
3613 
3614  // We don't need to check src0, all input types are legal, so just make sure
3615  // src0 isn't using any modifiers.
3616  if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
3617  return false;
3618 
3619  // Can it be shrunk to a valid 32 bit opcode?
3620  if (!hasVALU32BitEncoding(MI.getOpcode()))
3621  return false;
3622 
3623  // Check output modifiers
3624  return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
3625  !hasModifiersSet(MI, AMDGPU::OpName::clamp);
3626 }
3627 
3628 // Set VCC operand with all flags from \p Orig, except for setting it as
3629 // implicit.
3631  const MachineOperand &Orig) {
3632 
3633  for (MachineOperand &Use : MI.implicit_operands()) {
3634  if (Use.isUse() &&
3635  (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
3636  Use.setIsUndef(Orig.isUndef());
3637  Use.setIsKill(Orig.isKill());
3638  return;
3639  }
3640  }
3641 }
3642 
3644  unsigned Op32) const {
3645  MachineBasicBlock *MBB = MI.getParent();;
3646  MachineInstrBuilder Inst32 =
3647  BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32))
3648  .setMIFlags(MI.getFlags());
3649 
3650  // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
3651  // For VOPC instructions, this is replaced by an implicit def of vcc.
3652  int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
3653  if (Op32DstIdx != -1) {
3654  // dst
3655  Inst32.add(MI.getOperand(0));
3656  } else {
3657  assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) ||
3658  (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) &&
3659  "Unexpected case");
3660  }
3661 
3662  Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0));
3663 
3664  const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
3665  if (Src1)
3666  Inst32.add(*Src1);
3667 
3668  const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
3669 
3670  if (Src2) {
3671  int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
3672  if (Op32Src2Idx != -1) {
3673  Inst32.add(*Src2);
3674  } else {
3675  // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
3676  // replaced with an implicit read of vcc or vcc_lo. The implicit read
3677  // of vcc was already added during the initial BuildMI, but we
3678  // 1) may need to change vcc to vcc_lo to preserve the original register
3679  // 2) have to preserve the original flags.
3680  fixImplicitOperands(*Inst32);
3681  copyFlagsToImplicitVCC(*Inst32, *Src2);
3682  }
3683  }
3684 
3685  return Inst32;
3686 }
3687 
3689  const MachineOperand &MO,
3690  const MCOperandInfo &OpInfo) const {
3691  // Literal constants use the constant bus.
3692  //if (isLiteralConstantLike(MO, OpInfo))
3693  // return true;
3694  if (MO.isImm())
3695  return !isInlineConstant(MO, OpInfo);
3696 
3697  if (!MO.isReg())
3698  return true; // Misc other operands like FrameIndex
3699 
3700  if (!MO.isUse())
3701  return false;
3702 
3703  if (MO.getReg().isVirtual())
3704  return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
3705 
3706  // Null is free
3707  if (MO.getReg() == AMDGPU::SGPR_NULL)
3708  return false;
3709 
3710  // SGPRs use the constant bus
3711  if (MO.isImplicit()) {
3712  return MO.getReg() == AMDGPU::M0 ||
3713  MO.getReg() == AMDGPU::VCC ||
3714  MO.getReg() == AMDGPU::VCC_LO;
3715  } else {
3716  return AMDGPU::SReg_32RegClass.contains(MO.getReg()) ||
3717  AMDGPU::SReg_64RegClass.contains(MO.getReg());
3718  }
3719 }
3720 
3722  for (const MachineOperand &MO : MI.implicit_operands()) {
3723  // We only care about reads.
3724  if (MO.isDef())
3725  continue;
3726 
3727  switch (MO.getReg()) {
3728  case AMDGPU::VCC:
3729  case AMDGPU::VCC_LO:
3730  case AMDGPU::VCC_HI:
3731  case AMDGPU::M0:
3732  case AMDGPU::FLAT_SCR:
3733  return MO.getReg();
3734 
3735  default:
3736  break;
3737  }
3738  }
3739 
3740  return AMDGPU::NoRegister;
3741 }
3742 
3743 static bool shouldReadExec(const MachineInstr &MI) {
3744  if (SIInstrInfo::isVALU(MI)) {
3745  switch (MI.getOpcode()) {
3746  case AMDGPU::V_READLANE_B32:
3747  case AMDGPU::V_WRITELANE_B32:
3748  return false;
3749  }
3750 
3751  return true;
3752  }
3753 
3754  if (MI.isPreISelOpcode() ||
3755  SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
3758  return false;
3759 
3760  return true;
3761 }
3762 
3763 static bool isSubRegOf(const SIRegisterInfo &TRI,
3764  const MachineOperand &SuperVec,
3765  const MachineOperand &SubReg) {
3766  if (SubReg.getReg().isPhysical())
3767  return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
3768 
3769  return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
3770  SubReg.getReg() == SuperVec.getReg();
3771 }
3772 
3774  StringRef &ErrInfo) const {
3775  uint16_t Opcode = MI.getOpcode();
3776  if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
3777  return true;
3778 
3779  const MachineFunction *MF = MI.getParent()->getParent();
3780  const MachineRegisterInfo &MRI = MF->getRegInfo();
3781 
3782  int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
3783  int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
3784  int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
3785 
3786  // Make sure the number of operands is correct.
3787  const MCInstrDesc &Desc = get(Opcode);
3788  if (!Desc.isVariadic() &&
3789  Desc.getNumOperands() != MI.getNumExplicitOperands()) {
3790  ErrInfo = "Instruction has wrong number of operands.";
3791  return false;
3792  }
3793 
3794  if (MI.isInlineAsm()) {
3795  // Verify register classes for inlineasm constraints.
3796  for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
3797  I != E; ++I) {
3798  const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
3799  if (!RC)
3800  continue;
3801 
3802  const MachineOperand &Op = MI.getOperand(I);
3803  if (!Op.isReg())
3804  continue;
3805 
3806  Register Reg = Op.getReg();
3807  if (!Reg.isVirtual() && !RC->contains(Reg)) {
3808  ErrInfo = "inlineasm operand has incorrect register class.";
3809  return false;
3810  }
3811  }
3812 
3813  return true;
3814  }
3815 
3816  if (isMIMG(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
3817  ErrInfo = "missing memory operand from MIMG instruction.";
3818  return false;
3819  }
3820 
3821  // Make sure the register classes are correct.
3822  for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
3823  const MachineOperand &MO = MI.getOperand(i);
3824  if (MO.isFPImm()) {
3825  ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
3826  "all fp values to integers.";
3827  return false;
3828  }
3829 
3830  int RegClass = Desc.OpInfo[i].RegClass;
3831 
3832  switch (Desc.OpInfo[i].OperandType) {
3834  if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
3835  ErrInfo = "Illegal immediate value for operand.";
3836  return false;
3837  }
3838  break;
3842  break;
3854  if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
3855  ErrInfo = "Illegal immediate value for operand.";
3856  return false;
3857  }
3858  break;
3859  }
3862  // Check if this operand is an immediate.
3863  // FrameIndex operands will be replaced by immediates, so they are
3864  // allowed.
3865  if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
3866  ErrInfo = "Expected immediate, but got non-immediate";
3867  return false;
3868  }
3870  default:
3871  continue;
3872  }
3873 
3874  if (!MO.isReg())
3875  continue;
3876  Register Reg = MO.getReg();
3877  if (!Reg)
3878  continue;
3879 
3880  // FIXME: Ideally we would have separate instruction definitions with the
3881  // aligned register constraint.
3882  // FIXME: We do not verify inline asm operands, but custom inline asm
3883  // verification is broken anyway
3884  if (ST.needsAlignedVGPRs()) {
3885  const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
3886  const bool IsVGPR = RI.hasVGPRs(RC);
3887  const bool IsAGPR = !IsVGPR && RI.hasAGPRs(RC);
3888  if ((IsVGPR || IsAGPR) && MO.getSubReg()) {
3889  const TargetRegisterClass *SubRC =
3890  RI.getSubRegClass(RC, MO.getSubReg());
3891  RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
3892  if (RC)
3893  RC = SubRC;
3894  }
3895 
3896  // Check that this is the aligned version of the class.
3897  if (!RC || !RI.isProperlyAlignedRC(*RC)) {
3898  ErrInfo = "Subtarget requires even aligned vector registers";
3899  return false;
3900  }
3901  }
3902 
3903  if (RegClass != -1) {
3904  if (Reg.isVirtual())
3905  continue;
3906 
3907  const TargetRegisterClass *RC = RI.getRegClass(RegClass);
3908  if (!RC->contains(Reg)) {
3909  ErrInfo = "Operand has incorrect register class.";
3910  return false;
3911  }
3912  }
3913  }
3914 
3915  // Verify SDWA
3916  if (isSDWA(MI)) {
3917  if (!ST.hasSDWA()) {
3918  ErrInfo = "SDWA is not supported on this target";
3919  return false;
3920  }
3921 
3922  int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
3923 
3924  const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx };
3925 
3926  for (int OpIdx: OpIndicies) {
3927  if (OpIdx == -1)
3928  continue;
3929  const MachineOperand &MO = MI.getOperand(OpIdx);
3930 
3931  if (!ST.hasSDWAScalar()) {
3932  // Only VGPRS on VI
3933  if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
3934  ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
3935  return false;
3936  }
3937  } else {
3938  // No immediates on GFX9
3939  if (!MO.isReg()) {
3940  ErrInfo =
3941  "Only reg allowed as operands in SDWA instructions on GFX9+";
3942  return false;
3943  }
3944  }
3945  }
3946 
3947  if (!ST.hasSDWAOmod()) {
3948  // No omod allowed on VI
3949  const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
3950  if (OMod != nullptr &&
3951  (!OMod->isImm() || OMod->getImm() != 0)) {
3952  ErrInfo = "OMod not allowed in SDWA instructions on VI";
3953  return false;
3954  }
3955  }
3956 
3957  uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
3958  if (isVOPC(BasicOpcode)) {
3959  if (!ST.hasSDWASdst() && DstIdx != -1) {
3960  // Only vcc allowed as dst on VI for VOPC
3961  const MachineOperand &Dst = MI.getOperand(DstIdx);
3962  if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
3963  ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
3964  return false;
3965  }
3966  } else if (!ST.hasSDWAOutModsVOPC()) {
3967  // No clamp allowed on GFX9 for VOPC
3968  const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
3969  if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
3970  ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
3971  return false;
3972  }
3973 
3974  // No omod allowed on GFX9 for VOPC
3975  const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
3976  if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
3977  ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
3978  return false;
3979  }
3980  }
3981  }
3982 
3983  const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
3984  if (DstUnused && DstUnused->isImm() &&
3986  const MachineOperand &Dst = MI.getOperand(DstIdx);
3987  if (!Dst.isReg() || !Dst.isTied()) {
3988  ErrInfo = "Dst register should have tied register";
3989  return false;
3990  }
3991 
3992  const MachineOperand &TiedMO =
3993  MI.getOperand(MI.findTiedOperandIdx(DstIdx));
3994  if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
3995  ErrInfo =
3996  "Dst register should be tied to implicit use of preserved register";
3997  return false;
3998  } else if (TiedMO.getReg().isPhysical() &&
3999  Dst.getReg() != TiedMO.getReg()) {
4000  ErrInfo = "Dst register should use same physical register as preserved";
4001  return false;
4002  }
4003  }
4004  }
4005 
4006  // Verify MIMG
4007  if (isMIMG(MI.getOpcode()) && !MI.mayStore()) {
4008  // Ensure that the return type used is large enough for all the options
4009  // being used TFE/LWE require an extra result register.
4010  const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
4011  if (DMask) {
4012  uint64_t DMaskImm = DMask->getImm();
4013  uint32_t RegCount =
4014  isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm);
4015  const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
4016  const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
4017  const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
4018 
4019  // Adjust for packed 16 bit values
4020  if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
4021  RegCount >>= 1;
4022 
4023  // Adjust if using LWE or TFE
4024  if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
4025  RegCount += 1;
4026 
4027  const uint32_t DstIdx =
4028  AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
4029  const MachineOperand &Dst = MI.getOperand(DstIdx);
4030  if (Dst.isReg()) {
4031  const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
4032  uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
4033  if (RegCount > DstSize) {
4034  ErrInfo = "MIMG instruction returns too many registers for dst "
4035  "register class";
4036  return false;
4037  }
4038  }
4039  }
4040  }
4041 
4042  // Verify VOP*. Ignore multiple sgpr operands on writelane.
4043  if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
4044  && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
4045  // Only look at the true operands. Only a real operand can use the constant
4046  // bus, and we don't want to check pseudo-operands like the source modifier
4047  // flags.
4048  const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
4049 
4050  unsigned ConstantBusCount = 0;
4051  bool UsesLiteral = false;
4052  const MachineOperand *LiteralVal = nullptr;
4053 
4054  if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
4055  ++ConstantBusCount;
4056 
4057  SmallVector<Register, 2> SGPRsUsed;
4058  Register SGPRUsed;
4059 
4060  for (int OpIdx : OpIndices) {
4061  if (OpIdx == -1)
4062  break;
4063  const MachineOperand &MO = MI.getOperand(OpIdx);
4064  if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
4065  if (MO.isReg()) {
4066  SGPRUsed = MO.getReg();
4067  if (llvm::all_of(SGPRsUsed, [SGPRUsed](unsigned SGPR) {
4068  return SGPRUsed != SGPR;
4069  })) {
4070  ++ConstantBusCount;
4071  SGPRsUsed.push_back(SGPRUsed);
4072  }
4073  } else {
4074  if (!UsesLiteral) {
4075  ++ConstantBusCount;
4076  UsesLiteral = true;
4077  LiteralVal = &MO;
4078  } else if (!MO.isIdenticalTo(*LiteralVal)) {
4079  assert(isVOP3(MI));
4080  ErrInfo = "VOP3 instruction uses more than one literal";
4081  return false;
4082  }
4083  }
4084  }
4085  }
4086 
4087  SGPRUsed = findImplicitSGPRRead(MI);
4088  if (SGPRUsed != AMDGPU::NoRegister) {
4089  // Implicit uses may safely overlap true overands
4090  if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
4091  return !RI.regsOverlap(SGPRUsed, SGPR);
4092  })) {
4093  ++ConstantBusCount;
4094  SGPRsUsed.push_back(SGPRUsed);
4095  }
4096  }
4097 
4098  // v_writelane_b32 is an exception from constant bus restriction:
4099  // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
4100  if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
4101  Opcode != AMDGPU::V_WRITELANE_B32) {
4102  ErrInfo = "VOP* instruction violates constant bus restriction";
4103  return false;
4104  }
4105 
4106  if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
4107  ErrInfo = "VOP3 instruction uses literal";
4108  return false;
4109  }
4110  }
4111 
4112  // Special case for writelane - this can break the multiple constant bus rule,
4113  // but still can't use more than one SGPR register
4114  if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
4115  unsigned SGPRCount = 0;
4116  Register SGPRUsed = AMDGPU::NoRegister;
4117 
4118  for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) {
4119  if (OpIdx == -1)
4120  break;
4121 
4122  const MachineOperand &MO = MI.getOperand(OpIdx);
4123 
4124  if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
4125  if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
4126  if (MO.getReg() != SGPRUsed)
4127  ++SGPRCount;
4128  SGPRUsed = MO.getReg();
4129  }
4130  }
4131  if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
4132  ErrInfo = "WRITELANE instruction violates constant bus restriction";
4133  return false;
4134  }
4135  }
4136  }
4137 
4138  // Verify misc. restrictions on specific instructions.
4139  if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
4140  Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
4141  const MachineOperand &Src0 = MI.getOperand(Src0Idx);
4142  const MachineOperand &Src1 = MI.getOperand(Src1Idx);
4143  const MachineOperand &Src2 = MI.getOperand(Src2Idx);
4144  if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
4145  if (!compareMachineOp(Src0, Src1) &&
4146  !compareMachineOp(Src0, Src2)) {
4147  ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
4148  return false;
4149  }
4150  }
4151  if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
4152  SISrcMods::ABS) ||
4153  (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
4154  SISrcMods::ABS) ||
4155  (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
4156  SISrcMods::ABS)) {
4157  ErrInfo = "ABS not allowed in VOP3B instructions";
4158  return false;
4159  }
4160  }
4161 
4162  if (isSOP2(MI) || isSOPC(MI)) {
4163  const MachineOperand &Src0 = MI.getOperand(Src0Idx);
4164  const MachineOperand &Src1 = MI.getOperand(Src1Idx);
4165  unsigned Immediates = 0;
4166 
4167  if (!Src0.isReg() &&
4168  !isInlineConstant(Src0, Desc.OpInfo[Src0Idx].OperandType))
4169  Immediates++;
4170  if (!Src1.isReg() &&
4171  !isInlineConstant(Src1, Desc.OpInfo[Src1Idx].OperandType))
4172  Immediates++;
4173 
4174  if (Immediates > 1) {
4175  ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
4176  return false;
4177  }
4178  }
4179 
4180  if (isSOPK(MI)) {
4181  auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
4182  if (Desc.isBranch()) {
4183  if (!Op->isMBB()) {
4184  ErrInfo = "invalid branch target for SOPK instruction";
4185  return false;
4186  }
4187  } else {
4188  uint64_t Imm = Op->getImm();
4189  if (sopkIsZext(MI)) {
4190  if (!isUInt<16>(Imm)) {
4191  ErrInfo = "invalid immediate for SOPK instruction";
4192  return false;
4193  }
4194  } else {
4195  if (!isInt<16>(Imm)) {
4196  ErrInfo = "invalid immediate for SOPK instruction";
4197  return false;
4198  }
4199  }
4200  }
4201  }
4202 
4203  if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
4204  Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
4205  Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
4206  Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
4207  const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
4208  Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
4209 
4210  const unsigned StaticNumOps = Desc.getNumOperands() +
4211  Desc.getNumImplicitUses();
4212  const unsigned NumImplicitOps = IsDst ? 2 : 1;
4213 
4214  // Allow additional implicit operands. This allows a fixup done by the post
4215  // RA scheduler where the main implicit operand is killed and implicit-defs
4216  // are added for sub-registers that remain live after this instruction.
4217  if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
4218  ErrInfo = "missing implicit register operands";
4219  return false;
4220  }
4221 
4222  const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4223  if (IsDst) {
4224  if (!Dst->isUse()) {
4225  ErrInfo = "v_movreld_b32 vdst should be a use operand";
4226  return false;
4227  }
4228 
4229  unsigned UseOpIdx;
4230  if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
4231  UseOpIdx != StaticNumOps + 1) {
4232  ErrInfo = "movrel implicit operands should be tied";
4233  return false;
4234  }
4235  }
4236 
4237  const MachineOperand &Src0 = MI.getOperand(Src0Idx);
4238  const MachineOperand &ImpUse
4239  = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
4240  if (!ImpUse.isReg() || !ImpUse.isUse() ||
4241  !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
4242  ErrInfo = "src0 should be subreg of implicit vector use";
4243  return false;
4244  }
4245  }
4246 
4247  // Make sure we aren't losing exec uses in the td files. This mostly requires
4248  // being careful when using let Uses to try to add other use registers.
4249  if (shouldReadExec(MI)) {
4250  if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
4251  ErrInfo = "VALU instruction does not implicitly read exec mask";
4252  return false;
4253  }
4254  }
4255 
4256  if (isSMRD(MI)) {
4257  if (MI.mayStore()) {
4258  // The register offset form of scalar stores may only use m0 as the
4259  // soffset register.
4260  const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff);
4261  if (Soff && Soff->getReg() != AMDGPU::M0) {
4262  ErrInfo = "scalar stores must use m0 as offset register";
4263  return false;
4264  }
4265  }
4266  }
4267 
4268  if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
4269  const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
4270  if (Offset->getImm() != 0) {
4271  ErrInfo = "subtarget does not support offsets in flat instructions";
4272  return false;
4273  }
4274  }
4275 
4276  if (isMIMG(MI)) {
4277  const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
4278  if (DimOp) {
4279  int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
4280  AMDGPU::OpName::vaddr0);
4281  int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
4282  const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
4283  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4284  AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
4285  const AMDGPU::MIMGDimInfo *Dim =
4287 
4288  if (!Dim) {
4289  ErrInfo = "dim is out of range";
4290  return false;
4291  }
4292 
4293  bool IsA16 = false;
4294  if (ST.hasR128A16()) {
4295  const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
4296  IsA16 = R128A16->getImm() != 0;
4297  } else if (ST.hasGFX10A16()) {
4298  const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
4299  IsA16 = A16->getImm() != 0;
4300  }
4301 
4302  bool IsNSA = SRsrcIdx - VAddr0Idx > 1;
4303 
4304  unsigned AddrWords =
4305  AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
4306 
4307  unsigned VAddrWords;
4308  if (IsNSA) {
4309  VAddrWords = SRsrcIdx - VAddr0Idx;
4310  } else {
4311  const TargetRegisterClass *RC = getOpRegClass(MI, VAddr0Idx);
4312  VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32;
4313  if (AddrWords > 8)
4314  AddrWords = 16;
4315  }
4316 
4317  if (VAddrWords != AddrWords) {
4318  LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
4319  << " but got " << VAddrWords << "\n");
4320  ErrInfo = "bad vaddr size";
4321  return false;
4322  }
4323  }
4324  }
4325 
4326  const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
4327  if (DppCt) {
4328  using namespace AMDGPU::DPP;
4329 
4330  unsigned DC = DppCt->getImm();
4338  ErrInfo = "Invalid dpp_ctrl value";
4339  return false;
4340  }
4341  if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
4342  ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
4343  ErrInfo = "Invalid dpp_ctrl value: "
4344  "wavefront shifts are not supported on GFX10+";
4345  return false;
4346  }
4347  if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
4348  ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
4349  ErrInfo = "Invalid dpp_ctrl value: "
4350  "broadcasts are not supported on GFX10+";
4351  return false;
4352  }
4354  ST.getGeneration() < AMDGPUSubtarget::GFX10) {
4357  !ST.hasGFX90AInsts()) {
4358  ErrInfo = "Invalid dpp_ctrl value: "
4359  "row_newbroadcast/row_share is not supported before "
4360  "GFX90A/GFX10";
4361  return false;
4362  } else if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
4363  ErrInfo = "Invalid dpp_ctrl value: "
4364  "row_share and row_xmask are not supported before GFX10";
4365  return false;
4366  }
4367  }
4368 
4369  int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
4370  int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4371 
4372  if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
4373  ((DstIdx >= 0 &&
4374  (Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64RegClassID ||
4375  Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64_Align2RegClassID)) ||
4376  ((Src0Idx >= 0 &&
4377  (Desc.OpInfo[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID ||
4378  Desc.OpInfo[Src0Idx].RegClass ==
4379  AMDGPU::VReg_64_Align2RegClassID)))) &&
4381  ErrInfo = "Invalid dpp_ctrl value: "
4382  "64 bit dpp only support row_newbcast";
4383  return false;
4384  }
4385  }
4386 
4387  if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
4388  const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4389  uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0
4390  : AMDGPU::OpName::vdata;
4391  const MachineOperand *Data = getNamedOperand(MI, DataNameIdx);
4392  const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
4393  if (Data && !Data->isReg())
4394  Data = nullptr;
4395 
4396  if (ST.hasGFX90AInsts()) {
4397  if (Dst && Data &&
4398  (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
4399  ErrInfo = "Invalid register class: "
4400  "vdata and vdst should be both VGPR or AGPR";
4401  return false;
4402  }
4403  if (Data && Data2 &&
4404  (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
4405  ErrInfo = "Invalid register class: "
4406  "both data operands should be VGPR or AGPR";
4407  return false;
4408  }
4409  } else {
4410  if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
4411  (Data && RI.isAGPR(MRI, Data->getReg())) ||
4412  (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
4413  ErrInfo = "Invalid register class: "
4414  "agpr loads and stores not supported on this GPU";
4415  return false;
4416  }
4417  }
4418  }
4419 
4420  if (ST.needsAlignedVGPRs() &&
4421  (MI.getOpcode() == AMDGPU::DS_GWS_INIT ||
4422  MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
4423  MI.getOpcode() == AMDGPU::DS_GWS_BARRIER)) {
4424  const MachineOperand *Op = getNamedOperand(MI, AMDGPU::OpName::data0);
4425  Register Reg = Op->getReg();
4426  bool Aligned = true;
4427  if (Reg.isPhysical()) {
4428  Aligned = !(RI.getHWRegIndex(Reg) & 1);
4429  } else {
4430  const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
4431  Aligned = RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
4432  !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
4433  }
4434 
4435  if (!Aligned) {
4436  ErrInfo = "Subtarget requires even aligned vector registers "
4437  "for DS_GWS instructions";
4438  return false;
4439  }
4440  }
4441 
4442  return true;
4443 }
4444 
4445 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
4446  switch (MI.getOpcode()) {
4447  default: return AMDGPU::INSTRUCTION_LIST_END;
4448  case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
4449  case AMDGPU::COPY: return AMDGPU::COPY;
4450  case AMDGPU::PHI: return AMDGPU::PHI;
4451  case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
4452  case AMDGPU::WQM: return AMDGPU::WQM;
4453  case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
4454  case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
4455  case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
4456  case AMDGPU::S_MOV_B32: {
4457  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
4458  return MI.getOperand(1).isReg() ||
4459  RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
4460  AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
4461  }
4462  case AMDGPU::S_ADD_I32:
4463  return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
4464  case AMDGPU::S_ADDC_U32:
4465  return AMDGPU::V_ADDC_U32_e32;
4466  case AMDGPU::S_SUB_I32:
4467  return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
4468  // FIXME: These are not consistently handled, and selected when the carry is
4469  // used.
4470  case AMDGPU::S_ADD_U32:
4471  return AMDGPU::V_ADD_CO_U32_e32;
4472  case AMDGPU::S_SUB_U32:
4473  return AMDGPU::V_SUB_CO_U32_e32;
4474  case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
4475  case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
4476  case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
4477  case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
4478  case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
4479  case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
4480  case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
4481  case AMDGPU::S_XNOR_B32:
4482  return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
4483  case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
4484  case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
4485  case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
4486  case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
4487  case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
4488  case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
4489  case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
4490  case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
4491  case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
4492  case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
4493  case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
4494  case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
4495  case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
4496  case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
4497  case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
4498  case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
4499  case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
4500  case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
4501  case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
4502  case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
4503  case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
4504  case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
4505  case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
4506  case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
4507  case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
4508  case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
4509  case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
4510  case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
4511  case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
4512  case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
4513  case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
4514  case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
4515  case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
4516  case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
4517  case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
4518  case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
4519  case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
4520  case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
4521  }
4523  "Unexpected scalar opcode without corresponding vector one!");
4524 }
4525 
4526 static unsigned adjustAllocatableRegClass(const GCNSubtarget &ST,
4527  const MachineRegisterInfo &MRI,
4528  const MCInstrDesc &TID,
4529  unsigned RCID,
4530  bool IsAllocatable) {
4531  if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
4532  (TID.mayLoad() || TID.mayStore() ||
4534  switch (RCID) {
4535  case AMDGPU::AV_32RegClassID: return AMDGPU::VGPR_32RegClassID;
4536  case AMDGPU::AV_64RegClassID: return AMDGPU::VReg_64RegClassID;
4537  case AMDGPU::AV_96RegClassID: return AMDGPU::VReg_96RegClassID;
4538  case AMDGPU::AV_128RegClassID: return AMDGPU::VReg_128RegClassID;
4539  case AMDGPU::AV_160RegClassID: return AMDGPU::VReg_160RegClassID;
4540  default:
4541  break;
4542  }
4543  }
4544  return RCID;
4545 }
4546 
4548  unsigned OpNum, const TargetRegisterInfo *TRI,
4549  const MachineFunction &MF)
4550  const {
4551  if (OpNum >= TID.getNumOperands())
4552  return nullptr;
4553  auto RegClass = TID.OpInfo[OpNum].RegClass;
4554  bool IsAllocatable = false;
4556  // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
4557  // with two data operands. Request register class constainted to VGPR only
4558  // of both operands present as Machine Copy Propagation can not check this
4559  // constraint and possibly other passes too.
4560  //
4561  // The check is limited to FLAT and DS because atomics in non-flat encoding
4562  // have their vdst and vdata tied to be the same register.
4563  const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
4564  AMDGPU::OpName::vdst);
4565  const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
4566  (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
4567  : AMDGPU::OpName::vdata);
4568  if (DataIdx != -1) {
4569  IsAllocatable = VDstIdx != -1 ||
4571  AMDGPU::OpName::data1) != -1;
4572  }
4573  }
4574  RegClass = adjustAllocatableRegClass(ST, MF.getRegInfo(), TID, RegClass,
4575  IsAllocatable);
4576  return RI.getRegClass(RegClass);
4577 }
4578 
4580  unsigned OpNo) const {
4581  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
4582  const MCInstrDesc &Desc = get(MI.getOpcode());
4583  if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
4584  Desc.OpInfo[OpNo].RegClass == -1) {
4585  Register Reg = MI.getOperand(OpNo).getReg();
4586 
4587  if (Reg.isVirtual())
4588  return MRI.getRegClass(Reg);
4589  return RI.getPhysRegClass(Reg);
4590  }
4591 
4592  unsigned RCID = Desc.OpInfo[OpNo].RegClass;
4593  RCID = adjustAllocatableRegClass(ST, MRI, Desc, RCID, true);
4594  return RI.getRegClass(RCID);
4595 }
4596 
4597 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
4599  MachineBasicBlock *MBB = MI.getParent();
4600  MachineOperand &MO = MI.getOperand(OpIdx);
4602  unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
4603  const TargetRegisterClass *RC = RI.getRegClass(RCID);
4604  unsigned Size = RI.getRegSizeInBits(*RC);
4605  unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32;
4606  if (MO.isReg())
4607  Opcode = AMDGPU::COPY;
4608  else if (RI.isSGPRClass(RC))
4609  Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
4610 
4611  const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
4612  const TargetRegisterClass *VRC64 = RI.getVGPR64Class();
4613  if (RI.getCommonSubClass(VRC64, VRC))
4614  VRC = VRC64;
4615  else
4616  VRC = &AMDGPU::VGPR_32RegClass;
4617 
4620  BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
4621  MO.ChangeToRegister(Reg, false);
4622 }
4623 
4626  MachineOperand &SuperReg,
4627  const TargetRegisterClass *SuperRC,
4628  unsigned SubIdx,
4629  const TargetRegisterClass *SubRC)
4630  const {
4631  MachineBasicBlock *MBB = MI->getParent();
4632  DebugLoc DL = MI->getDebugLoc();
4634 
4635  if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
4636  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
4637  .addReg(SuperReg.getReg(), 0, SubIdx);
4638  return SubReg;
4639  }
4640 
4641  // Just in case the super register is itself a sub-register, copy it to a new
4642  // value so we don't need to worry about merging its subreg index with the
4643  // SubIdx passed to this function. The register coalescer should be able to
4644  // eliminate this extra copy.
4645  Register NewSuperReg = MRI.createVirtualRegister(SuperRC);
4646 
4647  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
4648  .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
4649 
4650  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
4651  .addReg(NewSuperReg, 0, SubIdx);
4652 
4653  return SubReg;
4654 }
4655 
4659  MachineOperand &Op,
4660  const TargetRegisterClass *SuperRC,
4661  unsigned SubIdx,
4662  const TargetRegisterClass *SubRC) const {
4663  if (Op.isImm()) {
4664  if (SubIdx == AMDGPU::sub0)
4665  return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
4666  if (SubIdx == AMDGPU::sub1)
4667  return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
4668 
4669  llvm_unreachable("Unhandled register index for immediate");
4670  }
4671 
4672  unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
4673  SubIdx, SubRC);
4674  return MachineOperand::CreateReg(SubReg, false);
4675 }
4676 
4677 // Change the order of operands from (0, 1, 2) to (0, 2, 1)
4678 void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
4679  assert(Inst.getNumExplicitOperands() == 3);
4680  MachineOperand Op1 = Inst.getOperand(1);
4681  Inst.RemoveOperand(1);
4682  Inst.addOperand(Op1);
4683 }
4684 
4686  const MCOperandInfo &OpInfo,
4687  const MachineOperand &MO) const {
4688  if (!MO.isReg())
4689  return false;
4690 
4691  Register Reg = MO.getReg();
4692 
4693  const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
4694  if (Reg.isPhysical())
4695  return DRC->contains(Reg);
4696 
4697  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
4698 
4699  if (MO.getSubReg()) {
4700  const MachineFunction *MF = MO.getParent()->getParent()->getParent();
4701  const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
4702  if (!SuperRC)
4703  return false;
4704 
4705  DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg());
4706  if (!DRC)
4707  return false;
4708  }
4709  return RC->hasSuperClassEq(DRC);
4710 }
4711 
4713  const MCOperandInfo &OpInfo,
4714  const MachineOperand &MO) const {
4715  if (MO.isReg())
4716  return isLegalRegOperand(MRI, OpInfo, MO);
4717 
4718  // Handle non-register types that are treated like immediates.
4719  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
4720  return true;
4721 }
4722 
4723 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
4724  const MachineOperand *MO) const {
4725  const MachineFunction &MF = *MI.getParent()->getParent();
4726  const MachineRegisterInfo &MRI = MF.getRegInfo();
4727  const MCInstrDesc &InstDesc = MI.getDesc();
4728  const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
4729  const TargetRegisterClass *DefinedRC =
4730  OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
4731  if (!MO)
4732  MO = &MI.getOperand(OpIdx);
4733 
4734  int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
4735  int VOP3LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
4736  if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
4737  if (isVOP3(MI) && isLiteralConstantLike(*MO, OpInfo) && !VOP3LiteralLimit--)
4738  return false;
4739 
4740  SmallDenseSet<RegSubRegPair> SGPRsUsed;
4741  if (MO->isReg())
4742  SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
4743 
4744  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
4745  if (i == OpIdx)
4746  continue;
4747  const MachineOperand &Op = MI.getOperand(i);
4748  if (Op.isReg()) {
4749  RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
4750  if (!SGPRsUsed.count(SGPR) &&
4751  usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) {
4752  if (--ConstantBusLimit <= 0)
4753  return false;
4754  SGPRsUsed.insert(SGPR);
4755  }
4756  } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
4757  if (--ConstantBusLimit <= 0)
4758  return false;
4759  } else if (isVOP3(MI) && AMDGPU::isSISrcOperand(InstDesc, i) &&
4760  isLiteralConstantLike(Op, InstDesc.OpInfo[i])) {
4761  if (!VOP3LiteralLimit--)
4762  return false;
4763  if (--ConstantBusLimit <= 0)
4764  return false;
4765  }
4766  }
4767  }
4768 
4769  if (MO->isReg()) {
4770  assert(DefinedRC);
4771  if (!isLegalRegOperand(MRI, OpInfo, *MO))
4772  return false;
4773  bool IsAGPR = RI.isAGPR(MRI, MO->getReg());
4774  if (IsAGPR && !ST.hasMAIInsts())
4775  return false;
4776  unsigned Opc = MI.getOpcode();
4777  if (IsAGPR &&
4778  (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
4779  (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
4780  return false;
4781  // Atomics should have both vdst and vdata either vgpr or agpr.
4782  const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
4783  const int DataIdx = AMDGPU::getNamedOperandIdx(Opc,
4784