LLVM  16.0.0git
GCNDPPCombine.cpp
Go to the documentation of this file.
1 //=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0
9 // operand. If any of the use instruction cannot be combined with the mov the
10 // whole sequence is reverted.
11 //
12 // $old = ...
13 // $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane,
14 // dpp_controls..., $row_mask, $bank_mask, $bound_ctrl
15 // $res = VALU $dpp_value [, src1]
16 //
17 // to
18 //
19 // $res = VALU_DPP $combined_old, $vgpr_to_be_read_from_other_lane, [src1,]
20 // dpp_controls..., $row_mask, $bank_mask, $combined_bound_ctrl
21 //
22 // Combining rules :
23 //
24 // if $row_mask and $bank_mask are fully enabled (0xF) and
25 // $bound_ctrl==DPP_BOUND_ZERO or $old==0
26 // -> $combined_old = undef,
27 // $combined_bound_ctrl = DPP_BOUND_ZERO
28 //
29 // if the VALU op is binary and
30 // $bound_ctrl==DPP_BOUND_OFF and
31 // $old==identity value (immediate) for the VALU op
32 // -> $combined_old = src1,
33 // $combined_bound_ctrl = DPP_BOUND_OFF
34 //
35 // Otherwise cancel.
36 //
37 // The mov_dpp instruction should reside in the same BB as all its uses
38 //===----------------------------------------------------------------------===//
39 
40 #include "AMDGPU.h"
41 #include "GCNSubtarget.h"
43 #include "llvm/ADT/Statistic.h"
45 
46 using namespace llvm;
47 
48 #define DEBUG_TYPE "gcn-dpp-combine"
49 
50 STATISTIC(NumDPPMovsCombined, "Number of DPP moves combined.");
51 
52 namespace {
53 
54 class GCNDPPCombine : public MachineFunctionPass {
56  const SIInstrInfo *TII;
57  const GCNSubtarget *ST;
58 
60 
61  MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const;
62 
63  MachineInstr *createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI,
64  RegSubRegPair CombOldVGPR,
65  MachineOperand *OldOpnd, bool CombBCZ,
66  bool IsShrinkable) const;
67 
68  MachineInstr *createDPPInst(MachineInstr &OrigMI, MachineInstr &MovMI,
69  RegSubRegPair CombOldVGPR, bool CombBCZ,
70  bool IsShrinkable) const;
71 
72  bool hasNoImmOrEqual(MachineInstr &MI,
73  unsigned OpndName,
74  int64_t Value,
75  int64_t Mask = -1) const;
76 
77  bool combineDPPMov(MachineInstr &MI) const;
78 
79 public:
80  static char ID;
81 
82  GCNDPPCombine() : MachineFunctionPass(ID) {
84  }
85 
86  bool runOnMachineFunction(MachineFunction &MF) override;
87 
88  StringRef getPassName() const override { return "GCN DPP Combine"; }
89 
90  void getAnalysisUsage(AnalysisUsage &AU) const override {
91  AU.setPreservesCFG();
93  }
94 
95  MachineFunctionProperties getRequiredProperties() const override {
98  }
99 
100 private:
101  int getDPPOp(unsigned Op, bool IsShrinkable) const;
102  bool isShrinkable(MachineInstr &MI) const;
103 };
104 
105 } // end anonymous namespace
106 
107 INITIALIZE_PASS(GCNDPPCombine, DEBUG_TYPE, "GCN DPP Combine", false, false)
108 
109 char GCNDPPCombine::ID = 0;
110 
111 char &llvm::GCNDPPCombineID = GCNDPPCombine::ID;
112 
114  return new GCNDPPCombine();
115 }
116 
117 bool GCNDPPCombine::isShrinkable(MachineInstr &MI) const {
118  unsigned Op = MI.getOpcode();
119  if (!TII->isVOP3(Op)) {
120  return false;
121  }
122  if (!TII->hasVALU32BitEncoding(Op)) {
123  LLVM_DEBUG(dbgs() << " Inst hasn't e32 equivalent\n");
124  return false;
125  }
126  // Do not shrink True16 instructions pre-RA to avoid the restriction in
127  // register allocation from only being able to use 128 VGPRs
129  return false;
130  if (const auto *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
131  // Give up if there are any uses of the sdst in carry-out or VOPC.
132  // The shrunken form of the instruction would write it to vcc instead of to
133  // a virtual register. If we rewrote the uses the shrinking would be
134  // possible.
135  if (!MRI->use_nodbg_empty(SDst->getReg()))
136  return false;
137  }
138  // check if other than abs|neg modifiers are set (opsel for example)
139  const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG);
140  if (!hasNoImmOrEqual(MI, AMDGPU::OpName::src0_modifiers, 0, Mask) ||
141  !hasNoImmOrEqual(MI, AMDGPU::OpName::src1_modifiers, 0, Mask) ||
142  !hasNoImmOrEqual(MI, AMDGPU::OpName::clamp, 0) ||
143  !hasNoImmOrEqual(MI, AMDGPU::OpName::omod, 0)) {
144  LLVM_DEBUG(dbgs() << " Inst has non-default modifiers\n");
145  return false;
146  }
147  return true;
148 }
149 
150 int GCNDPPCombine::getDPPOp(unsigned Op, bool IsShrinkable) const {
151  int DPP32 = AMDGPU::getDPPOp32(Op);
152  if (IsShrinkable) {
153  assert(DPP32 == -1);
154  int E32 = AMDGPU::getVOPe32(Op);
155  DPP32 = (E32 == -1) ? -1 : AMDGPU::getDPPOp32(E32);
156  }
157  if (DPP32 != -1 && TII->pseudoToMCOpcode(DPP32) != -1)
158  return DPP32;
159  int DPP64 = -1;
160  if (ST->hasVOP3DPP())
161  DPP64 = AMDGPU::getDPPOp64(Op);
162  if (DPP64 != -1 && TII->pseudoToMCOpcode(DPP64) != -1)
163  return DPP64;
164  return -1;
165 }
166 
167 // tracks the register operand definition and returns:
168 // 1. immediate operand used to initialize the register if found
169 // 2. nullptr if the register operand is undef
170 // 3. the operand itself otherwise
171 MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const {
172  auto *Def = getVRegSubRegDef(getRegSubRegPair(OldOpnd), *MRI);
173  if (!Def)
174  return nullptr;
175 
176  switch(Def->getOpcode()) {
177  default: break;
178  case AMDGPU::IMPLICIT_DEF:
179  return nullptr;
180  case AMDGPU::COPY:
181  case AMDGPU::V_MOV_B32_e32:
182  case AMDGPU::V_MOV_B64_PSEUDO:
183  case AMDGPU::V_MOV_B64_e32:
184  case AMDGPU::V_MOV_B64_e64: {
185  auto &Op1 = Def->getOperand(1);
186  if (Op1.isImm())
187  return &Op1;
188  break;
189  }
190  }
191  return &OldOpnd;
192 }
193 
194 MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
195  MachineInstr &MovMI,
196  RegSubRegPair CombOldVGPR,
197  bool CombBCZ,
198  bool IsShrinkable) const {
199  assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp ||
200  MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp ||
201  MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
202 
203  bool HasVOP3DPP = ST->hasVOP3DPP();
204  auto OrigOp = OrigMI.getOpcode();
205  auto DPPOp = getDPPOp(OrigOp, IsShrinkable);
206  if (DPPOp == -1) {
207  LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n");
208  return nullptr;
209  }
210  int OrigOpE32 = AMDGPU::getVOPe32(OrigOp);
211  // Prior checks cover Mask with VOPC condition, but not on purpose
212  auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask);
213  assert(RowMaskOpnd && RowMaskOpnd->isImm());
214  auto *BankMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask);
215  assert(BankMaskOpnd && BankMaskOpnd->isImm());
216  const bool MaskAllLanes =
217  RowMaskOpnd->getImm() == 0xF && BankMaskOpnd->getImm() == 0xF;
218  (void)MaskAllLanes;
219  assert((MaskAllLanes ||
220  !(TII->isVOPC(DPPOp) || (TII->isVOP3(DPPOp) && OrigOpE32 != -1 &&
221  TII->isVOPC(OrigOpE32)))) &&
222  "VOPC cannot form DPP unless mask is full");
223 
224  auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI,
225  OrigMI.getDebugLoc(), TII->get(DPPOp))
226  .setMIFlags(OrigMI.getFlags());
227 
228  bool Fail = false;
229  do {
230  int NumOperands = 0;
231  if (auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst)) {
232  DPPInst.add(*Dst);
233  ++NumOperands;
234  }
235  if (auto *SDst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::sdst)) {
236  if (TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, SDst)) {
237  DPPInst.add(*SDst);
238  ++NumOperands;
239  }
240  // If we shrunk a 64bit vop3b to 32bits, just ignore the sdst
241  }
242 
243  const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old);
244  if (OldIdx != -1) {
245  assert(OldIdx == NumOperands);
247  CombOldVGPR,
248  *MRI->getRegClass(
249  TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg()),
250  *MRI));
251  auto *Def = getVRegSubRegDef(CombOldVGPR, *MRI);
252  DPPInst.addReg(CombOldVGPR.Reg, Def ? 0 : RegState::Undef,
253  CombOldVGPR.SubReg);
254  ++NumOperands;
255  } else if (TII->isVOPC(DPPOp) || (TII->isVOP3(DPPOp) && OrigOpE32 != -1 &&
256  TII->isVOPC(OrigOpE32))) {
257  // VOPC DPP and VOPC promoted to VOP3 DPP do not have an old operand
258  // because they write to SGPRs not VGPRs
259  } else {
260  // TODO: this discards MAC/FMA instructions for now, let's add it later
261  LLVM_DEBUG(dbgs() << " failed: no old operand in DPP instruction,"
262  " TBD\n");
263  Fail = true;
264  break;
265  }
266 
267  if (auto *Mod0 = TII->getNamedOperand(OrigMI,
268  AMDGPU::OpName::src0_modifiers)) {
269  assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
270  AMDGPU::OpName::src0_modifiers));
271  assert(HasVOP3DPP ||
272  (0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))));
273  DPPInst.addImm(Mod0->getImm());
274  ++NumOperands;
275  } else if (AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::src0_modifiers)) {
276  DPPInst.addImm(0);
277  ++NumOperands;
278  }
279  auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
280  assert(Src0);
281  if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) {
282  LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n");
283  Fail = true;
284  break;
285  }
286  DPPInst.add(*Src0);
287  DPPInst->getOperand(NumOperands).setIsKill(false);
288  ++NumOperands;
289 
290  if (auto *Mod1 = TII->getNamedOperand(OrigMI,
291  AMDGPU::OpName::src1_modifiers)) {
292  assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
293  AMDGPU::OpName::src1_modifiers));
294  assert(HasVOP3DPP ||
295  (0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))));
296  DPPInst.addImm(Mod1->getImm());
297  ++NumOperands;
298  } else if (AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::src1_modifiers)) {
299  DPPInst.addImm(0);
300  ++NumOperands;
301  }
302  auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
303  if (Src1) {
304  if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) {
305  LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n");
306  Fail = true;
307  break;
308  }
309  DPPInst.add(*Src1);
310  ++NumOperands;
311  }
312  if (auto *Mod2 =
313  TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2_modifiers)) {
314  assert(NumOperands ==
315  AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src2_modifiers));
316  assert(HasVOP3DPP ||
317  (0LL == (Mod2->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))));
318  DPPInst.addImm(Mod2->getImm());
319  ++NumOperands;
320  }
321  auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2);
322  if (Src2) {
323  if (!TII->getNamedOperand(*DPPInst.getInstr(), AMDGPU::OpName::src2) ||
324  !TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) {
325  LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n");
326  Fail = true;
327  break;
328  }
329  DPPInst.add(*Src2);
330  ++NumOperands;
331  }
332  if (HasVOP3DPP) {
333  auto *ClampOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::clamp);
334  if (ClampOpr && AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::clamp)) {
335  DPPInst.addImm(ClampOpr->getImm());
336  }
337  auto *VdstInOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst_in);
338  if (VdstInOpr &&
339  AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::vdst_in)) {
340  DPPInst.add(*VdstInOpr);
341  }
342  auto *OmodOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::omod);
343  if (OmodOpr && AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::omod)) {
344  DPPInst.addImm(OmodOpr->getImm());
345  }
346  // Validate OP_SEL has to be set to all 0 and OP_SEL_HI has to be set to
347  // all 1.
348  if (auto *OpSelOpr =
349  TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel)) {
350  auto OpSel = OpSelOpr->getImm();
351  if (OpSel != 0) {
352  LLVM_DEBUG(dbgs() << " failed: op_sel must be zero\n");
353  Fail = true;
354  break;
355  }
356  if (AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::op_sel))
357  DPPInst.addImm(OpSel);
358  }
359  if (auto *OpSelHiOpr =
360  TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel_hi)) {
361  auto OpSelHi = OpSelHiOpr->getImm();
362  // Only vop3p has op_sel_hi, and all vop3p have 3 operands, so check
363  // the bitmask for 3 op_sel_hi bits set
364  assert(Src2 && "Expected vop3p with 3 operands");
365  if (OpSelHi != 7) {
366  LLVM_DEBUG(dbgs() << " failed: op_sel_hi must be all set to one\n");
367  Fail = true;
368  break;
369  }
370  if (AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::op_sel_hi))
371  DPPInst.addImm(OpSelHi);
372  }
373  auto *NegOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_lo);
374  if (NegOpr && AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::neg_lo)) {
375  DPPInst.addImm(NegOpr->getImm());
376  }
377  auto *NegHiOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_hi);
378  if (NegHiOpr && AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::neg_hi)) {
379  DPPInst.addImm(NegHiOpr->getImm());
380  }
381  }
382  DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
383  DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
384  DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask));
385  DPPInst.addImm(CombBCZ ? 1 : 0);
386  } while (false);
387 
388  if (Fail) {
389  DPPInst.getInstr()->eraseFromParent();
390  return nullptr;
391  }
392  LLVM_DEBUG(dbgs() << " combined: " << *DPPInst.getInstr());
393  return DPPInst.getInstr();
394 }
395 
396 static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd) {
397  assert(OldOpnd->isImm());
398  switch (OrigMIOp) {
399  default: break;
400  case AMDGPU::V_ADD_U32_e32:
401  case AMDGPU::V_ADD_U32_e64:
402  case AMDGPU::V_ADD_CO_U32_e32:
403  case AMDGPU::V_ADD_CO_U32_e64:
404  case AMDGPU::V_OR_B32_e32:
405  case AMDGPU::V_OR_B32_e64:
406  case AMDGPU::V_SUBREV_U32_e32:
407  case AMDGPU::V_SUBREV_U32_e64:
408  case AMDGPU::V_SUBREV_CO_U32_e32:
409  case AMDGPU::V_SUBREV_CO_U32_e64:
410  case AMDGPU::V_MAX_U32_e32:
411  case AMDGPU::V_MAX_U32_e64:
412  case AMDGPU::V_XOR_B32_e32:
413  case AMDGPU::V_XOR_B32_e64:
414  if (OldOpnd->getImm() == 0)
415  return true;
416  break;
417  case AMDGPU::V_AND_B32_e32:
418  case AMDGPU::V_AND_B32_e64:
419  case AMDGPU::V_MIN_U32_e32:
420  case AMDGPU::V_MIN_U32_e64:
421  if (static_cast<uint32_t>(OldOpnd->getImm()) ==
423  return true;
424  break;
425  case AMDGPU::V_MIN_I32_e32:
426  case AMDGPU::V_MIN_I32_e64:
427  if (static_cast<int32_t>(OldOpnd->getImm()) ==
429  return true;
430  break;
431  case AMDGPU::V_MAX_I32_e32:
432  case AMDGPU::V_MAX_I32_e64:
433  if (static_cast<int32_t>(OldOpnd->getImm()) ==
435  return true;
436  break;
437  case AMDGPU::V_MUL_I32_I24_e32:
438  case AMDGPU::V_MUL_I32_I24_e64:
439  case AMDGPU::V_MUL_U32_U24_e32:
440  case AMDGPU::V_MUL_U32_U24_e64:
441  if (OldOpnd->getImm() == 1)
442  return true;
443  break;
444  }
445  return false;
446 }
447 
448 MachineInstr *GCNDPPCombine::createDPPInst(
449  MachineInstr &OrigMI, MachineInstr &MovMI, RegSubRegPair CombOldVGPR,
450  MachineOperand *OldOpndValue, bool CombBCZ, bool IsShrinkable) const {
451  assert(CombOldVGPR.Reg);
452  if (!CombBCZ && OldOpndValue && OldOpndValue->isImm()) {
453  auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
454  if (!Src1 || !Src1->isReg()) {
455  LLVM_DEBUG(dbgs() << " failed: no src1 or it isn't a register\n");
456  return nullptr;
457  }
458  if (!isIdentityValue(OrigMI.getOpcode(), OldOpndValue)) {
459  LLVM_DEBUG(dbgs() << " failed: old immediate isn't an identity\n");
460  return nullptr;
461  }
462  CombOldVGPR = getRegSubRegPair(*Src1);
463  auto MovDst = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
464  const TargetRegisterClass *RC = MRI->getRegClass(MovDst->getReg());
465  if (!isOfRegClass(CombOldVGPR, *RC, *MRI)) {
466  LLVM_DEBUG(dbgs() << " failed: src1 has wrong register class\n");
467  return nullptr;
468  }
469  }
470  return createDPPInst(OrigMI, MovMI, CombOldVGPR, CombBCZ, IsShrinkable);
471 }
472 
473 // returns true if MI doesn't have OpndName immediate operand or the
474 // operand has Value
475 bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName,
476  int64_t Value, int64_t Mask) const {
477  auto *Imm = TII->getNamedOperand(MI, OpndName);
478  if (!Imm)
479  return true;
480 
481  assert(Imm->isImm());
482  return (Imm->getImm() & Mask) == Value;
483 }
484 
485 bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
486  assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp ||
487  MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp ||
488  MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
489  LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);
490 
491  auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
492  assert(DstOpnd && DstOpnd->isReg());
493  auto DPPMovReg = DstOpnd->getReg();
494  if (DPPMovReg.isPhysical()) {
495  LLVM_DEBUG(dbgs() << " failed: dpp move writes physreg\n");
496  return false;
497  }
498  if (execMayBeModifiedBeforeAnyUse(*MRI, DPPMovReg, MovMI)) {
499  LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same"
500  " for all uses\n");
501  return false;
502  }
503 
504  if (MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO ||
505  MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp) {
506  auto *DppCtrl = TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl);
507  assert(DppCtrl && DppCtrl->isImm());
508  if (!AMDGPU::isLegal64BitDPPControl(DppCtrl->getImm())) {
509  LLVM_DEBUG(dbgs() << " failed: 64 bit dpp move uses unsupported"
510  " control value\n");
511  // Let it split, then control may become legal.
512  return false;
513  }
514  }
515 
516  auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask);
517  assert(RowMaskOpnd && RowMaskOpnd->isImm());
518  auto *BankMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask);
519  assert(BankMaskOpnd && BankMaskOpnd->isImm());
520  const bool MaskAllLanes = RowMaskOpnd->getImm() == 0xF &&
521  BankMaskOpnd->getImm() == 0xF;
522 
523  auto *BCZOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl);
524  assert(BCZOpnd && BCZOpnd->isImm());
525  bool BoundCtrlZero = BCZOpnd->getImm();
526 
527  auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old);
528  auto *SrcOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
529  assert(OldOpnd && OldOpnd->isReg());
530  assert(SrcOpnd && SrcOpnd->isReg());
531  if (OldOpnd->getReg().isPhysical() || SrcOpnd->getReg().isPhysical()) {
532  LLVM_DEBUG(dbgs() << " failed: dpp move reads physreg\n");
533  return false;
534  }
535 
536  auto * const OldOpndValue = getOldOpndValue(*OldOpnd);
537  // OldOpndValue is either undef (IMPLICIT_DEF) or immediate or something else
538  // We could use: assert(!OldOpndValue || OldOpndValue->isImm())
539  // but the third option is used to distinguish undef from non-immediate
540  // to reuse IMPLICIT_DEF instruction later
541  assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd);
542 
543  bool CombBCZ = false;
544 
545  if (MaskAllLanes && BoundCtrlZero) { // [1]
546  CombBCZ = true;
547  } else {
548  if (!OldOpndValue || !OldOpndValue->isImm()) {
549  LLVM_DEBUG(dbgs() << " failed: the DPP mov isn't combinable\n");
550  return false;
551  }
552 
553  if (OldOpndValue->getImm() == 0) {
554  if (MaskAllLanes) {
555  assert(!BoundCtrlZero); // by check [1]
556  CombBCZ = true;
557  }
558  } else if (BoundCtrlZero) {
559  assert(!MaskAllLanes); // by check [1]
560  LLVM_DEBUG(dbgs() <<
561  " failed: old!=0 and bctrl:0 and not all lanes isn't combinable\n");
562  return false;
563  }
564  }
565 
566  LLVM_DEBUG(dbgs() << " old=";
567  if (!OldOpndValue)
568  dbgs() << "undef";
569  else
570  dbgs() << *OldOpndValue;
571  dbgs() << ", bound_ctrl=" << CombBCZ << '\n');
572 
573  SmallVector<MachineInstr*, 4> OrigMIs, DPPMIs;
575  auto CombOldVGPR = getRegSubRegPair(*OldOpnd);
576  // try to reuse previous old reg if its undefined (IMPLICIT_DEF)
577  if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef
578  const TargetRegisterClass *RC = MRI->getRegClass(DPPMovReg);
579  CombOldVGPR = RegSubRegPair(
581  auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(),
582  TII->get(AMDGPU::IMPLICIT_DEF), CombOldVGPR.Reg);
583  DPPMIs.push_back(UndefInst.getInstr());
584  }
585 
586  OrigMIs.push_back(&MovMI);
587  bool Rollback = true;
589 
590  for (auto &Use : MRI->use_nodbg_operands(DPPMovReg)) {
591  Uses.push_back(&Use);
592  }
593 
594  while (!Uses.empty()) {
595  MachineOperand *Use = Uses.pop_back_val();
596  Rollback = true;
597 
598  auto &OrigMI = *Use->getParent();
599  LLVM_DEBUG(dbgs() << " try: " << OrigMI);
600 
601  auto OrigOp = OrigMI.getOpcode();
602  assert((TII->get(OrigOp).Size != 4 || !AMDGPU::isTrue16Inst(OrigOp)) &&
603  "There should not be e32 True16 instructions pre-RA");
604  if (OrigOp == AMDGPU::REG_SEQUENCE) {
605  Register FwdReg = OrigMI.getOperand(0).getReg();
606  unsigned FwdSubReg = 0;
607 
608  if (execMayBeModifiedBeforeAnyUse(*MRI, FwdReg, OrigMI)) {
609  LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same"
610  " for all uses\n");
611  break;
612  }
613 
614  unsigned OpNo, E = OrigMI.getNumOperands();
615  for (OpNo = 1; OpNo < E; OpNo += 2) {
616  if (OrigMI.getOperand(OpNo).getReg() == DPPMovReg) {
617  FwdSubReg = OrigMI.getOperand(OpNo + 1).getImm();
618  break;
619  }
620  }
621 
622  if (!FwdSubReg)
623  break;
624 
625  for (auto &Op : MRI->use_nodbg_operands(FwdReg)) {
626  if (Op.getSubReg() == FwdSubReg)
627  Uses.push_back(&Op);
628  }
629  RegSeqWithOpNos[&OrigMI].push_back(OpNo);
630  continue;
631  }
632 
633  bool IsShrinkable = isShrinkable(OrigMI);
634  if (!(IsShrinkable ||
635  ((TII->isVOP3P(OrigOp) || TII->isVOPC(OrigOp) ||
636  TII->isVOP3(OrigOp)) &&
637  ST->hasVOP3DPP()) ||
638  TII->isVOP1(OrigOp) || TII->isVOP2(OrigOp))) {
639  LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3/3P/C\n");
640  break;
641  }
642  if (OrigMI.modifiesRegister(AMDGPU::EXEC, ST->getRegisterInfo())) {
643  LLVM_DEBUG(dbgs() << " failed: can't combine v_cmpx\n");
644  break;
645  }
646 
647  auto *Src0 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0);
648  auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
649  if (Use != Src0 && !(Use == Src1 && OrigMI.isCommutable())) { // [1]
650  LLVM_DEBUG(dbgs() << " failed: no suitable operands\n");
651  break;
652  }
653 
654  auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2);
655  assert(Src0 && "Src1 without Src0?");
656  if ((Use == Src0 && ((Src1 && Src1->isIdenticalTo(*Src0)) ||
657  (Src2 && Src2->isIdenticalTo(*Src0)))) ||
658  (Use == Src1 && (Src1->isIdenticalTo(*Src0) ||
659  (Src2 && Src2->isIdenticalTo(*Src1))))) {
660  LLVM_DEBUG(
661  dbgs()
662  << " " << OrigMI
663  << " failed: DPP register is used more than once per instruction\n");
664  break;
665  }
666 
667  LLVM_DEBUG(dbgs() << " combining: " << OrigMI);
668  if (Use == Src0) {
669  if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR,
670  OldOpndValue, CombBCZ, IsShrinkable)) {
671  DPPMIs.push_back(DPPInst);
672  Rollback = false;
673  }
674  } else {
675  assert(Use == Src1 && OrigMI.isCommutable()); // by check [1]
676  auto *BB = OrigMI.getParent();
677  auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI);
678  BB->insert(OrigMI, NewMI);
679  if (TII->commuteInstruction(*NewMI)) {
680  LLVM_DEBUG(dbgs() << " commuted: " << *NewMI);
681  if (auto *DPPInst =
682  createDPPInst(*NewMI, MovMI, CombOldVGPR, OldOpndValue, CombBCZ,
683  IsShrinkable)) {
684  DPPMIs.push_back(DPPInst);
685  Rollback = false;
686  }
687  } else
688  LLVM_DEBUG(dbgs() << " failed: cannot be commuted\n");
689  NewMI->eraseFromParent();
690  }
691  if (Rollback)
692  break;
693  OrigMIs.push_back(&OrigMI);
694  }
695 
696  Rollback |= !Uses.empty();
697 
698  for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs))
699  MI->eraseFromParent();
700 
701  if (!Rollback) {
702  for (auto &S : RegSeqWithOpNos) {
703  if (MRI->use_nodbg_empty(S.first->getOperand(0).getReg())) {
704  S.first->eraseFromParent();
705  continue;
706  }
707  while (!S.second.empty())
708  S.first->getOperand(S.second.pop_back_val()).setIsUndef();
709  }
710  }
711 
712  return !Rollback;
713 }
714 
715 bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) {
716  ST = &MF.getSubtarget<GCNSubtarget>();
717  if (!ST->hasDPP() || skipFunction(MF.getFunction()))
718  return false;
719 
720  MRI = &MF.getRegInfo();
721  TII = ST->getInstrInfo();
722 
723  bool Changed = false;
724  for (auto &MBB : MF) {
726  if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) {
727  Changed = true;
728  ++NumDPPMovsCombined;
729  } else if (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO ||
730  MI.getOpcode() == AMDGPU::V_MOV_B64_dpp) {
731  if (ST->has64BitDPP() && combineDPPMov(MI)) {
732  Changed = true;
733  ++NumDPPMovsCombined;
734  } else {
735  auto Split = TII->expandMovDPP64(MI);
736  for (auto *M : {Split.first, Split.second}) {
737  if (M && combineDPPMov(*M))
738  ++NumDPPMovsCombined;
739  }
740  Changed = true;
741  }
742  }
743  }
744  }
745  return Changed;
746 }
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:109
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
llvm::tgtok::Def
@ Def
Definition: TGLexer.h:50
llvm::SISrcMods::ABS
@ ABS
Definition: SIDefines.h:223
llvm::MachineRegisterInfo::createVirtualRegister
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Definition: MachineRegisterInfo.cpp:156
llvm::getRegSubRegPair
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
Definition: SIInstrInfo.h:1191
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:50
llvm::SmallVector< MachineInstr *, 4 >
Statistic.h
llvm::AMDGPU::isLegal64BitDPPControl
LLVM_READNONE bool isLegal64BitDPPControl(unsigned DC)
Definition: AMDGPUBaseInfo.h:1277
llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition: MachineFunctionPass.h:30
Fail
#define Fail
Definition: AArch64Disassembler.cpp:296
llvm::AMDGPU::VOP3PEncoding::OpSel
OpSel
Definition: SIDefines.h:885
llvm::SISrcMods::NEG
@ NEG
Definition: SIDefines.h:222
llvm::MachineFunctionProperties::Property::IsSSA
@ IsSSA
llvm::MachineFunctionProperties
Properties which a MachineFunction may have at a given point in time.
Definition: MachineFunction.h:127
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::max
Expected< ExpressionValue > max(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:337
llvm::AMDGPU::getNamedOperandIdx
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition: MachineFunctionPass.cpp:167
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::AMDGPU::isTrue16Inst
bool isTrue16Inst(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:465
llvm::initializeGCNDPPCombinePass
void initializeGCNDPPCombinePass(PassRegistry &)
llvm::TargetInstrInfo::RegSubRegPair
A pair composed of a register and a sub-register index.
Definition: TargetInstrInfo.h:494
Uses
SmallPtrSet< MachineInstr *, 2 > Uses
Definition: ARMLowOverheadLoops.cpp:590
llvm::logicalview::LVOutputKind::Split
@ Split
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::MachineInstr::getFlags
uint16_t getFlags() const
Return the MI flags bitvector.
Definition: MachineInstr.h:352
llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::Register::isPhysical
bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:97
llvm::AMDGPU::hasNamedOperand
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
Definition: AMDGPUBaseInfo.h:310
llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition: MachineFunction.h:670
llvm::MachineRegisterInfo::use_nodbg_operands
iterator_range< use_nodbg_iterator > use_nodbg_operands(Register Reg) const
Definition: MachineRegisterInfo.h:534
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:24
GCNSubtarget.h
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::MachineOperand::getImm
int64_t getImm() const
Definition: MachineOperand.h:546
llvm::AMDGPU::getDPPOp64
LLVM_READONLY int getDPPOp64(uint16_t Opcode)
llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:526
old
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM ID Predecessors according to mbb< bb27, 0x8b0a7c0 > Note ADDri is not a two address instruction its result reg1037 is an operand of the PHI node in bb76 and its operand reg1039 is the result of the PHI node We should treat it as a two address code and make sure the ADDri is scheduled after any node that reads reg1039 Use info(i.e. register scavenger) to assign it a free register to allow reuse the collector could move the objects and invalidate the derived pointer This is bad enough in the first but safe points can crop up unpredictably **array_addr i32 n old
Definition: README.txt:123
INITIALIZE_PASS
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:37
llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:45
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::AMDGPU::getDPPOp32
LLVM_READONLY int getDPPOp32(uint16_t Opcode)
llvm::AMDGPU::getVOPe32
LLVM_READONLY int getVOPe32(uint16_t Opcode)
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:125
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:48
llvm::MachineFunctionProperties::set
MachineFunctionProperties & set(Property P)
Definition: MachineFunction.h:196
llvm::STATISTIC
STATISTIC(NumFunctions, "Total number of functions")
llvm::getVRegSubRegDef
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
Definition: SIInstrInfo.cpp:8140
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::MachineRegisterInfo::getRegClass
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
Definition: MachineRegisterInfo.h:647
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:660
llvm::MachineInstr::getDebugLoc
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:445
AMDGPUMCTargetDesc.h
llvm::RegState::Undef
@ Undef
Value of the register doesn't matter.
Definition: MachineInstrBuilder.h:52
llvm::TargetInstrInfo::RegSubRegPair::Reg
Register Reg
Definition: TargetInstrInfo.h:495
llvm::MachineOperand::isReg
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Definition: MachineOperand.h:320
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:66
llvm::execMayBeModifiedBeforeAnyUse
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
Definition: SIInstrInfo.cpp:8208
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::MachineInstr::isCommutable
bool isCommutable(QueryType Type=IgnoreBundle) const
Return true if this may be a 2- or 3-address instruction (of the form "X = op Y, Z,...
Definition: MachineInstr.h:1108
llvm::AMDGPU::DPP::DppCtrl
DppCtrl
Definition: SIDefines.h:798
llvm::DenseMap
Definition: DenseMap.h:714
llvm::make_early_inc_range
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:716
MachineFunctionPass.h
llvm::GCNDPPCombineID
char & GCNDPPCombineID
Definition: GCNDPPCombine.cpp:111
RegSubRegPair
TargetInstrInfo::RegSubRegPair RegSubRegPair
Definition: PeepholeOptimizer.cpp:100
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition: MachineOperand.h:359
isIdentityValue
static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd)
Definition: GCNDPPCombine.cpp:396
llvm::MachineFunction
Definition: MachineFunction.h:257
llvm::MachineRegisterInfo::use_nodbg_empty
bool use_nodbg_empty(Register RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register.
Definition: MachineRegisterInfo.h:574
llvm::TargetInstrInfo::RegSubRegPair::SubReg
unsigned SubReg
Definition: TargetInstrInfo.h:496
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:265
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
AMDGPU.h
llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:516
uint32_t
S
add sub stmia L5 ldr r0 bl L_printf $stub Instead of a and a wouldn t it be better to do three moves *Return an aggregate type is even return S
Definition: README.txt:210
llvm::MachineInstr::getParent
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:313
MRI
unsigned const MachineRegisterInfo * MRI
Definition: AArch64AdvSIMDScalarPass.cpp:105
llvm::Register
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
llvm::createGCNDPPCombinePass
FunctionPass * createGCNDPPCombinePass()
Definition: GCNDPPCombine.cpp:113
llvm::MachineInstr::modifiesRegister
bool modifiesRegister(Register Reg, const TargetRegisterInfo *TRI=nullptr) const
Return true if the MachineInstr modifies (fully define or partially define) the specified register.
Definition: MachineInstr.h:1428
MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:74
llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition: MachineFunction.h:626
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:351
llvm::TargetStackID::Value
Value
Definition: TargetFrameLowering.h:27
llvm::MachineOperand::isImm
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Definition: MachineOperand.h:322
llvm::RISCVMatInt::Imm
@ Imm
Definition: RISCVMatInt.h:23
llvm::SIInstrInfo
Definition: SIInstrInfo.h:44
llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition: MachineInstrBuilder.h:357
llvm::MachineInstrBuilder::setMIFlags
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
Definition: MachineInstrBuilder.h:273
llvm::MachineInstr::getNumOperands
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:519
llvm::reverse
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:485
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:308
DEBUG_TYPE
#define DEBUG_TYPE
Definition: GCNDPPCombine.cpp:48
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::isOfRegClass
bool isOfRegClass(const TargetInstrInfo::RegSubRegPair &P, const TargetRegisterClass &TRC, MachineRegisterInfo &MRI)
Returns true if a reg:subreg pair P has a TRC class.
Definition: SIInstrInfo.h:1179
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::Use
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43