LLVM  10.0.0svn
SIFoldOperands.cpp
Go to the documentation of this file.
1 //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 /// \file
8 //===----------------------------------------------------------------------===//
9 //
10 
11 #include "AMDGPU.h"
12 #include "AMDGPUSubtarget.h"
13 #include "SIInstrInfo.h"
14 #include "SIMachineFunctionInfo.h"
21 #include "llvm/Support/Debug.h"
24 
25 #define DEBUG_TYPE "si-fold-operands"
26 using namespace llvm;
27 
28 namespace {
29 
30 struct FoldCandidate {
32  union {
33  MachineOperand *OpToFold;
34  uint64_t ImmToFold;
35  int FrameIndexToFold;
36  };
37  int ShrinkOpcode;
38  unsigned char UseOpNo;
40  bool Commuted;
41 
42  FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
43  bool Commuted_ = false,
44  int ShrinkOp = -1) :
45  UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
46  Kind(FoldOp->getType()),
47  Commuted(Commuted_) {
48  if (FoldOp->isImm()) {
49  ImmToFold = FoldOp->getImm();
50  } else if (FoldOp->isFI()) {
51  FrameIndexToFold = FoldOp->getIndex();
52  } else {
53  assert(FoldOp->isReg() || FoldOp->isGlobal());
54  OpToFold = FoldOp;
55  }
56  }
57 
58  bool isFI() const {
59  return Kind == MachineOperand::MO_FrameIndex;
60  }
61 
62  bool isImm() const {
63  return Kind == MachineOperand::MO_Immediate;
64  }
65 
66  bool isReg() const {
67  return Kind == MachineOperand::MO_Register;
68  }
69 
70  bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
71 
72  bool isCommuted() const {
73  return Commuted;
74  }
75 
76  bool needsShrink() const {
77  return ShrinkOpcode != -1;
78  }
79 
80  int getShrinkOpcode() const {
81  return ShrinkOpcode;
82  }
83 };
84 
85 class SIFoldOperands : public MachineFunctionPass {
86 public:
87  static char ID;
89  const SIInstrInfo *TII;
90  const SIRegisterInfo *TRI;
91  const GCNSubtarget *ST;
92  const SIMachineFunctionInfo *MFI;
93 
94  void foldOperand(MachineOperand &OpToFold,
96  int UseOpIdx,
98  SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
99 
100  void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
101 
102  const MachineOperand *isClamp(const MachineInstr &MI) const;
103  bool tryFoldClamp(MachineInstr &MI);
104 
105  std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
106  bool tryFoldOMod(MachineInstr &MI);
107 
108 public:
109  SIFoldOperands() : MachineFunctionPass(ID) {
111  }
112 
113  bool runOnMachineFunction(MachineFunction &MF) override;
114 
115  StringRef getPassName() const override { return "SI Fold Operands"; }
116 
117  void getAnalysisUsage(AnalysisUsage &AU) const override {
118  AU.setPreservesCFG();
120  }
121 };
122 
123 } // End anonymous namespace.
124 
125 INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
126  "SI Fold Operands", false, false)
127 
128 char SIFoldOperands::ID = 0;
129 
131 
132 // Wrapper around isInlineConstant that understands special cases when
133 // instruction types are replaced during operand folding.
135  const MachineInstr &UseMI,
136  unsigned OpNo,
137  const MachineOperand &OpToFold) {
138  if (TII->isInlineConstant(UseMI, OpNo, OpToFold))
139  return true;
140 
141  unsigned Opc = UseMI.getOpcode();
142  switch (Opc) {
143  case AMDGPU::V_MAC_F32_e64:
144  case AMDGPU::V_MAC_F16_e64:
145  case AMDGPU::V_FMAC_F32_e64: {
146  // Special case for mac. Since this is replaced with mad when folded into
147  // src2, we need to check the legality for the final instruction.
148  int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
149  if (static_cast<int>(OpNo) == Src2Idx) {
150  bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
151  bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
152 
153  unsigned Opc = IsFMA ?
154  AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
155  const MCInstrDesc &MadDesc = TII->get(Opc);
156  return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
157  }
158  return false;
159  }
160  default:
161  return false;
162  }
163 }
164 
165 // TODO: Add heuristic that the frame index might not fit in the addressing mode
166 // immediate offset to avoid materializing in loops.
167 static bool frameIndexMayFold(const SIInstrInfo *TII,
168  const MachineInstr &UseMI,
169  int OpNo,
170  const MachineOperand &OpToFold) {
171  return OpToFold.isFI() &&
172  (TII->isMUBUF(UseMI) || TII->isFLATScratch(UseMI)) &&
173  OpNo == AMDGPU::getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::vaddr);
174 }
175 
177  return new SIFoldOperands();
178 }
179 
180 static bool updateOperand(FoldCandidate &Fold,
181  const SIInstrInfo &TII,
182  const TargetRegisterInfo &TRI,
183  const GCNSubtarget &ST) {
184  MachineInstr *MI = Fold.UseMI;
185  MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
186  assert(Old.isReg());
187 
188  if (Fold.isImm()) {
189  if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked &&
190  !(MI->getDesc().TSFlags & SIInstrFlags::IsMAI) &&
191  AMDGPU::isInlinableLiteralV216(static_cast<uint16_t>(Fold.ImmToFold),
192  ST.hasInv2PiInlineImm())) {
193  // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
194  // already set.
195  unsigned Opcode = MI->getOpcode();
196  int OpNo = MI->getOperandNo(&Old);
197  int ModIdx = -1;
198  if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
199  ModIdx = AMDGPU::OpName::src0_modifiers;
200  else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
201  ModIdx = AMDGPU::OpName::src1_modifiers;
202  else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
203  ModIdx = AMDGPU::OpName::src2_modifiers;
204  assert(ModIdx != -1);
205  ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
206  MachineOperand &Mod = MI->getOperand(ModIdx);
207  unsigned Val = Mod.getImm();
208  if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1))
209  return false;
210  // Only apply the following transformation if that operand requries
211  // a packed immediate.
212  switch (TII.get(Opcode).OpInfo[OpNo].OperandType) {
217  // If upper part is all zero we do not need op_sel_hi.
218  if (!isUInt<16>(Fold.ImmToFold)) {
219  if (!(Fold.ImmToFold & 0xffff)) {
220  Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
221  Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
222  Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
223  return true;
224  }
225  Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
226  Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
227  return true;
228  }
229  break;
230  default:
231  break;
232  }
233  }
234  }
235 
236  if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
237  MachineBasicBlock *MBB = MI->getParent();
238  auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI);
239  if (Liveness != MachineBasicBlock::LQR_Dead)
240  return false;
241 
243  int Op32 = Fold.getShrinkOpcode();
244  MachineOperand &Dst0 = MI->getOperand(0);
245  MachineOperand &Dst1 = MI->getOperand(1);
246  assert(Dst0.isDef() && Dst1.isDef());
247 
248  bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg());
249 
250  const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg());
251  Register NewReg0 = MRI.createVirtualRegister(Dst0RC);
252 
253  MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32);
254 
255  if (HaveNonDbgCarryUse) {
256  BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), Dst1.getReg())
257  .addReg(AMDGPU::VCC, RegState::Kill);
258  }
259 
260  // Keep the old instruction around to avoid breaking iterators, but
261  // replace it with a dummy instruction to remove uses.
262  //
263  // FIXME: We should not invert how this pass looks at operands to avoid
264  // this. Should track set of foldable movs instead of looking for uses
265  // when looking at a use.
266  Dst0.setReg(NewReg0);
267  for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
268  MI->RemoveOperand(I);
269  MI->setDesc(TII.get(AMDGPU::IMPLICIT_DEF));
270 
271  if (Fold.isCommuted())
272  TII.commuteInstruction(*Inst32, false);
273  return true;
274  }
275 
276  assert(!Fold.needsShrink() && "not handled");
277 
278  if (Fold.isImm()) {
279  Old.ChangeToImmediate(Fold.ImmToFold);
280  return true;
281  }
282 
283  if (Fold.isGlobal()) {
284  Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
285  Fold.OpToFold->getTargetFlags());
286  return true;
287  }
288 
289  if (Fold.isFI()) {
290  Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
291  return true;
292  }
293 
294  MachineOperand *New = Fold.OpToFold;
295  Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
296  Old.setIsUndef(New->isUndef());
297  return true;
298 }
299 
301  const MachineInstr *MI) {
302  for (auto Candidate : FoldList) {
303  if (Candidate.UseMI == MI)
304  return true;
305  }
306  return false;
307 }
308 
310  MachineInstr *MI, unsigned OpNo,
311  MachineOperand *OpToFold,
312  const SIInstrInfo *TII) {
313  if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
314  // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
315  unsigned Opc = MI->getOpcode();
316  if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
317  Opc == AMDGPU::V_FMAC_F32_e64) &&
318  (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
319  bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
320  bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
321  unsigned NewOpc = IsFMA ?
322  AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
323 
324  // Check if changing this to a v_mad_{f16, f32} instruction will allow us
325  // to fold the operand.
326  MI->setDesc(TII->get(NewOpc));
327  bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII);
328  if (FoldAsMAD) {
329  MI->untieRegOperand(OpNo);
330  return true;
331  }
332  MI->setDesc(TII->get(Opc));
333  }
334 
335  // Special case for s_setreg_b32
336  if (Opc == AMDGPU::S_SETREG_B32 && OpToFold->isImm()) {
337  MI->setDesc(TII->get(AMDGPU::S_SETREG_IMM32_B32));
338  FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
339  return true;
340  }
341 
342  // If we are already folding into another operand of MI, then
343  // we can't commute the instruction, otherwise we risk making the
344  // other fold illegal.
345  if (isUseMIInFoldList(FoldList, MI))
346  return false;
347 
348  unsigned CommuteOpNo = OpNo;
349 
350  // Operand is not legal, so try to commute the instruction to
351  // see if this makes it possible to fold.
352  unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
353  unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
354  bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1);
355 
356  if (CanCommute) {
357  if (CommuteIdx0 == OpNo)
358  CommuteOpNo = CommuteIdx1;
359  else if (CommuteIdx1 == OpNo)
360  CommuteOpNo = CommuteIdx0;
361  }
362 
363 
364  // One of operands might be an Imm operand, and OpNo may refer to it after
365  // the call of commuteInstruction() below. Such situations are avoided
366  // here explicitly as OpNo must be a register operand to be a candidate
367  // for memory folding.
368  if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() ||
369  !MI->getOperand(CommuteIdx1).isReg()))
370  return false;
371 
372  if (!CanCommute ||
373  !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))
374  return false;
375 
376  if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
377  if ((Opc == AMDGPU::V_ADD_I32_e64 ||
378  Opc == AMDGPU::V_SUB_I32_e64 ||
379  Opc == AMDGPU::V_SUBREV_I32_e64) && // FIXME
380  (OpToFold->isImm() || OpToFold->isFI() || OpToFold->isGlobal())) {
382 
383  // Verify the other operand is a VGPR, otherwise we would violate the
384  // constant bus restriction.
385  unsigned OtherIdx = CommuteOpNo == CommuteIdx0 ? CommuteIdx1 : CommuteIdx0;
386  MachineOperand &OtherOp = MI->getOperand(OtherIdx);
387  if (!OtherOp.isReg() ||
388  !TII->getRegisterInfo().isVGPR(MRI, OtherOp.getReg()))
389  return false;
390 
391  assert(MI->getOperand(1).isDef());
392 
393  // Make sure to get the 32-bit version of the commuted opcode.
394  unsigned MaybeCommutedOpc = MI->getOpcode();
395  int Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
396 
397  FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true,
398  Op32));
399  return true;
400  }
401 
402  TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1);
403  return false;
404  }
405 
406  FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true));
407  return true;
408  }
409 
410  FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
411  return true;
412 }
413 
414 // If the use operand doesn't care about the value, this may be an operand only
415 // used for register indexing, in which case it is unsafe to fold.
416 static bool isUseSafeToFold(const SIInstrInfo *TII,
417  const MachineInstr &MI,
418  const MachineOperand &UseMO) {
419  return !UseMO.isUndef() && !TII->isSDWA(MI);
420  //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
421 }
422 
423 static bool tryToFoldACImm(const SIInstrInfo *TII,
424  const MachineOperand &OpToFold,
426  unsigned UseOpIdx,
427  SmallVectorImpl<FoldCandidate> &FoldList) {
428  const MCInstrDesc &Desc = UseMI->getDesc();
429  const MCOperandInfo *OpInfo = Desc.OpInfo;
430  if (!OpInfo || UseOpIdx >= Desc.getNumOperands())
431  return false;
432 
433  uint8_t OpTy = OpInfo[UseOpIdx].OperandType;
436  return false;
437 
438  if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) &&
439  TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) {
440  UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm());
441  return true;
442  }
443 
444  if (!OpToFold.isReg())
445  return false;
446 
447  Register UseReg = OpToFold.getReg();
448  if (!Register::isVirtualRegister(UseReg))
449  return false;
450 
451  if (llvm::find_if(FoldList, [UseMI](const FoldCandidate &FC) {
452  return FC.UseMI == UseMI; }) != FoldList.end())
453  return false;
454 
456  const MachineInstr *Def = MRI.getUniqueVRegDef(UseReg);
457  if (!Def || !Def->isRegSequence())
458  return false;
459 
460  int64_t Imm;
462  for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) {
463  const MachineOperand &Sub = Def->getOperand(I);
464  if (!Sub.isReg() || Sub.getSubReg())
465  return false;
466  MachineInstr *SubDef = MRI.getUniqueVRegDef(Sub.getReg());
467  while (SubDef && !SubDef->isMoveImmediate() &&
468  !SubDef->getOperand(1).isImm() && TII->isFoldableCopy(*SubDef))
469  SubDef = MRI.getUniqueVRegDef(SubDef->getOperand(1).getReg());
470  if (!SubDef || !SubDef->isMoveImmediate() || !SubDef->getOperand(1).isImm())
471  return false;
472  Op = &SubDef->getOperand(1);
473  auto SubImm = Op->getImm();
474  if (I == 1) {
475  if (!TII->isInlineConstant(SubDef->getOperand(1), OpTy))
476  return false;
477 
478  Imm = SubImm;
479  continue;
480  }
481  if (Imm != SubImm)
482  return false; // Can only fold splat constants
483  }
484 
485  if (!TII->isOperandLegal(*UseMI, UseOpIdx, Op))
486  return false;
487 
488  FoldList.push_back(FoldCandidate(UseMI, UseOpIdx, Op));
489  return true;
490 }
491 
492 void SIFoldOperands::foldOperand(
493  MachineOperand &OpToFold,
495  int UseOpIdx,
497  SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
498  const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
499 
500  if (!isUseSafeToFold(TII, *UseMI, UseOp))
501  return;
502 
503  // FIXME: Fold operands with subregs.
504  if (UseOp.isReg() && OpToFold.isReg()) {
505  if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister)
506  return;
507 
508  // Don't fold subregister extracts into tied operands, only if it is a full
509  // copy since a subregister use tied to a full register def doesn't really
510  // make sense. e.g. don't fold:
511  //
512  // %1 = COPY %0:sub1
513  // %2<tied3> = V_MAC_{F16, F32} %3, %4, %1<tied0>
514  //
515  // into
516  // %2<tied3> = V_MAC_{F16, F32} %3, %4, %0:sub1<tied0>
517  if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister)
518  return;
519  }
520 
521  // Special case for REG_SEQUENCE: We can't fold literals into
522  // REG_SEQUENCE instructions, so we have to fold them into the
523  // uses of REG_SEQUENCE.
524  if (UseMI->isRegSequence()) {
525  Register RegSeqDstReg = UseMI->getOperand(0).getReg();
526  unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
527 
530  RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end();
531  RSUse != RSE; RSUse = Next) {
532  Next = std::next(RSUse);
533 
534  MachineInstr *RSUseMI = RSUse->getParent();
535 
536  if (tryToFoldACImm(TII, UseMI->getOperand(0), RSUseMI,
537  RSUse.getOperandNo(), FoldList))
538  continue;
539 
540  if (RSUse->getSubReg() != RegSeqDstSubReg)
541  continue;
542 
543  foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,
544  CopiesToReplace);
545  }
546 
547  return;
548  }
549 
550  if (tryToFoldACImm(TII, OpToFold, UseMI, UseOpIdx, FoldList))
551  return;
552 
553  if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) {
554  // Sanity check that this is a stack access.
555  // FIXME: Should probably use stack pseudos before frame lowering.
556  MachineOperand *SOff = TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
557  if (!SOff->isReg() || (SOff->getReg() != MFI->getScratchWaveOffsetReg() &&
558  SOff->getReg() != MFI->getStackPtrOffsetReg()))
559  return;
560 
561  if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
562  MFI->getScratchRSrcReg())
563  return;
564 
565  // A frame index will resolve to a positive constant, so it should always be
566  // safe to fold the addressing mode, even pre-GFX9.
567  UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
568  SOff->setReg(MFI->getStackPtrOffsetReg());
569  return;
570  }
571 
572  bool FoldingImmLike =
573  OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
574 
575  if (FoldingImmLike && UseMI->isCopy()) {
576  Register DestReg = UseMI->getOperand(0).getReg();
577  const TargetRegisterClass *DestRC = Register::isVirtualRegister(DestReg)
578  ? MRI->getRegClass(DestReg)
579  : TRI->getPhysRegClass(DestReg);
580 
581  Register SrcReg = UseMI->getOperand(1).getReg();
582  if (Register::isVirtualRegister(DestReg) &&
583  Register::isVirtualRegister(SrcReg)) {
584  const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg);
585  if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) {
589  Use = MRI->use_begin(DestReg), E = MRI->use_end();
590  Use != E; Use = NextUse) {
591  NextUse = std::next(Use);
592  FoldCandidate FC = FoldCandidate(Use->getParent(),
593  Use.getOperandNo(), &UseMI->getOperand(1));
594  CopyUses.push_back(FC);
595  }
596  for (auto & F : CopyUses) {
597  foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo,
598  FoldList, CopiesToReplace);
599  }
600  }
601  }
602 
603  if (DestRC == &AMDGPU::AGPR_32RegClass &&
604  TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
605  UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32));
606  UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
607  CopiesToReplace.push_back(UseMI);
608  return;
609  }
610 
611  // In order to fold immediates into copies, we need to change the
612  // copy to a MOV.
613 
614  unsigned MovOp = TII->getMovOpcode(DestRC);
615  if (MovOp == AMDGPU::COPY)
616  return;
617 
618  UseMI->setDesc(TII->get(MovOp));
619  MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin();
620  MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end();
621  while (ImpOpI != ImpOpE) {
622  MachineInstr::mop_iterator Tmp = ImpOpI;
623  ImpOpI++;
624  UseMI->RemoveOperand(UseMI->getOperandNo(Tmp));
625  }
626  CopiesToReplace.push_back(UseMI);
627  } else {
628  if (UseMI->isCopy() && OpToFold.isReg() &&
630  TRI->isVectorRegister(*MRI, UseMI->getOperand(0).getReg()) &&
631  TRI->isVectorRegister(*MRI, UseMI->getOperand(1).getReg()) &&
632  !UseMI->getOperand(1).getSubReg()) {
633  unsigned Size = TII->getOpSize(*UseMI, 1);
634  UseMI->getOperand(1).setReg(OpToFold.getReg());
635  UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
636  UseMI->getOperand(1).setIsKill(false);
637  CopiesToReplace.push_back(UseMI);
638  OpToFold.setIsKill(false);
639  if (Size != 4)
640  return;
641  if (TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
642  TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg()))
643  UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32));
644  else if (TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
645  TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg()))
646  UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32));
647  return;
648  }
649 
650  unsigned UseOpc = UseMI->getOpcode();
651  if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
652  (UseOpc == AMDGPU::V_READLANE_B32 &&
653  (int)UseOpIdx ==
654  AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
655  // %vgpr = V_MOV_B32 imm
656  // %sgpr = V_READFIRSTLANE_B32 %vgpr
657  // =>
658  // %sgpr = S_MOV_B32 imm
659  if (FoldingImmLike) {
661  UseMI->getOperand(UseOpIdx).getReg(),
662  *OpToFold.getParent(),
663  *UseMI))
664  return;
665 
666  UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
667 
668  // FIXME: ChangeToImmediate should clear subreg
669  UseMI->getOperand(1).setSubReg(0);
670  if (OpToFold.isImm())
671  UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
672  else
673  UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex());
674  UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane)
675  return;
676  }
677 
678  if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
680  UseMI->getOperand(UseOpIdx).getReg(),
681  *OpToFold.getParent(),
682  *UseMI))
683  return;
684 
685  // %vgpr = COPY %sgpr0
686  // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
687  // =>
688  // %sgpr1 = COPY %sgpr0
689  UseMI->setDesc(TII->get(AMDGPU::COPY));
690  UseMI->getOperand(1).setReg(OpToFold.getReg());
691  UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
692  UseMI->getOperand(1).setIsKill(false);
693  UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane)
694  return;
695  }
696  }
697 
698  const MCInstrDesc &UseDesc = UseMI->getDesc();
699 
700  // Don't fold into target independent nodes. Target independent opcodes
701  // don't have defined register classes.
702  if (UseDesc.isVariadic() ||
703  UseOp.isImplicit() ||
704  UseDesc.OpInfo[UseOpIdx].RegClass == -1)
705  return;
706  }
707 
708  if (!FoldingImmLike) {
709  tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
710 
711  // FIXME: We could try to change the instruction from 64-bit to 32-bit
712  // to enable more folding opportunites. The shrink operands pass
713  // already does this.
714  return;
715  }
716 
717 
718  const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
719  const TargetRegisterClass *FoldRC =
720  TRI->getRegClass(FoldDesc.OpInfo[0].RegClass);
721 
722  // Split 64-bit constants into 32-bits for folding.
723  if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
724  Register UseReg = UseOp.getReg();
725  const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);
726 
727  if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
728  return;
729 
730  APInt Imm(64, OpToFold.getImm());
731  if (UseOp.getSubReg() == AMDGPU::sub0) {
732  Imm = Imm.getLoBits(32);
733  } else {
734  assert(UseOp.getSubReg() == AMDGPU::sub1);
735  Imm = Imm.getHiBits(32);
736  }
737 
738  MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
739  tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
740  return;
741  }
742 
743 
744 
745  tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
746 }
747 
748 static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
749  uint32_t LHS, uint32_t RHS) {
750  switch (Opcode) {
751  case AMDGPU::V_AND_B32_e64:
752  case AMDGPU::V_AND_B32_e32:
753  case AMDGPU::S_AND_B32:
754  Result = LHS & RHS;
755  return true;
756  case AMDGPU::V_OR_B32_e64:
757  case AMDGPU::V_OR_B32_e32:
758  case AMDGPU::S_OR_B32:
759  Result = LHS | RHS;
760  return true;
761  case AMDGPU::V_XOR_B32_e64:
762  case AMDGPU::V_XOR_B32_e32:
763  case AMDGPU::S_XOR_B32:
764  Result = LHS ^ RHS;
765  return true;
766  case AMDGPU::V_LSHL_B32_e64:
767  case AMDGPU::V_LSHL_B32_e32:
768  case AMDGPU::S_LSHL_B32:
769  // The instruction ignores the high bits for out of bounds shifts.
770  Result = LHS << (RHS & 31);
771  return true;
772  case AMDGPU::V_LSHLREV_B32_e64:
773  case AMDGPU::V_LSHLREV_B32_e32:
774  Result = RHS << (LHS & 31);
775  return true;
776  case AMDGPU::V_LSHR_B32_e64:
777  case AMDGPU::V_LSHR_B32_e32:
778  case AMDGPU::S_LSHR_B32:
779  Result = LHS >> (RHS & 31);
780  return true;
781  case AMDGPU::V_LSHRREV_B32_e64:
782  case AMDGPU::V_LSHRREV_B32_e32:
783  Result = RHS >> (LHS & 31);
784  return true;
785  case AMDGPU::V_ASHR_I32_e64:
786  case AMDGPU::V_ASHR_I32_e32:
787  case AMDGPU::S_ASHR_I32:
788  Result = static_cast<int32_t>(LHS) >> (RHS & 31);
789  return true;
790  case AMDGPU::V_ASHRREV_I32_e64:
791  case AMDGPU::V_ASHRREV_I32_e32:
792  Result = static_cast<int32_t>(RHS) >> (LHS & 31);
793  return true;
794  default:
795  return false;
796  }
797 }
798 
799 static unsigned getMovOpc(bool IsScalar) {
800  return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
801 }
802 
803 /// Remove any leftover implicit operands from mutating the instruction. e.g.
804 /// if we replace an s_and_b32 with a copy, we don't need the implicit scc def
805 /// anymore.
807  const MCInstrDesc &Desc = MI.getDesc();
808  unsigned NumOps = Desc.getNumOperands() +
809  Desc.getNumImplicitUses() +
810  Desc.getNumImplicitDefs();
811 
812  for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
813  MI.RemoveOperand(I);
814 }
815 
816 static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
817  MI.setDesc(NewDesc);
819 }
820 
822  MachineOperand &Op) {
823  if (Op.isReg()) {
824  // If this has a subregister, it obviously is a register source.
825  if (Op.getSubReg() != AMDGPU::NoSubRegister ||
827  return &Op;
828 
829  MachineInstr *Def = MRI.getVRegDef(Op.getReg());
830  if (Def && Def->isMoveImmediate()) {
831  MachineOperand &ImmSrc = Def->getOperand(1);
832  if (ImmSrc.isImm())
833  return &ImmSrc;
834  }
835  }
836 
837  return &Op;
838 }
839 
840 // Try to simplify operations with a constant that may appear after instruction
841 // selection.
842 // TODO: See if a frame index with a fixed offset can fold.
844  const SIInstrInfo *TII,
845  MachineInstr *MI,
846  MachineOperand *ImmOp) {
847  unsigned Opc = MI->getOpcode();
848  if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
849  Opc == AMDGPU::S_NOT_B32) {
850  MI->getOperand(1).ChangeToImmediate(~ImmOp->getImm());
851  mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
852  return true;
853  }
854 
855  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
856  if (Src1Idx == -1)
857  return false;
858 
859  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
860  MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx));
861  MachineOperand *Src1 = getImmOrMaterializedImm(MRI, MI->getOperand(Src1Idx));
862 
863  if (!Src0->isImm() && !Src1->isImm())
864  return false;
865 
866  if (MI->getOpcode() == AMDGPU::V_LSHL_OR_B32) {
867  if (Src0->isImm() && Src0->getImm() == 0) {
868  // v_lshl_or_b32 0, X, Y -> copy Y
869  // v_lshl_or_b32 0, X, K -> v_mov_b32 K
870  bool UseCopy = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->isReg();
871  MI->RemoveOperand(Src1Idx);
872  MI->RemoveOperand(Src0Idx);
873 
874  MI->setDesc(TII->get(UseCopy ? AMDGPU::COPY : AMDGPU::V_MOV_B32_e32));
875  return true;
876  }
877  }
878 
879  // and k0, k1 -> v_mov_b32 (k0 & k1)
880  // or k0, k1 -> v_mov_b32 (k0 | k1)
881  // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
882  if (Src0->isImm() && Src1->isImm()) {
883  int32_t NewImm;
884  if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
885  return false;
886 
887  const SIRegisterInfo &TRI = TII->getRegisterInfo();
888  bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg());
889 
890  // Be careful to change the right operand, src0 may belong to a different
891  // instruction.
892  MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
893  MI->RemoveOperand(Src1Idx);
894  mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
895  return true;
896  }
897 
898  if (!MI->isCommutable())
899  return false;
900 
901  if (Src0->isImm() && !Src1->isImm()) {
902  std::swap(Src0, Src1);
903  std::swap(Src0Idx, Src1Idx);
904  }
905 
906  int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
907  if (Opc == AMDGPU::V_OR_B32_e64 ||
908  Opc == AMDGPU::V_OR_B32_e32 ||
909  Opc == AMDGPU::S_OR_B32) {
910  if (Src1Val == 0) {
911  // y = or x, 0 => y = copy x
912  MI->RemoveOperand(Src1Idx);
913  mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
914  } else if (Src1Val == -1) {
915  // y = or x, -1 => y = v_mov_b32 -1
916  MI->RemoveOperand(Src1Idx);
917  mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
918  } else
919  return false;
920 
921  return true;
922  }
923 
924  if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 ||
925  MI->getOpcode() == AMDGPU::V_AND_B32_e32 ||
926  MI->getOpcode() == AMDGPU::S_AND_B32) {
927  if (Src1Val == 0) {
928  // y = and x, 0 => y = v_mov_b32 0
929  MI->RemoveOperand(Src0Idx);
930  mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
931  } else if (Src1Val == -1) {
932  // y = and x, -1 => y = copy x
933  MI->RemoveOperand(Src1Idx);
934  mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
936  } else
937  return false;
938 
939  return true;
940  }
941 
942  if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 ||
943  MI->getOpcode() == AMDGPU::V_XOR_B32_e32 ||
944  MI->getOpcode() == AMDGPU::S_XOR_B32) {
945  if (Src1Val == 0) {
946  // y = xor x, 0 => y = copy x
947  MI->RemoveOperand(Src1Idx);
948  mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
949  return true;
950  }
951  }
952 
953  return false;
954 }
955 
956 // Try to fold an instruction into a simpler one
957 static bool tryFoldInst(const SIInstrInfo *TII,
958  MachineInstr *MI) {
959  unsigned Opc = MI->getOpcode();
960 
961  if (Opc == AMDGPU::V_CNDMASK_B32_e32 ||
962  Opc == AMDGPU::V_CNDMASK_B32_e64 ||
963  Opc == AMDGPU::V_CNDMASK_B64_PSEUDO) {
964  const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
965  const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);
966  int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
967  int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
968  if (Src1->isIdenticalTo(*Src0) &&
969  (Src1ModIdx == -1 || !MI->getOperand(Src1ModIdx).getImm()) &&
970  (Src0ModIdx == -1 || !MI->getOperand(Src0ModIdx).getImm())) {
971  LLVM_DEBUG(dbgs() << "Folded " << *MI << " into ");
972  auto &NewDesc =
973  TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
974  int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
975  if (Src2Idx != -1)
976  MI->RemoveOperand(Src2Idx);
977  MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
978  if (Src1ModIdx != -1)
979  MI->RemoveOperand(Src1ModIdx);
980  if (Src0ModIdx != -1)
981  MI->RemoveOperand(Src0ModIdx);
982  mutateCopyOp(*MI, NewDesc);
983  LLVM_DEBUG(dbgs() << *MI << '\n');
984  return true;
985  }
986  }
987 
988  return false;
989 }
990 
991 void SIFoldOperands::foldInstOperand(MachineInstr &MI,
992  MachineOperand &OpToFold) const {
993  // We need mutate the operands of new mov instructions to add implicit
994  // uses of EXEC, but adding them invalidates the use_iterator, so defer
995  // this.
996  SmallVector<MachineInstr *, 4> CopiesToReplace;
998  MachineOperand &Dst = MI.getOperand(0);
999 
1000  bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1001  if (FoldingImm) {
1002  unsigned NumLiteralUses = 0;
1003  MachineOperand *NonInlineUse = nullptr;
1004  int NonInlineUseOpNo = -1;
1005 
1008  Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
1009  Use != E; Use = NextUse) {
1010  NextUse = std::next(Use);
1011  MachineInstr *UseMI = Use->getParent();
1012  unsigned OpNo = Use.getOperandNo();
1013 
1014  // Folding the immediate may reveal operations that can be constant
1015  // folded or replaced with a copy. This can happen for example after
1016  // frame indices are lowered to constants or from splitting 64-bit
1017  // constants.
1018  //
1019  // We may also encounter cases where one or both operands are
1020  // immediates materialized into a register, which would ordinarily not
1021  // be folded due to multiple uses or operand constraints.
1022 
1023  if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) {
1024  LLVM_DEBUG(dbgs() << "Constant folded " << *UseMI << '\n');
1025 
1026  // Some constant folding cases change the same immediate's use to a new
1027  // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user
1028  // again. The same constant folded instruction could also have a second
1029  // use operand.
1030  NextUse = MRI->use_begin(Dst.getReg());
1031  FoldList.clear();
1032  continue;
1033  }
1034 
1035  // Try to fold any inline immediate uses, and then only fold other
1036  // constants if they have one use.
1037  //
1038  // The legality of the inline immediate must be checked based on the use
1039  // operand, not the defining instruction, because 32-bit instructions
1040  // with 32-bit inline immediate sources may be used to materialize
1041  // constants used in 16-bit operands.
1042  //
1043  // e.g. it is unsafe to fold:
1044  // s_mov_b32 s0, 1.0 // materializes 0x3f800000
1045  // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
1046 
1047  // Folding immediates with more than one use will increase program size.
1048  // FIXME: This will also reduce register usage, which may be better
1049  // in some cases. A better heuristic is needed.
1050  if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
1051  foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
1052  } else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) {
1053  foldOperand(OpToFold, UseMI, OpNo, FoldList,
1054  CopiesToReplace);
1055  } else {
1056  if (++NumLiteralUses == 1) {
1057  NonInlineUse = &*Use;
1058  NonInlineUseOpNo = OpNo;
1059  }
1060  }
1061  }
1062 
1063  if (NumLiteralUses == 1) {
1064  MachineInstr *UseMI = NonInlineUse->getParent();
1065  foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace);
1066  }
1067  } else {
1068  // Folding register.
1071  Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
1072  Use != E; ++Use) {
1073  UsesToProcess.push_back(Use);
1074  }
1075  for (auto U : UsesToProcess) {
1076  MachineInstr *UseMI = U->getParent();
1077 
1078  foldOperand(OpToFold, UseMI, U.getOperandNo(),
1079  FoldList, CopiesToReplace);
1080  }
1081  }
1082 
1083  MachineFunction *MF = MI.getParent()->getParent();
1084  // Make sure we add EXEC uses to any new v_mov instructions created.
1085  for (MachineInstr *Copy : CopiesToReplace)
1086  Copy->addImplicitDefUseOperands(*MF);
1087 
1088  for (FoldCandidate &Fold : FoldList) {
1089  if (updateOperand(Fold, *TII, *TRI, *ST)) {
1090  // Clear kill flags.
1091  if (Fold.isReg()) {
1092  assert(Fold.OpToFold && Fold.OpToFold->isReg());
1093  // FIXME: Probably shouldn't bother trying to fold if not an
1094  // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1095  // copies.
1096  MRI->clearKillFlags(Fold.OpToFold->getReg());
1097  }
1098  LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
1099  << static_cast<int>(Fold.UseOpNo) << " of "
1100  << *Fold.UseMI << '\n');
1101  tryFoldInst(TII, Fold.UseMI);
1102  } else if (Fold.isCommuted()) {
1103  // Restoring instruction's original operand order if fold has failed.
1104  TII->commuteInstruction(*Fold.UseMI, false);
1105  }
1106  }
1107 }
1108 
1109 // Clamp patterns are canonically selected to v_max_* instructions, so only
1110 // handle them.
1111 const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
1112  unsigned Op = MI.getOpcode();
1113  switch (Op) {
1114  case AMDGPU::V_MAX_F32_e64:
1115  case AMDGPU::V_MAX_F16_e64:
1116  case AMDGPU::V_MAX_F64:
1117  case AMDGPU::V_PK_MAX_F16: {
1118  if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
1119  return nullptr;
1120 
1121  // Make sure sources are identical.
1122  const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1123  const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1124  if (!Src0->isReg() || !Src1->isReg() ||
1125  Src0->getReg() != Src1->getReg() ||
1126  Src0->getSubReg() != Src1->getSubReg() ||
1127  Src0->getSubReg() != AMDGPU::NoSubRegister)
1128  return nullptr;
1129 
1130  // Can't fold up if we have modifiers.
1131  if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1132  return nullptr;
1133 
1134  unsigned Src0Mods
1135  = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
1136  unsigned Src1Mods
1137  = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
1138 
1139  // Having a 0 op_sel_hi would require swizzling the output in the source
1140  // instruction, which we can't do.
1141  unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1
1142  : 0u;
1143  if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
1144  return nullptr;
1145  return Src0;
1146  }
1147  default:
1148  return nullptr;
1149  }
1150 }
1151 
1152 // We obviously have multiple uses in a clamp since the register is used twice
1153 // in the same instruction.
1154 static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) {
1155  int Count = 0;
1156  for (auto I = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end();
1157  I != E; ++I) {
1158  if (++Count > 1)
1159  return false;
1160  }
1161 
1162  return true;
1163 }
1164 
1165 // FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
1166 bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
1167  const MachineOperand *ClampSrc = isClamp(MI);
1168  if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg()))
1169  return false;
1170 
1171  MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
1172 
1173  // The type of clamp must be compatible.
1174  if (TII->getClampMask(*Def) != TII->getClampMask(MI))
1175  return false;
1176 
1177  MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
1178  if (!DefClamp)
1179  return false;
1180 
1181  LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def
1182  << '\n');
1183 
1184  // Clamp is applied after omod, so it is OK if omod is set.
1185  DefClamp->setImm(1);
1186  MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1187  MI.eraseFromParent();
1188  return true;
1189 }
1190 
1191 static int getOModValue(unsigned Opc, int64_t Val) {
1192  switch (Opc) {
1193  case AMDGPU::V_MUL_F32_e64: {
1194  switch (static_cast<uint32_t>(Val)) {
1195  case 0x3f000000: // 0.5
1196  return SIOutMods::DIV2;
1197  case 0x40000000: // 2.0
1198  return SIOutMods::MUL2;
1199  case 0x40800000: // 4.0
1200  return SIOutMods::MUL4;
1201  default:
1202  return SIOutMods::NONE;
1203  }
1204  }
1205  case AMDGPU::V_MUL_F16_e64: {
1206  switch (static_cast<uint16_t>(Val)) {
1207  case 0x3800: // 0.5
1208  return SIOutMods::DIV2;
1209  case 0x4000: // 2.0
1210  return SIOutMods::MUL2;
1211  case 0x4400: // 4.0
1212  return SIOutMods::MUL4;
1213  default:
1214  return SIOutMods::NONE;
1215  }
1216  }
1217  default:
1218  llvm_unreachable("invalid mul opcode");
1219  }
1220 }
1221 
1222 // FIXME: Does this really not support denormals with f16?
1223 // FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
1224 // handled, so will anything other than that break?
1225 std::pair<const MachineOperand *, int>
1226 SIFoldOperands::isOMod(const MachineInstr &MI) const {
1227  unsigned Op = MI.getOpcode();
1228  switch (Op) {
1229  case AMDGPU::V_MUL_F32_e64:
1230  case AMDGPU::V_MUL_F16_e64: {
1231  // If output denormals are enabled, omod is ignored.
1232  if ((Op == AMDGPU::V_MUL_F32_e64 && ST->hasFP32Denormals()) ||
1233  (Op == AMDGPU::V_MUL_F16_e64 && ST->hasFP16Denormals()))
1234  return std::make_pair(nullptr, SIOutMods::NONE);
1235 
1236  const MachineOperand *RegOp = nullptr;
1237  const MachineOperand *ImmOp = nullptr;
1238  const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1239  const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1240  if (Src0->isImm()) {
1241  ImmOp = Src0;
1242  RegOp = Src1;
1243  } else if (Src1->isImm()) {
1244  ImmOp = Src1;
1245  RegOp = Src0;
1246  } else
1247  return std::make_pair(nullptr, SIOutMods::NONE);
1248 
1249  int OMod = getOModValue(Op, ImmOp->getImm());
1250  if (OMod == SIOutMods::NONE ||
1251  TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
1252  TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
1253  TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
1254  TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
1255  return std::make_pair(nullptr, SIOutMods::NONE);
1256 
1257  return std::make_pair(RegOp, OMod);
1258  }
1259  case AMDGPU::V_ADD_F32_e64:
1260  case AMDGPU::V_ADD_F16_e64: {
1261  // If output denormals are enabled, omod is ignored.
1262  if ((Op == AMDGPU::V_ADD_F32_e64 && ST->hasFP32Denormals()) ||
1263  (Op == AMDGPU::V_ADD_F16_e64 && ST->hasFP16Denormals()))
1264  return std::make_pair(nullptr, SIOutMods::NONE);
1265 
1266  // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
1267  const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1268  const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1269 
1270  if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
1271  Src0->getSubReg() == Src1->getSubReg() &&
1272  !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
1273  !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
1274  !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
1275  !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1276  return std::make_pair(Src0, SIOutMods::MUL2);
1277 
1278  return std::make_pair(nullptr, SIOutMods::NONE);
1279  }
1280  default:
1281  return std::make_pair(nullptr, SIOutMods::NONE);
1282  }
1283 }
1284 
1285 // FIXME: Does this need to check IEEE bit on function?
1286 bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
1287  const MachineOperand *RegOp;
1288  int OMod;
1289  std::tie(RegOp, OMod) = isOMod(MI);
1290  if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
1291  RegOp->getSubReg() != AMDGPU::NoSubRegister ||
1292  !hasOneNonDBGUseInst(*MRI, RegOp->getReg()))
1293  return false;
1294 
1295  MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
1296  MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
1297  if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
1298  return false;
1299 
1300  // Clamp is applied after omod. If the source already has clamp set, don't
1301  // fold it.
1302  if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
1303  return false;
1304 
1305  LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n');
1306 
1307  DefOMod->setImm(OMod);
1308  MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1309  MI.eraseFromParent();
1310  return true;
1311 }
1312 
1313 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
1314  if (skipFunction(MF.getFunction()))
1315  return false;
1316 
1317  MRI = &MF.getRegInfo();
1318  ST = &MF.getSubtarget<GCNSubtarget>();
1319  TII = ST->getInstrInfo();
1320  TRI = &TII->getRegisterInfo();
1321  MFI = MF.getInfo<SIMachineFunctionInfo>();
1322 
1323  // omod is ignored by hardware if IEEE bit is enabled. omod also does not
1324  // correctly handle signed zeros.
1325  //
1326  // FIXME: Also need to check strictfp
1327  bool IsIEEEMode = MFI->getMode().IEEE;
1328  bool HasNSZ = MFI->hasNoSignedZerosFPMath();
1329 
1330  for (MachineBasicBlock *MBB : depth_first(&MF)) {
1332  for (I = MBB->begin(); I != MBB->end(); I = Next) {
1333  Next = std::next(I);
1334  MachineInstr &MI = *I;
1335 
1336  tryFoldInst(TII, &MI);
1337 
1338  if (!TII->isFoldableCopy(MI)) {
1339  // TODO: Omod might be OK if there is NSZ only on the source
1340  // instruction, and not the omod multiply.
1341  if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
1342  !tryFoldOMod(MI))
1343  tryFoldClamp(MI);
1344  continue;
1345  }
1346 
1347  MachineOperand &OpToFold = MI.getOperand(1);
1348  bool FoldingImm =
1349  OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1350 
1351  // FIXME: We could also be folding things like TargetIndexes.
1352  if (!FoldingImm && !OpToFold.isReg())
1353  continue;
1354 
1355  if (OpToFold.isReg() && !Register::isVirtualRegister(OpToFold.getReg()))
1356  continue;
1357 
1358  // Prevent folding operands backwards in the function. For example,
1359  // the COPY opcode must not be replaced by 1 in this example:
1360  //
1361  // %3 = COPY %vgpr0; VGPR_32:%3
1362  // ...
1363  // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
1364  MachineOperand &Dst = MI.getOperand(0);
1365  if (Dst.isReg() && !Register::isVirtualRegister(Dst.getReg()))
1366  continue;
1367 
1368  foldInstOperand(MI, OpToFold);
1369  }
1370  }
1371  return false;
1372 }
static bool isReg(const MCInst &MI, unsigned OpNo)
unsigned getNumImplicitUses() const
Return the number of implicit uses this instruction has.
Definition: MCInstrDesc.h:546
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
bool use_nodbg_empty(unsigned RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register...
AMDGPU specific subclass of TargetSubtarget.
unsigned getNumImplicitDefs() const
Return the number of implicit defs this instruct has.
Definition: MCInstrDesc.h:568
This class represents lattice values for constants.
Definition: AllocatorList.h:23
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, unsigned Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before...
static bool isUseMIInFoldList(ArrayRef< FoldCandidate > FoldList, const MachineInstr *MI)
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
static bool isUseSafeToFold(const SIInstrInfo *TII, const MachineInstr &MI, const MachineOperand &UseMO)
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:385
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:178
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
Definition: MachineInstr.h:527
void setIsUndef(bool Val=true)
unsigned Reg
unsigned getSubReg() const
unsigned getRegBitWidth(unsigned RCID)
Get the size in bits of a register from the register class RC.
void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
bool isRegSequence() const
void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
unsigned const TargetRegisterInfo * TRI
F(f)
bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi)
bool isInlineConstant(const APInt &Imm) const
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
const SIRegisterInfo & getRegisterInfo() const
Definition: SIInstrInfo.h:171
bool isMoveImmediate(QueryType Type=IgnoreBundle) const
Return true if this instruction is a move immediate (including conditional moves) instruction...
Definition: MachineInstr.h:718
AMDGPU::SIModeRegisterDefaults getMode() const
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
static unsigned getMovOpc(bool IsScalar)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
APInt getLoBits(unsigned numBits) const
Compute an APInt containing numBits lowbits from this APInt.
Definition: APInt.cpp:518
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:225
const HexagonInstrInfo * TII
bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:414
static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg)
A Use represents the edge between a Value definition and its users.
Definition: Use.h:55
static bool tryAddToFoldList(SmallVectorImpl< FoldCandidate > &FoldList, MachineInstr *MI, unsigned OpNo, MachineOperand *OpToFold, const SIInstrInfo *TII)
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:41
void eraseFromParent()
Unlink &#39;this&#39; from the containing basic block and delete it.
static MachineOperand * getImmOrMaterializedImm(MachineRegisterInfo &MRI, MachineOperand &Op)
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:411
unsigned getID() const
Return the register class ID number.
static int getOModValue(unsigned Opc, int64_t Val)
MachineInstr * getVRegDef(unsigned Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
uint8_t OperandType
Information about the type of the operand.
Definition: MCInstrDesc.h:82
bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition: MachineInstr.h:408
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:436
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:32
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
void setReg(Register Reg)
Change the register this operand corresponds to.
unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:47
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI...
void ChangeToImmediate(int64_t ImmVal)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value...
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
static const unsigned CommuteAnyOperandIndex
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
static bool updateOperand(FoldCandidate &Fold, const SIInstrInfo &TII, const TargetRegisterInfo &TRI, const GCNSubtarget &ST)
Address of a global value.
unsigned const MachineRegisterInfo * MRI
bool isFoldableCopy(const MachineInstr &MI) const
static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
Definition: MCInstrDesc.h:249
MachineInstrBuilder & UseMI
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static Register UseReg(const MachineOperand &MO)
Register is known to be fully dead.
Represent the analysis usage information of a pass.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
void setImm(int64_t immVal)
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:284
auto find_if(R &&Range, UnaryPredicate P) -> decltype(adl_begin(Range))
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1193
static bool frameIndexMayFold(const SIInstrInfo *TII, const MachineInstr &UseMI, int OpNo, const MachineOperand &OpToFold)
bool isCopy() const
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
#define DEBUG_TYPE
void setIsKill(bool Val=true)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:33
void setDesc(const MCInstrDesc &tid)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one...
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:301
static bool tryFoldInst(const SIInstrInfo *TII, MachineInstr *MI)
int64_t getImm() const
static void stripExtraCopyOperands(MachineInstr &MI)
Remove any leftover implicit operands from mutating the instruction.
MachineInstr * getUniqueVRegDef(unsigned Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
The access may modify the value stored in memory.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:940
Class for arbitrary precision integers.
Definition: APInt.h:69
iterator_range< mop_iterator > implicit_operands()
Definition: MachineInstr.h:491
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:256
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Definition: MachineInstr.h:64
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void initializeSIFoldOperandsPass(PassRegistry &)
char & SIFoldOperandsID
bool hasInv2PiInlineImm() const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Interface definition for SIInstrInfo.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition: MCInstrDesc.h:76
FunctionPass * createSIFoldOperandsPass()
static MachineOperand CreateImm(int64_t Val)
#define I(x, y, z)
Definition: MD5.cpp:58
void setSubReg(unsigned subReg)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
static bool isInlineConstantIfFolded(const SIInstrInfo *TII, const MachineInstr &UseMI, unsigned OpNo, const MachineOperand &OpToFold)
uint32_t Size
Definition: Profile.cpp:46
Abstract Stack Frame Index.
constexpr bool isUInt< 16 >(uint64_t x)
Definition: MathExtras.h:345
bool isReg() const
isReg - Tests if this is a MO_Register operand.
iterator_range< df_iterator< T > > depth_first(const T &G)
static use_instr_nodbg_iterator use_instr_nodbg_end()
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
use_instr_nodbg_iterator use_instr_nodbg_begin(unsigned RegNo) const
static bool isSDWA(const MachineInstr &MI)
Definition: SIInstrInfo.h:420
static bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:69
const MCOperandInfo * OpInfo
Definition: MCInstrDesc.h:189
void ChangeToFrameIndex(int Idx)
Replace this operand with a frame index.
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
void RemoveOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with...
Register getReg() const
getReg - Returns the register number.
static bool tryConstantFoldOp(MachineRegisterInfo &MRI, const SIInstrInfo *TII, MachineInstr *MI, MachineOperand *ImmOp)
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:70
#define LLVM_DEBUG(X)
Definition: Debug.h:122
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:416
reg_begin/reg_end - Provide iteration support to walk over all definitions and uses of a register wit...
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
Definition: MachineInstr.h:297
bool isCommutable(QueryType Type=IgnoreBundle) const
Return true if this may be a 2- or 3-address instruction (of the form "X = op Y, Z, ..."), which produces the same result if Y and Z are exchanged.
Definition: MachineInstr.h:877
bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
bool isImplicit() const
static bool tryToFoldACImm(const SIInstrInfo *TII, const MachineOperand &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx, SmallVectorImpl< FoldCandidate > &FoldList)
const SIRegisterInfo * getRegisterInfo() const override
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:500