LLVM  14.0.0git
SIWholeQuadMode.cpp
Go to the documentation of this file.
1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass adds instructions to enable whole quad mode (strict or non-strict)
11 /// for pixel shaders, and strict whole wavefront mode for all programs.
12 ///
13 /// The "strict" prefix indicates that inactive lanes do not take part in
14 /// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
15 /// always be enabled irrespective of control flow decisions. Conversely in
16 /// non-strict WQM inactive lanes may control flow decisions.
17 ///
18 /// Whole quad mode is required for derivative computations, but it interferes
19 /// with shader side effects (stores and atomics). It ensures that WQM is
20 /// enabled when necessary, but disabled around stores and atomics.
21 ///
22 /// When necessary, this pass creates a function prolog
23 ///
24 /// S_MOV_B64 LiveMask, EXEC
25 /// S_WQM_B64 EXEC, EXEC
26 ///
27 /// to enter WQM at the top of the function and surrounds blocks of Exact
28 /// instructions by
29 ///
30 /// S_AND_SAVEEXEC_B64 Tmp, LiveMask
31 /// ...
32 /// S_MOV_B64 EXEC, Tmp
33 ///
34 /// We also compute when a sequence of instructions requires strict whole
35 /// wavefront mode (StrictWWM) and insert instructions to save and restore it:
36 ///
37 /// S_OR_SAVEEXEC_B64 Tmp, -1
38 /// ...
39 /// S_MOV_B64 EXEC, Tmp
40 ///
41 /// When a sequence of instructions requires strict whole quad mode (StrictWQM)
42 /// we use a similar save and restore mechanism and force whole quad mode for
43 /// those instructions:
44 ///
45 /// S_MOV_B64 Tmp, EXEC
46 /// S_WQM_B64 EXEC, EXEC
47 /// ...
48 /// S_MOV_B64 EXEC, Tmp
49 ///
50 /// In order to avoid excessive switching during sequences of Exact
51 /// instructions, the pass first analyzes which instructions must be run in WQM
52 /// (aka which instructions produce values that lead to derivative
53 /// computations).
54 ///
55 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
56 ///
57 /// There is room for improvement given better control flow analysis:
58 ///
59 /// (1) at the top level (outside of control flow statements, and as long as
60 /// kill hasn't been used), one SGPR can be saved by recovering WQM from
61 /// the LiveMask (this is implemented for the entry block).
62 ///
63 /// (2) when entire regions (e.g. if-else blocks or entire loops) only
64 /// consist of exact and don't-care instructions, the switch only has to
65 /// be done at the entry and exit points rather than potentially in each
66 /// block of the region.
67 ///
68 //===----------------------------------------------------------------------===//
69 
70 #include "AMDGPU.h"
71 #include "GCNSubtarget.h"
73 #include "llvm/ADT/MapVector.h"
81 #include "llvm/IR/CallingConv.h"
82 #include "llvm/InitializePasses.h"
84 
85 using namespace llvm;
86 
87 #define DEBUG_TYPE "si-wqm"
88 
89 namespace {
90 
91 enum {
92  StateWQM = 0x1,
93  StateStrictWWM = 0x2,
94  StateStrictWQM = 0x4,
95  StateExact = 0x8,
96  StateStrict = StateStrictWWM | StateStrictWQM,
97 };
98 
99 struct PrintState {
100 public:
101  int State;
102 
103  explicit PrintState(int State) : State(State) {}
104 };
105 
106 #ifndef NDEBUG
107 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
108 
109  static const std::pair<char, const char *> Mapping[] = {
110  std::make_pair(StateWQM, "WQM"),
111  std::make_pair(StateStrictWWM, "StrictWWM"),
112  std::make_pair(StateStrictWQM, "StrictWQM"),
113  std::make_pair(StateExact, "Exact")};
114  char State = PS.State;
115  for (auto M : Mapping) {
116  if (State & M.first) {
117  OS << M.second;
118  State &= ~M.first;
119 
120  if (State)
121  OS << '|';
122  }
123  }
124  assert(State == 0);
125  return OS;
126 }
127 #endif
128 
129 struct InstrInfo {
130  char Needs = 0;
131  char Disabled = 0;
132  char OutNeeds = 0;
133 };
134 
135 struct BlockInfo {
136  char Needs = 0;
137  char InNeeds = 0;
138  char OutNeeds = 0;
139  char InitialState = 0;
140  bool NeedsLowering = false;
141 };
142 
143 struct WorkItem {
144  MachineBasicBlock *MBB = nullptr;
145  MachineInstr *MI = nullptr;
146 
147  WorkItem() = default;
148  WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
149  WorkItem(MachineInstr *MI) : MI(MI) {}
150 };
151 
152 class SIWholeQuadMode : public MachineFunctionPass {
153 private:
154  const SIInstrInfo *TII;
155  const SIRegisterInfo *TRI;
156  const GCNSubtarget *ST;
158  LiveIntervals *LIS;
161 
162  unsigned AndOpc;
163  unsigned AndN2Opc;
164  unsigned XorOpc;
165  unsigned AndSaveExecOpc;
166  unsigned OrSaveExecOpc;
167  unsigned WQMOpc;
168  Register Exec;
169  Register LiveMaskReg;
170 
173 
174  // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
175  DenseMap<const MachineInstr *, char> StateTransition;
176 
177  SmallVector<MachineInstr *, 2> LiveMaskQueries;
178  SmallVector<MachineInstr *, 4> LowerToMovInstrs;
179  SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
181 
182  void printInfo();
183 
184  void markInstruction(MachineInstr &MI, char Flag,
185  std::vector<WorkItem> &Worklist);
186  void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
187  unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
188  void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
189  std::vector<WorkItem> &Worklist);
190  void markInstructionUses(const MachineInstr &MI, char Flag,
191  std::vector<WorkItem> &Worklist);
192  char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
193  void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
194  void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
195  char analyzeFunction(MachineFunction &MF);
196 
201  MachineBasicBlock::iterator Last, bool PreferLast,
202  bool SaveSCC);
203  void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
204  Register SaveWQM);
206  Register SavedWQM);
207  void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
208  Register SaveOrig, char StrictStateNeeded);
209  void fromStrictMode(MachineBasicBlock &MBB,
210  MachineBasicBlock::iterator Before, Register SavedOrig,
211  char NonStrictState, char CurrentStrictState);
212 
214 
216  bool IsWQM);
218 
219  void lowerBlock(MachineBasicBlock &MBB);
220  void processBlock(MachineBasicBlock &MBB, bool IsEntry);
221 
222  void lowerLiveMaskQueries();
223  void lowerCopyInstrs();
224  void lowerKillInstrs(bool IsWQM);
225 
226 public:
227  static char ID;
228 
229  SIWholeQuadMode() :
231 
232  bool runOnMachineFunction(MachineFunction &MF) override;
233 
234  StringRef getPassName() const override { return "SI Whole Quad Mode"; }
235 
236  void getAnalysisUsage(AnalysisUsage &AU) const override {
245  }
246 
247  MachineFunctionProperties getClearedProperties() const override {
250  }
251 };
252 
253 } // end anonymous namespace
254 
255 char SIWholeQuadMode::ID = 0;
256 
257 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
258  false)
262 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
263  false)
264 
265 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
266 
268  return new SIWholeQuadMode;
269 }
270 
271 #ifndef NDEBUG
272 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
273  for (const auto &BII : Blocks) {
274  dbgs() << "\n"
275  << printMBBReference(*BII.first) << ":\n"
276  << " InNeeds = " << PrintState(BII.second.InNeeds)
277  << ", Needs = " << PrintState(BII.second.Needs)
278  << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
279 
280  for (const MachineInstr &MI : *BII.first) {
281  auto III = Instructions.find(&MI);
282  if (III == Instructions.end())
283  continue;
284 
285  dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs)
286  << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
287  }
288  }
289 }
290 #endif
291 
292 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
293  std::vector<WorkItem> &Worklist) {
294  InstrInfo &II = Instructions[&MI];
295 
296  assert(!(Flag & StateExact) && Flag != 0);
297 
298  // Remove any disabled states from the flag. The user that required it gets
299  // an undefined value in the helper lanes. For example, this can happen if
300  // the result of an atomic is used by instruction that requires WQM, where
301  // ignoring the request for WQM is correct as per the relevant specs.
302  Flag &= ~II.Disabled;
303 
304  // Ignore if the flag is already encompassed by the existing needs, or we
305  // just disabled everything.
306  if ((II.Needs & Flag) == Flag)
307  return;
308 
309  LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
310  II.Needs |= Flag;
311  Worklist.push_back(&MI);
312 }
313 
314 /// Mark all relevant definitions of register \p Reg in usage \p UseMI.
315 void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
316  Register Reg, unsigned SubReg, char Flag,
317  std::vector<WorkItem> &Worklist) {
318  LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
319 
320  LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
321  const VNInfo *Value = UseLRQ.valueIn();
322  if (!Value)
323  return;
324 
325  // Note: this code assumes that lane masks on AMDGPU completely
326  // cover registers.
327  const LaneBitmask UseLanes =
329  : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
331 
332  // Perform a depth-first iteration of the LiveRange graph marking defs.
333  // Stop processing of a given branch when all use lanes have been defined.
334  // The first definition stops processing for a physical register.
335  struct PhiEntry {
336  const VNInfo *Phi;
337  unsigned PredIdx;
338  LaneBitmask DefinedLanes;
339 
340  PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
341  : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
342  };
343  using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
344  SmallVector<PhiEntry, 2> PhiStack;
345  SmallSet<VisitKey, 4> Visited;
346  LaneBitmask DefinedLanes;
347  unsigned NextPredIdx = 0; // Only used for processing phi nodes
348  do {
349  const VNInfo *NextValue = nullptr;
350  const VisitKey Key(Value, DefinedLanes);
351 
352  if (!Visited.count(Key)) {
353  Visited.insert(Key);
354  // On first visit to a phi then start processing first predecessor
355  NextPredIdx = 0;
356  }
357 
358  if (Value->isPHIDef()) {
359  // Each predecessor node in the phi must be processed as a subgraph
360  const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
361  assert(MBB && "Phi-def has no defining MBB");
362 
363  // Find next predecessor to process
364  unsigned Idx = NextPredIdx;
365  auto PI = MBB->pred_begin() + Idx;
366  auto PE = MBB->pred_end();
367  for (; PI != PE && !NextValue; ++PI, ++Idx) {
368  if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
369  if (!Visited.count(VisitKey(VN, DefinedLanes)))
370  NextValue = VN;
371  }
372  }
373 
374  // If there are more predecessors to process; add phi to stack
375  if (PI != PE)
376  PhiStack.emplace_back(Value, Idx, DefinedLanes);
377  } else {
378  MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
379  assert(MI && "Def has no defining instruction");
380 
381  if (Reg.isVirtual()) {
382  // Iterate over all operands to find relevant definitions
383  bool HasDef = false;
384  for (const MachineOperand &Op : MI->operands()) {
385  if (!(Op.isReg() && Op.isDef() && Op.getReg() == Reg))
386  continue;
387 
388  // Compute lanes defined and overlap with use
389  LaneBitmask OpLanes =
390  Op.isUndef() ? LaneBitmask::getAll()
391  : TRI->getSubRegIndexLaneMask(Op.getSubReg());
392  LaneBitmask Overlap = (UseLanes & OpLanes);
393 
394  // Record if this instruction defined any of use
395  HasDef |= Overlap.any();
396 
397  // Mark any lanes defined
398  DefinedLanes |= OpLanes;
399  }
400 
401  // Check if all lanes of use have been defined
402  if ((DefinedLanes & UseLanes) != UseLanes) {
403  // Definition not complete; need to process input value
404  LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
405  if (const VNInfo *VN = LRQ.valueIn()) {
406  if (!Visited.count(VisitKey(VN, DefinedLanes)))
407  NextValue = VN;
408  }
409  }
410 
411  // Only mark the instruction if it defines some part of the use
412  if (HasDef)
413  markInstruction(*MI, Flag, Worklist);
414  } else {
415  // For physical registers simply mark the defining instruction
416  markInstruction(*MI, Flag, Worklist);
417  }
418  }
419 
420  if (!NextValue && !PhiStack.empty()) {
421  // Reach end of chain; revert to processing last phi
422  PhiEntry &Entry = PhiStack.back();
423  NextValue = Entry.Phi;
424  NextPredIdx = Entry.PredIdx;
425  DefinedLanes = Entry.DefinedLanes;
426  PhiStack.pop_back();
427  }
428 
429  Value = NextValue;
430  } while (Value);
431 }
432 
433 void SIWholeQuadMode::markOperand(const MachineInstr &MI,
434  const MachineOperand &Op, char Flag,
435  std::vector<WorkItem> &Worklist) {
436  assert(Op.isReg());
437  Register Reg = Op.getReg();
438 
439  // Ignore some hardware registers
440  switch (Reg) {
441  case AMDGPU::EXEC:
442  case AMDGPU::EXEC_LO:
443  return;
444  default:
445  break;
446  }
447 
448  LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
449  << " for " << MI);
450  if (Reg.isVirtual()) {
451  LiveRange &LR = LIS->getInterval(Reg);
452  markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
453  } else {
454  // Handle physical registers that we need to track; this is mostly relevant
455  // for VCC, which can appear as the (implicit) input of a uniform branch,
456  // e.g. when a loop counter is stored in a VGPR.
457  for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid();
458  ++RegUnit) {
459  LiveRange &LR = LIS->getRegUnit(*RegUnit);
460  const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
461  if (!Value)
462  continue;
463 
464  markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist);
465  }
466  }
467 }
468 
469 /// Mark all instructions defining the uses in \p MI with \p Flag.
470 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
471  std::vector<WorkItem> &Worklist) {
472  LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
473  << MI);
474 
475  for (const MachineOperand &Use : MI.uses()) {
476  if (!Use.isReg() || !Use.isUse())
477  continue;
478  markOperand(MI, Use, Flag, Worklist);
479  }
480 }
481 
482 // Scan instructions to determine which ones require an Exact execmask and
483 // which ones seed WQM requirements.
484 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
485  std::vector<WorkItem> &Worklist) {
486  char GlobalFlags = 0;
487  bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
488  SmallVector<MachineInstr *, 4> SetInactiveInstrs;
489  SmallVector<MachineInstr *, 4> SoftWQMInstrs;
490 
491  // We need to visit the basic blocks in reverse post-order so that we visit
492  // defs before uses, in particular so that we don't accidentally mark an
493  // instruction as needing e.g. WQM before visiting it and realizing it needs
494  // WQM disabled.
496  for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
497  MachineBasicBlock &MBB = **BI;
498  BlockInfo &BBI = Blocks[&MBB];
499 
500  for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
501  MachineInstr &MI = *II;
502  InstrInfo &III = Instructions[&MI];
503  unsigned Opcode = MI.getOpcode();
504  char Flags = 0;
505 
506  if (TII->isWQM(Opcode)) {
507  // If LOD is not supported WQM is not needed.
508  if (!ST->hasExtendedImageInsts())
509  continue;
510  // Sampling instructions don't need to produce results for all pixels
511  // in a quad, they just require all inputs of a quad to have been
512  // computed for derivatives.
513  markInstructionUses(MI, StateWQM, Worklist);
514  GlobalFlags |= StateWQM;
515  continue;
516  } else if (Opcode == AMDGPU::WQM) {
517  // The WQM intrinsic requires its output to have all the helper lanes
518  // correct, so we need it to be in WQM.
519  Flags = StateWQM;
520  LowerToCopyInstrs.push_back(&MI);
521  } else if (Opcode == AMDGPU::SOFT_WQM) {
522  LowerToCopyInstrs.push_back(&MI);
523  SoftWQMInstrs.push_back(&MI);
524  continue;
525  } else if (Opcode == AMDGPU::STRICT_WWM) {
526  // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
527  // it needs to be executed in WQM or Exact so that its copy doesn't
528  // clobber inactive lanes.
529  markInstructionUses(MI, StateStrictWWM, Worklist);
530  GlobalFlags |= StateStrictWWM;
531  LowerToMovInstrs.push_back(&MI);
532  continue;
533  } else if (Opcode == AMDGPU::STRICT_WQM) {
534  // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
535  // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
536  // quads that have at least one active thread.
537  markInstructionUses(MI, StateStrictWQM, Worklist);
538  GlobalFlags |= StateStrictWQM;
539  LowerToMovInstrs.push_back(&MI);
540  continue;
541  } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
542  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
543  III.Disabled = StateStrict;
544  MachineOperand &Inactive = MI.getOperand(2);
545  if (Inactive.isReg()) {
546  if (Inactive.isUndef()) {
547  LowerToCopyInstrs.push_back(&MI);
548  } else {
549  markOperand(MI, Inactive, StateStrictWWM, Worklist);
550  }
551  }
552  SetInactiveInstrs.push_back(&MI);
553  continue;
554  } else if (TII->isDisableWQM(MI)) {
555  BBI.Needs |= StateExact;
556  if (!(BBI.InNeeds & StateExact)) {
557  BBI.InNeeds |= StateExact;
558  Worklist.push_back(&MBB);
559  }
560  GlobalFlags |= StateExact;
561  III.Disabled = StateWQM | StateStrict;
562  continue;
563  } else {
564  if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) {
565  LiveMaskQueries.push_back(&MI);
566  } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
567  Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
568  Opcode == AMDGPU::SI_DEMOTE_I1) {
569  KillInstrs.push_back(&MI);
570  BBI.NeedsLowering = true;
571  } else if (WQMOutputs) {
572  // The function is in machine SSA form, which means that physical
573  // VGPRs correspond to shader inputs and outputs. Inputs are
574  // only used, outputs are only defined.
575  // FIXME: is this still valid?
576  for (const MachineOperand &MO : MI.defs()) {
577  if (!MO.isReg())
578  continue;
579 
580  Register Reg = MO.getReg();
581 
582  if (!Reg.isVirtual() &&
583  TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) {
584  Flags = StateWQM;
585  break;
586  }
587  }
588  }
589 
590  if (!Flags)
591  continue;
592  }
593 
594  markInstruction(MI, Flags, Worklist);
595  GlobalFlags |= Flags;
596  }
597  }
598 
599  // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
600  // ever used anywhere in the function. This implements the corresponding
601  // semantics of @llvm.amdgcn.set.inactive.
602  // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
603  if (GlobalFlags & StateWQM) {
604  for (MachineInstr *MI : SetInactiveInstrs)
605  markInstruction(*MI, StateWQM, Worklist);
606  for (MachineInstr *MI : SoftWQMInstrs)
607  markInstruction(*MI, StateWQM, Worklist);
608  }
609 
610  return GlobalFlags;
611 }
612 
613 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
614  std::vector<WorkItem>& Worklist) {
615  MachineBasicBlock *MBB = MI.getParent();
616  InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
617  BlockInfo &BI = Blocks[MBB];
618 
619  // Control flow-type instructions and stores to temporary memory that are
620  // followed by WQM computations must themselves be in WQM.
621  if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
622  (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
623  Instructions[&MI].Needs = StateWQM;
624  II.Needs = StateWQM;
625  }
626 
627  // Propagate to block level
628  if (II.Needs & StateWQM) {
629  BI.Needs |= StateWQM;
630  if (!(BI.InNeeds & StateWQM)) {
631  BI.InNeeds |= StateWQM;
632  Worklist.push_back(MBB);
633  }
634  }
635 
636  // Propagate backwards within block
637  if (MachineInstr *PrevMI = MI.getPrevNode()) {
638  char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
639  if (!PrevMI->isPHI()) {
640  InstrInfo &PrevII = Instructions[PrevMI];
641  if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
642  PrevII.OutNeeds |= InNeeds;
643  Worklist.push_back(PrevMI);
644  }
645  }
646  }
647 
648  // Propagate WQM flag to instruction inputs
649  assert(!(II.Needs & StateExact));
650 
651  if (II.Needs != 0)
652  markInstructionUses(MI, II.Needs, Worklist);
653 
654  // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
655  // not require any WQM transitions.
656  if (II.Needs & StateStrictWWM)
657  BI.Needs |= StateStrictWWM;
658  if (II.Needs & StateStrictWQM)
659  BI.Needs |= StateStrictWQM;
660 }
661 
662 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
663  std::vector<WorkItem>& Worklist) {
664  BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
665 
666  // Propagate through instructions
667  if (!MBB.empty()) {
668  MachineInstr *LastMI = &*MBB.rbegin();
669  InstrInfo &LastII = Instructions[LastMI];
670  if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
671  LastII.OutNeeds |= BI.OutNeeds;
672  Worklist.push_back(LastMI);
673  }
674  }
675 
676  // Predecessor blocks must provide for our WQM/Exact needs.
677  for (MachineBasicBlock *Pred : MBB.predecessors()) {
678  BlockInfo &PredBI = Blocks[Pred];
679  if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
680  continue;
681 
682  PredBI.OutNeeds |= BI.InNeeds;
683  PredBI.InNeeds |= BI.InNeeds;
684  Worklist.push_back(Pred);
685  }
686 
687  // All successors must be prepared to accept the same set of WQM/Exact data.
688  for (MachineBasicBlock *Succ : MBB.successors()) {
689  BlockInfo &SuccBI = Blocks[Succ];
690  if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
691  continue;
692 
693  SuccBI.InNeeds |= BI.OutNeeds;
694  Worklist.push_back(Succ);
695  }
696 }
697 
698 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
699  std::vector<WorkItem> Worklist;
700  char GlobalFlags = scanInstructions(MF, Worklist);
701 
702  while (!Worklist.empty()) {
703  WorkItem WI = Worklist.back();
704  Worklist.pop_back();
705 
706  if (WI.MI)
707  propagateInstruction(*WI.MI, Worklist);
708  else
709  propagateBlock(*WI.MBB, Worklist);
710  }
711 
712  return GlobalFlags;
713 }
714 
716 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
718  Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
719 
720  MachineInstr *Save =
721  BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
723  MachineInstr *Restore =
724  BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
725  .addReg(SaveReg);
726 
727  LIS->InsertMachineInstrInMaps(*Save);
728  LIS->InsertMachineInstrInMaps(*Restore);
729  LIS->createAndComputeVirtRegInterval(SaveReg);
730 
731  return Restore;
732 }
733 
735  MachineInstr *TermMI) {
736  LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
737  << *TermMI << "\n");
738 
739  MachineBasicBlock *SplitBB =
740  BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
741 
742  // Convert last instruction in block to a terminator.
743  // Note: this only covers the expected patterns
744  unsigned NewOpcode = 0;
745  switch (TermMI->getOpcode()) {
746  case AMDGPU::S_AND_B32:
747  NewOpcode = AMDGPU::S_AND_B32_term;
748  break;
749  case AMDGPU::S_AND_B64:
750  NewOpcode = AMDGPU::S_AND_B64_term;
751  break;
752  case AMDGPU::S_MOV_B32:
753  NewOpcode = AMDGPU::S_MOV_B32_term;
754  break;
755  case AMDGPU::S_MOV_B64:
756  NewOpcode = AMDGPU::S_MOV_B64_term;
757  break;
758  default:
759  break;
760  }
761  if (NewOpcode)
762  TermMI->setDesc(TII->get(NewOpcode));
763 
764  if (SplitBB != BB) {
765  // Update dominator trees
766  using DomTreeT = DomTreeBase<MachineBasicBlock>;
768  for (MachineBasicBlock *Succ : SplitBB->successors()) {
769  DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
770  DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
771  }
772  DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
773  if (MDT)
774  MDT->getBase().applyUpdates(DTUpdates);
775  if (PDT)
776  PDT->getBase().applyUpdates(DTUpdates);
777 
778  // Link blocks
779  MachineInstr *MI =
780  BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
781  .addMBB(SplitBB);
782  LIS->InsertMachineInstrInMaps(*MI);
783  }
784 
785  return SplitBB;
786 }
787 
788 MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
789  MachineInstr &MI) {
790  const DebugLoc &DL = MI.getDebugLoc();
791  unsigned Opcode = 0;
792 
793  assert(MI.getOperand(0).isReg());
794 
795  // Comparison is for live lanes; however here we compute the inverse
796  // (killed lanes). This is because VCMP will always generate 0 bits
797  // for inactive lanes so a mask of live lanes would not be correct
798  // inside control flow.
799  // Invert the comparison by swapping the operands and adjusting
800  // the comparison codes.
801 
802  switch (MI.getOperand(2).getImm()) {
803  case ISD::SETUEQ:
804  Opcode = AMDGPU::V_CMP_LG_F32_e64;
805  break;
806  case ISD::SETUGT:
807  Opcode = AMDGPU::V_CMP_GE_F32_e64;
808  break;
809  case ISD::SETUGE:
810  Opcode = AMDGPU::V_CMP_GT_F32_e64;
811  break;
812  case ISD::SETULT:
813  Opcode = AMDGPU::V_CMP_LE_F32_e64;
814  break;
815  case ISD::SETULE:
816  Opcode = AMDGPU::V_CMP_LT_F32_e64;
817  break;
818  case ISD::SETUNE:
819  Opcode = AMDGPU::V_CMP_EQ_F32_e64;
820  break;
821  case ISD::SETO:
822  Opcode = AMDGPU::V_CMP_O_F32_e64;
823  break;
824  case ISD::SETUO:
825  Opcode = AMDGPU::V_CMP_U_F32_e64;
826  break;
827  case ISD::SETOEQ:
828  case ISD::SETEQ:
829  Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
830  break;
831  case ISD::SETOGT:
832  case ISD::SETGT:
833  Opcode = AMDGPU::V_CMP_NLT_F32_e64;
834  break;
835  case ISD::SETOGE:
836  case ISD::SETGE:
837  Opcode = AMDGPU::V_CMP_NLE_F32_e64;
838  break;
839  case ISD::SETOLT:
840  case ISD::SETLT:
841  Opcode = AMDGPU::V_CMP_NGT_F32_e64;
842  break;
843  case ISD::SETOLE:
844  case ISD::SETLE:
845  Opcode = AMDGPU::V_CMP_NGE_F32_e64;
846  break;
847  case ISD::SETONE:
848  case ISD::SETNE:
849  Opcode = AMDGPU::V_CMP_NLG_F32_e64;
850  break;
851  default:
852  llvm_unreachable("invalid ISD:SET cond code");
853  }
854 
855  // Pick opcode based on comparison type.
856  MachineInstr *VcmpMI;
857  const MachineOperand &Op0 = MI.getOperand(0);
858  const MachineOperand &Op1 = MI.getOperand(1);
859  if (TRI->isVGPR(*MRI, Op0.getReg())) {
860  Opcode = AMDGPU::getVOPe32(Opcode);
861  VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
862  } else {
863  VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
864  .addReg(AMDGPU::VCC, RegState::Define)
865  .addImm(0) // src0 modifiers
866  .add(Op1)
867  .addImm(0) // src1 modifiers
868  .add(Op0)
869  .addImm(0); // omod
870  }
871 
872  // VCC represents lanes killed.
873  Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
874 
875  MachineInstr *MaskUpdateMI =
876  BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
877  .addReg(LiveMaskReg)
878  .addReg(VCC);
879 
880  // State of SCC represents whether any lanes are live in mask,
881  // if SCC is 0 then no lanes will be alive anymore.
882  MachineInstr *EarlyTermMI =
883  BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
884 
885  MachineInstr *ExecMaskMI =
886  BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
887 
888  assert(MBB.succ_size() == 1);
889  MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
890  .addMBB(*MBB.succ_begin());
891 
892  // Update live intervals
893  LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
894  MBB.remove(&MI);
895 
896  LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
897  LIS->InsertMachineInstrInMaps(*ExecMaskMI);
898  LIS->InsertMachineInstrInMaps(*EarlyTermMI);
899  LIS->InsertMachineInstrInMaps(*NewTerm);
900 
901  return NewTerm;
902 }
903 
904 MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
905  MachineInstr &MI, bool IsWQM) {
906  const DebugLoc &DL = MI.getDebugLoc();
907  MachineInstr *MaskUpdateMI = nullptr;
908 
909  const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
910  const MachineOperand &Op = MI.getOperand(0);
911  int64_t KillVal = MI.getOperand(1).getImm();
912  MachineInstr *ComputeKilledMaskMI = nullptr;
913  Register CndReg = !Op.isImm() ? Op.getReg() : Register();
914  Register TmpReg;
915 
916  // Is this a static or dynamic kill?
917  if (Op.isImm()) {
918  if (Op.getImm() == KillVal) {
919  // Static: all active lanes are killed
920  MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
921  .addReg(LiveMaskReg)
922  .addReg(Exec);
923  } else {
924  // Static: kill does nothing
925  MachineInstr *NewTerm = nullptr;
926  if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
927  LIS->RemoveMachineInstrFromMaps(MI);
928  } else {
929  assert(MBB.succ_size() == 1);
930  NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
931  .addMBB(*MBB.succ_begin());
932  LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
933  }
934  MBB.remove(&MI);
935  return NewTerm;
936  }
937  } else {
938  if (!KillVal) {
939  // Op represents live lanes after kill,
940  // so exec mask needs to be factored in.
941  TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
942  ComputeKilledMaskMI =
943  BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);
944  MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
945  .addReg(LiveMaskReg)
946  .addReg(TmpReg);
947  } else {
948  // Op represents lanes to kill
949  MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
950  .addReg(LiveMaskReg)
951  .add(Op);
952  }
953  }
954 
955  // State of SCC represents whether any lanes are live in mask,
956  // if SCC is 0 then no lanes will be alive anymore.
957  MachineInstr *EarlyTermMI =
958  BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
959 
960  // In the case we got this far some lanes are still live,
961  // update EXEC to deactivate lanes as appropriate.
962  MachineInstr *NewTerm;
963  MachineInstr *WQMMaskMI = nullptr;
964  Register LiveMaskWQM;
965  if (IsDemote) {
966  // Demotes deactive quads with only helper lanes
967  LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
968  WQMMaskMI =
969  BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
970  NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
971  .addReg(Exec)
972  .addReg(LiveMaskWQM);
973  } else {
974  // Kills deactivate lanes
975  if (Op.isImm()) {
976  unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
977  NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
978  } else if (!IsWQM) {
979  NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
980  .addReg(Exec)
981  .addReg(LiveMaskReg);
982  } else {
983  unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
984  NewTerm =
985  BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
986  }
987  }
988 
989  // Update live intervals
990  LIS->RemoveMachineInstrFromMaps(MI);
991  MBB.remove(&MI);
992  assert(EarlyTermMI);
993  assert(MaskUpdateMI);
994  assert(NewTerm);
995  if (ComputeKilledMaskMI)
996  LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
997  LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
998  LIS->InsertMachineInstrInMaps(*EarlyTermMI);
999  if (WQMMaskMI)
1000  LIS->InsertMachineInstrInMaps(*WQMMaskMI);
1001  LIS->InsertMachineInstrInMaps(*NewTerm);
1002 
1003  if (CndReg) {
1004  LIS->removeInterval(CndReg);
1005  LIS->createAndComputeVirtRegInterval(CndReg);
1006  }
1007  if (TmpReg)
1008  LIS->createAndComputeVirtRegInterval(TmpReg);
1009  if (LiveMaskWQM)
1010  LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
1011 
1012  return NewTerm;
1013 }
1014 
1015 // Replace (or supplement) instructions accessing live mask.
1016 // This can only happen once all the live mask registers have been created
1017 // and the execute state (WQM/StrictWWM/Exact) of instructions is known.
1018 void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
1019  auto BII = Blocks.find(&MBB);
1020  if (BII == Blocks.end())
1021  return;
1022 
1023  const BlockInfo &BI = BII->second;
1024  if (!BI.NeedsLowering)
1025  return;
1026 
1027  LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
1028 
1029  SmallVector<MachineInstr *, 4> SplitPoints;
1030  char State = BI.InitialState;
1031 
1032  auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1033  while (II != IE) {
1034  auto Next = std::next(II);
1035  MachineInstr &MI = *II;
1036 
1037  if (StateTransition.count(&MI))
1038  State = StateTransition[&MI];
1039 
1040  MachineInstr *SplitPoint = nullptr;
1041  switch (MI.getOpcode()) {
1042  case AMDGPU::SI_DEMOTE_I1:
1043  case AMDGPU::SI_KILL_I1_TERMINATOR:
1044  SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
1045  break;
1046  case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1047  SplitPoint = lowerKillF32(MBB, MI);
1048  break;
1049  default:
1050  break;
1051  }
1052  if (SplitPoint)
1053  SplitPoints.push_back(SplitPoint);
1054 
1055  II = Next;
1056  }
1057 
1058  // Perform splitting after instruction scan to simplify iteration.
1059  if (!SplitPoints.empty()) {
1060  MachineBasicBlock *BB = &MBB;
1061  for (MachineInstr *MI : SplitPoints) {
1062  BB = splitBlock(BB, MI);
1063  }
1064  }
1065 }
1066 
1067 // Return an iterator in the (inclusive) range [First, Last] at which
1068 // instructions can be safely inserted, keeping in mind that some of the
1069 // instructions we want to add necessarily clobber SCC.
1070 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
1072  MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
1073  if (!SaveSCC)
1074  return PreferLast ? Last : First;
1075 
1076  LiveRange &LR =
1077  LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
1078  auto MBBE = MBB.end();
1079  SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
1080  : LIS->getMBBEndIdx(&MBB);
1081  SlotIndex LastIdx =
1082  Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
1083  SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1084  const LiveRange::Segment *S;
1085 
1086  for (;;) {
1087  S = LR.getSegmentContaining(Idx);
1088  if (!S)
1089  break;
1090 
1091  if (PreferLast) {
1092  SlotIndex Next = S->start.getBaseIndex();
1093  if (Next < FirstIdx)
1094  break;
1095  Idx = Next;
1096  } else {
1097  MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
1098  assert(EndMI && "Segment does not end on valid instruction");
1099  auto NextI = std::next(EndMI->getIterator());
1100  if (NextI == MBB.end())
1101  break;
1102  SlotIndex Next = LIS->getInstructionIndex(*NextI);
1103  if (Next > LastIdx)
1104  break;
1105  Idx = Next;
1106  }
1107  }
1108 
1110 
1111  if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
1112  MBBI = MI;
1113  else {
1114  assert(Idx == LIS->getMBBEndIdx(&MBB));
1115  MBBI = MBB.end();
1116  }
1117 
1118  // Move insertion point past any operations modifying EXEC.
1119  // This assumes that the value of SCC defined by any of these operations
1120  // does not need to be preserved.
1121  while (MBBI != Last) {
1122  bool IsExecDef = false;
1123  for (const MachineOperand &MO : MBBI->operands()) {
1124  if (MO.isReg() && MO.isDef()) {
1125  IsExecDef |=
1126  MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1127  }
1128  }
1129  if (!IsExecDef)
1130  break;
1131  MBBI++;
1132  S = nullptr;
1133  }
1134 
1135  if (S)
1136  MBBI = saveSCC(MBB, MBBI);
1137 
1138  return MBBI;
1139 }
1140 
1141 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
1143  Register SaveWQM) {
1144  MachineInstr *MI;
1145 
1146  if (SaveWQM) {
1147  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndSaveExecOpc), SaveWQM)
1148  .addReg(LiveMaskReg);
1149  } else {
1150  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndOpc), Exec)
1151  .addReg(Exec)
1152  .addReg(LiveMaskReg);
1153  }
1154 
1155  LIS->InsertMachineInstrInMaps(*MI);
1156  StateTransition[MI] = StateExact;
1157 }
1158 
1159 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
1161  Register SavedWQM) {
1162  MachineInstr *MI;
1163 
1164  if (SavedWQM) {
1165  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
1166  .addReg(SavedWQM);
1167  } else {
1168  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
1169  }
1170 
1171  LIS->InsertMachineInstrInMaps(*MI);
1172  StateTransition[MI] = StateWQM;
1173 }
1174 
1175 void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
1177  Register SaveOrig, char StrictStateNeeded) {
1178  MachineInstr *MI;
1179  assert(SaveOrig);
1180  assert(StrictStateNeeded == StateStrictWWM ||
1181  StrictStateNeeded == StateStrictWQM);
1182 
1183  if (StrictStateNeeded == StateStrictWWM) {
1184  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
1185  SaveOrig)
1186  .addImm(-1);
1187  } else {
1188  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
1189  SaveOrig)
1190  .addImm(-1);
1191  }
1192  LIS->InsertMachineInstrInMaps(*MI);
1193  StateTransition[MI] = StateStrictWWM;
1194 }
1195 
1196 void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
1198  Register SavedOrig, char NonStrictState,
1199  char CurrentStrictState) {
1200  MachineInstr *MI;
1201 
1202  assert(SavedOrig);
1203  assert(CurrentStrictState == StateStrictWWM ||
1204  CurrentStrictState == StateStrictWQM);
1205 
1206  if (CurrentStrictState == StateStrictWWM) {
1207  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
1208  Exec)
1209  .addReg(SavedOrig);
1210  } else {
1211  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
1212  Exec)
1213  .addReg(SavedOrig);
1214  }
1215  LIS->InsertMachineInstrInMaps(*MI);
1216  StateTransition[MI] = NonStrictState;
1217 }
1218 
1219 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
1220  auto BII = Blocks.find(&MBB);
1221  if (BII == Blocks.end())
1222  return;
1223 
1224  BlockInfo &BI = BII->second;
1225 
1226  // This is a non-entry block that is WQM throughout, so no need to do
1227  // anything.
1228  if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1229  BI.InitialState = StateWQM;
1230  return;
1231  }
1232 
1233  LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
1234  << ":\n");
1235 
1236  Register SavedWQMReg;
1237  Register SavedNonStrictReg;
1238  bool WQMFromExec = IsEntry;
1239  char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1240  char NonStrictState = 0;
1241  const TargetRegisterClass *BoolRC = TRI->getBoolRC();
1242 
1243  auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1244  if (IsEntry) {
1245  // Skip the instruction that saves LiveMask
1246  if (II != IE && II->getOpcode() == AMDGPU::COPY)
1247  ++II;
1248  }
1249 
1250  // This stores the first instruction where it's safe to switch from WQM to
1251  // Exact or vice versa.
1252  MachineBasicBlock::iterator FirstWQM = IE;
1253 
1254  // This stores the first instruction where it's safe to switch from Strict
1255  // mode to Exact/WQM or to switch to Strict mode. It must always be the same
1256  // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
1257  // be safe to switch to/from WQM as well.
1258  MachineBasicBlock::iterator FirstStrict = IE;
1259 
1260  // Record initial state is block information.
1261  BI.InitialState = State;
1262 
1263  for (;;) {
1264  MachineBasicBlock::iterator Next = II;
1265  char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
1266  char OutNeeds = 0;
1267 
1268  if (FirstWQM == IE)
1269  FirstWQM = II;
1270 
1271  if (FirstStrict == IE)
1272  FirstStrict = II;
1273 
1274  // First, figure out the allowed states (Needs) based on the propagated
1275  // flags.
1276  if (II != IE) {
1277  MachineInstr &MI = *II;
1278 
1279  if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
1280  auto III = Instructions.find(&MI);
1281  if (III != Instructions.end()) {
1282  if (III->second.Needs & StateStrictWWM)
1283  Needs = StateStrictWWM;
1284  else if (III->second.Needs & StateStrictWQM)
1285  Needs = StateStrictWQM;
1286  else if (III->second.Needs & StateWQM)
1287  Needs = StateWQM;
1288  else
1289  Needs &= ~III->second.Disabled;
1290  OutNeeds = III->second.OutNeeds;
1291  }
1292  } else {
1293  // If the instruction doesn't actually need a correct EXEC, then we can
1294  // safely leave Strict mode enabled.
1295  Needs = StateExact | StateWQM | StateStrict;
1296  }
1297 
1298  if (MI.isTerminator() && OutNeeds == StateExact)
1299  Needs = StateExact;
1300 
1301  ++Next;
1302  } else {
1303  // End of basic block
1304  if (BI.OutNeeds & StateWQM)
1305  Needs = StateWQM;
1306  else if (BI.OutNeeds == StateExact)
1307  Needs = StateExact;
1308  else
1309  Needs = StateWQM | StateExact;
1310  }
1311 
1312  // Now, transition if necessary.
1313  if (!(Needs & State)) {
1315  if (State == StateStrictWWM || Needs == StateStrictWWM ||
1316  State == StateStrictWQM || Needs == StateStrictWQM) {
1317  // We must switch to or from Strict mode.
1318  First = FirstStrict;
1319  } else {
1320  // We only need to switch to/from WQM, so we can use FirstWQM.
1321  First = FirstWQM;
1322  }
1323 
1324  // Whether we need to save SCC depends on start and end states.
1325  bool SaveSCC = false;
1326  switch (State) {
1327  case StateExact:
1328  case StateStrictWWM:
1329  case StateStrictWQM:
1330  // Exact/Strict -> Strict: save SCC
1331  // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
1332  // Exact/Strict -> Exact: no save
1333  SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1334  break;
1335  case StateWQM:
1336  // WQM -> Exact/Strict: save SCC
1337  SaveSCC = !(Needs & StateWQM);
1338  break;
1339  default:
1340  llvm_unreachable("Unknown state");
1341  break;
1342  }
1344  prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC);
1345 
1346  if (State & StateStrict) {
1347  assert(State == StateStrictWWM || State == StateStrictWQM);
1348  assert(SavedNonStrictReg);
1349  fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
1350 
1351  LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
1352  SavedNonStrictReg = 0;
1353  State = NonStrictState;
1354  }
1355 
1356  if (Needs & StateStrict) {
1357  NonStrictState = State;
1358  assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1359  assert(!SavedNonStrictReg);
1360  SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
1361 
1362  toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
1363  State = Needs;
1364 
1365  } else {
1366  if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
1367  if (!WQMFromExec && (OutNeeds & StateWQM)) {
1368  assert(!SavedWQMReg);
1369  SavedWQMReg = MRI->createVirtualRegister(BoolRC);
1370  }
1371 
1372  toExact(MBB, Before, SavedWQMReg);
1373  State = StateExact;
1374  } else if (State == StateExact && (Needs & StateWQM) &&
1375  !(Needs & StateExact)) {
1376  assert(WQMFromExec == (SavedWQMReg == 0));
1377 
1378  toWQM(MBB, Before, SavedWQMReg);
1379 
1380  if (SavedWQMReg) {
1381  LIS->createAndComputeVirtRegInterval(SavedWQMReg);
1382  SavedWQMReg = 0;
1383  }
1384  State = StateWQM;
1385  } else {
1386  // We can get here if we transitioned from StrictWWM to a
1387  // non-StrictWWM state that already matches our needs, but we
1388  // shouldn't need to do anything.
1389  assert(Needs & State);
1390  }
1391  }
1392  }
1393 
1394  if (Needs != (StateExact | StateWQM | StateStrict)) {
1395  if (Needs != (StateExact | StateWQM))
1396  FirstWQM = IE;
1397  FirstStrict = IE;
1398  }
1399 
1400  if (II == IE)
1401  break;
1402 
1403  II = Next;
1404  }
1405  assert(!SavedWQMReg);
1406  assert(!SavedNonStrictReg);
1407 }
1408 
1409 void SIWholeQuadMode::lowerLiveMaskQueries() {
1410  for (MachineInstr *MI : LiveMaskQueries) {
1411  const DebugLoc &DL = MI->getDebugLoc();
1412  Register Dest = MI->getOperand(0).getReg();
1413 
1414  MachineInstr *Copy =
1415  BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
1416  .addReg(LiveMaskReg);
1417 
1418  LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
1419  MI->eraseFromParent();
1420  }
1421 }
1422 
1423 void SIWholeQuadMode::lowerCopyInstrs() {
1424  for (MachineInstr *MI : LowerToMovInstrs) {
1425  assert(MI->getNumExplicitOperands() == 2);
1426 
1427  const Register Reg = MI->getOperand(0).getReg();
1428  const unsigned SubReg = MI->getOperand(0).getSubReg();
1429 
1430  if (TRI->isVGPR(*MRI, Reg)) {
1431  const TargetRegisterClass *regClass =
1432  Reg.isVirtual() ? MRI->getRegClass(Reg) : TRI->getPhysRegClass(Reg);
1433  if (SubReg)
1434  regClass = TRI->getSubRegClass(regClass, SubReg);
1435 
1436  const unsigned MovOp = TII->getMovOpcode(regClass);
1437  MI->setDesc(TII->get(MovOp));
1438 
1439  // Check that it already implicitly depends on exec (like all VALU movs
1440  // should do).
1441  assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
1442  return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1443  }));
1444  } else {
1445  // Remove early-clobber and exec dependency from simple SGPR copies.
1446  // This allows some to be eliminated during/post RA.
1447  LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
1448  if (MI->getOperand(0).isEarlyClobber()) {
1449  LIS->removeInterval(Reg);
1450  MI->getOperand(0).setIsEarlyClobber(false);
1451  LIS->createAndComputeVirtRegInterval(Reg);
1452  }
1453  int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
1454  while (Index >= 0) {
1455  MI->RemoveOperand(Index);
1456  Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
1457  }
1458  MI->setDesc(TII->get(AMDGPU::COPY));
1459  LLVM_DEBUG(dbgs() << " -> " << *MI);
1460  }
1461  }
1462  for (MachineInstr *MI : LowerToCopyInstrs) {
1463  if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
1464  MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
1465  assert(MI->getNumExplicitOperands() == 3);
1466  // the only reason we should be here is V_SET_INACTIVE has
1467  // an undef input so it is being replaced by a simple copy.
1468  // There should be a second undef source that we should remove.
1469  assert(MI->getOperand(2).isUndef());
1470  MI->RemoveOperand(2);
1471  MI->untieRegOperand(1);
1472  } else {
1473  assert(MI->getNumExplicitOperands() == 2);
1474  }
1475 
1476  MI->setDesc(TII->get(AMDGPU::COPY));
1477  }
1478 }
1479 
1480 void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
1481  for (MachineInstr *MI : KillInstrs) {
1482  MachineBasicBlock *MBB = MI->getParent();
1483  MachineInstr *SplitPoint = nullptr;
1484  switch (MI->getOpcode()) {
1485  case AMDGPU::SI_DEMOTE_I1:
1486  case AMDGPU::SI_KILL_I1_TERMINATOR:
1487  SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
1488  break;
1489  case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1490  SplitPoint = lowerKillF32(*MBB, *MI);
1491  break;
1492  default:
1493  continue;
1494  }
1495  if (SplitPoint)
1496  splitBlock(MBB, SplitPoint);
1497  }
1498 }
1499 
1500 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
1501  LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
1502  << " ------------- \n");
1503  LLVM_DEBUG(MF.dump(););
1504 
1505  Instructions.clear();
1506  Blocks.clear();
1507  LiveMaskQueries.clear();
1508  LowerToCopyInstrs.clear();
1509  LowerToMovInstrs.clear();
1510  KillInstrs.clear();
1511  StateTransition.clear();
1512 
1513  ST = &MF.getSubtarget<GCNSubtarget>();
1514 
1515  TII = ST->getInstrInfo();
1516  TRI = &TII->getRegisterInfo();
1517  MRI = &MF.getRegInfo();
1518  LIS = &getAnalysis<LiveIntervals>();
1519  MDT = &getAnalysis<MachineDominatorTree>();
1520  PDT = &getAnalysis<MachinePostDominatorTree>();
1521 
1522  if (ST->isWave32()) {
1523  AndOpc = AMDGPU::S_AND_B32;
1524  AndN2Opc = AMDGPU::S_ANDN2_B32;
1525  XorOpc = AMDGPU::S_XOR_B32;
1526  AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
1527  OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
1528  WQMOpc = AMDGPU::S_WQM_B32;
1529  Exec = AMDGPU::EXEC_LO;
1530  } else {
1531  AndOpc = AMDGPU::S_AND_B64;
1532  AndN2Opc = AMDGPU::S_ANDN2_B64;
1533  XorOpc = AMDGPU::S_XOR_B64;
1534  AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
1535  OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
1536  WQMOpc = AMDGPU::S_WQM_B64;
1537  Exec = AMDGPU::EXEC;
1538  }
1539 
1540  const char GlobalFlags = analyzeFunction(MF);
1541  const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty());
1542 
1543  LiveMaskReg = Exec;
1544 
1545  // Shader is simple does not need any state changes or any complex lowering
1546  if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
1547  LowerToMovInstrs.empty() && KillInstrs.empty()) {
1548  lowerLiveMaskQueries();
1549  return !LiveMaskQueries.empty();
1550  }
1551 
1552  MachineBasicBlock &Entry = MF.front();
1553  MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
1554 
1555  // Store a copy of the original live mask when required
1556  if (NeedsLiveMask || (GlobalFlags & StateWQM)) {
1557  LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
1558  MachineInstr *MI =
1559  BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
1560  .addReg(Exec);
1561  LIS->InsertMachineInstrInMaps(*MI);
1562  }
1563 
1564  LLVM_DEBUG(printInfo());
1565 
1566  lowerLiveMaskQueries();
1567  lowerCopyInstrs();
1568 
1569  // Shader only needs WQM
1570  if (GlobalFlags == StateWQM) {
1571  auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
1572  .addReg(Exec);
1573  LIS->InsertMachineInstrInMaps(*MI);
1574  lowerKillInstrs(true);
1575  } else {
1576  for (auto BII : Blocks)
1577  processBlock(*BII.first, BII.first == &Entry);
1578  // Lowering blocks causes block splitting so perform as a second pass.
1579  for (auto BII : Blocks)
1580  lowerBlock(*BII.first);
1581  }
1582 
1583  // Compute live range for live mask
1584  if (LiveMaskReg != Exec)
1585  LIS->createAndComputeVirtRegInterval(LiveMaskReg);
1586 
1587  // Physical registers like SCC aren't tracked by default anyway, so just
1588  // removing the ranges we computed is the simplest option for maintaining
1589  // the analysis results.
1590  LIS->removeRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
1591 
1592  // If we performed any kills then recompute EXEC
1593  if (!KillInstrs.empty())
1594  LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
1595 
1596  return true;
1597 }
llvm::LaneBitmask
Definition: LaneBitmask.h:40
llvm::ISD::SETUGE
@ SETUGE
Definition: ISDOpcodes.h:1368
llvm::MachineBasicBlock::succ_size
unsigned succ_size() const
Definition: MachineBasicBlock.h:344
llvm::ISD::SETLE
@ SETLE
Definition: ISDOpcodes.h:1379
llvm::ISD::SETO
@ SETO
Definition: ISDOpcodes.h:1364
llvm::MachineBasicBlock::pred_begin
pred_iterator pred_begin()
Definition: MachineBasicBlock.h:316
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:102
MachineInstr.h
LLVM_DUMP_METHOD
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:491
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:131
llvm
---------------------— PointerInfo ------------------------------------—
Definition: AllocatorList.h:23
Reg
unsigned Reg
Definition: MachineSink.cpp:1566
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::TailPredication::Disabled
@ Disabled
Definition: ARMTargetTransformInfo.h:43
UseMI
MachineInstrBuilder & UseMI
Definition: AArch64ExpandPseudoInsts.cpp:102
llvm::ISD::SETGT
@ SETGT
Definition: ISDOpcodes.h:1376
llvm::ISD::SETNE
@ SETNE
Definition: ISDOpcodes.h:1380
llvm::MachineRegisterInfo::createVirtualRegister
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Definition: MachineRegisterInfo.cpp:158
Insert
Vector Rotate Left Mask Mask Insert
Definition: README_P9.txt:112
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:52
llvm::MCRegister::from
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
Definition: MCRegister.h:66
llvm::MachineInstrBuilder::add
const MachineInstrBuilder & add(const MachineOperand &MO) const
Definition: MachineInstrBuilder.h:224
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::ISD::SETEQ
@ SETEQ
Definition: ISDOpcodes.h:1375
llvm::printMBBReference
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
Definition: MachineBasicBlock.cpp:119
MapVector.h
llvm::LiveRange::Segment
This represents a simple continuous liveness interval for a value.
Definition: LiveInterval.h:162
llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition: MachineFunctionPass.h:30
llvm::createSIWholeQuadModePass
FunctionPass * createSIWholeQuadModePass()
Definition: SIWholeQuadMode.cpp:267
MachineBasicBlock.h
llvm::ISD::SETULE
@ SETULE
Definition: ISDOpcodes.h:1370
Instructions
Code Generation Notes for reduce the size of the ISel and reduce repetition in the implementation In a small number of this can cause even when no optimisation has taken place Instructions
Definition: MSA.txt:11
llvm::SmallSet
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:134
llvm::MachineFunctionProperties::Property::IsSSA
@ IsSSA
llvm::Function::hasFnAttribute
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.h:355
llvm::MachineFunctionProperties
Properties which a MachineFunction may have at a given point in time.
Definition: MachineFunction.h:111
llvm::MapVector
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:37
splitBlock
static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI, MachineDominatorTree *MDT)
Definition: SILateBranchLowering.cpp:98
llvm::GCNSubtarget
Definition: GCNSubtarget.h:38
llvm::RegState::Define
@ Define
Register definition.
Definition: MachineInstrBuilder.h:44
llvm::ISD::SETOEQ
@ SETOEQ
Definition: ISDOpcodes.h:1358
llvm::LiveQueryResult
Result of a LiveRange query.
Definition: LiveInterval.h:90
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1567
llvm::ISD::SETUEQ
@ SETUEQ
Definition: ISDOpcodes.h:1366
llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition: MachineFunctionPass.cpp:102
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:122
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::MachineFunction::front
const MachineBasicBlock & front() const
Definition: MachineFunction.h:816
llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition: MachineFunction.h:632
llvm::MachineBasicBlock::remove
MachineInstr * remove(MachineInstr *I)
Remove the unbundled instruction from the instruction list without deleting it.
Definition: MachineBasicBlock.h:930
llvm::MachineInstrBuilder::addMBB
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Definition: MachineInstrBuilder.h:146
llvm::TargetRegisterInfo::getSubRegIndexLaneMask
LaneBitmask getSubRegIndexLaneMask(unsigned SubIdx) const
Return a bitmask representing the parts of a register that are covered by SubIdx.
Definition: TargetRegisterInfo.h:375
GCNSubtarget.h
llvm::ISD::SETGE
@ SETGE
Definition: ISDOpcodes.h:1377
llvm::LaneBitmask::getNone
static constexpr LaneBitmask getNone()
Definition: LaneBitmask.h:83
llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:46
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::SIInstrFlags::WQM
@ WQM
Definition: SIDefines.h:63
llvm::AMDGPU::PALMD::Key
Key
PAL metadata keys.
Definition: AMDGPUMetadata.h:481
llvm::AMDGPU::getVOPe32
LLVM_READONLY int getVOPe32(uint16_t Opcode)
false
Definition: StackSlotColoring.cpp:142
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:129
First
into llvm powi allowing the code generator to produce balanced multiplication trees First
Definition: README.txt:54
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:49
llvm::MachineFunctionProperties::set
MachineFunctionProperties & set(Property P)
Definition: MachineFunction.h:169
llvm::MCID::Flag
Flag
These should be considered private to the implementation of the MCInstrDesc class.
Definition: MCInstrDesc.h:146
llvm::raw_ostream
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:53
llvm::operator<<
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:230
llvm::SlotIndexes
SlotIndexes pass.
Definition: SlotIndexes.h:314
llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:29
llvm::ISD::SETOLT
@ SETOLT
Definition: ISDOpcodes.h:1361
llvm::LiveQueryResult::valueIn
VNInfo * valueIn() const
Return the value that is live-in to the instruction.
Definition: LiveInterval.h:105
llvm::SlotIndex
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:83
llvm::ISD::SETOLE
@ SETOLE
Definition: ISDOpcodes.h:1362
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:95
llvm::ISD::SETUGT
@ SETUGT
Definition: ISDOpcodes.h:1367
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::MachineRegisterInfo::getRegClass
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
Definition: MachineRegisterInfo.h:634
llvm::ARM_PROC::IE
@ IE
Definition: ARMBaseInfo.h:27
llvm::ISD::SETUNE
@ SETUNE
Definition: ISDOpcodes.h:1371
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:622
llvm::MachineOperand::isUndef
bool isUndef() const
Definition: MachineOperand.h:395
llvm::SmallSet::count
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:164
llvm::LiveRange::getVNInfoBefore
VNInfo * getVNInfoBefore(SlotIndex Idx) const
getVNInfoBefore - Return the VNInfo that is live up to but not necessarilly including Idx,...
Definition: LiveInterval.h:421
AMDGPUMCTargetDesc.h
llvm::MachineBasicBlock::pred_end
pred_iterator pred_end()
Definition: MachineBasicBlock.h:318
Index
uint32_t Index
Definition: ELFObjHandler.cpp:84
llvm::MachineOperand::isReg
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Definition: MachineOperand.h:321
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:64
LiveIntervals.h
llvm::LiveRange
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:157
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
llvm::DenseMap
Definition: DenseMap.h:714
llvm::codeview::FrameCookieKind::Copy
@ Copy
llvm::MachineFunction::dump
void dump() const
dump - Print the current MachineFunction to cerr, useful for debugger use.
Definition: MachineFunction.cpp:536
llvm::LaneBitmask::any
constexpr bool any() const
Definition: LaneBitmask.h:53
llvm::MachineBasicBlock::getFirstNonPHI
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: MachineBasicBlock.cpp:200
llvm::LiveRange::Query
LiveQueryResult Query(SlotIndex Idx) const
Query Liveness at Idx.
Definition: LiveInterval.h:533
MachineFunctionPass.h
llvm::MachineFunction::getName
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
Definition: MachineFunction.cpp:541
llvm::ISD::SETOGT
@ SETOGT
Definition: ISDOpcodes.h:1359
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::MachineBasicBlock::succ_begin
succ_iterator succ_begin()
Definition: MachineBasicBlock.h:332
llvm::ISD::SETULT
@ SETULT
Definition: ISDOpcodes.h:1369
SI
StandardInstrumentations SI(Debug, VerifyEach)
llvm::IndexedInstrProf::HashT::Last
@ Last
Mode
SI Whole Quad Mode
Definition: SIWholeQuadMode.cpp:262
llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition: MachineInstrBuilder.h:97
MachinePostDominators.h
llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition: MachineOperand.h:360
llvm::MachineBasicBlock::predecessors
iterator_range< pred_iterator > predecessors()
Definition: MachineBasicBlock.h:349
llvm::MachineFunction
Definition: MachineFunction.h:230
llvm::AMDGPU::CPol::SCC
@ SCC
Definition: SIDefines.h:285
llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1554
llvm::MachineBasicBlock::successors
iterator_range< succ_iterator > successors()
Definition: MachineBasicBlock.h:355
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:58
llvm::MachineBasicBlock::rbegin
reverse_iterator rbegin()
Definition: MachineBasicBlock.h:272
AMDGPU.h
MBBI
MachineBasicBlock MachineBasicBlock::iterator MBBI
Definition: AArch64SLSHardening.cpp:75
llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:489
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:136
if
if(llvm_vc STREQUAL "") set(fake_version_inc "$
Definition: CMakeLists.txt:14
llvm::AnalysisUsage::addPreserved
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Definition: PassAnalysisSupport.h:98
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:81
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
S
add sub stmia L5 ldr r0 bl L_printf $stub Instead of a and a wouldn t it be better to do three moves *Return an aggregate type is even return S
Definition: README.txt:210
llvm::DominatorTreeBase
Core dominator tree base class.
Definition: LoopInfo.h:65
llvm::SmallSet::insert
std::pair< NoneType, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:180
llvm::SIWholeQuadModeID
char & SIWholeQuadModeID
Definition: SIWholeQuadMode.cpp:265
llvm::MachinePostDominatorTree
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
Definition: MachinePostDominators.h:27
MRI
unsigned const MachineRegisterInfo * MRI
Definition: AArch64AdvSIMDScalarPass.cpp:105
llvm::Register
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
CallingConv.h
MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:74
llvm::ISD::SETLT
@ SETLT
Definition: ISDOpcodes.h:1378
llvm::MachineRegisterInfo::getMaxLaneMaskForVReg
LaneBitmask getMaxLaneMaskForVReg(Register Reg) const
Returns a mask covering all bits that can appear in lane masks of subregisters of the virtual registe...
Definition: MachineRegisterInfo.cpp:493
llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition: MachineFunction.h:588
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:314
llvm::ISD::SETUO
@ SETUO
Definition: ISDOpcodes.h:1365
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, false) INITIALIZE_PASS_END(SIWholeQuadMode
llvm::LiveIntervals
Definition: LiveIntervals.h:54
llvm::VNInfo
VNInfo - Value Number Information.
Definition: LiveInterval.h:53
llvm::ReversePostOrderTraversal
Definition: PostOrderIterator.h:290
DEBUG_TYPE
#define DEBUG_TYPE
Definition: SIWholeQuadMode.cpp:87
llvm::SIInstrInfo
Definition: SIInstrInfo.h:38
llvm::MCRegUnitIterator
Definition: MCRegisterInfo.h:677
llvm::ISD::SETOGE
@ SETOGE
Definition: ISDOpcodes.h:1360
PostOrderIterator.h
llvm::MachineBasicBlock::begin
iterator begin()
Definition: MachineBasicBlock.h:268
llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition: MachineInstrBuilder.h:328
llvm::MCRegisterInfo::DiffListIterator::isValid
bool isValid() const
isValid - returns true if this iterator is not yet at the end.
Definition: MCRegisterInfo.h:224
llvm::LiveRange::getSegmentContaining
const Segment * getSegmentContaining(SlotIndex Idx) const
Return the segment that contains the specified index, or null if there is none.
Definition: LiveInterval.h:400
llvm::MachineBasicBlock::empty
bool empty() const
Definition: MachineBasicBlock.h:240
llvm::MachineInstr::setDesc
void setDesc(const MCInstrDesc &tid)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
Definition: MachineInstr.h:1741
llvm::ISD::SETONE
@ SETONE
Definition: ISDOpcodes.h:1363
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:298
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
raw_ostream.h
llvm::MachineDominatorTree
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
Definition: MachineDominators.h:45
llvm::MachineInstrBundleIterator< MachineInstr >
InitializePasses.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:75
llvm::MachineBasicBlock::end
iterator end()
Definition: MachineBasicBlock.h:270
llvm::LaneBitmask::getAll
static constexpr LaneBitmask getAll()
Definition: LaneBitmask.h:84
SubReg
unsigned SubReg
Definition: AArch64AdvSIMDScalarPass.cpp:104
llvm::Use
A Use represents the edge between a Value definition and its users.
Definition: Use.h:44
MachineDominators.h
llvm::SmallVectorImpl::emplace_back
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:908
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38