LLVM  16.0.0git
SIWholeQuadMode.cpp
Go to the documentation of this file.
1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass adds instructions to enable whole quad mode (strict or non-strict)
11 /// for pixel shaders, and strict whole wavefront mode for all programs.
12 ///
13 /// The "strict" prefix indicates that inactive lanes do not take part in
14 /// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
15 /// always be enabled irrespective of control flow decisions. Conversely in
16 /// non-strict WQM inactive lanes may control flow decisions.
17 ///
18 /// Whole quad mode is required for derivative computations, but it interferes
19 /// with shader side effects (stores and atomics). It ensures that WQM is
20 /// enabled when necessary, but disabled around stores and atomics.
21 ///
22 /// When necessary, this pass creates a function prolog
23 ///
24 /// S_MOV_B64 LiveMask, EXEC
25 /// S_WQM_B64 EXEC, EXEC
26 ///
27 /// to enter WQM at the top of the function and surrounds blocks of Exact
28 /// instructions by
29 ///
30 /// S_AND_SAVEEXEC_B64 Tmp, LiveMask
31 /// ...
32 /// S_MOV_B64 EXEC, Tmp
33 ///
34 /// We also compute when a sequence of instructions requires strict whole
35 /// wavefront mode (StrictWWM) and insert instructions to save and restore it:
36 ///
37 /// S_OR_SAVEEXEC_B64 Tmp, -1
38 /// ...
39 /// S_MOV_B64 EXEC, Tmp
40 ///
41 /// When a sequence of instructions requires strict whole quad mode (StrictWQM)
42 /// we use a similar save and restore mechanism and force whole quad mode for
43 /// those instructions:
44 ///
45 /// S_MOV_B64 Tmp, EXEC
46 /// S_WQM_B64 EXEC, EXEC
47 /// ...
48 /// S_MOV_B64 EXEC, Tmp
49 ///
50 /// In order to avoid excessive switching during sequences of Exact
51 /// instructions, the pass first analyzes which instructions must be run in WQM
52 /// (aka which instructions produce values that lead to derivative
53 /// computations).
54 ///
55 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
56 ///
57 /// There is room for improvement given better control flow analysis:
58 ///
59 /// (1) at the top level (outside of control flow statements, and as long as
60 /// kill hasn't been used), one SGPR can be saved by recovering WQM from
61 /// the LiveMask (this is implemented for the entry block).
62 ///
63 /// (2) when entire regions (e.g. if-else blocks or entire loops) only
64 /// consist of exact and don't-care instructions, the switch only has to
65 /// be done at the entry and exit points rather than potentially in each
66 /// block of the region.
67 ///
68 //===----------------------------------------------------------------------===//
69 
70 #include "AMDGPU.h"
71 #include "GCNSubtarget.h"
73 #include "llvm/ADT/MapVector.h"
81 #include "llvm/IR/CallingConv.h"
82 #include "llvm/InitializePasses.h"
84 
85 using namespace llvm;
86 
87 #define DEBUG_TYPE "si-wqm"
88 
89 namespace {
90 
91 enum {
92  StateWQM = 0x1,
93  StateStrictWWM = 0x2,
94  StateStrictWQM = 0x4,
95  StateExact = 0x8,
96  StateStrict = StateStrictWWM | StateStrictWQM,
97 };
98 
99 struct PrintState {
100 public:
101  int State;
102 
103  explicit PrintState(int State) : State(State) {}
104 };
105 
106 #ifndef NDEBUG
107 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
108 
109  static const std::pair<char, const char *> Mapping[] = {
110  std::make_pair(StateWQM, "WQM"),
111  std::make_pair(StateStrictWWM, "StrictWWM"),
112  std::make_pair(StateStrictWQM, "StrictWQM"),
113  std::make_pair(StateExact, "Exact")};
114  char State = PS.State;
115  for (auto M : Mapping) {
116  if (State & M.first) {
117  OS << M.second;
118  State &= ~M.first;
119 
120  if (State)
121  OS << '|';
122  }
123  }
124  assert(State == 0);
125  return OS;
126 }
127 #endif
128 
129 struct InstrInfo {
130  char Needs = 0;
131  char Disabled = 0;
132  char OutNeeds = 0;
133 };
134 
135 struct BlockInfo {
136  char Needs = 0;
137  char InNeeds = 0;
138  char OutNeeds = 0;
139  char InitialState = 0;
140  bool NeedsLowering = false;
141 };
142 
143 struct WorkItem {
144  MachineBasicBlock *MBB = nullptr;
145  MachineInstr *MI = nullptr;
146 
147  WorkItem() = default;
148  WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
149  WorkItem(MachineInstr *MI) : MI(MI) {}
150 };
151 
152 class SIWholeQuadMode : public MachineFunctionPass {
153 private:
154  const SIInstrInfo *TII;
155  const SIRegisterInfo *TRI;
156  const GCNSubtarget *ST;
158  LiveIntervals *LIS;
161 
162  unsigned AndOpc;
163  unsigned AndN2Opc;
164  unsigned XorOpc;
165  unsigned AndSaveExecOpc;
166  unsigned OrSaveExecOpc;
167  unsigned WQMOpc;
168  Register Exec;
169  Register LiveMaskReg;
170 
173 
174  // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
175  DenseMap<const MachineInstr *, char> StateTransition;
176 
177  SmallVector<MachineInstr *, 2> LiveMaskQueries;
178  SmallVector<MachineInstr *, 4> LowerToMovInstrs;
179  SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
181 
182  void printInfo();
183 
184  void markInstruction(MachineInstr &MI, char Flag,
185  std::vector<WorkItem> &Worklist);
186  void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
187  unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
188  void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
189  std::vector<WorkItem> &Worklist);
190  void markInstructionUses(const MachineInstr &MI, char Flag,
191  std::vector<WorkItem> &Worklist);
192  char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
193  void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
194  void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
195  char analyzeFunction(MachineFunction &MF);
196 
201  MachineBasicBlock::iterator Last, bool PreferLast,
202  bool SaveSCC);
203  void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
204  Register SaveWQM);
206  Register SavedWQM);
207  void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
208  Register SaveOrig, char StrictStateNeeded);
209  void fromStrictMode(MachineBasicBlock &MBB,
210  MachineBasicBlock::iterator Before, Register SavedOrig,
211  char NonStrictState, char CurrentStrictState);
212 
214 
216  bool IsWQM);
218 
219  void lowerBlock(MachineBasicBlock &MBB);
220  void processBlock(MachineBasicBlock &MBB, bool IsEntry);
221 
222  void lowerLiveMaskQueries();
223  void lowerCopyInstrs();
224  void lowerKillInstrs(bool IsWQM);
225 
226 public:
227  static char ID;
228 
229  SIWholeQuadMode() :
231 
232  bool runOnMachineFunction(MachineFunction &MF) override;
233 
234  StringRef getPassName() const override { return "SI Whole Quad Mode"; }
235 
236  void getAnalysisUsage(AnalysisUsage &AU) const override {
245  }
246 
247  MachineFunctionProperties getClearedProperties() const override {
250  }
251 };
252 
253 } // end anonymous namespace
254 
255 char SIWholeQuadMode::ID = 0;
256 
257 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
258  false)
262 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
263  false)
264 
265 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
266 
268  return new SIWholeQuadMode;
269 }
270 
271 #ifndef NDEBUG
272 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
273  for (const auto &BII : Blocks) {
274  dbgs() << "\n"
275  << printMBBReference(*BII.first) << ":\n"
276  << " InNeeds = " << PrintState(BII.second.InNeeds)
277  << ", Needs = " << PrintState(BII.second.Needs)
278  << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
279 
280  for (const MachineInstr &MI : *BII.first) {
281  auto III = Instructions.find(&MI);
282  if (III == Instructions.end())
283  continue;
284 
285  dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs)
286  << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
287  }
288  }
289 }
290 #endif
291 
292 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
293  std::vector<WorkItem> &Worklist) {
294  InstrInfo &II = Instructions[&MI];
295 
296  assert(!(Flag & StateExact) && Flag != 0);
297 
298  // Remove any disabled states from the flag. The user that required it gets
299  // an undefined value in the helper lanes. For example, this can happen if
300  // the result of an atomic is used by instruction that requires WQM, where
301  // ignoring the request for WQM is correct as per the relevant specs.
302  Flag &= ~II.Disabled;
303 
304  // Ignore if the flag is already encompassed by the existing needs, or we
305  // just disabled everything.
306  if ((II.Needs & Flag) == Flag)
307  return;
308 
309  LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
310  II.Needs |= Flag;
311  Worklist.push_back(&MI);
312 }
313 
314 /// Mark all relevant definitions of register \p Reg in usage \p UseMI.
315 void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
316  Register Reg, unsigned SubReg, char Flag,
317  std::vector<WorkItem> &Worklist) {
318  LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
319 
320  LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
321  const VNInfo *Value = UseLRQ.valueIn();
322  if (!Value)
323  return;
324 
325  // Note: this code assumes that lane masks on AMDGPU completely
326  // cover registers.
327  const LaneBitmask UseLanes =
329  : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
331 
332  // Perform a depth-first iteration of the LiveRange graph marking defs.
333  // Stop processing of a given branch when all use lanes have been defined.
334  // The first definition stops processing for a physical register.
335  struct PhiEntry {
336  const VNInfo *Phi;
337  unsigned PredIdx;
338  LaneBitmask DefinedLanes;
339 
340  PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
341  : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
342  };
343  using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
344  SmallVector<PhiEntry, 2> PhiStack;
345  SmallSet<VisitKey, 4> Visited;
346  LaneBitmask DefinedLanes;
347  unsigned NextPredIdx = 0; // Only used for processing phi nodes
348  do {
349  const VNInfo *NextValue = nullptr;
350  const VisitKey Key(Value, DefinedLanes);
351 
352  if (Visited.insert(Key).second) {
353  // On first visit to a phi then start processing first predecessor
354  NextPredIdx = 0;
355  }
356 
357  if (Value->isPHIDef()) {
358  // Each predecessor node in the phi must be processed as a subgraph
359  const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
360  assert(MBB && "Phi-def has no defining MBB");
361 
362  // Find next predecessor to process
363  unsigned Idx = NextPredIdx;
364  auto PI = MBB->pred_begin() + Idx;
365  auto PE = MBB->pred_end();
366  for (; PI != PE && !NextValue; ++PI, ++Idx) {
367  if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
368  if (!Visited.count(VisitKey(VN, DefinedLanes)))
369  NextValue = VN;
370  }
371  }
372 
373  // If there are more predecessors to process; add phi to stack
374  if (PI != PE)
375  PhiStack.emplace_back(Value, Idx, DefinedLanes);
376  } else {
377  MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
378  assert(MI && "Def has no defining instruction");
379 
380  if (Reg.isVirtual()) {
381  // Iterate over all operands to find relevant definitions
382  bool HasDef = false;
383  for (const MachineOperand &Op : MI->operands()) {
384  if (!(Op.isReg() && Op.isDef() && Op.getReg() == Reg))
385  continue;
386 
387  // Compute lanes defined and overlap with use
388  LaneBitmask OpLanes =
389  Op.isUndef() ? LaneBitmask::getAll()
390  : TRI->getSubRegIndexLaneMask(Op.getSubReg());
391  LaneBitmask Overlap = (UseLanes & OpLanes);
392 
393  // Record if this instruction defined any of use
394  HasDef |= Overlap.any();
395 
396  // Mark any lanes defined
397  DefinedLanes |= OpLanes;
398  }
399 
400  // Check if all lanes of use have been defined
401  if ((DefinedLanes & UseLanes) != UseLanes) {
402  // Definition not complete; need to process input value
403  LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
404  if (const VNInfo *VN = LRQ.valueIn()) {
405  if (!Visited.count(VisitKey(VN, DefinedLanes)))
406  NextValue = VN;
407  }
408  }
409 
410  // Only mark the instruction if it defines some part of the use
411  if (HasDef)
412  markInstruction(*MI, Flag, Worklist);
413  } else {
414  // For physical registers simply mark the defining instruction
415  markInstruction(*MI, Flag, Worklist);
416  }
417  }
418 
419  if (!NextValue && !PhiStack.empty()) {
420  // Reach end of chain; revert to processing last phi
421  PhiEntry &Entry = PhiStack.back();
422  NextValue = Entry.Phi;
423  NextPredIdx = Entry.PredIdx;
424  DefinedLanes = Entry.DefinedLanes;
425  PhiStack.pop_back();
426  }
427 
428  Value = NextValue;
429  } while (Value);
430 }
431 
432 void SIWholeQuadMode::markOperand(const MachineInstr &MI,
433  const MachineOperand &Op, char Flag,
434  std::vector<WorkItem> &Worklist) {
435  assert(Op.isReg());
436  Register Reg = Op.getReg();
437 
438  // Ignore some hardware registers
439  switch (Reg) {
440  case AMDGPU::EXEC:
441  case AMDGPU::EXEC_LO:
442  return;
443  default:
444  break;
445  }
446 
447  LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
448  << " for " << MI);
449  if (Reg.isVirtual()) {
450  LiveRange &LR = LIS->getInterval(Reg);
451  markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
452  } else {
453  // Handle physical registers that we need to track; this is mostly relevant
454  // for VCC, which can appear as the (implicit) input of a uniform branch,
455  // e.g. when a loop counter is stored in a VGPR.
456  for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid();
457  ++RegUnit) {
458  LiveRange &LR = LIS->getRegUnit(*RegUnit);
459  const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
460  if (!Value)
461  continue;
462 
463  markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist);
464  }
465  }
466 }
467 
468 /// Mark all instructions defining the uses in \p MI with \p Flag.
469 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
470  std::vector<WorkItem> &Worklist) {
471  LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
472  << MI);
473 
474  for (const MachineOperand &Use : MI.uses()) {
475  if (!Use.isReg() || !Use.isUse())
476  continue;
477  markOperand(MI, Use, Flag, Worklist);
478  }
479 }
480 
481 // Scan instructions to determine which ones require an Exact execmask and
482 // which ones seed WQM requirements.
483 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
484  std::vector<WorkItem> &Worklist) {
485  char GlobalFlags = 0;
486  bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
487  SmallVector<MachineInstr *, 4> SetInactiveInstrs;
488  SmallVector<MachineInstr *, 4> SoftWQMInstrs;
489  bool HasImplicitDerivatives =
491 
492  // We need to visit the basic blocks in reverse post-order so that we visit
493  // defs before uses, in particular so that we don't accidentally mark an
494  // instruction as needing e.g. WQM before visiting it and realizing it needs
495  // WQM disabled.
497  for (MachineBasicBlock *MBB : RPOT) {
498  BlockInfo &BBI = Blocks[MBB];
499 
500  for (MachineInstr &MI : *MBB) {
501  InstrInfo &III = Instructions[&MI];
502  unsigned Opcode = MI.getOpcode();
503  char Flags = 0;
504 
505  if (TII->isWQM(Opcode)) {
506  // If LOD is not supported WQM is not needed.
507  if (!ST->hasExtendedImageInsts())
508  continue;
509  // Only generate implicit WQM if implicit derivatives are required.
510  // This avoids inserting unintended WQM if a shader type without
511  // implicit derivatives uses an image sampling instruction.
512  if (!HasImplicitDerivatives)
513  continue;
514  // Sampling instructions don't need to produce results for all pixels
515  // in a quad, they just require all inputs of a quad to have been
516  // computed for derivatives.
517  markInstructionUses(MI, StateWQM, Worklist);
518  GlobalFlags |= StateWQM;
519  continue;
520  } else if (Opcode == AMDGPU::WQM) {
521  // The WQM intrinsic requires its output to have all the helper lanes
522  // correct, so we need it to be in WQM.
523  Flags = StateWQM;
524  LowerToCopyInstrs.push_back(&MI);
525  } else if (Opcode == AMDGPU::SOFT_WQM) {
526  LowerToCopyInstrs.push_back(&MI);
527  SoftWQMInstrs.push_back(&MI);
528  continue;
529  } else if (Opcode == AMDGPU::STRICT_WWM) {
530  // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
531  // it needs to be executed in WQM or Exact so that its copy doesn't
532  // clobber inactive lanes.
533  markInstructionUses(MI, StateStrictWWM, Worklist);
534  GlobalFlags |= StateStrictWWM;
535  LowerToMovInstrs.push_back(&MI);
536  continue;
537  } else if (Opcode == AMDGPU::STRICT_WQM ||
538  TII->isDualSourceBlendEXP(MI)) {
539  // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
540  // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
541  // quads that have at least one active thread.
542  markInstructionUses(MI, StateStrictWQM, Worklist);
543  GlobalFlags |= StateStrictWQM;
544 
545  if (Opcode == AMDGPU::STRICT_WQM) {
546  LowerToMovInstrs.push_back(&MI);
547  } else {
548  // Dual source blend export acts as implicit strict-wqm, its sources
549  // need to be shuffled in strict wqm, but the export itself needs to
550  // run in exact mode.
551  BBI.Needs |= StateExact;
552  if (!(BBI.InNeeds & StateExact)) {
553  BBI.InNeeds |= StateExact;
554  Worklist.push_back(MBB);
555  }
556  GlobalFlags |= StateExact;
557  III.Disabled = StateWQM | StateStrict;
558  }
559  continue;
560  } else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
561  Opcode == AMDGPU::LDS_DIRECT_LOAD) {
562  // Mark these STRICTWQM, but only for the instruction, not its operands.
563  // This avoid unnecessarily marking M0 as requiring WQM.
564  InstrInfo &II = Instructions[&MI];
565  II.Needs |= StateStrictWQM;
566  GlobalFlags |= StateStrictWQM;
567  continue;
568  } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
569  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
570  III.Disabled = StateStrict;
571  MachineOperand &Inactive = MI.getOperand(2);
572  if (Inactive.isReg()) {
573  if (Inactive.isUndef()) {
574  LowerToCopyInstrs.push_back(&MI);
575  } else {
576  markOperand(MI, Inactive, StateStrictWWM, Worklist);
577  }
578  }
579  SetInactiveInstrs.push_back(&MI);
580  continue;
581  } else if (TII->isDisableWQM(MI)) {
582  BBI.Needs |= StateExact;
583  if (!(BBI.InNeeds & StateExact)) {
584  BBI.InNeeds |= StateExact;
585  Worklist.push_back(MBB);
586  }
587  GlobalFlags |= StateExact;
588  III.Disabled = StateWQM | StateStrict;
589  continue;
590  } else {
591  if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) {
592  LiveMaskQueries.push_back(&MI);
593  } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
594  Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
595  Opcode == AMDGPU::SI_DEMOTE_I1) {
596  KillInstrs.push_back(&MI);
597  BBI.NeedsLowering = true;
598  } else if (WQMOutputs) {
599  // The function is in machine SSA form, which means that physical
600  // VGPRs correspond to shader inputs and outputs. Inputs are
601  // only used, outputs are only defined.
602  // FIXME: is this still valid?
603  for (const MachineOperand &MO : MI.defs()) {
604  if (!MO.isReg())
605  continue;
606 
607  Register Reg = MO.getReg();
608 
609  if (!Reg.isVirtual() &&
610  TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) {
611  Flags = StateWQM;
612  break;
613  }
614  }
615  }
616 
617  if (!Flags)
618  continue;
619  }
620 
621  markInstruction(MI, Flags, Worklist);
622  GlobalFlags |= Flags;
623  }
624  }
625 
626  // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
627  // ever used anywhere in the function. This implements the corresponding
628  // semantics of @llvm.amdgcn.set.inactive.
629  // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
630  if (GlobalFlags & StateWQM) {
631  for (MachineInstr *MI : SetInactiveInstrs)
632  markInstruction(*MI, StateWQM, Worklist);
633  for (MachineInstr *MI : SoftWQMInstrs)
634  markInstruction(*MI, StateWQM, Worklist);
635  }
636 
637  return GlobalFlags;
638 }
639 
640 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
641  std::vector<WorkItem>& Worklist) {
642  MachineBasicBlock *MBB = MI.getParent();
643  InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
644  BlockInfo &BI = Blocks[MBB];
645 
646  // Control flow-type instructions and stores to temporary memory that are
647  // followed by WQM computations must themselves be in WQM.
648  if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
649  (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
650  Instructions[&MI].Needs = StateWQM;
651  II.Needs = StateWQM;
652  }
653 
654  // Propagate to block level
655  if (II.Needs & StateWQM) {
656  BI.Needs |= StateWQM;
657  if (!(BI.InNeeds & StateWQM)) {
658  BI.InNeeds |= StateWQM;
659  Worklist.push_back(MBB);
660  }
661  }
662 
663  // Propagate backwards within block
664  if (MachineInstr *PrevMI = MI.getPrevNode()) {
665  char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
666  if (!PrevMI->isPHI()) {
667  InstrInfo &PrevII = Instructions[PrevMI];
668  if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
669  PrevII.OutNeeds |= InNeeds;
670  Worklist.push_back(PrevMI);
671  }
672  }
673  }
674 
675  // Propagate WQM flag to instruction inputs
676  assert(!(II.Needs & StateExact));
677 
678  if (II.Needs != 0)
679  markInstructionUses(MI, II.Needs, Worklist);
680 
681  // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
682  // not require any WQM transitions.
683  if (II.Needs & StateStrictWWM)
684  BI.Needs |= StateStrictWWM;
685  if (II.Needs & StateStrictWQM)
686  BI.Needs |= StateStrictWQM;
687 }
688 
689 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
690  std::vector<WorkItem>& Worklist) {
691  BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
692 
693  // Propagate through instructions
694  if (!MBB.empty()) {
695  MachineInstr *LastMI = &*MBB.rbegin();
696  InstrInfo &LastII = Instructions[LastMI];
697  if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
698  LastII.OutNeeds |= BI.OutNeeds;
699  Worklist.push_back(LastMI);
700  }
701  }
702 
703  // Predecessor blocks must provide for our WQM/Exact needs.
704  for (MachineBasicBlock *Pred : MBB.predecessors()) {
705  BlockInfo &PredBI = Blocks[Pred];
706  if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
707  continue;
708 
709  PredBI.OutNeeds |= BI.InNeeds;
710  PredBI.InNeeds |= BI.InNeeds;
711  Worklist.push_back(Pred);
712  }
713 
714  // All successors must be prepared to accept the same set of WQM/Exact data.
715  for (MachineBasicBlock *Succ : MBB.successors()) {
716  BlockInfo &SuccBI = Blocks[Succ];
717  if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
718  continue;
719 
720  SuccBI.InNeeds |= BI.OutNeeds;
721  Worklist.push_back(Succ);
722  }
723 }
724 
725 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
726  std::vector<WorkItem> Worklist;
727  char GlobalFlags = scanInstructions(MF, Worklist);
728 
729  while (!Worklist.empty()) {
730  WorkItem WI = Worklist.back();
731  Worklist.pop_back();
732 
733  if (WI.MI)
734  propagateInstruction(*WI.MI, Worklist);
735  else
736  propagateBlock(*WI.MBB, Worklist);
737  }
738 
739  return GlobalFlags;
740 }
741 
743 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
745  Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
746 
747  MachineInstr *Save =
748  BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
750  MachineInstr *Restore =
751  BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
752  .addReg(SaveReg);
753 
754  LIS->InsertMachineInstrInMaps(*Save);
755  LIS->InsertMachineInstrInMaps(*Restore);
756  LIS->createAndComputeVirtRegInterval(SaveReg);
757 
758  return Restore;
759 }
760 
762  MachineInstr *TermMI) {
763  LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
764  << *TermMI << "\n");
765 
766  MachineBasicBlock *SplitBB =
767  BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
768 
769  // Convert last instruction in block to a terminator.
770  // Note: this only covers the expected patterns
771  unsigned NewOpcode = 0;
772  switch (TermMI->getOpcode()) {
773  case AMDGPU::S_AND_B32:
774  NewOpcode = AMDGPU::S_AND_B32_term;
775  break;
776  case AMDGPU::S_AND_B64:
777  NewOpcode = AMDGPU::S_AND_B64_term;
778  break;
779  case AMDGPU::S_MOV_B32:
780  NewOpcode = AMDGPU::S_MOV_B32_term;
781  break;
782  case AMDGPU::S_MOV_B64:
783  NewOpcode = AMDGPU::S_MOV_B64_term;
784  break;
785  default:
786  break;
787  }
788  if (NewOpcode)
789  TermMI->setDesc(TII->get(NewOpcode));
790 
791  if (SplitBB != BB) {
792  // Update dominator trees
793  using DomTreeT = DomTreeBase<MachineBasicBlock>;
795  for (MachineBasicBlock *Succ : SplitBB->successors()) {
796  DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
797  DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
798  }
799  DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
800  if (MDT)
801  MDT->getBase().applyUpdates(DTUpdates);
802  if (PDT)
803  PDT->getBase().applyUpdates(DTUpdates);
804 
805  // Link blocks
806  MachineInstr *MI =
807  BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
808  .addMBB(SplitBB);
809  LIS->InsertMachineInstrInMaps(*MI);
810  }
811 
812  return SplitBB;
813 }
814 
815 MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
816  MachineInstr &MI) {
817  const DebugLoc &DL = MI.getDebugLoc();
818  unsigned Opcode = 0;
819 
820  assert(MI.getOperand(0).isReg());
821 
822  // Comparison is for live lanes; however here we compute the inverse
823  // (killed lanes). This is because VCMP will always generate 0 bits
824  // for inactive lanes so a mask of live lanes would not be correct
825  // inside control flow.
826  // Invert the comparison by swapping the operands and adjusting
827  // the comparison codes.
828 
829  switch (MI.getOperand(2).getImm()) {
830  case ISD::SETUEQ:
831  Opcode = AMDGPU::V_CMP_LG_F32_e64;
832  break;
833  case ISD::SETUGT:
834  Opcode = AMDGPU::V_CMP_GE_F32_e64;
835  break;
836  case ISD::SETUGE:
837  Opcode = AMDGPU::V_CMP_GT_F32_e64;
838  break;
839  case ISD::SETULT:
840  Opcode = AMDGPU::V_CMP_LE_F32_e64;
841  break;
842  case ISD::SETULE:
843  Opcode = AMDGPU::V_CMP_LT_F32_e64;
844  break;
845  case ISD::SETUNE:
846  Opcode = AMDGPU::V_CMP_EQ_F32_e64;
847  break;
848  case ISD::SETO:
849  Opcode = AMDGPU::V_CMP_O_F32_e64;
850  break;
851  case ISD::SETUO:
852  Opcode = AMDGPU::V_CMP_U_F32_e64;
853  break;
854  case ISD::SETOEQ:
855  case ISD::SETEQ:
856  Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
857  break;
858  case ISD::SETOGT:
859  case ISD::SETGT:
860  Opcode = AMDGPU::V_CMP_NLT_F32_e64;
861  break;
862  case ISD::SETOGE:
863  case ISD::SETGE:
864  Opcode = AMDGPU::V_CMP_NLE_F32_e64;
865  break;
866  case ISD::SETOLT:
867  case ISD::SETLT:
868  Opcode = AMDGPU::V_CMP_NGT_F32_e64;
869  break;
870  case ISD::SETOLE:
871  case ISD::SETLE:
872  Opcode = AMDGPU::V_CMP_NGE_F32_e64;
873  break;
874  case ISD::SETONE:
875  case ISD::SETNE:
876  Opcode = AMDGPU::V_CMP_NLG_F32_e64;
877  break;
878  default:
879  llvm_unreachable("invalid ISD:SET cond code");
880  }
881 
882  // Pick opcode based on comparison type.
883  MachineInstr *VcmpMI;
884  const MachineOperand &Op0 = MI.getOperand(0);
885  const MachineOperand &Op1 = MI.getOperand(1);
886 
887  // VCC represents lanes killed.
888  Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
889 
890  if (TRI->isVGPR(*MRI, Op0.getReg())) {
891  Opcode = AMDGPU::getVOPe32(Opcode);
892  VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
893  } else {
894  VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
895  .addReg(VCC, RegState::Define)
896  .addImm(0) // src0 modifiers
897  .add(Op1)
898  .addImm(0) // src1 modifiers
899  .add(Op0)
900  .addImm(0); // omod
901  }
902 
903  MachineInstr *MaskUpdateMI =
904  BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
905  .addReg(LiveMaskReg)
906  .addReg(VCC);
907 
908  // State of SCC represents whether any lanes are live in mask,
909  // if SCC is 0 then no lanes will be alive anymore.
910  MachineInstr *EarlyTermMI =
911  BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
912 
913  MachineInstr *ExecMaskMI =
914  BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
915 
916  assert(MBB.succ_size() == 1);
917  MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
918  .addMBB(*MBB.succ_begin());
919 
920  // Update live intervals
921  LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
922  MBB.remove(&MI);
923 
924  LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
925  LIS->InsertMachineInstrInMaps(*ExecMaskMI);
926  LIS->InsertMachineInstrInMaps(*EarlyTermMI);
927  LIS->InsertMachineInstrInMaps(*NewTerm);
928 
929  return NewTerm;
930 }
931 
932 MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
933  MachineInstr &MI, bool IsWQM) {
934  const DebugLoc &DL = MI.getDebugLoc();
935  MachineInstr *MaskUpdateMI = nullptr;
936 
937  const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
938  const MachineOperand &Op = MI.getOperand(0);
939  int64_t KillVal = MI.getOperand(1).getImm();
940  MachineInstr *ComputeKilledMaskMI = nullptr;
941  Register CndReg = !Op.isImm() ? Op.getReg() : Register();
942  Register TmpReg;
943 
944  // Is this a static or dynamic kill?
945  if (Op.isImm()) {
946  if (Op.getImm() == KillVal) {
947  // Static: all active lanes are killed
948  MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
949  .addReg(LiveMaskReg)
950  .addReg(Exec);
951  } else {
952  // Static: kill does nothing
953  MachineInstr *NewTerm = nullptr;
954  if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
955  LIS->RemoveMachineInstrFromMaps(MI);
956  } else {
957  assert(MBB.succ_size() == 1);
958  NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
959  .addMBB(*MBB.succ_begin());
960  LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
961  }
962  MBB.remove(&MI);
963  return NewTerm;
964  }
965  } else {
966  if (!KillVal) {
967  // Op represents live lanes after kill,
968  // so exec mask needs to be factored in.
969  TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
970  ComputeKilledMaskMI =
971  BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);
972  MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
973  .addReg(LiveMaskReg)
974  .addReg(TmpReg);
975  } else {
976  // Op represents lanes to kill
977  MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
978  .addReg(LiveMaskReg)
979  .add(Op);
980  }
981  }
982 
983  // State of SCC represents whether any lanes are live in mask,
984  // if SCC is 0 then no lanes will be alive anymore.
985  MachineInstr *EarlyTermMI =
986  BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
987 
988  // In the case we got this far some lanes are still live,
989  // update EXEC to deactivate lanes as appropriate.
990  MachineInstr *NewTerm;
991  MachineInstr *WQMMaskMI = nullptr;
992  Register LiveMaskWQM;
993  if (IsDemote) {
994  // Demote - deactivate quads with only helper lanes
995  LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
996  WQMMaskMI =
997  BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
998  NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
999  .addReg(Exec)
1000  .addReg(LiveMaskWQM);
1001  } else {
1002  // Kill - deactivate lanes no longer in live mask
1003  if (Op.isImm()) {
1004  unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1005  NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
1006  } else if (!IsWQM) {
1007  NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
1008  .addReg(Exec)
1009  .addReg(LiveMaskReg);
1010  } else {
1011  unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
1012  NewTerm =
1013  BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
1014  }
1015  }
1016 
1017  // Update live intervals
1018  LIS->RemoveMachineInstrFromMaps(MI);
1019  MBB.remove(&MI);
1020  assert(EarlyTermMI);
1021  assert(MaskUpdateMI);
1022  assert(NewTerm);
1023  if (ComputeKilledMaskMI)
1024  LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
1025  LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
1026  LIS->InsertMachineInstrInMaps(*EarlyTermMI);
1027  if (WQMMaskMI)
1028  LIS->InsertMachineInstrInMaps(*WQMMaskMI);
1029  LIS->InsertMachineInstrInMaps(*NewTerm);
1030 
1031  if (CndReg) {
1032  LIS->removeInterval(CndReg);
1033  LIS->createAndComputeVirtRegInterval(CndReg);
1034  }
1035  if (TmpReg)
1036  LIS->createAndComputeVirtRegInterval(TmpReg);
1037  if (LiveMaskWQM)
1038  LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
1039 
1040  return NewTerm;
1041 }
1042 
1043 // Replace (or supplement) instructions accessing live mask.
1044 // This can only happen once all the live mask registers have been created
1045 // and the execute state (WQM/StrictWWM/Exact) of instructions is known.
1046 void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
1047  auto BII = Blocks.find(&MBB);
1048  if (BII == Blocks.end())
1049  return;
1050 
1051  const BlockInfo &BI = BII->second;
1052  if (!BI.NeedsLowering)
1053  return;
1054 
1055  LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
1056 
1057  SmallVector<MachineInstr *, 4> SplitPoints;
1058  char State = BI.InitialState;
1059 
1062  if (StateTransition.count(&MI))
1063  State = StateTransition[&MI];
1064 
1065  MachineInstr *SplitPoint = nullptr;
1066  switch (MI.getOpcode()) {
1067  case AMDGPU::SI_DEMOTE_I1:
1068  case AMDGPU::SI_KILL_I1_TERMINATOR:
1069  SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
1070  break;
1071  case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1072  SplitPoint = lowerKillF32(MBB, MI);
1073  break;
1074  default:
1075  break;
1076  }
1077  if (SplitPoint)
1078  SplitPoints.push_back(SplitPoint);
1079  }
1080 
1081  // Perform splitting after instruction scan to simplify iteration.
1082  if (!SplitPoints.empty()) {
1083  MachineBasicBlock *BB = &MBB;
1084  for (MachineInstr *MI : SplitPoints) {
1085  BB = splitBlock(BB, MI);
1086  }
1087  }
1088 }
1089 
1090 // Return an iterator in the (inclusive) range [First, Last] at which
1091 // instructions can be safely inserted, keeping in mind that some of the
1092 // instructions we want to add necessarily clobber SCC.
1093 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
1095  MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
1096  if (!SaveSCC)
1097  return PreferLast ? Last : First;
1098 
1099  LiveRange &LR =
1100  LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
1101  auto MBBE = MBB.end();
1102  SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
1103  : LIS->getMBBEndIdx(&MBB);
1104  SlotIndex LastIdx =
1105  Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
1106  SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1107  const LiveRange::Segment *S;
1108 
1109  for (;;) {
1110  S = LR.getSegmentContaining(Idx);
1111  if (!S)
1112  break;
1113 
1114  if (PreferLast) {
1115  SlotIndex Next = S->start.getBaseIndex();
1116  if (Next < FirstIdx)
1117  break;
1118  Idx = Next;
1119  } else {
1120  MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
1121  assert(EndMI && "Segment does not end on valid instruction");
1122  auto NextI = std::next(EndMI->getIterator());
1123  if (NextI == MBB.end())
1124  break;
1125  SlotIndex Next = LIS->getInstructionIndex(*NextI);
1126  if (Next > LastIdx)
1127  break;
1128  Idx = Next;
1129  }
1130  }
1131 
1133 
1134  if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
1135  MBBI = MI;
1136  else {
1137  assert(Idx == LIS->getMBBEndIdx(&MBB));
1138  MBBI = MBB.end();
1139  }
1140 
1141  // Move insertion point past any operations modifying EXEC.
1142  // This assumes that the value of SCC defined by any of these operations
1143  // does not need to be preserved.
1144  while (MBBI != Last) {
1145  bool IsExecDef = false;
1146  for (const MachineOperand &MO : MBBI->operands()) {
1147  if (MO.isReg() && MO.isDef()) {
1148  IsExecDef |=
1149  MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1150  }
1151  }
1152  if (!IsExecDef)
1153  break;
1154  MBBI++;
1155  S = nullptr;
1156  }
1157 
1158  if (S)
1159  MBBI = saveSCC(MBB, MBBI);
1160 
1161  return MBBI;
1162 }
1163 
1164 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
1166  Register SaveWQM) {
1167  MachineInstr *MI;
1168 
1169  if (SaveWQM) {
1170  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndSaveExecOpc), SaveWQM)
1171  .addReg(LiveMaskReg);
1172  } else {
1173  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndOpc), Exec)
1174  .addReg(Exec)
1175  .addReg(LiveMaskReg);
1176  }
1177 
1178  LIS->InsertMachineInstrInMaps(*MI);
1179  StateTransition[MI] = StateExact;
1180 }
1181 
1182 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
1184  Register SavedWQM) {
1185  MachineInstr *MI;
1186 
1187  if (SavedWQM) {
1188  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
1189  .addReg(SavedWQM);
1190  } else {
1191  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
1192  }
1193 
1194  LIS->InsertMachineInstrInMaps(*MI);
1195  StateTransition[MI] = StateWQM;
1196 }
1197 
1198 void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
1200  Register SaveOrig, char StrictStateNeeded) {
1201  MachineInstr *MI;
1202  assert(SaveOrig);
1203  assert(StrictStateNeeded == StateStrictWWM ||
1204  StrictStateNeeded == StateStrictWQM);
1205 
1206  if (StrictStateNeeded == StateStrictWWM) {
1207  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
1208  SaveOrig)
1209  .addImm(-1);
1210  } else {
1211  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
1212  SaveOrig)
1213  .addImm(-1);
1214  }
1215  LIS->InsertMachineInstrInMaps(*MI);
1216  StateTransition[MI] = StateStrictWWM;
1217 }
1218 
1219 void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
1221  Register SavedOrig, char NonStrictState,
1222  char CurrentStrictState) {
1223  MachineInstr *MI;
1224 
1225  assert(SavedOrig);
1226  assert(CurrentStrictState == StateStrictWWM ||
1227  CurrentStrictState == StateStrictWQM);
1228 
1229  if (CurrentStrictState == StateStrictWWM) {
1230  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
1231  Exec)
1232  .addReg(SavedOrig);
1233  } else {
1234  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
1235  Exec)
1236  .addReg(SavedOrig);
1237  }
1238  LIS->InsertMachineInstrInMaps(*MI);
1239  StateTransition[MI] = NonStrictState;
1240 }
1241 
1242 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
1243  auto BII = Blocks.find(&MBB);
1244  if (BII == Blocks.end())
1245  return;
1246 
1247  BlockInfo &BI = BII->second;
1248 
1249  // This is a non-entry block that is WQM throughout, so no need to do
1250  // anything.
1251  if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1252  BI.InitialState = StateWQM;
1253  return;
1254  }
1255 
1256  LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
1257  << ":\n");
1258 
1259  Register SavedWQMReg;
1260  Register SavedNonStrictReg;
1261  bool WQMFromExec = IsEntry;
1262  char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1263  char NonStrictState = 0;
1264  const TargetRegisterClass *BoolRC = TRI->getBoolRC();
1265 
1266  auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1267  if (IsEntry) {
1268  // Skip the instruction that saves LiveMask
1269  if (II != IE && II->getOpcode() == AMDGPU::COPY)
1270  ++II;
1271  }
1272 
1273  // This stores the first instruction where it's safe to switch from WQM to
1274  // Exact or vice versa.
1275  MachineBasicBlock::iterator FirstWQM = IE;
1276 
1277  // This stores the first instruction where it's safe to switch from Strict
1278  // mode to Exact/WQM or to switch to Strict mode. It must always be the same
1279  // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
1280  // be safe to switch to/from WQM as well.
1281  MachineBasicBlock::iterator FirstStrict = IE;
1282 
1283  // Record initial state is block information.
1284  BI.InitialState = State;
1285 
1286  for (;;) {
1287  MachineBasicBlock::iterator Next = II;
1288  char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
1289  char OutNeeds = 0;
1290 
1291  if (FirstWQM == IE)
1292  FirstWQM = II;
1293 
1294  if (FirstStrict == IE)
1295  FirstStrict = II;
1296 
1297  // First, figure out the allowed states (Needs) based on the propagated
1298  // flags.
1299  if (II != IE) {
1300  MachineInstr &MI = *II;
1301 
1302  if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
1303  auto III = Instructions.find(&MI);
1304  if (III != Instructions.end()) {
1305  if (III->second.Needs & StateStrictWWM)
1306  Needs = StateStrictWWM;
1307  else if (III->second.Needs & StateStrictWQM)
1308  Needs = StateStrictWQM;
1309  else if (III->second.Needs & StateWQM)
1310  Needs = StateWQM;
1311  else
1312  Needs &= ~III->second.Disabled;
1313  OutNeeds = III->second.OutNeeds;
1314  }
1315  } else {
1316  // If the instruction doesn't actually need a correct EXEC, then we can
1317  // safely leave Strict mode enabled.
1318  Needs = StateExact | StateWQM | StateStrict;
1319  }
1320 
1321  if (MI.isTerminator() && OutNeeds == StateExact)
1322  Needs = StateExact;
1323 
1324  ++Next;
1325  } else {
1326  // End of basic block
1327  if (BI.OutNeeds & StateWQM)
1328  Needs = StateWQM;
1329  else if (BI.OutNeeds == StateExact)
1330  Needs = StateExact;
1331  else
1332  Needs = StateWQM | StateExact;
1333  }
1334 
1335  // Now, transition if necessary.
1336  if (!(Needs & State)) {
1338  if (State == StateStrictWWM || Needs == StateStrictWWM ||
1339  State == StateStrictWQM || Needs == StateStrictWQM) {
1340  // We must switch to or from Strict mode.
1341  First = FirstStrict;
1342  } else {
1343  // We only need to switch to/from WQM, so we can use FirstWQM.
1344  First = FirstWQM;
1345  }
1346 
1347  // Whether we need to save SCC depends on start and end states.
1348  bool SaveSCC = false;
1349  switch (State) {
1350  case StateExact:
1351  case StateStrictWWM:
1352  case StateStrictWQM:
1353  // Exact/Strict -> Strict: save SCC
1354  // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
1355  // Exact/Strict -> Exact: no save
1356  SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1357  break;
1358  case StateWQM:
1359  // WQM -> Exact/Strict: save SCC
1360  SaveSCC = !(Needs & StateWQM);
1361  break;
1362  default:
1363  llvm_unreachable("Unknown state");
1364  break;
1365  }
1367  prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC);
1368 
1369  if (State & StateStrict) {
1370  assert(State == StateStrictWWM || State == StateStrictWQM);
1371  assert(SavedNonStrictReg);
1372  fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
1373 
1374  LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
1375  SavedNonStrictReg = 0;
1376  State = NonStrictState;
1377  }
1378 
1379  if (Needs & StateStrict) {
1380  NonStrictState = State;
1381  assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1382  assert(!SavedNonStrictReg);
1383  SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
1384 
1385  toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
1386  State = Needs;
1387 
1388  } else {
1389  if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
1390  if (!WQMFromExec && (OutNeeds & StateWQM)) {
1391  assert(!SavedWQMReg);
1392  SavedWQMReg = MRI->createVirtualRegister(BoolRC);
1393  }
1394 
1395  toExact(MBB, Before, SavedWQMReg);
1396  State = StateExact;
1397  } else if (State == StateExact && (Needs & StateWQM) &&
1398  !(Needs & StateExact)) {
1399  assert(WQMFromExec == (SavedWQMReg == 0));
1400 
1401  toWQM(MBB, Before, SavedWQMReg);
1402 
1403  if (SavedWQMReg) {
1404  LIS->createAndComputeVirtRegInterval(SavedWQMReg);
1405  SavedWQMReg = 0;
1406  }
1407  State = StateWQM;
1408  } else {
1409  // We can get here if we transitioned from StrictWWM to a
1410  // non-StrictWWM state that already matches our needs, but we
1411  // shouldn't need to do anything.
1412  assert(Needs & State);
1413  }
1414  }
1415  }
1416 
1417  if (Needs != (StateExact | StateWQM | StateStrict)) {
1418  if (Needs != (StateExact | StateWQM))
1419  FirstWQM = IE;
1420  FirstStrict = IE;
1421  }
1422 
1423  if (II == IE)
1424  break;
1425 
1426  II = Next;
1427  }
1428  assert(!SavedWQMReg);
1429  assert(!SavedNonStrictReg);
1430 }
1431 
1432 void SIWholeQuadMode::lowerLiveMaskQueries() {
1433  for (MachineInstr *MI : LiveMaskQueries) {
1434  const DebugLoc &DL = MI->getDebugLoc();
1435  Register Dest = MI->getOperand(0).getReg();
1436 
1437  MachineInstr *Copy =
1438  BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
1439  .addReg(LiveMaskReg);
1440 
1441  LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
1442  MI->eraseFromParent();
1443  }
1444 }
1445 
1446 void SIWholeQuadMode::lowerCopyInstrs() {
1447  for (MachineInstr *MI : LowerToMovInstrs) {
1448  assert(MI->getNumExplicitOperands() == 2);
1449 
1450  const Register Reg = MI->getOperand(0).getReg();
1451 
1452  const TargetRegisterClass *regClass =
1453  TRI->getRegClassForOperandReg(*MRI, MI->getOperand(0));
1454  if (TRI->isVGPRClass(regClass)) {
1455  const unsigned MovOp = TII->getMovOpcode(regClass);
1456  MI->setDesc(TII->get(MovOp));
1457 
1458  // Check that it already implicitly depends on exec (like all VALU movs
1459  // should do).
1460  assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
1461  return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1462  }));
1463  } else {
1464  // Remove early-clobber and exec dependency from simple SGPR copies.
1465  // This allows some to be eliminated during/post RA.
1466  LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
1467  if (MI->getOperand(0).isEarlyClobber()) {
1468  LIS->removeInterval(Reg);
1469  MI->getOperand(0).setIsEarlyClobber(false);
1470  LIS->createAndComputeVirtRegInterval(Reg);
1471  }
1472  int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
1473  while (Index >= 0) {
1474  MI->removeOperand(Index);
1475  Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
1476  }
1477  MI->setDesc(TII->get(AMDGPU::COPY));
1478  LLVM_DEBUG(dbgs() << " -> " << *MI);
1479  }
1480  }
1481  for (MachineInstr *MI : LowerToCopyInstrs) {
1482  if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
1483  MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
1484  assert(MI->getNumExplicitOperands() == 3);
1485  // the only reason we should be here is V_SET_INACTIVE has
1486  // an undef input so it is being replaced by a simple copy.
1487  // There should be a second undef source that we should remove.
1488  assert(MI->getOperand(2).isUndef());
1489  MI->removeOperand(2);
1490  MI->untieRegOperand(1);
1491  } else {
1492  assert(MI->getNumExplicitOperands() == 2);
1493  }
1494 
1495  MI->setDesc(TII->get(AMDGPU::COPY));
1496  }
1497 }
1498 
1499 void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
1500  for (MachineInstr *MI : KillInstrs) {
1501  MachineBasicBlock *MBB = MI->getParent();
1502  MachineInstr *SplitPoint = nullptr;
1503  switch (MI->getOpcode()) {
1504  case AMDGPU::SI_DEMOTE_I1:
1505  case AMDGPU::SI_KILL_I1_TERMINATOR:
1506  SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
1507  break;
1508  case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1509  SplitPoint = lowerKillF32(*MBB, *MI);
1510  break;
1511  default:
1512  continue;
1513  }
1514  if (SplitPoint)
1515  splitBlock(MBB, SplitPoint);
1516  }
1517 }
1518 
1519 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
1520  LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
1521  << " ------------- \n");
1522  LLVM_DEBUG(MF.dump(););
1523 
1524  Instructions.clear();
1525  Blocks.clear();
1526  LiveMaskQueries.clear();
1527  LowerToCopyInstrs.clear();
1528  LowerToMovInstrs.clear();
1529  KillInstrs.clear();
1530  StateTransition.clear();
1531 
1532  ST = &MF.getSubtarget<GCNSubtarget>();
1533 
1534  TII = ST->getInstrInfo();
1535  TRI = &TII->getRegisterInfo();
1536  MRI = &MF.getRegInfo();
1537  LIS = &getAnalysis<LiveIntervals>();
1538  MDT = &getAnalysis<MachineDominatorTree>();
1539  PDT = &getAnalysis<MachinePostDominatorTree>();
1540 
1541  if (ST->isWave32()) {
1542  AndOpc = AMDGPU::S_AND_B32;
1543  AndN2Opc = AMDGPU::S_ANDN2_B32;
1544  XorOpc = AMDGPU::S_XOR_B32;
1545  AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
1546  OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
1547  WQMOpc = AMDGPU::S_WQM_B32;
1548  Exec = AMDGPU::EXEC_LO;
1549  } else {
1550  AndOpc = AMDGPU::S_AND_B64;
1551  AndN2Opc = AMDGPU::S_ANDN2_B64;
1552  XorOpc = AMDGPU::S_XOR_B64;
1553  AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
1554  OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
1555  WQMOpc = AMDGPU::S_WQM_B64;
1556  Exec = AMDGPU::EXEC;
1557  }
1558 
1559  const char GlobalFlags = analyzeFunction(MF);
1560  const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty());
1561 
1562  LiveMaskReg = Exec;
1563 
1564  // Shader is simple does not need any state changes or any complex lowering
1565  if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
1566  LowerToMovInstrs.empty() && KillInstrs.empty()) {
1567  lowerLiveMaskQueries();
1568  return !LiveMaskQueries.empty();
1569  }
1570 
1571  MachineBasicBlock &Entry = MF.front();
1572  MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
1573 
1574  // Store a copy of the original live mask when required
1575  if (NeedsLiveMask || (GlobalFlags & StateWQM)) {
1576  LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
1577  MachineInstr *MI =
1578  BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
1579  .addReg(Exec);
1580  LIS->InsertMachineInstrInMaps(*MI);
1581  }
1582 
1583  LLVM_DEBUG(printInfo());
1584 
1585  lowerLiveMaskQueries();
1586  lowerCopyInstrs();
1587 
1588  // Shader only needs WQM
1589  if (GlobalFlags == StateWQM) {
1590  auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
1591  .addReg(Exec);
1592  LIS->InsertMachineInstrInMaps(*MI);
1593  lowerKillInstrs(true);
1594  } else {
1595  for (auto BII : Blocks)
1596  processBlock(*BII.first, BII.first == &Entry);
1597  // Lowering blocks causes block splitting so perform as a second pass.
1598  for (auto BII : Blocks)
1599  lowerBlock(*BII.first);
1600  }
1601 
1602  // Compute live range for live mask
1603  if (LiveMaskReg != Exec)
1604  LIS->createAndComputeVirtRegInterval(LiveMaskReg);
1605 
1606  // Physical registers like SCC aren't tracked by default anyway, so just
1607  // removing the ranges we computed is the simplest option for maintaining
1608  // the analysis results.
1609  LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC);
1610 
1611  // If we performed any kills then recompute EXEC
1612  if (!KillInstrs.empty())
1613  LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
1614 
1615  return true;
1616 }
llvm::LaneBitmask
Definition: LaneBitmask.h:40
llvm::ISD::SETUGE
@ SETUGE
Definition: ISDOpcodes.h:1437
llvm::MachineBasicBlock::succ_size
unsigned succ_size() const
Definition: MachineBasicBlock.h:381
llvm::ISD::SETLE
@ SETLE
Definition: ISDOpcodes.h:1448
llvm::ISD::SETO
@ SETO
Definition: ISDOpcodes.h:1433
llvm::MachineBasicBlock::pred_begin
pred_iterator pred_begin()
Definition: MachineBasicBlock.h:353
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:108
MachineInstr.h
LLVM_DUMP_METHOD
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:492
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:131
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::TailPredication::Disabled
@ Disabled
Definition: ARMTargetTransformInfo.h:43
UseMI
MachineInstrBuilder & UseMI
Definition: AArch64ExpandPseudoInsts.cpp:105
llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition: iterator_range.h:53
llvm::ISD::SETGT
@ SETGT
Definition: ISDOpcodes.h:1445
llvm::ISD::SETNE
@ SETNE
Definition: ISDOpcodes.h:1449
llvm::MachineRegisterInfo::createVirtualRegister
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Definition: MachineRegisterInfo.cpp:156
Insert
Vector Rotate Left Mask Mask Insert
Definition: README_P9.txt:112
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:50
llvm::MCRegister::from
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
Definition: MCRegister.h:67
llvm::MachineInstrBuilder::add
const MachineInstrBuilder & add(const MachineOperand &MO) const
Definition: MachineInstrBuilder.h:224
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1182
llvm::ISD::SETEQ
@ SETEQ
Definition: ISDOpcodes.h:1444
llvm::printMBBReference
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
Definition: MachineBasicBlock.cpp:116
llvm::X86Disassembler::Reg
Reg
All possible values of the reg field in the ModR/M byte.
Definition: X86DisassemblerDecoder.h:462
MapVector.h
llvm::LiveRange::Segment
This represents a simple continuous liveness interval for a value.
Definition: LiveInterval.h:162
llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition: MachineFunctionPass.h:30
llvm::createSIWholeQuadModePass
FunctionPass * createSIWholeQuadModePass()
Definition: SIWholeQuadMode.cpp:267
MachineBasicBlock.h
llvm::ISD::SETULE
@ SETULE
Definition: ISDOpcodes.h:1439
Instructions
Code Generation Notes for reduce the size of the ISel and reduce repetition in the implementation In a small number of this can cause even when no optimisation has taken place Instructions
Definition: MSA.txt:11
llvm::SmallSet
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:136
llvm::MachineFunctionProperties::Property::IsSSA
@ IsSSA
llvm::MachineFunctionProperties
Properties which a MachineFunction may have at a given point in time.
Definition: MachineFunction.h:127
llvm::MapVector
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:37
splitBlock
static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI, MachineDominatorTree *MDT)
Definition: SILateBranchLowering.cpp:104
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::ISD::SETOEQ
@ SETOEQ
Definition: ISDOpcodes.h:1427
llvm::LiveQueryResult
Result of a LiveRange query.
Definition: LiveInterval.h:90
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1628
llvm::ISD::SETUEQ
@ SETUEQ
Definition: ISDOpcodes.h:1435
llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition: MachineFunctionPass.cpp:167
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::SIInstrFlags::WQM
@ WQM
Definition: SIDefines.h:77
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::MachineFunction::front
const MachineBasicBlock & front() const
Definition: MachineFunction.h:865
llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition: MachineFunction.h:666
llvm::MachineBasicBlock::remove
MachineInstr * remove(MachineInstr *I)
Remove the unbundled instruction from the instruction list without deleting it.
Definition: MachineBasicBlock.h:989
llvm::MachineInstrBuilder::addMBB
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Definition: MachineInstrBuilder.h:146
llvm::TargetRegisterInfo::getSubRegIndexLaneMask
LaneBitmask getSubRegIndexLaneMask(unsigned SubIdx) const
Return a bitmask representing the parts of a register that are covered by SubIdx.
Definition: TargetRegisterInfo.h:381
GCNSubtarget.h
llvm::ISD::SETGE
@ SETGE
Definition: ISDOpcodes.h:1446
llvm::LaneBitmask::getNone
static constexpr LaneBitmask getNone()
Definition: LaneBitmask.h:83
llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:46
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::AMDGPU::PALMD::Key
Key
PAL metadata keys.
Definition: AMDGPUMetadata.h:486
llvm::AMDGPU::getVOPe32
LLVM_READONLY int getVOPe32(uint16_t Opcode)
false
Definition: StackSlotColoring.cpp:141
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:125
llvm::dwarf::Index
Index
Definition: Dwarf.h:472
First
into llvm powi allowing the code generator to produce balanced multiplication trees First
Definition: README.txt:54
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:48
llvm::MachineFunctionProperties::set
MachineFunctionProperties & set(Property P)
Definition: MachineFunction.h:196
llvm::MCID::Flag
Flag
These should be considered private to the implementation of the MCInstrDesc class.
Definition: MCInstrDesc.h:147
llvm::raw_ostream
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
llvm::operator<<
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:230
llvm::SlotIndexes
SlotIndexes pass.
Definition: SlotIndexes.h:319
llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:30
llvm::ISD::SETOLT
@ SETOLT
Definition: ISDOpcodes.h:1430
llvm::LiveQueryResult::valueIn
VNInfo * valueIn() const
Return the value that is live-in to the instruction.
Definition: LiveInterval.h:105
llvm::SlotIndex
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:82
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::ISD::SETOLE
@ SETOLE
Definition: ISDOpcodes.h:1431
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:94
llvm::ISD::SETUGT
@ SETUGT
Definition: ISDOpcodes.h:1436
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::ARM_PROC::IE
@ IE
Definition: ARMBaseInfo.h:27
llvm::ISD::SETUNE
@ SETUNE
Definition: ISDOpcodes.h:1440
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:656
llvm::Function::hasFnAttribute
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:628
llvm::MachineOperand::isUndef
bool isUndef() const
Definition: MachineOperand.h:394
llvm::SmallSet::count
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:165
llvm::LiveRange::getVNInfoBefore
VNInfo * getVNInfoBefore(SlotIndex Idx) const
getVNInfoBefore - Return the VNInfo that is live up to but not necessarilly including Idx,...
Definition: LiveInterval.h:429
AMDGPUMCTargetDesc.h
llvm::MachineBasicBlock::pred_end
pred_iterator pred_end()
Definition: MachineBasicBlock.h:355
llvm::MachineOperand::isReg
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Definition: MachineOperand.h:320
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:66
LiveIntervals.h
llvm::Function::getCallingConv
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:238
llvm::LiveRange
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:157
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
llvm::DenseMap
Definition: DenseMap.h:714
llvm::codeview::FrameCookieKind::Copy
@ Copy
llvm::MachineFunction::dump
void dump() const
dump - Print the current MachineFunction to cerr, useful for debugger use.
Definition: MachineFunction.cpp:562
llvm::LaneBitmask::any
constexpr bool any() const
Definition: LaneBitmask.h:53
llvm::make_early_inc_range
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:596
llvm::MachineBasicBlock::getFirstNonPHI
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: MachineBasicBlock.cpp:196
llvm::LiveRange::Query
LiveQueryResult Query(SlotIndex Idx) const
Query Liveness at Idx.
Definition: LiveInterval.h:541
MachineFunctionPass.h
llvm::RegState::Define
@ Define
Register definition.
Definition: MachineInstrBuilder.h:44
llvm::MachineFunction::getName
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
Definition: MachineFunction.cpp:567
llvm::ISD::SETOGT
@ SETOGT
Definition: ISDOpcodes.h:1428
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::MachineBasicBlock::succ_begin
succ_iterator succ_begin()
Definition: MachineBasicBlock.h:369
llvm::ISD::SETULT
@ SETULT
Definition: ISDOpcodes.h:1438
SI
StandardInstrumentations SI(Debug, VerifyEach)
llvm::IndexedInstrProf::HashT::Last
@ Last
Mode
SI Whole Quad Mode
Definition: SIWholeQuadMode.cpp:262
llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition: MachineInstrBuilder.h:97
MachinePostDominators.h
llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition: MachineOperand.h:359
llvm::MachineBasicBlock::predecessors
iterator_range< pred_iterator > predecessors()
Definition: MachineBasicBlock.h:386
llvm::MachineFunction
Definition: MachineFunction.h:257
llvm::AMDGPU::CPol::SCC
@ SCC
Definition: SIDefines.h:307
llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1597
llvm::MachineBasicBlock::successors
iterator_range< succ_iterator > successors()
Definition: MachineBasicBlock.h:392
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
llvm::MachineBasicBlock::rbegin
reverse_iterator rbegin()
Definition: MachineBasicBlock.h:309
AMDGPU.h
MBBI
MachineBasicBlock MachineBasicBlock::iterator MBBI
Definition: AArch64SLSHardening.cpp:75
llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:516
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
if
if(llvm_vc STREQUAL "") set(fake_version_inc "$
Definition: CMakeLists.txt:14
llvm::AnalysisUsage::addPreserved
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Definition: PassAnalysisSupport.h:98
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:82
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
S
add sub stmia L5 ldr r0 bl L_printf $stub Instead of a and a wouldn t it be better to do three moves *Return an aggregate type is even return S
Definition: README.txt:210
llvm::DominatorTreeBase
Core dominator tree base class.
Definition: LoopInfo.h:65
llvm::SIWholeQuadModeID
char & SIWholeQuadModeID
Definition: SIWholeQuadMode.cpp:265
llvm::MachinePostDominatorTree
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
Definition: MachinePostDominators.h:27
MRI
unsigned const MachineRegisterInfo * MRI
Definition: AArch64AdvSIMDScalarPass.cpp:105
llvm::Register
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
CallingConv.h
MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:74
llvm::ISD::SETLT
@ SETLT
Definition: ISDOpcodes.h:1447
llvm::MachineRegisterInfo::getMaxLaneMaskForVReg
LaneBitmask getMaxLaneMaskForVReg(Register Reg) const
Returns a mask covering all bits that can appear in lane masks of subregisters of the virtual registe...
Definition: MachineRegisterInfo.cpp:495
llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition: MachineFunction.h:622
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:348
llvm::ISD::SETUO
@ SETUO
Definition: ISDOpcodes.h:1434
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, false) INITIALIZE_PASS_END(SIWholeQuadMode
llvm::LiveIntervals
Definition: LiveIntervals.h:53
llvm::VNInfo
VNInfo - Value Number Information.
Definition: LiveInterval.h:53
llvm::SmallSet::insert
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:178
llvm::ReversePostOrderTraversal
Definition: PostOrderIterator.h:291
DEBUG_TYPE
#define DEBUG_TYPE
Definition: SIWholeQuadMode.cpp:87
llvm::MachineInstr::setDesc
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
Definition: MachineInstr.h:1763
llvm::SIInstrInfo
Definition: SIInstrInfo.h:44
llvm::MCRegUnitIterator
Definition: MCRegisterInfo.h:680
llvm::ISD::SETOGE
@ SETOGE
Definition: ISDOpcodes.h:1429
PostOrderIterator.h
llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition: MachineInstrBuilder.h:357
llvm::MCRegisterInfo::DiffListIterator::isValid
bool isValid() const
isValid - returns true if this iterator is not yet at the end.
Definition: MCRegisterInfo.h:224
llvm::CallingConv::AMDGPU_PS
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:195
llvm::LiveRange::getSegmentContaining
const Segment * getSegmentContaining(SlotIndex Idx) const
Return the segment that contains the specified index, or null if there is none.
Definition: LiveInterval.h:408
llvm::MachineBasicBlock::empty
bool empty() const
Definition: MachineBasicBlock.h:277
llvm::ISD::SETONE
@ SETONE
Definition: ISDOpcodes.h:1432
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:308
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
raw_ostream.h
llvm::MachineDominatorTree
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
Definition: MachineDominators.h:51
llvm::MachineInstrBundleIterator
MachineBasicBlock iterator that automatically skips over MIs that are inside bundles (i....
Definition: MachineInstrBundleIterator.h:108
InitializePasses.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::MachineBasicBlock::end
iterator end()
Definition: MachineBasicBlock.h:307
llvm::LaneBitmask::getAll
static constexpr LaneBitmask getAll()
Definition: LaneBitmask.h:84
SubReg
unsigned SubReg
Definition: AArch64AdvSIMDScalarPass.cpp:104
llvm::Use
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
MachineDominators.h
llvm::SmallVectorImpl::emplace_back
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:924
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38