LLVM  16.0.0git
AMDGPUIGroupLP.cpp
Go to the documentation of this file.
1 //===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file This file defines a set of schedule DAG mutations that can be used to
10 // override default scheduler behavior to enforce specific scheduling patterns.
11 // They should be used in cases where runtime performance considerations such as
12 // inter-wavefront interactions, mean that compile-time heuristics cannot
13 // predict the optimal instruction ordering, or in kernels where optimum
14 // instruction scheduling is important enough to warrant manual intervention.
15 //
16 //===----------------------------------------------------------------------===//
17 
18 #include "AMDGPUIGroupLP.h"
19 #include "AMDGPUTargetMachine.h"
21 #include "SIInstrInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "llvm/ADT/BitmaskEnum.h"
24 #include "llvm/ADT/DenseMap.h"
27 
28 using namespace llvm;
29 
30 #define DEBUG_TYPE "igrouplp"
31 
32 namespace {
33 
34 static cl::opt<bool> EnableExactSolver(
35  "amdgpu-igrouplp-exact-solver", cl::Hidden,
36  cl::desc("Whether to use the exponential time solver to fit "
37  "the instructions to the pipeline as closely as "
38  "possible."),
39  cl::init(false));
40 
41 static cl::opt<unsigned> CutoffForExact(
42  "amdgpu-igrouplp-exact-solver-cutoff", cl::init(0), cl::Hidden,
43  cl::desc("The maximum number of scheduling group conflicts "
44  "which we attempt to solve with the exponential time "
45  "exact solver. Problem sizes greater than this will"
46  "be solved by the less accurate greedy algorithm. Selecting "
47  "solver by size is superseded by manually selecting "
48  "the solver (e.g. by amdgpu-igrouplp-exact-solver"));
49 
50 static cl::opt<uint64_t> MaxBranchesExplored(
51  "amdgpu-igrouplp-exact-solver-max-branches", cl::init(0), cl::Hidden,
52  cl::desc("The amount of branches that we are willing to explore with"
53  "the exact algorithm before giving up."));
54 
55 static cl::opt<bool> UseCostHeur(
56  "amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden,
57  cl::desc("Whether to use the cost heuristic to make choices as we "
58  "traverse the search space using the exact solver. Defaulted "
59  "to on, and if turned off, we will use the node order -- "
60  "attempting to put the later nodes in the later sched groups. "
61  "Experimentally, results are mixed, so this should be set on a "
62  "case-by-case basis."));
63 
64 // Components of the mask that determines which instruction types may be may be
65 // classified into a SchedGroup.
66 enum class SchedGroupMask {
67  NONE = 0u,
68  ALU = 1u << 0,
69  VALU = 1u << 1,
70  SALU = 1u << 2,
71  MFMA = 1u << 3,
72  VMEM = 1u << 4,
73  VMEM_READ = 1u << 5,
74  VMEM_WRITE = 1u << 6,
75  DS = 1u << 7,
76  DS_READ = 1u << 8,
77  DS_WRITE = 1u << 9,
78  ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
79  DS_READ | DS_WRITE,
80  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
81 };
82 
83 typedef DenseMap<SUnit *, SmallVector<int, 4>> SUnitsToCandidateSGsMap;
84 
85 // Classify instructions into groups to enable fine tuned control over the
86 // scheduler. These groups may be more specific than current SchedModel
87 // instruction classes.
88 class SchedGroup {
89 private:
90  // Mask that defines which instruction types can be classified into this
91  // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER
92  // and SCHED_GROUP_BARRIER.
93  SchedGroupMask SGMask;
94 
95  // Maximum number of SUnits that can be added to this group.
96  Optional<unsigned> MaxSize;
97 
98  // SchedGroups will only synchronize with other SchedGroups that have the same
99  // SyncID.
100  int SyncID = 0;
101 
102  // SGID is used to map instructions to candidate SchedGroups
103  unsigned SGID;
104 
105  // Count of the number of created SchedGroups, used to initialize SGID.
106  static unsigned NumSchedGroups;
107 
108  ScheduleDAGInstrs *DAG;
109 
110  const SIInstrInfo *TII;
111 
112  // Try to add and edge from SU A to SU B.
113  bool tryAddEdge(SUnit *A, SUnit *B);
114 
115  // Use SGMask to determine whether we can classify MI as a member of this
116  // SchedGroup object.
117  bool canAddMI(const MachineInstr &MI) const;
118 
119 public:
120  // Collection of SUnits that are classified as members of this group.
121  SmallVector<SUnit *, 32> Collection;
122 
123  // Returns true if SU can be added to this SchedGroup.
124  bool canAddSU(SUnit &SU) const;
125 
126  // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If
127  // MakePred is true, SU will be a predecessor of the SUnits in this
128  // SchedGroup, otherwise SU will be a successor.
129  void link(SUnit &SU, bool MakePred = false);
130 
131  // Add DAG dependencies and track which edges are added, and the count of
132  // missed edges
133  int link(SUnit &SU, bool MakePred,
134  std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
135 
136  // Add DAG dependencies from all SUnits in this SchedGroup and this SU.
137  // Use the predicate to determine whether SU should be a predecessor (P =
138  // true) or a successor (P = false) of this SchedGroup.
139  void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P);
140 
141  // Add DAG dependencies such that SUnits in this group shall be ordered
142  // before SUnits in OtherGroup.
143  void link(SchedGroup &OtherGroup);
144 
145  // Returns true if no more instructions may be added to this group.
146  bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; }
147 
148  // Add SU to the SchedGroup.
149  void add(SUnit &SU) {
150  LLVM_DEBUG(dbgs() << "For SchedGroup with mask "
151  << format_hex((int)SGMask, 10, true) << " adding "
152  << *SU.getInstr());
153  Collection.push_back(&SU);
154  }
155 
156  // Remove last element in the SchedGroup
157  void pop() { Collection.pop_back(); }
158 
159  // Identify and add all relevant SUs from the DAG to this SchedGroup.
160  void initSchedGroup();
161 
162  // Add instructions to the SchedGroup bottom up starting from RIter.
163  // PipelineInstrs is a set of instructions that should not be added to the
164  // SchedGroup even when the other conditions for adding it are satisfied.
165  // RIter will be added to the SchedGroup as well, and dependencies will be
166  // added so that RIter will always be scheduled at the end of the group.
167  void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
168  SUnitsToCandidateSGsMap &SyncedInstrs);
169 
170  void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs);
171 
172  int getSyncID() { return SyncID; }
173 
174  int getSGID() { return SGID; }
175 
176  SchedGroupMask getMask() { return SGMask; }
177 
178  SchedGroup(SchedGroupMask SGMask, Optional<unsigned> MaxSize,
179  ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
180  : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) {
181  SGID = NumSchedGroups++;
182  }
183 
184  SchedGroup(SchedGroupMask SGMask, Optional<unsigned> MaxSize, int SyncID,
185  ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
186  : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) {
187  SGID = NumSchedGroups++;
188  }
189 };
190 
191 // Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER.
192 static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) {
193  assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER ||
194  SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||
195  SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT);
196 
197  while (!SU.Preds.empty())
198  for (auto &P : SU.Preds)
199  SU.removePred(P);
200 
201  while (!SU.Succs.empty())
202  for (auto &S : SU.Succs)
203  for (auto &SP : S.getSUnit()->Preds)
204  if (SP.getSUnit() == &SU)
205  S.getSUnit()->removePred(SP);
206 }
207 
208 typedef std::pair<SUnit *, SmallVector<int, 4>> SUToCandSGsPair;
209 typedef SmallVector<SUToCandSGsPair, 4> SUsToCandSGsVec;
210 
211 // The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline
212 // in non-trivial cases. For example, if the requested pipeline is
213 // {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction
214 // in the DAG, then we will have an instruction that can not be trivially
215 // assigned to a SchedGroup. The PipelineSolver class implements two algorithms
216 // to find a good solution to the pipeline -- a greedy algorithm and an exact
217 // algorithm. The exact algorithm has an exponential time complexity and should
218 // only be used for small sized problems or medium sized problems where an exact
219 // solution is highly desired.
220 class PipelineSolver {
221  ScheduleDAGMI *DAG;
222 
223  // Instructions that can be assigned to multiple SchedGroups
225  SmallVector<SUsToCandSGsVec, 4> PipelineInstrs;
226  DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
227  // The current working pipeline
229  // The pipeline that has the best solution found so far
231 
232  // Whether or not we actually have any SyncedInstrs to try to solve.
233  bool NeedsSolver = false;
234 
235  // Compute an estimate of the size of search tree -- the true size is
236  // the product of each conflictedInst.Matches.size() across all SyncPipelines
237  unsigned computeProblemSize();
238 
239  // The cost penalty of not assigning a SU to a SchedGroup
240  int MissPenalty = 0;
241 
242  // Costs in terms of the number of edges we are unable to add
243  int BestCost = -1;
244  int CurrCost = 0;
245 
246  // Index pointing to the conflicting instruction that is currently being
247  // fitted
248  int CurrConflInstNo = 0;
249  // Index to the pipeline that is currently being fitted
250  int CurrSyncGroupIdx = 0;
251  // The first non trivial pipeline
252  int BeginSyncGroupIdx = 0;
253 
254  // How many branches we have explored
255  uint64_t BranchesExplored = 0;
256 
257  // Update indices to fit next conflicting instruction
258  void advancePosition();
259  // Recede indices to attempt to find better fit for previous conflicting
260  // instruction
261  void retreatPosition();
262 
263  // The exponential time algorithm which finds the provably best fit
264  bool solveExact();
265  // The polynomial time algorithm which attempts to find a good fit
266  bool solveGreedy();
267  // Whether or not the current solution is optimal
268  bool checkOptimal();
269  // Populate the ready list, prioiritizing fewest missed edges first
270  void populateReadyList(SUToCandSGsPair &CurrSU,
271  SmallVectorImpl<std::pair<int, int>> &ReadyList,
272  SmallVectorImpl<SchedGroup> &SyncPipeline);
273  // Add edges corresponding to the SchedGroups as assigned by solver
274  void makePipeline();
275  // Add the edges from the SU to the other SchedGroups in pipeline, and
276  // return the number of edges missed.
277  int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
278  std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
279  // Remove the edges passed via AddedEdges
280  void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
281  // Convert the passed in maps to arrays for bidirectional iterators
282  void convertSyncMapsToArrays();
283 
284  void reset();
285 
286 public:
287  // Invoke the solver to map instructions to instruction groups. Heuristic &&
288  // command-line-option determines to use exact or greedy algorithm.
289  void solve();
290 
291  PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
293  ScheduleDAGMI *DAG)
294  : DAG(DAG), SyncedInstrs(SyncedInstrs),
295  SyncedSchedGroups(SyncedSchedGroups) {
296 
297  for (auto &PipelineInstrs : SyncedInstrs) {
298  if (PipelineInstrs.second.size() > 0) {
299  NeedsSolver = true;
300  break;
301  }
302  }
303 
304  if (!NeedsSolver)
305  return;
306 
307  convertSyncMapsToArrays();
308 
309  CurrPipeline = BestPipeline;
310 
311  while (static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.size() &&
312  PipelineInstrs[BeginSyncGroupIdx].size() == 0)
313  ++BeginSyncGroupIdx;
314 
315  if (static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.size())
316  return;
317  }
318 };
319 
320 void PipelineSolver::reset() {
321 
322  for (auto &SyncPipeline : CurrPipeline) {
323  for (auto &SG : SyncPipeline) {
324  SmallVector<SUnit *, 32> TempCollection = SG.Collection;
325  SG.Collection.clear();
326  auto SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) {
327  return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER;
328  });
329  if (SchedBarr != TempCollection.end())
330  SG.Collection.push_back(*SchedBarr);
331  }
332  }
333 
334  CurrSyncGroupIdx = BeginSyncGroupIdx;
335  CurrConflInstNo = 0;
336  CurrCost = 0;
337 }
338 
339 void PipelineSolver::convertSyncMapsToArrays() {
340  for (auto &SyncPipe : SyncedSchedGroups) {
341  BestPipeline.insert(BestPipeline.begin(), SyncPipe.second);
342  }
343 
344  int PipelineIDx = SyncedInstrs.size() - 1;
345  PipelineInstrs.resize(SyncedInstrs.size());
346  for (auto &SyncInstrMap : SyncedInstrs) {
347  for (auto &SUsToCandSGs : SyncInstrMap.second) {
348  if (PipelineInstrs[PipelineIDx].size() == 0) {
349  PipelineInstrs[PipelineIDx].push_back(
350  std::make_pair(SUsToCandSGs.first, SUsToCandSGs.second));
351  continue;
352  }
353  auto SortPosition = PipelineInstrs[PipelineIDx].begin();
354  // Insert them in sorted order -- this allows for good parsing order in
355  // the greedy algorithm
356  while (SortPosition != PipelineInstrs[PipelineIDx].end() &&
357  SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum)
358  ++SortPosition;
359  PipelineInstrs[PipelineIDx].insert(
360  SortPosition,
361  std::make_pair(SUsToCandSGs.first, SUsToCandSGs.second));
362  }
363  --PipelineIDx;
364  }
365 }
366 
367 void PipelineSolver::makePipeline() {
368  // Preserve the order of barrier for subsequent SchedGroupBarrier mutations
369  for (auto &SyncPipeline : BestPipeline) {
370  for (auto &SG : SyncPipeline) {
371  SUnit *SGBarr = nullptr;
372  for (auto &SU : SG.Collection) {
373  if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
374  SGBarr = SU;
375  }
376  // Command line requested IGroupLP doesn't have SGBarr
377  if (!SGBarr)
378  continue;
379  resetEdges(*SGBarr, DAG);
380  SG.link(*SGBarr, false);
381  }
382  }
383 
384  for (auto &SyncPipeline : BestPipeline) {
385  auto I = SyncPipeline.rbegin();
386  auto E = SyncPipeline.rend();
387  for (; I != E; ++I) {
388  auto &GroupA = *I;
389  for (auto J = std::next(I); J != E; ++J) {
390  auto &GroupB = *J;
391  GroupA.link(GroupB);
392  }
393  }
394  }
395 }
396 
397 int PipelineSolver::addEdges(
398  SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
399  std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
400  int AddedCost = 0;
401  bool MakePred = false;
402 
403  // The groups in the pipeline are in reverse order. Thus,
404  // by traversing them from last to first, we are traversing
405  // them in the order as they were introduced in the code. After we
406  // pass the group the SU is being assigned to, it should be
407  // linked as a predecessor of the subsequent SchedGroups
408  auto GroupNo = (int)SyncPipeline.size() - 1;
409  for (; GroupNo >= 0; GroupNo--) {
410  if (SyncPipeline[GroupNo].getSGID() == SGID) {
411  MakePred = true;
412  continue;
413  }
414  auto Group = &SyncPipeline[GroupNo];
415  AddedCost += Group->link(*SU, MakePred, AddedEdges);
416  assert(AddedCost >= 0);
417  }
418 
419  return AddedCost;
420 }
421 
422 void PipelineSolver::removeEdges(
423  const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
424  // Only remove the edges that we have added when testing
425  // the fit.
426  for (auto &PredSuccPair : EdgesToRemove) {
427  SUnit *Pred = PredSuccPair.first;
428  SUnit *Succ = PredSuccPair.second;
429 
430  auto Match = llvm::find_if(
431  Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; });
432  if (Match != Succ->Preds.end()) {
433  assert(Match->isArtificial());
434  Succ->removePred(*Match);
435  }
436  }
437 }
438 
439 void PipelineSolver::advancePosition() {
440  ++CurrConflInstNo;
441 
442  if (static_cast<size_t>(CurrConflInstNo) >=
443  PipelineInstrs[CurrSyncGroupIdx].size()) {
444  CurrConflInstNo = 0;
445  ++CurrSyncGroupIdx;
446  // Advance to next non-trivial pipeline
447  while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() &&
448  PipelineInstrs[CurrSyncGroupIdx].size() == 0)
449  ++CurrSyncGroupIdx;
450  }
451 }
452 
453 void PipelineSolver::retreatPosition() {
454  assert(CurrConflInstNo >= 0);
455  assert(CurrSyncGroupIdx >= 0);
456 
457  if (CurrConflInstNo > 0) {
458  --CurrConflInstNo;
459  return;
460  }
461 
462  if (CurrConflInstNo == 0) {
463  // If we return to the starting position, we have explored
464  // the entire tree
465  if (CurrSyncGroupIdx == BeginSyncGroupIdx)
466  return;
467 
468  --CurrSyncGroupIdx;
469  // Go to previous non-trivial pipeline
470  while (PipelineInstrs[CurrSyncGroupIdx].size() == 0)
471  --CurrSyncGroupIdx;
472 
473  CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1;
474  }
475 }
476 
477 bool PipelineSolver::checkOptimal() {
478  if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {
479  if (BestCost == -1 || CurrCost < BestCost) {
480  BestPipeline = CurrPipeline;
481  BestCost = CurrCost;
482  LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n");
483  }
484  assert(BestCost >= 0);
485  }
486 
487  bool DoneExploring = false;
488  if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
489  DoneExploring = true;
490 
491  return (DoneExploring || BestCost == 0);
492 }
493 
494 void PipelineSolver::populateReadyList(
495  SUToCandSGsPair &CurrSU, SmallVectorImpl<std::pair<int, int>> &ReadyList,
496  SmallVectorImpl<SchedGroup> &SyncPipeline) {
497  assert(CurrSU.second.size() >= 1);
498  auto I = CurrSU.second.rbegin();
499  auto E = CurrSU.second.rend();
500  for (; I != E; ++I) {
501  std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
502  int CandSGID = *I;
503  SchedGroup *Match;
504  for (auto &SG : SyncPipeline) {
505  if (SG.getSGID() == CandSGID)
506  Match = &SG;
507  }
508 
509  if (UseCostHeur) {
510  if (Match->isFull()) {
511  ReadyList.push_back(std::make_pair(*I, MissPenalty));
512  continue;
513  }
514 
515  int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
516  ReadyList.push_back(std::make_pair(*I, TempCost));
517  removeEdges(AddedEdges);
518  } else
519  ReadyList.push_back(std::make_pair(*I, -1));
520  }
521 
522  if (UseCostHeur) {
523  std::sort(ReadyList.begin(), ReadyList.end(),
524  [](std::pair<int, int> A, std::pair<int, int> B) {
525  return A.second < B.second;
526  });
527  }
528 
529  assert(ReadyList.size() == CurrSU.second.size());
530 }
531 
532 bool PipelineSolver::solveExact() {
533  if (checkOptimal())
534  return true;
535 
536  if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())
537  return false;
538 
539  assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());
540  assert(static_cast<size_t>(CurrConflInstNo) <
541  PipelineInstrs[CurrSyncGroupIdx].size());
542  SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
543  LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
544  << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
545 
546  // SchedGroup -> Cost pairs
547  SmallVector<std::pair<int, int>, 4> ReadyList;
548  // Prioritize the candidate sched groups in terms of lowest cost first
549  populateReadyList(CurrSU, ReadyList, CurrPipeline[CurrSyncGroupIdx]);
550 
551  auto I = ReadyList.begin();
552  auto E = ReadyList.end();
553  for (; I != E; ++I) {
554  // If we are trying SGs in least cost order, and the current SG is cost
555  // infeasible, then all subsequent SGs will also be cost infeasible, so we
556  // can prune.
557  if (BestCost != -1 && (CurrCost + I->second > BestCost))
558  return false;
559 
560  int CandSGID = I->first;
561  int AddedCost = 0;
562  std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
563  auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
564  SchedGroup *Match;
565  for (auto &SG : SyncPipeline) {
566  if (SG.getSGID() == CandSGID)
567  Match = &SG;
568  }
569 
570  if (Match->isFull())
571  continue;
572 
573  LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask "
574  << (int)Match->getMask() << "and ID " << CandSGID
575  << "\n");
576  Match->add(*CurrSU.first);
577  AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
578  LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n");
579  CurrCost += AddedCost;
580  advancePosition();
581  ++BranchesExplored;
582  bool FinishedExploring = false;
583  // If the Cost after adding edges is greater than a known solution,
584  // backtrack
585  if (CurrCost < BestCost || BestCost == -1) {
586  if (solveExact()) {
587  FinishedExploring = BestCost != 0;
588  if (!FinishedExploring)
589  return true;
590  }
591  }
592 
593  retreatPosition();
594  CurrCost -= AddedCost;
595  removeEdges(AddedEdges);
596  Match->pop();
597  CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
598  if (FinishedExploring)
599  return true;
600  }
601 
602  // Try the pipeline where the current instruction is omitted
603  // Potentially if we omit a problematic instruction from the pipeline,
604  // all the other instructions can nicely fit.
605  CurrCost += MissPenalty;
606  advancePosition();
607 
608  LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n");
609 
610  bool FinishedExploring = false;
611  if (CurrCost < BestCost || BestCost == -1) {
612  if (solveExact()) {
613  bool FinishedExploring = BestCost != 0;
614  if (!FinishedExploring)
615  return true;
616  }
617  }
618 
619  retreatPosition();
620  CurrCost -= MissPenalty;
621  return FinishedExploring;
622 }
623 
624 bool PipelineSolver::solveGreedy() {
625  BestCost = 0;
626  std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
627 
628  while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
629  SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
630  int BestNodeCost = -1;
631  int TempCost;
632  SchedGroup *BestGroup = nullptr;
633  int BestGroupID = -1;
634  auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
635  LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
636  << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
637 
638  // Since we have added the potential SchedGroups from bottom up, but
639  // traversed the DAG from top down, parse over the groups from last to
640  // first. If we fail to do this for the greedy algorithm, the solution will
641  // likely not be good in more complex cases.
642  auto I = CurrSU.second.rbegin();
643  auto E = CurrSU.second.rend();
644  for (; I != E; ++I) {
645  std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
646  int CandSGID = *I;
647  SchedGroup *Match;
648  for (auto &SG : SyncPipeline) {
649  if (SG.getSGID() == CandSGID)
650  Match = &SG;
651  }
652 
653  LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "
654  << (int)Match->getMask() << "\n");
655 
656  if (Match->isFull()) {
657  LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");
658  continue;
659  }
660  TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
661  LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n");
662  if (TempCost < BestNodeCost || BestNodeCost == -1) {
663  BestGroup = Match;
664  BestNodeCost = TempCost;
665  BestGroupID = CandSGID;
666  }
667  removeEdges(AddedEdges);
668  if (BestNodeCost == 0)
669  break;
670  }
671 
672  if (BestGroupID != -1) {
673  BestGroup->add(*CurrSU.first);
674  addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);
675  LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask"
676  << (int)BestGroup->getMask() << "\n");
677  BestCost += TempCost;
678  } else
679  BestCost += MissPenalty;
680 
681  CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
682  advancePosition();
683  }
684  BestPipeline = CurrPipeline;
685  removeEdges(AddedEdges);
686  return false;
687 }
688 
689 unsigned PipelineSolver::computeProblemSize() {
690  unsigned ProblemSize = 0;
691  for (auto &PipeConflicts : PipelineInstrs) {
692  ProblemSize += PipeConflicts.size();
693  }
694 
695  return ProblemSize;
696 }
697 
698 void PipelineSolver::solve() {
699  if (!NeedsSolver)
700  return;
701 
702  unsigned ProblemSize = computeProblemSize();
703  assert(ProblemSize > 0);
704 
705  bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact;
706  MissPenalty = (ProblemSize / 2) + 1;
707 
708  LLVM_DEBUG(DAG->dump());
709  if (EnableExactSolver || BelowCutoff) {
710  LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n");
711  solveGreedy();
712  reset();
713  LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n");
714  if (BestCost > 0) {
715  LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n");
716  solveExact();
717  LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n");
718  }
719  } else { // Use the Greedy Algorithm by default
720  LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n");
721  solveGreedy();
722  }
723 
724  makePipeline();
725 }
726 
727 enum IGLPStrategyID : int { MFMASmallGemmOptID = 0 };
728 
729 // Implement a IGLP scheduling strategy.
730 class IGLPStrategy {
731 protected:
732  ScheduleDAGInstrs *DAG;
733 
734  const SIInstrInfo *TII;
735 
736 public:
737  // Add SchedGroups to \p Pipeline to implement this Strategy.
738  virtual void applyIGLPStrategy(
740  DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) = 0;
741 
742  // Returns true if this strategy should be applied to a ScheduleDAG.
743  virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0;
744 
745  IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
746  : DAG(DAG), TII(TII) {}
747 
748  virtual ~IGLPStrategy() = default;
749 };
750 
751 class MFMASmallGemmOpt final : public IGLPStrategy {
752 public:
753  void applyIGLPStrategy(
755  DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override;
756 
757  bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
758 
759  MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
760  : IGLPStrategy(DAG, TII) {}
761 };
762 
763 void MFMASmallGemmOpt::applyIGLPStrategy(
765  DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) {
766  // Count the number of MFMA instructions.
767  unsigned MFMACount = 0;
768  for (const MachineInstr &I : *DAG)
769  if (TII->isMFMA(I))
770  ++MFMACount;
771 
772  const unsigned PipelineSyncID = 0;
773  SchedGroup *SG = nullptr;
774  for (unsigned I = 0; I < MFMACount; ++I) {
775  SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
776  SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);
777  SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
778 
779  SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
780  SchedGroupMask::VMEM_READ, 1, PipelineSyncID, DAG, TII);
781  SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
782 
783  SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
784  SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
785  SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
786 
787  SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
788  SchedGroupMask::VMEM_WRITE, 1, PipelineSyncID, DAG, TII);
789  SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
790 
791  SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
792  SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
793  SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
794  }
795 
796  for (unsigned I = 0; I < MFMACount; ++I) {
797  SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
798  SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);
799  SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
800 
801  SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
802  SchedGroupMask::VMEM_READ, 1, PipelineSyncID, DAG, TII);
803  SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
804 
805  SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
806  SchedGroupMask::VMEM_WRITE, 1, PipelineSyncID, DAG, TII);
807  SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
808 
809  SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
810  SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
811  SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
812  }
813 }
814 
815 static std::unique_ptr<IGLPStrategy>
816 createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
817  const SIInstrInfo *TII) {
818  switch (ID) {
819  case MFMASmallGemmOptID:
820  return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
821  }
822 
823  llvm_unreachable("Unknown IGLPStrategyID");
824 }
825 
826 class IGroupLPDAGMutation : public ScheduleDAGMutation {
827 private:
828  const SIInstrInfo *TII;
829 
830  ScheduleDAGMI *DAG;
831 
832  // Organize lists of SchedGroups by their SyncID. SchedGroups /
833  // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added
834  // between then.
835  DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
836 
837  // Used to track instructions that can be mapped to multiple sched groups
839 
840  // Add DAG edges that enforce SCHED_BARRIER ordering.
841  void addSchedBarrierEdges(SUnit &SU);
842 
843  // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
844  // not be reordered accross the SCHED_BARRIER. This is used for the base
845  // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that
846  // SCHED_BARRIER will always block all instructions that can be classified
847  // into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size
848  // and may only synchronize with some SchedGroups. Returns the inverse of
849  // Mask. SCHED_BARRIER's mask describes which instruction types should be
850  // allowed to be scheduled across it. Invert the mask to get the
851  // SchedGroupMask of instructions that should be barred.
852  SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const;
853 
854  // Create SchedGroups for a SCHED_GROUP_BARRIER.
855  void initSchedGroupBarrierPipelineStage(
856  std::vector<SUnit>::reverse_iterator RIter);
857 
858  void initIGLPOpt(SUnit &SU);
859 
860 public:
861  void apply(ScheduleDAGInstrs *DAGInstrs) override;
862 
863  IGroupLPDAGMutation() = default;
864 };
865 
866 unsigned SchedGroup::NumSchedGroups = 0;
867 
868 bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) {
869  if (A != B && DAG->canAddEdge(B, A)) {
870  DAG->addEdge(B, SDep(A, SDep::Artificial));
871  return true;
872  }
873  return false;
874 }
875 
876 bool SchedGroup::canAddMI(const MachineInstr &MI) const {
877  bool Result = false;
878  if (MI.isMetaInstruction())
879  Result = false;
880 
881  else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
882  (TII->isVALU(MI) || TII->isMFMA(MI) || TII->isSALU(MI)))
883  Result = true;
884 
885  else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
886  TII->isVALU(MI) && !TII->isMFMA(MI))
887  Result = true;
888 
889  else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
890  TII->isSALU(MI))
891  Result = true;
892 
893  else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
894  TII->isMFMA(MI))
895  Result = true;
896 
897  else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
898  (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
899  Result = true;
900 
901  else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
902  MI.mayLoad() &&
903  (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
904  Result = true;
905 
906  else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
907  MI.mayStore() &&
908  (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
909  Result = true;
910 
911  else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
912  TII->isDS(MI))
913  Result = true;
914 
915  else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
916  MI.mayLoad() && TII->isDS(MI))
917  Result = true;
918 
919  else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
920  MI.mayStore() && TII->isDS(MI))
921  Result = true;
922 
923  LLVM_DEBUG(
924  dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)
925  << (Result ? " could classify " : " unable to classify ") << MI);
926 
927  return Result;
928 }
929 
930 int SchedGroup::link(SUnit &SU, bool MakePred,
931  std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
932  int MissedEdges = 0;
933  for (auto *A : Collection) {
934  SUnit *B = &SU;
935  if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
936  continue;
937  if (MakePred)
938  std::swap(A, B);
939 
940  if (DAG->IsReachable(B, A))
941  continue;
942  // tryAddEdge returns false if there is a dependency that makes adding
943  // the A->B edge impossible, otherwise it returns true;
944  bool Added = tryAddEdge(A, B);
945  if (Added)
946  AddedEdges.push_back(std::make_pair(A, B));
947  else
948  ++MissedEdges;
949  }
950 
951  return MissedEdges;
952 }
953 
954 void SchedGroup::link(SUnit &SU, bool MakePred) {
955  for (auto *A : Collection) {
956  SUnit *B = &SU;
957  if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
958  continue;
959  if (MakePred)
960  std::swap(A, B);
961 
962  tryAddEdge(A, B);
963  }
964 }
965 
966 void SchedGroup::link(SUnit &SU,
967  function_ref<bool(const SUnit *A, const SUnit *B)> P) {
968  for (auto *A : Collection) {
969  SUnit *B = &SU;
970  if (P(A, B))
971  std::swap(A, B);
972 
973  tryAddEdge(A, B);
974  }
975 }
976 
977 void SchedGroup::link(SchedGroup &OtherGroup) {
978  for (auto *B : OtherGroup.Collection)
979  link(*B);
980 }
981 
982 bool SchedGroup::canAddSU(SUnit &SU) const {
983  MachineInstr &MI = *SU.getInstr();
984  if (MI.getOpcode() != TargetOpcode::BUNDLE)
985  return canAddMI(MI);
986 
987  // Special case for bundled MIs.
988  const MachineBasicBlock *MBB = MI.getParent();
989  MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;
990  while (E != MBB->end() && E->isBundledWithPred())
991  ++E;
992 
993  // Return true if all of the bundled MIs can be added to this group.
994  return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); });
995 }
996 
997 void SchedGroup::initSchedGroup() {
998  for (auto &SU : DAG->SUnits) {
999  if (isFull())
1000  break;
1001 
1002  if (canAddSU(SU))
1003  add(SU);
1004  }
1005 }
1006 
1007 void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
1008  SUnitsToCandidateSGsMap &SyncedInstrs) {
1009  SUnit &InitSU = *RIter;
1010  for (auto E = DAG->SUnits.rend(); RIter != E; ++RIter) {
1011  auto &SU = *RIter;
1012  if (isFull())
1013  break;
1014 
1015  if (canAddSU(SU))
1016  SyncedInstrs[&SU].push_back(SGID);
1017  }
1018 
1019  add(InitSU);
1020  assert(MaxSize);
1021  (*MaxSize)++;
1022 }
1023 
1024 void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) {
1025  auto I = DAG->SUnits.rbegin();
1026  auto E = DAG->SUnits.rend();
1027  for (; I != E; ++I) {
1028  auto &SU = *I;
1029  if (isFull())
1030  break;
1031 
1032  if (canAddSU(SU))
1033  SyncedInstrs[&SU].push_back(SGID);
1034  }
1035 }
1036 
1038  const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
1039  if (!TSchedModel || DAGInstrs->SUnits.empty())
1040  return;
1041 
1042  LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n");
1043  const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
1044  TII = ST.getInstrInfo();
1045  DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
1046  SyncedSchedGroups.clear();
1047  SyncedInstrs.clear();
1048  bool foundSB = false;
1049  bool foundIGLP = false;
1050  for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) {
1051  unsigned Opc = R->getInstr()->getOpcode();
1052  // SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive.
1053  if (Opc == AMDGPU::SCHED_BARRIER) {
1054  addSchedBarrierEdges(*R);
1055  foundSB = true;
1056  } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) {
1057  initSchedGroupBarrierPipelineStage(R);
1058  foundSB = true;
1059  } else if (Opc == AMDGPU::IGLP_OPT) {
1060  resetEdges(*R, DAG);
1061  if (!foundSB && !foundIGLP)
1062  initIGLPOpt(*R);
1063  foundIGLP = true;
1064  }
1065  }
1066 
1067  if (foundSB || foundIGLP) {
1068  PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG);
1069  // PipelineSolver performs the mutation by adding the edges it
1070  // determined as the best
1071  PS.solve();
1072  }
1073 }
1074 
1075 void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
1076  MachineInstr &MI = *SchedBarrier.getInstr();
1077  assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
1078  // Remove all existing edges from the SCHED_BARRIER that were added due to the
1079  // instruction having side effects.
1080  resetEdges(SchedBarrier, DAG);
1081  auto InvertedMask =
1082  invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm());
1083  SchedGroup SG(InvertedMask, std::nullopt, DAG, TII);
1084  SG.initSchedGroup();
1085  // Preserve original instruction ordering relative to the SCHED_BARRIER.
1086  SG.link(
1087  SchedBarrier,
1088  (function_ref<bool(const SUnit *A, const SUnit *B)>)[](
1089  const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; });
1090 }
1091 
1092 SchedGroupMask
1093 IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
1094  // Invert mask and erase bits for types of instructions that are implied to be
1095  // allowed past the SCHED_BARRIER.
1096  SchedGroupMask InvertedMask = ~Mask;
1097 
1098  // ALU implies VALU, SALU, MFMA.
1099  if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
1100  InvertedMask &=
1101  ~SchedGroupMask::VALU & ~SchedGroupMask::SALU & ~SchedGroupMask::MFMA;
1102  // VALU, SALU, MFMA implies ALU.
1103  else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||
1104  (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||
1105  (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE)
1106  InvertedMask &= ~SchedGroupMask::ALU;
1107 
1108  // VMEM implies VMEM_READ, VMEM_WRITE.
1109  if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
1110  InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE;
1111  // VMEM_READ, VMEM_WRITE implies VMEM.
1112  else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||
1113  (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE)
1114  InvertedMask &= ~SchedGroupMask::VMEM;
1115 
1116  // DS implies DS_READ, DS_WRITE.
1117  if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE)
1118  InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE;
1119  // DS_READ, DS_WRITE implies DS.
1120  else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE ||
1121  (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE)
1122  InvertedMask &= ~SchedGroupMask::DS;
1123 
1124  return InvertedMask;
1125 }
1126 
1127 void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
1128  std::vector<SUnit>::reverse_iterator RIter) {
1129  // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due
1130  // to the instruction having side effects.
1131  resetEdges(*RIter, DAG);
1132  MachineInstr &SGB = *RIter->getInstr();
1133  assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER);
1134  int32_t SGMask = SGB.getOperand(0).getImm();
1135  int32_t Size = SGB.getOperand(1).getImm();
1136  int32_t SyncID = SGB.getOperand(2).getImm();
1137 
1138  auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,
1139  Size, SyncID, DAG, TII);
1140 
1141  SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]);
1142 }
1143 
1144 void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
1145  IGLPStrategyID StrategyID =
1146  (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm();
1147  auto S = createIGLPStrategy(StrategyID, DAG, TII);
1148  if (S->shouldApplyStrategy(DAG))
1149  S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups);
1150 }
1151 
1152 } // namespace
1153 
1154 namespace llvm {
1155 
1156 std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() {
1157  return std::make_unique<IGroupLPDAGMutation>();
1158 }
1159 
1160 } // end namespace llvm
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:109
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
llvm::SIInstrFlags::DS
@ DS
Definition: SIDefines.h:60
llvm::SDep::Artificial
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
SIMachineFunctionInfo.h
LLVM_MARK_AS_BITMASK_ENUM
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:41
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
llvm::ALL
@ ALL
Definition: Attributor.h:5208
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1199
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:140
llvm::ScheduleDAGInstrs::addEdge
bool addEdge(SUnit *SuccSU, const SDep &PredDep)
Add a DAG edge to the given SU with the given predecessor dependence data.
Definition: ScheduleDAGInstrs.cpp:1199
DenseMap.h
llvm::sys::path::end
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:235
llvm::SUnit::Succs
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:257
llvm::Optional< unsigned >
llvm::SIInstrFlags::VALU
@ VALU
Definition: SIDefines.h:30
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::DiagnosticPredicateTy::Match
@ Match
AMDGPUIGroupLP.h
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::CSKYAttrs::NONE
@ NONE
Definition: CSKYAttributes.h:76
llvm::SIInstrFlags::SALU
@ SALU
Definition: SIDefines.h:29
llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1734
llvm::SUnit::removePred
void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
Definition: ScheduleDAG.cpp:175
llvm::cl::apply
void apply(Opt *O, const Mod &M, const Mods &... Ms)
Definition: CommandLine.h:1300
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::MachineOperand::getImm
int64_t getImm() const
Definition: MachineOperand.h:546
llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:526
int
Clang compiles this i1 i64 store i64 i64 store i64 i64 store i64 i64 store i64 align Which gets codegen d xmm0 movaps rbp movaps rbp movaps rbp movaps rbp rbp rbp rbp rbp It would be better to have movq s of instead of the movaps s LLVM produces ret int
Definition: README.txt:536
TargetOpcodes.h
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:125
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:94
llvm::sort
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1682
llvm::function_ref
An efficient, type-erasing, non-owning reference to a callable.
Definition: STLFunctionalExtras.h:36
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:657
llvm::cl::opt< bool >
AMDGPUMCTargetDesc.h
llvm::TargetSchedModel
Provide an instruction scheduling machine model to CodeGen passes.
Definition: TargetSchedule.h:30
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:66
uint64_t
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::ARM_AM::add
@ add
Definition: ARMAddressingModes.h:39
llvm::ScheduleDAGInstrs::IsReachable
bool IsReachable(SUnit *SU, SUnit *TargetSU)
IsReachable - Checks if SU is reachable from TargetSU.
Definition: ScheduleDAGInstrs.h:273
llvm::DenseMap
Definition: DenseMap.h:714
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::SUnit::getInstr
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:373
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:447
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:853
llvm::ScheduleDAGInstrs::canAddEdge
bool canAddEdge(SUnit *SuccSU, SUnit *PredSU)
True if an edge can be added from PredSU to SuccSU without creating a cycle.
Definition: ScheduleDAGInstrs.cpp:1195
llvm::ScheduleDAGMI
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
Definition: MachineScheduler.h:273
getOpcode
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
SIInstrInfo.h
llvm::size
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1715
llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:516
llvm::ScheduleDAG::MF
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:559
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
BitmaskEnum.h
S
add sub stmia L5 ldr r0 bl L_printf $stub Instead of a and a wouldn t it be better to do three moves *Return an aggregate type is even return S
Definition: README.txt:210
llvm::ScheduleDAGInstrs::dump
void dump() const override
Definition: ScheduleDAGInstrs.cpp:1166
llvm::find_if
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1761
llvm::PBQP::RegAlloc::solve
Solution solve(PBQPRAGraph &G)
Definition: RegAllocPBQP.h:522
llvm::SDep
Scheduling dependency.
Definition: ScheduleDAG.h:49
MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:74
llvm::ScheduleDAG::SUnits
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:561
llvm::ilist_iterator
Iterator for intrusive lists based on ilist_node.
Definition: ilist_iterator.h:57
llvm::SmallVectorImpl::clear
void clear()
Definition: SmallVector.h:614
llvm::SIInstrInfo
Definition: SIInstrInfo.h:44
MachineScheduler.h
llvm::NONE
@ NONE
Definition: Attributor.h:5205
llvm::ScheduleDAGInstrs::getSchedModel
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
Definition: ScheduleDAGInstrs.h:263
llvm::ScheduleDAGMutation
Mutate the DAG as a postpass after normal DAG building.
Definition: ScheduleDAGMutation.h:22
llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:42
llvm::SUnit::Preds
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:256
llvm::SUnit
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
llvm::cl::desc
Definition: CommandLine.h:413
llvm::ScheduleDAGInstrs
A ScheduleDAG for scheduling lists of MachineInstr.
Definition: ScheduleDAGInstrs.h:120
llvm::format_hex
FormattedNumber format_hex(uint64_t N, unsigned Width, bool Upper=false)
format_hex - Output N as a fixed width hexadecimal.
Definition: Format.h:186
AMDGPUTargetMachine.h
llvm::MachineBasicBlock::end
iterator end()
Definition: MachineBasicBlock.h:307
llvm::createIGroupLPDAGMutation
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation()
Definition: AMDGPUIGroupLP.cpp:1156