LLVM 18.0.0git
AMDGPUIGroupLP.cpp
Go to the documentation of this file.
1//===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file This file defines a set of schedule DAG mutations that can be used to
10// override default scheduler behavior to enforce specific scheduling patterns.
11// They should be used in cases where runtime performance considerations such as
12// inter-wavefront interactions, mean that compile-time heuristics cannot
13// predict the optimal instruction ordering, or in kernels where optimum
14// instruction scheduling is important enough to warrant manual intervention.
15//
16//===----------------------------------------------------------------------===//
17
18#include "AMDGPUIGroupLP.h"
19#include "AMDGPUTargetMachine.h"
21#include "SIInstrInfo.h"
24#include "llvm/ADT/DenseMap.h"
27
28using namespace llvm;
29
30#define DEBUG_TYPE "igrouplp"
31
32namespace {
33
34static cl::opt<bool> EnableExactSolver(
35 "amdgpu-igrouplp-exact-solver", cl::Hidden,
36 cl::desc("Whether to use the exponential time solver to fit "
37 "the instructions to the pipeline as closely as "
38 "possible."),
39 cl::init(false));
40
41static cl::opt<unsigned> CutoffForExact(
42 "amdgpu-igrouplp-exact-solver-cutoff", cl::init(0), cl::Hidden,
43 cl::desc("The maximum number of scheduling group conflicts "
44 "which we attempt to solve with the exponential time "
45 "exact solver. Problem sizes greater than this will"
46 "be solved by the less accurate greedy algorithm. Selecting "
47 "solver by size is superseded by manually selecting "
48 "the solver (e.g. by amdgpu-igrouplp-exact-solver"));
49
50static cl::opt<uint64_t> MaxBranchesExplored(
51 "amdgpu-igrouplp-exact-solver-max-branches", cl::init(0), cl::Hidden,
52 cl::desc("The amount of branches that we are willing to explore with"
53 "the exact algorithm before giving up."));
54
55static cl::opt<bool> UseCostHeur(
56 "amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden,
57 cl::desc("Whether to use the cost heuristic to make choices as we "
58 "traverse the search space using the exact solver. Defaulted "
59 "to on, and if turned off, we will use the node order -- "
60 "attempting to put the later nodes in the later sched groups. "
61 "Experimentally, results are mixed, so this should be set on a "
62 "case-by-case basis."));
63
64// Components of the mask that determines which instruction types may be may be
65// classified into a SchedGroup.
66enum class SchedGroupMask {
67 NONE = 0u,
68 ALU = 1u << 0,
69 VALU = 1u << 1,
70 SALU = 1u << 2,
71 MFMA = 1u << 3,
72 VMEM = 1u << 4,
73 VMEM_READ = 1u << 5,
74 VMEM_WRITE = 1u << 6,
75 DS = 1u << 7,
76 DS_READ = 1u << 8,
77 DS_WRITE = 1u << 9,
78 ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
79 DS_READ | DS_WRITE,
80 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
81};
82
83class SchedGroup;
84
85// InstructionRule class is used to enact a filter which determines whether or
86// not an SU maps to a given SchedGroup. It contains complementary data
87// structures (e.g Cache) to help those filters.
88class InstructionRule {
89protected:
90 const SIInstrInfo *TII;
91 unsigned SGID;
92 // A cache made available to the Filter to store SUnits for subsequent
93 // invocations of the Filter
94 std::optional<SmallVector<SUnit *, 4>> Cache;
95
96public:
97 virtual bool
98 apply(const SUnit *, const ArrayRef<SUnit *>,
100 return true;
101 };
102
103 InstructionRule(const SIInstrInfo *TII, unsigned SGID,
104 bool NeedsCache = false)
105 : TII(TII), SGID(SGID) {
106 if (NeedsCache) {
107 Cache = SmallVector<SUnit *, 4>();
108 }
109 }
110
111 virtual ~InstructionRule() = default;
112};
113
114typedef DenseMap<SUnit *, SmallVector<int, 4>> SUnitsToCandidateSGsMap;
115
116// Classify instructions into groups to enable fine tuned control over the
117// scheduler. These groups may be more specific than current SchedModel
118// instruction classes.
119class SchedGroup {
120private:
121 // Mask that defines which instruction types can be classified into this
122 // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER
123 // and SCHED_GROUP_BARRIER.
124 SchedGroupMask SGMask;
125
126 // Maximum number of SUnits that can be added to this group.
127 std::optional<unsigned> MaxSize;
128
129 // SchedGroups will only synchronize with other SchedGroups that have the same
130 // SyncID.
131 int SyncID = 0;
132
133 // SGID is used to map instructions to candidate SchedGroups
134 unsigned SGID;
135
136 // The different rules each instruction in this SchedGroup must conform to
138
139 // Count of the number of created SchedGroups, used to initialize SGID.
140 static unsigned NumSchedGroups;
141
142 const SIInstrInfo *TII;
143
144 // Try to add and edge from SU A to SU B.
145 bool tryAddEdge(SUnit *A, SUnit *B);
146
147 // Use SGMask to determine whether we can classify MI as a member of this
148 // SchedGroup object.
149 bool canAddMI(const MachineInstr &MI) const;
150
151public:
152 // Collection of SUnits that are classified as members of this group.
153 SmallVector<SUnit *, 32> Collection;
154
156
157 // Returns true if SU can be added to this SchedGroup.
158 bool canAddSU(SUnit &SU) const;
159
160 // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If
161 // MakePred is true, SU will be a predecessor of the SUnits in this
162 // SchedGroup, otherwise SU will be a successor.
163 void link(SUnit &SU, bool MakePred = false);
164
165 // Add DAG dependencies and track which edges are added, and the count of
166 // missed edges
167 int link(SUnit &SU, bool MakePred,
168 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
169
170 // Add DAG dependencies from all SUnits in this SchedGroup and this SU.
171 // Use the predicate to determine whether SU should be a predecessor (P =
172 // true) or a successor (P = false) of this SchedGroup.
173 void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P);
174
175 // Add DAG dependencies such that SUnits in this group shall be ordered
176 // before SUnits in OtherGroup.
177 void link(SchedGroup &OtherGroup);
178
179 // Returns true if no more instructions may be added to this group.
180 bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; }
181
182 // Append a constraint that SUs must meet in order to fit into this
183 // SchedGroup. Since many rules involve the relationship between a SchedGroup
184 // and the SUnits in other SchedGroups, rules are checked at Pipeline Solve
185 // time (rather than SchedGroup init time.)
186 void addRule(std::shared_ptr<InstructionRule> NewRule) {
187 Rules.push_back(NewRule);
188 }
189
190 // Returns true if the SU matches all rules
191 bool allowedByRules(const SUnit *SU,
192 SmallVectorImpl<SchedGroup> &SyncPipe) const {
193 if (Rules.empty())
194 return true;
195 for (size_t I = 0; I < Rules.size(); I++) {
196 auto TheRule = Rules[I].get();
197 if (!TheRule->apply(SU, Collection, SyncPipe)) {
198 return false;
199 }
200 }
201 return true;
202 }
203
204 // Add SU to the SchedGroup.
205 void add(SUnit &SU) {
206 LLVM_DEBUG(dbgs() << "For SchedGroup with mask "
207 << format_hex((int)SGMask, 10, true) << " adding "
208 << *SU.getInstr());
209 Collection.push_back(&SU);
210 }
211
212 // Remove last element in the SchedGroup
213 void pop() { Collection.pop_back(); }
214
215 // Identify and add all relevant SUs from the DAG to this SchedGroup.
216 void initSchedGroup();
217
218 // Add instructions to the SchedGroup bottom up starting from RIter.
219 // PipelineInstrs is a set of instructions that should not be added to the
220 // SchedGroup even when the other conditions for adding it are satisfied.
221 // RIter will be added to the SchedGroup as well, and dependencies will be
222 // added so that RIter will always be scheduled at the end of the group.
223 void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
224 SUnitsToCandidateSGsMap &SyncedInstrs);
225
226 void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs);
227
228 int getSyncID() { return SyncID; }
229
230 int getSGID() { return SGID; }
231
232 SchedGroupMask getMask() { return SGMask; }
233
234 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
235 ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
236 : SGMask(SGMask), MaxSize(MaxSize), TII(TII), DAG(DAG) {
237 SGID = NumSchedGroups++;
238 }
239
240 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID,
241 ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
242 : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), TII(TII), DAG(DAG) {
243 SGID = NumSchedGroups++;
244 }
245};
246
247// Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER.
248static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) {
249 assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER ||
250 SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||
251 SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT);
252
253 while (!SU.Preds.empty())
254 for (auto &P : SU.Preds)
255 SU.removePred(P);
256
257 while (!SU.Succs.empty())
258 for (auto &S : SU.Succs)
259 for (auto &SP : S.getSUnit()->Preds)
260 if (SP.getSUnit() == &SU)
261 S.getSUnit()->removePred(SP);
262}
263
264typedef std::pair<SUnit *, SmallVector<int, 4>> SUToCandSGsPair;
265typedef SmallVector<SUToCandSGsPair, 4> SUsToCandSGsVec;
266
267// The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline
268// in non-trivial cases. For example, if the requested pipeline is
269// {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction
270// in the DAG, then we will have an instruction that can not be trivially
271// assigned to a SchedGroup. The PipelineSolver class implements two algorithms
272// to find a good solution to the pipeline -- a greedy algorithm and an exact
273// algorithm. The exact algorithm has an exponential time complexity and should
274// only be used for small sized problems or medium sized problems where an exact
275// solution is highly desired.
276class PipelineSolver {
277 ScheduleDAGMI *DAG;
278
279 // Instructions that can be assigned to multiple SchedGroups
281 SmallVector<SUsToCandSGsVec, 4> PipelineInstrs;
283 // The current working pipeline
285 // The pipeline that has the best solution found so far
287
288 // Whether or not we actually have any SyncedInstrs to try to solve.
289 bool NeedsSolver = false;
290
291 // Compute an estimate of the size of search tree -- the true size is
292 // the product of each conflictedInst.Matches.size() across all SyncPipelines
293 unsigned computeProblemSize();
294
295 // The cost penalty of not assigning a SU to a SchedGroup
296 int MissPenalty = 0;
297
298 // Costs in terms of the number of edges we are unable to add
299 int BestCost = -1;
300 int CurrCost = 0;
301
302 // Index pointing to the conflicting instruction that is currently being
303 // fitted
304 int CurrConflInstNo = 0;
305 // Index to the pipeline that is currently being fitted
306 int CurrSyncGroupIdx = 0;
307 // The first non trivial pipeline
308 int BeginSyncGroupIdx = 0;
309
310 // How many branches we have explored
311 uint64_t BranchesExplored = 0;
312
313 // The direction in which we process the candidate SchedGroups per SU
314 bool IsBottomUp = 1;
315
316 // Update indices to fit next conflicting instruction
317 void advancePosition();
318 // Recede indices to attempt to find better fit for previous conflicting
319 // instruction
320 void retreatPosition();
321
322 // The exponential time algorithm which finds the provably best fit
323 bool solveExact();
324 // The polynomial time algorithm which attempts to find a good fit
325 bool solveGreedy();
326 // Find the best SchedGroup for the current SU using the heuristic given all
327 // current information. One step in the greedy algorithm. Templated against
328 // the SchedGroup iterator (either reverse or forward).
329 template <typename T>
330 void greedyFind(std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I,
331 T E);
332 // Whether or not the current solution is optimal
333 bool checkOptimal();
334 // Populate the ready list, prioiritizing fewest missed edges first
335 // Templated against the SchedGroup iterator (either reverse or forward).
336 template <typename T>
337 void populateReadyList(SmallVectorImpl<std::pair<int, int>> &ReadyList, T I,
338 T E);
339 // Add edges corresponding to the SchedGroups as assigned by solver
340 void makePipeline();
341 // Link the SchedGroups in the best found pipeline.
342 // Tmplated against the SchedGroup iterator (either reverse or forward).
343 template <typename T> void linkSchedGroups(T I, T E);
344 // Add the edges from the SU to the other SchedGroups in pipeline, and
345 // return the number of edges missed.
346 int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
347 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
348 // Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It
349 // returns the cost (in terms of missed pipeline edges), and tracks the edges
350 // added in \p AddedEdges
351 template <typename T>
352 int linkSUnit(SUnit *SU, int SGID,
353 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E);
354 // Remove the edges passed via \p AddedEdges
355 void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
356 // Convert the passed in maps to arrays for bidirectional iterators
357 void convertSyncMapsToArrays();
358
359 void reset();
360
361public:
362 // Invoke the solver to map instructions to instruction groups. Heuristic &&
363 // command-line-option determines to use exact or greedy algorithm.
364 void solve();
365
366 PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
368 ScheduleDAGMI *DAG, bool IsBottomUp = 1)
369 : DAG(DAG), SyncedInstrs(SyncedInstrs),
370 SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) {
371
372 for (auto &PipelineInstrs : SyncedInstrs) {
373 if (PipelineInstrs.second.size() > 0) {
374 NeedsSolver = true;
375 break;
376 }
377 }
378
379 if (!NeedsSolver)
380 return;
381
382 convertSyncMapsToArrays();
383
384 CurrPipeline = BestPipeline;
385
386 while (static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.size() &&
387 PipelineInstrs[BeginSyncGroupIdx].size() == 0)
388 ++BeginSyncGroupIdx;
389
390 if (static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.size())
391 return;
392 }
393};
394
395void PipelineSolver::reset() {
396
397 for (auto &SyncPipeline : CurrPipeline) {
398 for (auto &SG : SyncPipeline) {
399 SmallVector<SUnit *, 32> TempCollection = SG.Collection;
400 SG.Collection.clear();
401 auto SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) {
402 return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER;
403 });
404 if (SchedBarr != TempCollection.end())
405 SG.Collection.push_back(*SchedBarr);
406 }
407 }
408
409 CurrSyncGroupIdx = BeginSyncGroupIdx;
410 CurrConflInstNo = 0;
411 CurrCost = 0;
412}
413
414void PipelineSolver::convertSyncMapsToArrays() {
415 for (auto &SyncPipe : SyncedSchedGroups) {
416 BestPipeline.insert(BestPipeline.begin(), SyncPipe.second);
417 }
418
419 int PipelineIDx = SyncedInstrs.size() - 1;
420 PipelineInstrs.resize(SyncedInstrs.size());
421 for (auto &SyncInstrMap : SyncedInstrs) {
422 for (auto &SUsToCandSGs : SyncInstrMap.second) {
423 if (PipelineInstrs[PipelineIDx].size() == 0) {
424 PipelineInstrs[PipelineIDx].push_back(
425 std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
426 continue;
427 }
428 auto SortPosition = PipelineInstrs[PipelineIDx].begin();
429 // Insert them in sorted order -- this allows for good parsing order in
430 // the greedy algorithm
431 while (SortPosition != PipelineInstrs[PipelineIDx].end() &&
432 SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum)
433 ++SortPosition;
434 PipelineInstrs[PipelineIDx].insert(
435 SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
436 }
437 --PipelineIDx;
438 }
439}
440
441template <typename T> void PipelineSolver::linkSchedGroups(T I, T E) {
442 for (; I != E; ++I) {
443 auto &GroupA = *I;
444 for (auto J = std::next(I); J != E; ++J) {
445 auto &GroupB = *J;
446 GroupA.link(GroupB);
447 }
448 }
449}
450
451void PipelineSolver::makePipeline() {
452 // Preserve the order of barrier for subsequent SchedGroupBarrier mutations
453 for (auto &SyncPipeline : BestPipeline) {
454 LLVM_DEBUG(dbgs() << "Printing SchedGroups\n");
455 for (auto &SG : SyncPipeline) {
456 LLVM_DEBUG(dbgs() << "SchedGroup with SGID " << SG.getSGID()
457 << " has: \n");
458 SUnit *SGBarr = nullptr;
459 for (auto &SU : SG.Collection) {
460 if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
461 SGBarr = SU;
462 LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n");
463 }
464 // Command line requested IGroupLP doesn't have SGBarr
465 if (!SGBarr)
466 continue;
467 resetEdges(*SGBarr, DAG);
468 SG.link(*SGBarr, false);
469 }
470 }
471
472 for (auto &SyncPipeline : BestPipeline) {
473 IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend())
474 : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end());
475 }
476}
477
478template <typename T>
479int PipelineSolver::linkSUnit(
480 SUnit *SU, int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
481 T I, T E) {
482 bool MakePred = false;
483 int AddedCost = 0;
484 for (; I < E; ++I) {
485 if (I->getSGID() == SGID) {
486 MakePred = true;
487 continue;
488 }
489 auto Group = *I;
490 AddedCost += Group.link(*SU, MakePred, AddedEdges);
491 assert(AddedCost >= 0);
492 }
493 return AddedCost;
494}
495
496int PipelineSolver::addEdges(
497 SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
498 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
499
500 // For IsBottomUp, the first SchedGroup in SyncPipeline contains the
501 // instructions that are the ultimate successors in the resultant mutation.
502 // Therefore, in such a configuration, the SchedGroups occurring before the
503 // candidate SGID are successors of the candidate SchedGroup, thus the current
504 // SU should be linked as a predecessor to SUs in those SchedGroups. The
505 // opposite is true if !IsBottomUp. IsBottomUp occurs in the case of multiple
506 // SCHED_GROUP_BARRIERS, or if a user specifies IGLP_OPT SchedGroups using
507 // IsBottomUp (in reverse).
508 return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.rbegin(),
509 SyncPipeline.rend())
510 : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.begin(),
511 SyncPipeline.end());
512}
513
514void PipelineSolver::removeEdges(
515 const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
516 // Only remove the edges that we have added when testing
517 // the fit.
518 for (auto &PredSuccPair : EdgesToRemove) {
519 SUnit *Pred = PredSuccPair.first;
520 SUnit *Succ = PredSuccPair.second;
521
522 auto Match = llvm::find_if(
523 Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; });
524 if (Match != Succ->Preds.end()) {
525 assert(Match->isArtificial());
526 Succ->removePred(*Match);
527 }
528 }
529}
530
531void PipelineSolver::advancePosition() {
532 ++CurrConflInstNo;
533
534 if (static_cast<size_t>(CurrConflInstNo) >=
535 PipelineInstrs[CurrSyncGroupIdx].size()) {
536 CurrConflInstNo = 0;
537 ++CurrSyncGroupIdx;
538 // Advance to next non-trivial pipeline
539 while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() &&
540 PipelineInstrs[CurrSyncGroupIdx].size() == 0)
541 ++CurrSyncGroupIdx;
542 }
543}
544
545void PipelineSolver::retreatPosition() {
546 assert(CurrConflInstNo >= 0);
547 assert(CurrSyncGroupIdx >= 0);
548
549 if (CurrConflInstNo > 0) {
550 --CurrConflInstNo;
551 return;
552 }
553
554 if (CurrConflInstNo == 0) {
555 // If we return to the starting position, we have explored
556 // the entire tree
557 if (CurrSyncGroupIdx == BeginSyncGroupIdx)
558 return;
559
560 --CurrSyncGroupIdx;
561 // Go to previous non-trivial pipeline
562 while (PipelineInstrs[CurrSyncGroupIdx].size() == 0)
563 --CurrSyncGroupIdx;
564
565 CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1;
566 }
567}
568
569bool PipelineSolver::checkOptimal() {
570 if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {
571 if (BestCost == -1 || CurrCost < BestCost) {
572 BestPipeline = CurrPipeline;
573 BestCost = CurrCost;
574 LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n");
575 }
576 assert(BestCost >= 0);
577 }
578
579 bool DoneExploring = false;
580 if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
581 DoneExploring = true;
582
583 return (DoneExploring || BestCost == 0);
584}
585
586template <typename T>
587void PipelineSolver::populateReadyList(
588 SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, T E) {
589 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
590 auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
591 assert(CurrSU.second.size() >= 1);
592
593 for (; I != E; ++I) {
594 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
595 int CandSGID = *I;
596 SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
597 return SG.getSGID() == CandSGID;
598 });
599 assert(Match);
600
601 if (UseCostHeur) {
602 if (Match->isFull()) {
603 ReadyList.push_back(std::pair(*I, MissPenalty));
604 continue;
605 }
606
607 int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
608 ReadyList.push_back(std::pair(*I, TempCost));
609 removeEdges(AddedEdges);
610 } else
611 ReadyList.push_back(std::pair(*I, -1));
612 }
613
614 if (UseCostHeur) {
615 std::sort(ReadyList.begin(), ReadyList.end(),
616 [](std::pair<int, int> A, std::pair<int, int> B) {
617 return A.second < B.second;
618 });
619 }
620
621 assert(ReadyList.size() == CurrSU.second.size());
622}
623
624bool PipelineSolver::solveExact() {
625 if (checkOptimal())
626 return true;
627
628 if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())
629 return false;
630
631 assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());
632 assert(static_cast<size_t>(CurrConflInstNo) <
633 PipelineInstrs[CurrSyncGroupIdx].size());
634 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
635 LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
636 << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
637
638 // SchedGroup -> Cost pairs
640 // Prioritize the candidate sched groups in terms of lowest cost first
641 IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.rbegin(),
642 CurrSU.second.rend())
643 : populateReadyList(ReadyList, CurrSU.second.begin(),
644 CurrSU.second.end());
645
646 auto I = ReadyList.begin();
647 auto E = ReadyList.end();
648 for (; I != E; ++I) {
649 // If we are trying SGs in least cost order, and the current SG is cost
650 // infeasible, then all subsequent SGs will also be cost infeasible, so we
651 // can prune.
652 if (BestCost != -1 && (CurrCost + I->second > BestCost))
653 return false;
654
655 int CandSGID = I->first;
656 int AddedCost = 0;
657 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
658 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
659 SchedGroup *Match;
660 for (auto &SG : SyncPipeline) {
661 if (SG.getSGID() == CandSGID)
662 Match = &SG;
663 }
664
665 if (Match->isFull())
666 continue;
667
668 if (!Match->allowedByRules(CurrSU.first, SyncPipeline))
669 continue;
670
671 LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask "
672 << (int)Match->getMask() << "and ID " << CandSGID
673 << "\n");
674 Match->add(*CurrSU.first);
675 AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
676 LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n");
677 CurrCost += AddedCost;
678 advancePosition();
679 ++BranchesExplored;
680 bool FinishedExploring = false;
681 // If the Cost after adding edges is greater than a known solution,
682 // backtrack
683 if (CurrCost < BestCost || BestCost == -1) {
684 if (solveExact()) {
685 FinishedExploring = BestCost != 0;
686 if (!FinishedExploring)
687 return true;
688 }
689 }
690
691 retreatPosition();
692 CurrCost -= AddedCost;
693 removeEdges(AddedEdges);
694 Match->pop();
695 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
696 if (FinishedExploring)
697 return true;
698 }
699
700 // Try the pipeline where the current instruction is omitted
701 // Potentially if we omit a problematic instruction from the pipeline,
702 // all the other instructions can nicely fit.
703 CurrCost += MissPenalty;
704 advancePosition();
705
706 LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n");
707
708 bool FinishedExploring = false;
709 if (CurrCost < BestCost || BestCost == -1) {
710 if (solveExact()) {
711 bool FinishedExploring = BestCost != 0;
712 if (!FinishedExploring)
713 return true;
714 }
715 }
716
717 retreatPosition();
718 CurrCost -= MissPenalty;
719 return FinishedExploring;
720}
721
722template <typename T>
723void PipelineSolver::greedyFind(
724 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E) {
725 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
726 int BestNodeCost = -1;
727 int TempCost;
728 SchedGroup *BestGroup = nullptr;
729 int BestGroupID = -1;
730 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
731 LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
732 << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
733
734 // Since we have added the potential SchedGroups from bottom up, but
735 // traversed the DAG from top down, parse over the groups from last to
736 // first. If we fail to do this for the greedy algorithm, the solution will
737 // likely not be good in more complex cases.
738 for (; I != E; ++I) {
739 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
740 int CandSGID = *I;
741 SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
742 return SG.getSGID() == CandSGID;
743 });
744 assert(Match);
745
746 LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "
747 << (int)Match->getMask() << "\n");
748
749 if (Match->isFull()) {
750 LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");
751 continue;
752 }
753 if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) {
754 LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " has conflicting rule\n");
755 continue;
756 }
757 TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
758 LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n");
759 if (TempCost < BestNodeCost || BestNodeCost == -1) {
760 BestGroup = Match;
761 BestNodeCost = TempCost;
762 BestGroupID = CandSGID;
763 }
764 removeEdges(AddedEdges);
765 if (BestNodeCost == 0)
766 break;
767 }
768
769 if (BestGroupID != -1) {
770 BestGroup->add(*CurrSU.first);
771 addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);
772 LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask"
773 << (int)BestGroup->getMask() << "\n");
774 BestCost += TempCost;
775 } else
776 BestCost += MissPenalty;
777
778 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
779}
780
781bool PipelineSolver::solveGreedy() {
782 BestCost = 0;
783 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
784
785 while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
786 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
787 IsBottomUp
788 ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend())
789 : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end());
790 advancePosition();
791 }
792 BestPipeline = CurrPipeline;
793 removeEdges(AddedEdges);
794 return false;
795}
796
797unsigned PipelineSolver::computeProblemSize() {
798 unsigned ProblemSize = 0;
799 for (auto &PipeConflicts : PipelineInstrs) {
800 ProblemSize += PipeConflicts.size();
801 }
802
803 return ProblemSize;
804}
805
806void PipelineSolver::solve() {
807 if (!NeedsSolver)
808 return;
809
810 unsigned ProblemSize = computeProblemSize();
811 assert(ProblemSize > 0);
812
813 bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact;
814 MissPenalty = (ProblemSize / 2) + 1;
815
816 LLVM_DEBUG(DAG->dump());
817 if (EnableExactSolver || BelowCutoff) {
818 LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n");
819 solveGreedy();
820 reset();
821 LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n");
822 if (BestCost > 0) {
823 LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n");
824 solveExact();
825 LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n");
826 }
827 } else { // Use the Greedy Algorithm by default
828 LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n");
829 solveGreedy();
830 }
831
832 makePipeline();
833 LLVM_DEBUG(dbgs() << "After applying mutation\n");
834 LLVM_DEBUG(DAG->dump());
835}
836
837enum IGLPStrategyID : int {
838 MFMASmallGemmOptID = 0,
839 MFMASmallGemmSingleWaveOptID = 1,
840};
841
842// Implement a IGLP scheduling strategy.
843class IGLPStrategy {
844protected:
846
847 const SIInstrInfo *TII;
848
849public:
850 // Add SchedGroups to \p Pipeline to implement this Strategy.
851 virtual void applyIGLPStrategy(
853 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) = 0;
854
855 // Returns true if this strategy should be applied to a ScheduleDAG.
856 virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0;
857
858 bool IsBottomUp = 1;
859
860 IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
861 : DAG(DAG), TII(TII) {}
862
863 virtual ~IGLPStrategy() = default;
864};
865
866class MFMASmallGemmOpt final : public IGLPStrategy {
867private:
868public:
869 void applyIGLPStrategy(
871 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override;
872
873 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
874
875 MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
876 : IGLPStrategy(DAG, TII) {
877 IsBottomUp = 1;
878 }
879};
880
881void MFMASmallGemmOpt::applyIGLPStrategy(
883 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) {
884 // Count the number of MFMA instructions.
885 unsigned MFMACount = 0;
886 for (const MachineInstr &I : *DAG)
887 if (TII->isMFMAorWMMA(I))
888 ++MFMACount;
889
890 const unsigned PipelineSyncID = 0;
891 SchedGroup *SG = nullptr;
892 for (unsigned I = 0; I < MFMACount * 3; ++I) {
893 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
894 SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII);
895 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
896
897 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
898 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
899 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
900 }
901}
902
903class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
904private:
905 // Whether the DS_READ is a predecessor of first four MFMA in region
906 class EnablesInitialMFMA final : public InstructionRule {
907 public:
908 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
909 SmallVectorImpl<SchedGroup> &SyncPipe) override {
910 if (!SyncPipe.size())
911 return false;
912 int MFMAsFound = 0;
913 if (!Cache->size()) {
914 for (auto &Elt : SyncPipe[0].DAG->SUnits) {
915 if (TII->isMFMAorWMMA(*Elt.getInstr())) {
916 ++MFMAsFound;
917 if (MFMAsFound > 4)
918 break;
919 Cache->push_back(&Elt);
920 }
921 }
922 }
923
924 assert(Cache->size());
925 auto DAG = SyncPipe[0].DAG;
926 for (auto &Elt : *Cache) {
927 if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU)))
928 return true;
929 }
930 return false;
931 }
932
933 EnablesInitialMFMA(const SIInstrInfo *TII, unsigned SGID,
934 bool NeedsCache = false)
935 : InstructionRule(TII, SGID, NeedsCache) {}
936 };
937
938 // Whether the MI is a V_PERM and is a predecessor of a common DS_WRITE
939 class IsPermForDSW final : public InstructionRule {
940 public:
941 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
942 SmallVectorImpl<SchedGroup> &SyncPipe) override {
943 auto MI = SU->getInstr();
944 if (MI->getOpcode() != AMDGPU::V_PERM_B32_e64)
945 return false;
946
947 bool FitsInGroup = false;
948 // Does the VALU have a DS_WRITE successor
949 if (!Collection.size()) {
950 for (auto &Succ : SU->Succs) {
951 SUnit *SuccUnit = Succ.getSUnit();
952 if (TII->isDS(*SuccUnit->getInstr()) &&
953 SuccUnit->getInstr()->mayStore()) {
954 Cache->push_back(SuccUnit);
955 FitsInGroup = true;
956 }
957 }
958 return FitsInGroup;
959 }
960
961 assert(Cache->size());
962
963 // Does the VALU have a DS_WRITE successor that is the same as other
964 // VALU already in the group. The V_PERMs will all share 1 DS_W succ
965 return std::any_of(Cache->begin(), Cache->end(), [&SU](SUnit *Elt) {
966 return std::any_of(SU->Succs.begin(), SU->Succs.end(),
967 [&Elt](const SDep &ThisSucc) {
968 return ThisSucc.getSUnit() == Elt;
969 });
970 });
971 }
972
973 IsPermForDSW(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
974 : InstructionRule(TII, SGID, NeedsCache) {}
975 };
976
977 // Whether the SU is a successor of any element in previous SchedGroup
978 class IsSuccOfPrevGroup final : public InstructionRule {
979 public:
980 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
981 SmallVectorImpl<SchedGroup> &SyncPipe) override {
982 SchedGroup *OtherGroup = nullptr;
983 for (auto &PipeSG : SyncPipe) {
984 if ((unsigned)PipeSG.getSGID() == SGID - 1) {
985 OtherGroup = &PipeSG;
986 }
987 }
988
989 if (!OtherGroup)
990 return false;
991 if (!OtherGroup->Collection.size())
992 return true;
993
994 // Does the previous VALU have this DS_Write as a successor
995 return (std::any_of(OtherGroup->Collection.begin(),
996 OtherGroup->Collection.end(), [&SU](SUnit *Elt) {
997 return std::any_of(Elt->Succs.begin(),
998 Elt->Succs.end(),
999 [&SU](SDep &Succ) {
1000 return Succ.getSUnit() == SU;
1001 });
1002 }));
1003 }
1004 IsSuccOfPrevGroup(const SIInstrInfo *TII, unsigned SGID,
1005 bool NeedsCache = false)
1006 : InstructionRule(TII, SGID, NeedsCache) {}
1007 };
1008
1009 // Whether the combined load width of group is 128 bits
1010 class VMEMSize final : public InstructionRule {
1011 public:
1012 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1013 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1014 auto MI = SU->getInstr();
1015 if (MI->getOpcode() == TargetOpcode::BUNDLE)
1016 return false;
1017 if (!Collection.size())
1018 return true;
1019
1020 int NumBits = 0;
1021
1022 auto TRI = TII->getRegisterInfo();
1023 auto &MRI = MI->getParent()->getParent()->getRegInfo();
1024 for (auto &Elt : Collection) {
1025 auto Op = Elt->getInstr()->getOperand(0);
1026 auto Size =
1027 TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(MRI, Op));
1028 NumBits += Size;
1029 }
1030
1031 if (NumBits < 128) {
1032 assert(TII->isVMEM(*MI) && MI->mayLoad());
1033 if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(
1034 MRI, MI->getOperand(0))) <=
1035 128)
1036 return true;
1037 }
1038
1039 return false;
1040 }
1041
1042 VMEMSize(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
1043 : InstructionRule(TII, SGID, NeedsCache) {}
1044 };
1045
1046 // Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup
1047 // that is /p Distance steps away
1048 class SharesPredWithPrevNthGroup final : public InstructionRule {
1049 private:
1050 unsigned Distance = 1;
1051
1052 public:
1053 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1054 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1055 SchedGroup *OtherGroup = nullptr;
1056 if (!SyncPipe.size())
1057 return false;
1058
1059 if (!Cache->size()) {
1060
1061 for (auto &PipeSG : SyncPipe) {
1062 if ((unsigned)PipeSG.getSGID() == SGID - Distance) {
1063 OtherGroup = &PipeSG;
1064 }
1065 }
1066
1067 if (!OtherGroup)
1068 return false;
1069 if (!OtherGroup->Collection.size())
1070 return true;
1071
1072 for (auto &OtherEle : OtherGroup->Collection) {
1073 for (auto &Pred : OtherEle->Preds) {
1074 if (Pred.getSUnit()->getInstr()->getOpcode() ==
1075 AMDGPU::V_PERM_B32_e64)
1076 Cache->push_back(Pred.getSUnit());
1077 }
1078 }
1079 }
1080
1081 assert(Cache->size());
1082 auto DAG = SyncPipe[0].DAG;
1083 // Does the previous DS_WRITE share a V_PERM predecessor with this
1084 // VMEM_READ
1085 return (
1086 std::any_of(Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *Elt) {
1087 return DAG->IsReachable(const_cast<SUnit *>(SU), Elt);
1088 }));
1089 }
1090 SharesPredWithPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
1091 unsigned SGID, bool NeedsCache = false)
1092 : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
1093 };
1094
1095public:
1096 void applyIGLPStrategy(
1098 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override;
1099
1100 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
1101
1102 MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
1103 : IGLPStrategy(DAG, TII) {
1104 IsBottomUp = 0;
1105 }
1106};
1107
1108void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
1110 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) {
1111 unsigned MFMACount = 0;
1112 unsigned DSWCount = 0;
1113 unsigned DSWWithPermCount = 0;
1114 unsigned DSWWithSharedVMEMCount = 0;
1115 unsigned DSRCount = 0;
1116 SmallVector<SUnit *, 6> DSWithPerms;
1117 for (auto &SU : DAG->SUnits) {
1118 auto I = SU.getInstr();
1119 if (TII->isMFMAorWMMA(*I))
1120 ++MFMACount;
1121 else if (TII->isDS(*I)) {
1122 if (I->mayLoad())
1123 ++DSRCount;
1124 else if (I->mayStore()) {
1125 ++DSWCount;
1126 for (auto Pred : SU.Preds) {
1127 if (Pred.getSUnit()->getInstr()->getOpcode() ==
1128 AMDGPU::V_PERM_B32_e64) {
1129 DSWithPerms.push_back(&SU);
1130 break;
1131 }
1132 }
1133 }
1134 }
1135 }
1136 DSWWithPermCount = DSWithPerms.size();
1137 auto I = DSWithPerms.begin();
1138 auto E = DSWithPerms.end();
1139
1140 // Get the count of DS_WRITES with V_PERM predecessors which
1141 // have loop carried dependencies (WAR) on the same VMEM_READs.
1142 // We consider partial overlap as a miss -- in other words,
1143 // for a given DS_W, we only consider another DS_W as matching
1144 // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
1145 // for every V_PERM pred of this DS_W.
1148 for (; I != E; I++) {
1149 SUnit *Cand = nullptr;
1150 bool MissedAny = false;
1151 for (auto &Pred : (*I)->Preds) {
1152 if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
1153 continue;
1154
1155 if (Cand && llvm::is_contained(Counted, Cand))
1156 break;
1157
1158 for (auto &Succ : Pred.getSUnit()->Succs) {
1159 auto MI = Succ.getSUnit()->getInstr();
1160 if (!TII->isVMEM(*MI) || !MI->mayLoad())
1161 continue;
1162
1163 if (MissedAny || !VMEMLookup.size()) {
1164 MissedAny = true;
1165 VMEMLookup[MI] = *I;
1166 continue;
1167 }
1168
1169 if (!VMEMLookup.contains(MI)) {
1170 MissedAny = true;
1171 VMEMLookup[MI] = *I;
1172 continue;
1173 }
1174
1175 Cand = VMEMLookup[MI];
1176 if (llvm::is_contained(Counted, Cand)) {
1177 MissedAny = true;
1178 break;
1179 }
1180 }
1181 }
1182 if (!MissedAny && Cand) {
1183 DSWWithSharedVMEMCount += 2;
1184 Counted.push_back(Cand);
1185 Counted.push_back(*I);
1186 }
1187 }
1188
1189 assert(DSWWithSharedVMEMCount <= DSWWithPermCount);
1190 SchedGroup *SG;
1191 unsigned PipelineSyncID = 0;
1192 // For kernels with V_PERM, there are enough VALU to mix in between MFMAs
1193 if (DSWWithPermCount) {
1194 for (unsigned I = 0; I < MFMACount; I++) {
1195 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1196 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
1197 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1198
1199 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1200 SchedGroupMask::VALU, 2, PipelineSyncID, DAG, TII);
1201 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1202 }
1203 }
1204
1205 PipelineSyncID = 1;
1206 // Phase 1: Break up DS_READ and MFMA clusters.
1207 // First DS_READ to make ready initial MFMA, then interleave MFMA with DS_READ
1208 // prefetch
1209
1210 // Make ready initial MFMA
1211 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1212 SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG, TII);
1213 SG->addRule(std::make_shared<EnablesInitialMFMA>(TII, SG->getSGID(), true));
1214 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1215
1216 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1217 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
1218 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1219
1220 // Interleave MFMA with DS_READ prefetch
1221 for (unsigned I = 0; I < DSRCount - 4; ++I) {
1222 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1223 SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);
1224 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1225
1226 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1227 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
1228 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1229 }
1230
1231 // Phase 2a: Loop carried dependency with V_PERM
1232 // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
1233 // depend on. Interleave MFMA to keep XDL unit busy throughout.
1234 for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) {
1235 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1236 SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
1237 SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
1238 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1239
1240 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1241 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
1242 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false));
1243 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1244
1245 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1246 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
1247 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
1248 1, TII, SG->getSGID(), true));
1249 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
1250 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1251
1252 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1253 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
1254 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1255
1256 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1257 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
1258 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
1259 3, TII, SG->getSGID(), true));
1260 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
1261 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1262
1263 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1264 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
1265 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1266 }
1267
1268 // Phase 2b: Loop carried dependency without V_PERM
1269 // Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on.
1270 // Interleave MFMA to keep XDL unit busy throughout.
1271 for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) {
1272 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1273 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
1274 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1275
1276 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1277 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
1278 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
1279 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1280
1281 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1282 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
1283 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1284 }
1285
1286 // Phase 2c: Loop carried dependency with V_PERM, VMEM_READs are
1287 // ultimately used by two DS_WRITE
1288 // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
1289 // depend on. Interleave MFMA to keep XDL unit busy throughout.
1290
1291 for (unsigned I = 0; I < DSWWithSharedVMEMCount; ++I) {
1292 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1293 SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
1294 SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
1295 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1296
1297 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1298 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
1299 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false));
1300 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1301
1302 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1303 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
1304 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1305
1306 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1307 SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
1308 SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
1309 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1310
1311 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1312 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
1313 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false));
1314 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1315
1316 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1317 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
1318 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1319
1320 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1321 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
1322 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
1323 2, TII, SG->getSGID(), true));
1324 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
1325 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1326
1327 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1328 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
1329 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1330
1331 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1332 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
1333 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
1334 4, TII, SG->getSGID(), true));
1335 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
1336 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1337
1338 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1339 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
1340 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1341 }
1342}
1343
1344static std::unique_ptr<IGLPStrategy>
1345createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
1346 const SIInstrInfo *TII) {
1347 switch (ID) {
1348 case MFMASmallGemmOptID:
1349 return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
1350 case MFMASmallGemmSingleWaveOptID:
1351 return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII);
1352 }
1353
1354 llvm_unreachable("Unknown IGLPStrategyID");
1355}
1356
1357class IGroupLPDAGMutation : public ScheduleDAGMutation {
1358private:
1359 const SIInstrInfo *TII;
1360
1361 ScheduleDAGMI *DAG;
1362
1363 // Organize lists of SchedGroups by their SyncID. SchedGroups /
1364 // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added
1365 // between then.
1367
1368 // Used to track instructions that can be mapped to multiple sched groups
1370
1371 // Add DAG edges that enforce SCHED_BARRIER ordering.
1372 void addSchedBarrierEdges(SUnit &SU);
1373
1374 // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
1375 // not be reordered accross the SCHED_BARRIER. This is used for the base
1376 // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that
1377 // SCHED_BARRIER will always block all instructions that can be classified
1378 // into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size
1379 // and may only synchronize with some SchedGroups. Returns the inverse of
1380 // Mask. SCHED_BARRIER's mask describes which instruction types should be
1381 // allowed to be scheduled across it. Invert the mask to get the
1382 // SchedGroupMask of instructions that should be barred.
1383 SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const;
1384
1385 // Create SchedGroups for a SCHED_GROUP_BARRIER.
1386 void initSchedGroupBarrierPipelineStage(
1387 std::vector<SUnit>::reverse_iterator RIter);
1388
1389 void initIGLPOpt(SUnit &SU);
1390
1391public:
1392 void apply(ScheduleDAGInstrs *DAGInstrs) override;
1393
1394 // The order in which the PipelineSolver should process the candidate
1395 // SchedGroup for a PipelineInstr. BOTTOM_UP will try to add SUs to the last
1396 // created SchedGroup first, and will consider that as the ultimate
1397 // predecessor group when linking. TOP_DOWN instead links and processes the
1398 // first created SchedGroup first.
1399 bool IsBottomUp = 1;
1400
1401 IGroupLPDAGMutation() = default;
1402};
1403
1404unsigned SchedGroup::NumSchedGroups = 0;
1405
1406bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) {
1407 if (A != B && DAG->canAddEdge(B, A)) {
1408 DAG->addEdge(B, SDep(A, SDep::Artificial));
1409 return true;
1410 }
1411 return false;
1412}
1413
1414bool SchedGroup::canAddMI(const MachineInstr &MI) const {
1415 bool Result = false;
1416 if (MI.isMetaInstruction())
1417 Result = false;
1418
1419 else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
1420 (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI)))
1421 Result = true;
1422
1423 else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
1424 TII->isVALU(MI) && !TII->isMFMAorWMMA(MI))
1425 Result = true;
1426
1427 else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
1428 TII->isSALU(MI))
1429 Result = true;
1430
1431 else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
1432 TII->isMFMAorWMMA(MI))
1433 Result = true;
1434
1435 else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
1436 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
1437 Result = true;
1438
1439 else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
1440 MI.mayLoad() &&
1441 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
1442 Result = true;
1443
1444 else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
1445 MI.mayStore() &&
1446 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
1447 Result = true;
1448
1449 else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
1450 TII->isDS(MI))
1451 Result = true;
1452
1453 else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
1454 MI.mayLoad() && TII->isDS(MI))
1455 Result = true;
1456
1457 else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
1458 MI.mayStore() && TII->isDS(MI))
1459 Result = true;
1460
1461 LLVM_DEBUG(
1462 dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)
1463 << (Result ? " could classify " : " unable to classify ") << MI);
1464
1465 return Result;
1466}
1467
1468int SchedGroup::link(SUnit &SU, bool MakePred,
1469 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
1470 int MissedEdges = 0;
1471 for (auto *A : Collection) {
1472 SUnit *B = &SU;
1473 if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
1474 continue;
1475 if (MakePred)
1476 std::swap(A, B);
1477
1478 if (DAG->IsReachable(B, A))
1479 continue;
1480
1481 // tryAddEdge returns false if there is a dependency that makes adding
1482 // the A->B edge impossible, otherwise it returns true;
1483 bool Added = tryAddEdge(A, B);
1484 if (Added)
1485 AddedEdges.push_back(std::pair(A, B));
1486 else
1487 ++MissedEdges;
1488 }
1489
1490 return MissedEdges;
1491}
1492
1493void SchedGroup::link(SUnit &SU, bool MakePred) {
1494 for (auto *A : Collection) {
1495 SUnit *B = &SU;
1496 if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
1497 continue;
1498 if (MakePred)
1499 std::swap(A, B);
1500
1501 tryAddEdge(A, B);
1502 }
1503}
1504
1505void SchedGroup::link(SUnit &SU,
1506 function_ref<bool(const SUnit *A, const SUnit *B)> P) {
1507 for (auto *A : Collection) {
1508 SUnit *B = &SU;
1509 if (P(A, B))
1510 std::swap(A, B);
1511
1512 tryAddEdge(A, B);
1513 }
1514}
1515
1516void SchedGroup::link(SchedGroup &OtherGroup) {
1517 for (auto *B : OtherGroup.Collection)
1518 link(*B);
1519}
1520
1521bool SchedGroup::canAddSU(SUnit &SU) const {
1522 MachineInstr &MI = *SU.getInstr();
1523 if (MI.getOpcode() != TargetOpcode::BUNDLE)
1524 return canAddMI(MI);
1525
1526 // Special case for bundled MIs.
1527 const MachineBasicBlock *MBB = MI.getParent();
1528 MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;
1529 while (E != MBB->end() && E->isBundledWithPred())
1530 ++E;
1531
1532 // Return true if all of the bundled MIs can be added to this group.
1533 return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); });
1534}
1535
1536void SchedGroup::initSchedGroup() {
1537 for (auto &SU : DAG->SUnits) {
1538 if (isFull())
1539 break;
1540
1541 if (canAddSU(SU))
1542 add(SU);
1543 }
1544}
1545
1546void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
1547 SUnitsToCandidateSGsMap &SyncedInstrs) {
1548 SUnit &InitSU = *RIter;
1549 for (auto E = DAG->SUnits.rend(); RIter != E; ++RIter) {
1550 auto &SU = *RIter;
1551 if (isFull())
1552 break;
1553
1554 if (canAddSU(SU))
1555 SyncedInstrs[&SU].push_back(SGID);
1556 }
1557
1558 add(InitSU);
1559 assert(MaxSize);
1560 (*MaxSize)++;
1561}
1562
1563void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) {
1564 auto I = DAG->SUnits.rbegin();
1565 auto E = DAG->SUnits.rend();
1566 for (; I != E; ++I) {
1567 auto &SU = *I;
1568 if (isFull())
1569 break;
1570
1571 if (canAddSU(SU))
1572 SyncedInstrs[&SU].push_back(SGID);
1573 }
1574}
1575
1576void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
1577 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
1578 if (!TSchedModel || DAGInstrs->SUnits.empty())
1579 return;
1580
1581 LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n");
1582 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
1583 TII = ST.getInstrInfo();
1584 DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
1585 SyncedSchedGroups.clear();
1586 SyncedInstrs.clear();
1587 bool foundSB = false;
1588 bool foundIGLP = false;
1589 for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) {
1590 unsigned Opc = R->getInstr()->getOpcode();
1591 // SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive.
1592 if (Opc == AMDGPU::SCHED_BARRIER) {
1593 addSchedBarrierEdges(*R);
1594 foundSB = true;
1595 } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) {
1596 initSchedGroupBarrierPipelineStage(R);
1597 foundSB = true;
1598 } else if (Opc == AMDGPU::IGLP_OPT) {
1599 resetEdges(*R, DAG);
1600 if (!foundSB && !foundIGLP)
1601 initIGLPOpt(*R);
1602 foundIGLP = true;
1603 }
1604 }
1605
1606 if (foundSB || foundIGLP) {
1607 PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
1608 // PipelineSolver performs the mutation by adding the edges it
1609 // determined as the best
1610 PS.solve();
1611 }
1612}
1613
1614void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
1615 MachineInstr &MI = *SchedBarrier.getInstr();
1616 assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
1617 // Remove all existing edges from the SCHED_BARRIER that were added due to the
1618 // instruction having side effects.
1619 resetEdges(SchedBarrier, DAG);
1620 auto InvertedMask =
1621 invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm());
1622 SchedGroup SG(InvertedMask, std::nullopt, DAG, TII);
1623 SG.initSchedGroup();
1624 // Preserve original instruction ordering relative to the SCHED_BARRIER.
1625 SG.link(
1626 SchedBarrier,
1627 (function_ref<bool(const SUnit *A, const SUnit *B)>)[](
1628 const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; });
1629}
1630
1631SchedGroupMask
1632IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
1633 // Invert mask and erase bits for types of instructions that are implied to be
1634 // allowed past the SCHED_BARRIER.
1635 SchedGroupMask InvertedMask = ~Mask;
1636
1637 // ALU implies VALU, SALU, MFMA.
1638 if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
1639 InvertedMask &=
1640 ~SchedGroupMask::VALU & ~SchedGroupMask::SALU & ~SchedGroupMask::MFMA;
1641 // VALU, SALU, MFMA implies ALU.
1642 else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||
1643 (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||
1644 (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE)
1645 InvertedMask &= ~SchedGroupMask::ALU;
1646
1647 // VMEM implies VMEM_READ, VMEM_WRITE.
1648 if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
1649 InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE;
1650 // VMEM_READ, VMEM_WRITE implies VMEM.
1651 else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||
1652 (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE)
1653 InvertedMask &= ~SchedGroupMask::VMEM;
1654
1655 // DS implies DS_READ, DS_WRITE.
1656 if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE)
1657 InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE;
1658 // DS_READ, DS_WRITE implies DS.
1659 else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE ||
1660 (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE)
1661 InvertedMask &= ~SchedGroupMask::DS;
1662
1663 return InvertedMask;
1664}
1665
1666void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
1667 std::vector<SUnit>::reverse_iterator RIter) {
1668 // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due
1669 // to the instruction having side effects.
1670 resetEdges(*RIter, DAG);
1671 MachineInstr &SGB = *RIter->getInstr();
1672 assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER);
1673 int32_t SGMask = SGB.getOperand(0).getImm();
1674 int32_t Size = SGB.getOperand(1).getImm();
1675 int32_t SyncID = SGB.getOperand(2).getImm();
1676
1677 auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,
1678 Size, SyncID, DAG, TII);
1679
1680 SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]);
1681}
1682
1683void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
1684 IGLPStrategyID StrategyID =
1685 (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm();
1686 auto S = createIGLPStrategy(StrategyID, DAG, TII);
1687 if (S->shouldApplyStrategy(DAG)) {
1688 IsBottomUp = S->IsBottomUp;
1689 S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups);
1690 }
1691}
1692
1693} // namespace
1694
1695namespace llvm {
1696
1697std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() {
1698 return std::make_unique<IGroupLPDAGMutation>();
1699}
1700
1701} // end namespace llvm
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock & MBB
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:41
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
uint64_t Size
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
#define P(N)
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
This class represents an Operation in the Expression.
unsigned size() const
Definition: DenseMap.h:99
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Representation of each machine instruction.
Definition: MachineInstr.h:68
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:543
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:553
int64_t getImm() const
Scheduling dependency.
Definition: ScheduleDAG.h:49
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
unsigned NodeNum
Entry # of node in the node vector.
Definition: ScheduleDAG.h:264
void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:257
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:256
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:373
A ScheduleDAG for scheduling lists of MachineInstr.
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
bool addEdge(SUnit *SuccSU, const SDep &PredDep)
Add a DAG edge to the given SU with the given predecessor dependence data.
bool IsReachable(SUnit *SU, SUnit *TargetSU)
IsReachable - Checks if SU is reachable from TargetSU.
bool canAddEdge(SUnit *SuccSU, SUnit *PredSU)
True if an edge can be added from PredSU to SuccSU without creating a cycle.
void dump() const override
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
Mutate the DAG as a postpass after normal DAG building.
virtual void apply(ScheduleDAGInstrs *DAG)=0
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:561
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:559
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
Provide an instruction scheduling machine model to CodeGen passes.
An efficient, type-erasing, non-owning reference to a callable.
Iterator for intrusive lists based on ilist_node.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
void apply(Opt *O, const Mod &M, const Mods &... Ms)
Definition: CommandLine.h:1300
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1685
@ NONE
Definition: Attributor.h:6385
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation()
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
FormattedNumber format_hex(uint64_t N, unsigned Width, bool Upper=false)
format_hex - Output N as a fixed width hexadecimal.
Definition: Format.h:187
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1754
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1884
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860