LLVM 23.0.0git
AMDGPUIGroupLP.cpp
Go to the documentation of this file.
1//===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file This file defines a set of schedule DAG mutations that can be used to
10// override default scheduler behavior to enforce specific scheduling patterns.
11// They should be used in cases where runtime performance considerations such as
12// inter-wavefront interactions, mean that compile-time heuristics cannot
13// predict the optimal instruction ordering, or in kernels where optimum
14// instruction scheduling is important enough to warrant manual intervention.
15//
16//===----------------------------------------------------------------------===//
17
18#include "AMDGPUIGroupLP.h"
20#include "SIInstrInfo.h"
23#include "llvm/ADT/DenseMap.h"
26
27#include <type_traits>
28
29using namespace llvm;
30
31#define DEBUG_TYPE "igrouplp"
32
33namespace {
34
35static cl::opt<bool> EnableExactSolver(
36 "amdgpu-igrouplp-exact-solver", cl::Hidden,
37 cl::desc("Whether to use the exponential time solver to fit "
38 "the instructions to the pipeline as closely as "
39 "possible."),
40 cl::init(false));
41
42static cl::opt<unsigned> CutoffForExact(
43 "amdgpu-igrouplp-exact-solver-cutoff", cl::init(0), cl::Hidden,
44 cl::desc("The maximum number of scheduling group conflicts "
45 "which we attempt to solve with the exponential time "
46 "exact solver. Problem sizes greater than this will"
47 "be solved by the less accurate greedy algorithm. Selecting "
48 "solver by size is superseded by manually selecting "
49 "the solver (e.g. by amdgpu-igrouplp-exact-solver"));
50
51static cl::opt<uint64_t> MaxBranchesExplored(
52 "amdgpu-igrouplp-exact-solver-max-branches", cl::init(0), cl::Hidden,
53 cl::desc("The amount of branches that we are willing to explore with"
54 "the exact algorithm before giving up."));
55
56static cl::opt<bool> UseCostHeur(
57 "amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden,
58 cl::desc("Whether to use the cost heuristic to make choices as we "
59 "traverse the search space using the exact solver. Defaulted "
60 "to on, and if turned off, we will use the node order -- "
61 "attempting to put the later nodes in the later sched groups. "
62 "Experimentally, results are mixed, so this should be set on a "
63 "case-by-case basis."));
64
65// Components of the mask that determines which instruction types may be may be
66// classified into a SchedGroup.
67enum class SchedGroupMask {
68 NONE = 0u,
69 ALU = 1u << 0,
70 VALU = 1u << 1,
71 SALU = 1u << 2,
72 MFMA = 1u << 3,
73 VMEM = 1u << 4,
74 VMEM_READ = 1u << 5,
75 VMEM_WRITE = 1u << 6,
76 DS = 1u << 7,
77 DS_READ = 1u << 8,
78 DS_WRITE = 1u << 9,
79 TRANS = 1u << 10,
80 ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
81 DS_READ | DS_WRITE | TRANS,
82 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
83};
84
85class SchedGroup;
86
87// InstructionRule class is used to enact a filter which determines whether or
88// not an SU maps to a given SchedGroup. It contains complementary data
89// structures (e.g Cache) to help those filters.
90class InstructionRule {
91protected:
92 const SIInstrInfo *TII;
93 unsigned SGID;
94 // A cache made available to the Filter to store SUnits for subsequent
95 // invocations of the Filter
96 std::optional<SmallVector<SUnit *, 4>> Cache;
97
98public:
99 virtual bool
100 apply(const SUnit *, const ArrayRef<SUnit *>,
102 return true;
103 };
104
105 InstructionRule(const SIInstrInfo *TII, unsigned SGID,
106 bool NeedsCache = false)
107 : TII(TII), SGID(SGID) {
108 if (NeedsCache) {
109 Cache = SmallVector<SUnit *, 4>();
110 }
111 }
112
113 virtual ~InstructionRule() = default;
114};
115
116using SUnitsToCandidateSGsMap = DenseMap<SUnit *, SmallVector<int, 4>>;
117
118// Classify instructions into groups to enable fine tuned control over the
119// scheduler. These groups may be more specific than current SchedModel
120// instruction classes.
121class SchedGroup {
122private:
123 // Mask that defines which instruction types can be classified into this
124 // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER
125 // and SCHED_GROUP_BARRIER.
126 SchedGroupMask SGMask;
127
128 // Maximum number of SUnits that can be added to this group.
129 std::optional<unsigned> MaxSize;
130
131 // SchedGroups will only synchronize with other SchedGroups that have the same
132 // SyncID.
133 int SyncID = 0;
134
135 // SGID is used to map instructions to candidate SchedGroups
136 unsigned SGID;
137
138 // The different rules each instruction in this SchedGroup must conform to
140
141 // Count of the number of created SchedGroups, used to initialize SGID.
142 static unsigned NumSchedGroups;
143
144 // Try to add and edge from SU A to SU B.
145 bool tryAddEdge(SUnit *A, SUnit *B);
146
147 // Use SGMask to determine whether we can classify MI as a member of this
148 // SchedGroup object.
149 bool canAddMI(const MachineInstr &MI) const;
150
151public:
152 // Collection of SUnits that are classified as members of this group.
153 SmallVector<SUnit *, 32> Collection;
154
156 const SIInstrInfo *TII;
157
158 // Returns true if SU can be added to this SchedGroup.
159 bool canAddSU(SUnit &SU) const;
160
161 // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If
162 // MakePred is true, SU will be a predecessor of the SUnits in this
163 // SchedGroup, otherwise SU will be a successor.
164 void link(SUnit &SU, bool MakePred = false);
165
166 // Add DAG dependencies and track which edges are added, and the count of
167 // missed edges
168 int link(SUnit &SU, bool MakePred,
169 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
170
171 // Add DAG dependencies from all SUnits in this SchedGroup and this SU.
172 // Use the predicate to determine whether SU should be a predecessor (P =
173 // true) or a successor (P = false) of this SchedGroup.
174 void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P);
175
176 // Add DAG dependencies such that SUnits in this group shall be ordered
177 // before SUnits in OtherGroup.
178 void link(SchedGroup &OtherGroup);
179
180 // Returns true if no more instructions may be added to this group.
181 bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; }
182
183 // Append a constraint that SUs must meet in order to fit into this
184 // SchedGroup. Since many rules involve the relationship between a SchedGroup
185 // and the SUnits in other SchedGroups, rules are checked at Pipeline Solve
186 // time (rather than SchedGroup init time.)
187 void addRule(std::shared_ptr<InstructionRule> NewRule) {
188 Rules.push_back(NewRule);
189 }
190
191 // Returns true if the SU matches all rules
192 bool allowedByRules(const SUnit *SU,
193 SmallVectorImpl<SchedGroup> &SyncPipe) const {
194 for (auto &Rule : Rules) {
195 if (!Rule->apply(SU, Collection, SyncPipe))
196 return false;
197 }
198 return true;
199 }
200
201 // Add SU to the SchedGroup.
202 void add(SUnit &SU) {
203 LLVM_DEBUG(dbgs() << "For SchedGroup with mask "
204 << format_hex((int)SGMask, 10, true) << " adding "
205 << *SU.getInstr());
206 Collection.push_back(&SU);
207 }
208
209 // Remove last element in the SchedGroup
210 void pop() { Collection.pop_back(); }
211
212 // Identify and add all relevant SUs from the DAG to this SchedGroup.
213 void initSchedGroup();
214
215 // Add instructions to the SchedGroup bottom up starting from RIter.
216 // PipelineInstrs is a set of instructions that should not be added to the
217 // SchedGroup even when the other conditions for adding it are satisfied.
218 // RIter will be added to the SchedGroup as well, and dependencies will be
219 // added so that RIter will always be scheduled at the end of the group.
220 void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
221 SUnitsToCandidateSGsMap &SyncedInstrs);
222
223 void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs);
224
225 int getSyncID() { return SyncID; }
226
227 int getSGID() { return SGID; }
228
229 SchedGroupMask getMask() { return SGMask; }
230
231 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
232 ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
233 : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) {
234 SGID = NumSchedGroups++;
235 }
236
237 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID,
238 ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
239 : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) {
240 SGID = NumSchedGroups++;
241 }
242};
243
244using SUToCandSGsPair = std::pair<SUnit *, SmallVector<int, 4>>;
245using SUsToCandSGsVec = SmallVector<SUToCandSGsPair, 4>;
246
247// The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline
248// in non-trivial cases. For example, if the requested pipeline is
249// {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction
250// in the DAG, then we will have an instruction that can not be trivially
251// assigned to a SchedGroup. The PipelineSolver class implements two algorithms
252// to find a good solution to the pipeline -- a greedy algorithm and an exact
253// algorithm. The exact algorithm has an exponential time complexity and should
254// only be used for small sized problems or medium sized problems where an exact
255// solution is highly desired.
256class PipelineSolver {
257 [[maybe_unused]] ScheduleDAGMI *DAG;
258
259 // Instructions that can be assigned to multiple SchedGroups
261 SmallVector<SUsToCandSGsVec, 4> PipelineInstrs;
263 // The current working pipeline
265 // The pipeline that has the best solution found so far
267
268 // Whether or not we actually have any SyncedInstrs to try to solve.
269 bool NeedsSolver = false;
270
271 // Compute an estimate of the size of search tree -- the true size is
272 // the product of each conflictedInst.Matches.size() across all SyncPipelines
273 unsigned computeProblemSize();
274
275 // The cost penalty of not assigning a SU to a SchedGroup
276 int MissPenalty = 0;
277
278 // Costs in terms of the number of edges we are unable to add
279 int BestCost = -1;
280 int CurrCost = 0;
281
282 // Index pointing to the conflicting instruction that is currently being
283 // fitted
284 int CurrConflInstNo = 0;
285 // Index to the pipeline that is currently being fitted
286 int CurrSyncGroupIdx = 0;
287 // The first non trivial pipeline
288 int BeginSyncGroupIdx = 0;
289
290 // How many branches we have explored
291 uint64_t BranchesExplored = 0;
292
293 // The direction in which we process the candidate SchedGroups per SU
294 bool IsBottomUp = true;
295
296 // Update indices to fit next conflicting instruction
297 void advancePosition();
298 // Recede indices to attempt to find better fit for previous conflicting
299 // instruction
300 void retreatPosition();
301
302 // The exponential time algorithm which finds the provably best fit
303 bool solveExact();
304 // The polynomial time algorithm which attempts to find a good fit
305 bool solveGreedy();
306 // Find the best SchedGroup for the current SU using the heuristic given all
307 // current information. One step in the greedy algorithm. Templated against
308 // the SchedGroup iterator (either reverse or forward).
309 template <typename T>
310 void greedyFind(std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I,
311 T E);
312 // Whether or not the current solution is optimal
313 bool checkOptimal();
314 // Populate the ready list, prioiritizing fewest missed edges first
315 // Templated against the SchedGroup iterator (either reverse or forward).
316 template <typename T>
317 void populateReadyList(SmallVectorImpl<std::pair<int, int>> &ReadyList, T I,
318 T E);
319 // Add edges corresponding to the SchedGroups as assigned by solver
320 void makePipeline();
321 // Link the SchedGroups in the best found pipeline.
322 // Tmplated against the SchedGroup iterator (either reverse or forward).
323 template <typename T> void linkSchedGroups(T I, T E);
324 // Add the edges from the SU to the other SchedGroups in pipeline, and
325 // return the number of edges missed.
326 int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
327 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
328 /// Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It
329 /// returns the cost (in terms of missed pipeline edges), and tracks the edges
330 /// added in \p AddedEdges
331 template <typename T>
332 int linkSUnit(SUnit *SU, int SGID,
333 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E);
334 /// Remove the edges passed via \p AddedEdges
335 void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
336 // Convert the passed in maps to arrays for bidirectional iterators
337 void convertSyncMapsToArrays();
338
339 void reset();
340
341public:
342 // Invoke the solver to map instructions to instruction groups. Heuristic &&
343 // command-line-option determines to use exact or greedy algorithm.
344 void solve();
345
346 PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
348 ScheduleDAGMI *DAG, bool IsBottomUp = true)
349 : DAG(DAG), SyncedInstrs(SyncedInstrs),
350 SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) {
351
352 for (auto &PipelineInstrs : SyncedInstrs) {
353 if (PipelineInstrs.second.size() > 0) {
354 NeedsSolver = true;
355 break;
356 }
357 }
358
359 if (!NeedsSolver)
360 return;
361
362 convertSyncMapsToArrays();
363
364 CurrPipeline = BestPipeline;
365
366 while (static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.size() &&
367 PipelineInstrs[BeginSyncGroupIdx].size() == 0)
368 ++BeginSyncGroupIdx;
369
370 if (static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.size())
371 return;
372 }
373};
374
375void PipelineSolver::reset() {
376
377 for (auto &SyncPipeline : CurrPipeline) {
378 for (auto &SG : SyncPipeline) {
379 SmallVector<SUnit *, 32> TempCollection = SG.Collection;
380 SG.Collection.clear();
381 auto *SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) {
382 return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER;
383 });
384 if (SchedBarr != TempCollection.end())
385 SG.Collection.push_back(*SchedBarr);
386 }
387 }
388
389 CurrSyncGroupIdx = BeginSyncGroupIdx;
390 CurrConflInstNo = 0;
391 CurrCost = 0;
392}
393
394void PipelineSolver::convertSyncMapsToArrays() {
395 for (auto &SyncPipe : SyncedSchedGroups) {
396 BestPipeline.insert(BestPipeline.begin(), SyncPipe.second);
397 }
398
399 int PipelineIDx = SyncedInstrs.size() - 1;
400 PipelineInstrs.resize(SyncedInstrs.size());
401 for (auto &SyncInstrMap : SyncedInstrs) {
402 for (auto &SUsToCandSGs : SyncInstrMap.second) {
403 if (PipelineInstrs[PipelineIDx].size() == 0) {
404 PipelineInstrs[PipelineIDx].push_back(
405 std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
406 continue;
407 }
408 auto *SortPosition = PipelineInstrs[PipelineIDx].begin();
409 // Insert them in sorted order -- this allows for good parsing order in
410 // the greedy algorithm
411 while (SortPosition != PipelineInstrs[PipelineIDx].end() &&
412 SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum)
413 ++SortPosition;
414 PipelineInstrs[PipelineIDx].insert(
415 SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
416 }
417 --PipelineIDx;
418 }
419}
420
421template <typename T> void PipelineSolver::linkSchedGroups(T I, T E) {
422 for (; I != E; ++I) {
423 auto &GroupA = *I;
424 for (auto J = std::next(I); J != E; ++J) {
425 auto &GroupB = *J;
426 GroupA.link(GroupB);
427 }
428 }
429}
430
431void PipelineSolver::makePipeline() {
432 // Preserve the order of barrier for subsequent SchedGroupBarrier mutations
433 for (auto &SyncPipeline : BestPipeline) {
434 LLVM_DEBUG(dbgs() << "Printing SchedGroups\n");
435 for (auto &SG : SyncPipeline) {
436 LLVM_DEBUG(dbgs() << "SchedGroup with SGID " << SG.getSGID()
437 << " has: \n");
438 SUnit *SGBarr = nullptr;
439 for (auto &SU : SG.Collection) {
440 if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
441 SGBarr = SU;
442 LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n");
443 }
444 // Command line requested IGroupLP doesn't have SGBarr
445 if (!SGBarr)
446 continue;
447 SG.link(*SGBarr, false);
448 }
449 }
450
451 for (auto &SyncPipeline : BestPipeline) {
452 IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend())
453 : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end());
454 }
455}
456
457template <typename T>
458int PipelineSolver::linkSUnit(
459 SUnit *SU, int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
460 T I, T E) {
461 bool MakePred = false;
462 int AddedCost = 0;
463 for (; I < E; ++I) {
464 if (I->getSGID() == SGID) {
465 MakePred = true;
466 continue;
467 }
468 auto Group = *I;
469 AddedCost += Group.link(*SU, MakePred, AddedEdges);
470 assert(AddedCost >= 0);
471 }
472 return AddedCost;
473}
474
475int PipelineSolver::addEdges(
476 SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
477 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
478
479 // For IsBottomUp, the first SchedGroup in SyncPipeline contains the
480 // instructions that are the ultimate successors in the resultant mutation.
481 // Therefore, in such a configuration, the SchedGroups occurring before the
482 // candidate SGID are successors of the candidate SchedGroup, thus the current
483 // SU should be linked as a predecessor to SUs in those SchedGroups. The
484 // opposite is true if !IsBottomUp. IsBottomUp occurs in the case of multiple
485 // SCHED_GROUP_BARRIERS, or if a user specifies IGLP_OPT SchedGroups using
486 // IsBottomUp (in reverse).
487 return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.rbegin(),
488 SyncPipeline.rend())
489 : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.begin(),
490 SyncPipeline.end());
491}
492
493void PipelineSolver::removeEdges(
494 const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
495 // Only remove the edges that we have added when testing
496 // the fit.
497 for (auto &PredSuccPair : EdgesToRemove) {
498 SUnit *Pred = PredSuccPair.first;
499 SUnit *Succ = PredSuccPair.second;
500
501 auto *Match = llvm::find_if(
502 Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; });
503 if (Match != Succ->Preds.end()) {
504 assert(Match->isArtificial());
505 Succ->removePred(*Match);
506 }
507 }
508}
509
510void PipelineSolver::advancePosition() {
511 ++CurrConflInstNo;
512
513 if (static_cast<size_t>(CurrConflInstNo) >=
514 PipelineInstrs[CurrSyncGroupIdx].size()) {
515 CurrConflInstNo = 0;
516 ++CurrSyncGroupIdx;
517 // Advance to next non-trivial pipeline
518 while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() &&
519 PipelineInstrs[CurrSyncGroupIdx].size() == 0)
520 ++CurrSyncGroupIdx;
521 }
522}
523
524void PipelineSolver::retreatPosition() {
525 assert(CurrConflInstNo >= 0);
526 assert(CurrSyncGroupIdx >= 0);
527
528 if (CurrConflInstNo > 0) {
529 --CurrConflInstNo;
530 return;
531 }
532
533 if (CurrConflInstNo == 0) {
534 // If we return to the starting position, we have explored
535 // the entire tree
536 if (CurrSyncGroupIdx == BeginSyncGroupIdx)
537 return;
538
539 --CurrSyncGroupIdx;
540 // Go to previous non-trivial pipeline
541 while (PipelineInstrs[CurrSyncGroupIdx].size() == 0)
542 --CurrSyncGroupIdx;
543
544 CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1;
545 }
546}
547
548bool PipelineSolver::checkOptimal() {
549 if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {
550 if (BestCost == -1 || CurrCost < BestCost) {
551 BestPipeline = CurrPipeline;
552 BestCost = CurrCost;
553 LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n");
554 }
555 assert(BestCost >= 0);
556 }
557
558 bool DoneExploring = false;
559 if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
560 DoneExploring = true;
561
562 return (DoneExploring || BestCost == 0);
563}
564
565template <typename T>
566void PipelineSolver::populateReadyList(
567 SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, T E) {
568 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
569 auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
570 assert(CurrSU.second.size() >= 1);
571
572 for (; I != E; ++I) {
573 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
574 int CandSGID = *I;
575 SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
576 return SG.getSGID() == CandSGID;
577 });
578 assert(Match);
579
580 if (UseCostHeur) {
581 if (Match->isFull()) {
582 ReadyList.push_back(std::pair(*I, MissPenalty));
583 continue;
584 }
585
586 int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
587 ReadyList.push_back(std::pair(*I, TempCost));
588 removeEdges(AddedEdges);
589 } else
590 ReadyList.push_back(std::pair(*I, -1));
591 }
592
593 if (UseCostHeur)
594 std::sort(ReadyList.begin(), ReadyList.end(), llvm::less_second());
595
596 assert(ReadyList.size() == CurrSU.second.size());
597}
598
599bool PipelineSolver::solveExact() {
600 if (checkOptimal())
601 return true;
602
603 if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())
604 return false;
605
606 assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());
607 assert(static_cast<size_t>(CurrConflInstNo) <
608 PipelineInstrs[CurrSyncGroupIdx].size());
609 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
610 LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
611 << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
612
613 // SchedGroup -> Cost pairs
615 // Prioritize the candidate sched groups in terms of lowest cost first
616 IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.rbegin(),
617 CurrSU.second.rend())
618 : populateReadyList(ReadyList, CurrSU.second.begin(),
619 CurrSU.second.end());
620
621 auto *I = ReadyList.begin();
622 auto *E = ReadyList.end();
623 for (; I != E; ++I) {
624 // If we are trying SGs in least cost order, and the current SG is cost
625 // infeasible, then all subsequent SGs will also be cost infeasible, so we
626 // can prune.
627 if (BestCost != -1 && (CurrCost + I->second > BestCost))
628 return false;
629
630 int CandSGID = I->first;
631 int AddedCost = 0;
632 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
633 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
634 SchedGroup *Match;
635 for (auto &SG : SyncPipeline) {
636 if (SG.getSGID() == CandSGID)
637 Match = &SG;
638 }
639
640 if (Match->isFull())
641 continue;
642
643 if (!Match->allowedByRules(CurrSU.first, SyncPipeline))
644 continue;
645
646 LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask "
647 << (int)Match->getMask() << "and ID " << CandSGID
648 << "\n");
649 Match->add(*CurrSU.first);
650 AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
651 LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n");
652 CurrCost += AddedCost;
653 advancePosition();
654 ++BranchesExplored;
655 bool FinishedExploring = false;
656 // If the Cost after adding edges is greater than a known solution,
657 // backtrack
658 if (CurrCost < BestCost || BestCost == -1) {
659 if (solveExact()) {
660 FinishedExploring = BestCost != 0;
661 if (!FinishedExploring)
662 return true;
663 }
664 }
665
666 retreatPosition();
667 CurrCost -= AddedCost;
668 removeEdges(AddedEdges);
669 Match->pop();
670 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
671 if (FinishedExploring)
672 return true;
673 }
674
675 // Try the pipeline where the current instruction is omitted
676 // Potentially if we omit a problematic instruction from the pipeline,
677 // all the other instructions can nicely fit.
678 CurrCost += MissPenalty;
679 advancePosition();
680
681 LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n");
682
683 bool FinishedExploring = false;
684 if (CurrCost < BestCost || BestCost == -1) {
685 if (solveExact()) {
686 bool FinishedExploring = BestCost != 0;
687 if (!FinishedExploring)
688 return true;
689 }
690 }
691
692 retreatPosition();
693 CurrCost -= MissPenalty;
694 return FinishedExploring;
695}
696
697template <typename T>
698void PipelineSolver::greedyFind(
699 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E) {
700 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
701 int BestNodeCost = -1;
702 int TempCost;
703 SchedGroup *BestGroup = nullptr;
704 int BestGroupID = -1;
705 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
706 LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
707 << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
708
709 // Since we have added the potential SchedGroups from bottom up, but
710 // traversed the DAG from top down, parse over the groups from last to
711 // first. If we fail to do this for the greedy algorithm, the solution will
712 // likely not be good in more complex cases.
713 for (; I != E; ++I) {
714 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
715 int CandSGID = *I;
716 SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
717 return SG.getSGID() == CandSGID;
718 });
719 assert(Match);
720
721 LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "
722 << (int)Match->getMask() << "\n");
723
724 if (Match->isFull()) {
725 LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");
726 continue;
727 }
728 if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) {
729 LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " has conflicting rule\n");
730 continue;
731 }
732 TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
733 LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n");
734 if (TempCost < BestNodeCost || BestNodeCost == -1) {
735 BestGroup = Match;
736 BestNodeCost = TempCost;
737 BestGroupID = CandSGID;
738 }
739 removeEdges(AddedEdges);
740 if (BestNodeCost == 0)
741 break;
742 }
743
744 if (BestGroupID != -1) {
745 BestGroup->add(*CurrSU.first);
746 addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);
747 LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask"
748 << (int)BestGroup->getMask() << "\n");
749 BestCost += TempCost;
750 } else
751 BestCost += MissPenalty;
752
753 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
754}
755
756bool PipelineSolver::solveGreedy() {
757 BestCost = 0;
758 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
759
760 while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
761 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
762 IsBottomUp
763 ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend())
764 : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end());
765 advancePosition();
766 }
767 BestPipeline = CurrPipeline;
768 removeEdges(AddedEdges);
769 return false;
770}
771
772unsigned PipelineSolver::computeProblemSize() {
773 unsigned ProblemSize = 0;
774 for (auto &PipeConflicts : PipelineInstrs) {
775 ProblemSize += PipeConflicts.size();
776 }
777
778 return ProblemSize;
779}
780
781void PipelineSolver::solve() {
782 if (!NeedsSolver)
783 return;
784
785 unsigned ProblemSize = computeProblemSize();
786 assert(ProblemSize > 0);
787
788 bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact;
789 MissPenalty = (ProblemSize / 2) + 1;
790
791 LLVM_DEBUG(DAG->dump());
792 if (EnableExactSolver || BelowCutoff) {
793 LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n");
794 solveGreedy();
795 reset();
796 LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n");
797 if (BestCost > 0) {
798 LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n");
799 solveExact();
800 LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n");
801 }
802 } else { // Use the Greedy Algorithm by default
803 LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n");
804 solveGreedy();
805 }
806
807 makePipeline();
808 LLVM_DEBUG(dbgs() << "After applying mutation\n");
809 LLVM_DEBUG(DAG->dump());
810}
811
812enum IGLPStrategyID : int {
813 MFMASmallGemmOptID = 0,
814 MFMASmallGemmSingleWaveOptID = 1,
815 MFMAExpInterleaveID = 2,
816 MFMAExpSimpleInterleaveID = 3
817};
818
819// Implement a IGLP scheduling strategy.
820class IGLPStrategy {
821protected:
823
824 const SIInstrInfo *TII;
825
826public:
827 /// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy.
828 virtual bool applyIGLPStrategy(
830 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
832
833 // Returns true if this strategy should be applied to a ScheduleDAG.
834 virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
836
837 bool IsBottomUp = true;
838
839 IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
840 : DAG(DAG), TII(TII) {}
841
842 virtual ~IGLPStrategy() = default;
843};
844
845class MFMASmallGemmOpt final : public IGLPStrategy {
846private:
847public:
848 bool applyIGLPStrategy(
850 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
852
853 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
855 return true;
856 }
857
858 MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
859 : IGLPStrategy(DAG, TII) {
860 IsBottomUp = true;
861 }
862};
863
864bool MFMASmallGemmOpt::applyIGLPStrategy(
866 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
868 // Count the number of MFMA instructions.
869 unsigned MFMACount = 0;
870 for (const MachineInstr &I : *DAG)
871 if (TII->isMFMAorWMMA(I))
872 ++MFMACount;
873
874 const unsigned PipelineSyncID = 0;
875 SchedGroup *SG = nullptr;
876 for (unsigned I = 0; I < MFMACount * 3; ++I) {
877 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
878 SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII);
879 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
880
881 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
882 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
883 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
884 }
885
886 return true;
887}
888
889class MFMAExpInterleaveOpt final : public IGLPStrategy {
890private:
891 // The count of TRANS SUs involved in the interleaved pipeline
892 static unsigned TransPipeCount;
893 // The count of MFMA SUs involved in the interleaved pipeline
894 static unsigned MFMAPipeCount;
895 // The count of Add SUs involved in the interleaved pipeline
896 static unsigned AddPipeCount;
897 // The number of transitive MFMA successors for each TRANS SU
898 static unsigned MFMAEnablement;
899 // The number of transitive TRANS predecessors for each MFMA SU
900 static unsigned ExpRequirement;
901 // The count of independent "chains" of MFMA instructions in the pipeline
902 static unsigned MFMAChains;
903 // The length of each independent "chain" of MFMA instructions
904 static unsigned MFMAChainLength;
905 // Whether or not the pipeline has V_CVT instructions
906 static bool HasCvt;
907 // Whether or not there are instructions between the TRANS instruction and
908 // V_CVT
909 static bool HasChainBetweenCvt;
910 // The first occuring DS_READ which feeds an MFMA chain
911 static std::optional<unsigned> FirstPipeDSR;
912 // The MFMAPipe SUs with no MFMA predecessors
913 SmallVector<SUnit *, 4> MFMAChainSeeds;
914 // Compute the heuristics for the pipeline, returning whether or not the DAG
915 // is well formatted for the mutation
916 bool analyzeDAG(const SIInstrInfo *TII);
917
918 /// Whether or not the instruction is a transitive predecessor of an MFMA
919 /// instruction
920 class IsPipeExp final : public InstructionRule {
921 public:
922 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
923 SmallVectorImpl<SchedGroup> &SyncPipe) override {
924
925 auto *DAG = SyncPipe[0].DAG;
926
927 if (Cache->empty()) {
928 auto I = DAG->SUnits.rbegin();
929 auto E = DAG->SUnits.rend();
930 for (; I != E; I++) {
931 if (TII->isMFMAorWMMA(*I->getInstr()))
932 Cache->push_back(&*I);
933 }
934 if (Cache->empty())
935 return false;
936 }
937
938 auto Reaches = any_of(*Cache, [&SU, &DAG](SUnit *TargetSU) {
939 return DAG->IsReachable(TargetSU, const_cast<SUnit *>(SU));
940 });
941
942 return Reaches;
943 }
944 IsPipeExp(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
945 : InstructionRule(TII, SGID, NeedsCache) {}
946 };
947
948 /// Whether or not the instruction is a transitive predecessor of the
949 /// \p Number th MFMA of the MFMAs occuring after a TRANS instruction
950 class EnablesNthMFMA final : public InstructionRule {
951 private:
952 unsigned Number = 1;
953
954 public:
955 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
956 SmallVectorImpl<SchedGroup> &SyncPipe) override {
957 bool FoundTrans = false;
958 unsigned Counter = 1;
959 auto *DAG = SyncPipe[0].DAG;
960
961 if (Cache->empty()) {
962 auto I = DAG->SUnits.begin();
963 auto E = DAG->SUnits.end();
964 for (; I != E; I++) {
965 if (FoundTrans && TII->isMFMAorWMMA(*I->getInstr())) {
966 if (Counter == Number) {
967 Cache->push_back(&*I);
968 break;
969 }
970 ++Counter;
971 }
972 if (!FoundTrans && TII->isTRANS(I->getInstr()->getOpcode()))
973 FoundTrans = true;
974 }
975 if (Cache->empty())
976 return false;
977 }
978
979 return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU));
980 }
981
982 EnablesNthMFMA(unsigned Number, const SIInstrInfo *TII, unsigned SGID,
983 bool NeedsCache = false)
984 : InstructionRule(TII, SGID, NeedsCache), Number(Number) {}
985 };
986
987 /// Whether or not the instruction enables the exact MFMA that is the \p
988 /// Number th MFMA in the chain starting with \p ChainSeed
989 class EnablesNthMFMAInChain final : public InstructionRule {
990 private:
991 unsigned Number = 1;
992 SUnit *ChainSeed;
993
994 public:
995 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
996 SmallVectorImpl<SchedGroup> &SyncPipe) override {
997 auto *DAG = SyncPipe[0].DAG;
998
999 if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr()))
1000 return false;
1001
1002 if (Cache->empty()) {
1003 auto *TempSU = ChainSeed;
1004 auto Depth = Number;
1005 while (Depth > 0) {
1006 --Depth;
1007 bool Found = false;
1008 for (auto &Succ : TempSU->Succs) {
1009 if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
1010 TempSU = Succ.getSUnit();
1011 Found = true;
1012 break;
1013 }
1014 }
1015 if (!Found)
1016 return false;
1017 }
1018
1019 Cache->push_back(TempSU);
1020 }
1021 // If we failed to find the instruction to be placed into the cache, we
1022 // would have already exited.
1023 assert(!Cache->empty());
1024
1025 return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU));
1026 }
1027
1028 EnablesNthMFMAInChain(unsigned Number, SUnit *ChainSeed,
1029 const SIInstrInfo *TII, unsigned SGID,
1030 bool NeedsCache = false)
1031 : InstructionRule(TII, SGID, NeedsCache), Number(Number),
1032 ChainSeed(ChainSeed) {}
1033 };
1034
1035 /// Whether or not the instruction has less than \p Size immediate successors.
1036 /// If \p HasIntermediary is true, this tests also whether all successors of
1037 /// the SUnit have less than \p Size successors.
1038 class LessThanNSuccs final : public InstructionRule {
1039 private:
1040 unsigned Size = 1;
1041 bool HasIntermediary = false;
1042
1043 public:
1044 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1045 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1046 if (!SyncPipe.size())
1047 return false;
1048
1049 unsigned SuccSize = llvm::count_if(SU->Succs, [](const SDep &Succ) {
1050 return Succ.getKind() == SDep::Data;
1051 });
1052 if (SuccSize >= Size)
1053 return false;
1054
1055 if (HasIntermediary) {
1056 for (auto Succ : SU->Succs) {
1057 unsigned SuccSize =
1058 llvm::count_if(Succ.getSUnit()->Succs, [](const SDep &SuccSucc) {
1059 return SuccSucc.getKind() == SDep::Data;
1060 });
1061 if (SuccSize >= Size)
1062 return false;
1063 }
1064 }
1065
1066 return true;
1067 }
1068 LessThanNSuccs(unsigned Size, const SIInstrInfo *TII, unsigned SGID,
1069 bool HasIntermediary = false, bool NeedsCache = false)
1070 : InstructionRule(TII, SGID, NeedsCache), Size(Size),
1071 HasIntermediary(HasIntermediary) {}
1072 };
1073
1074 /// Whether or not the instruction has greater than or equal to \p Size
1075 /// immediate successors. If \p HasIntermediary is true, this tests also
1076 /// whether all successors of the SUnit have greater than or equal to \p Size
1077 /// successors.
1078 class GreaterThanOrEqualToNSuccs final : public InstructionRule {
1079 private:
1080 unsigned Size = 1;
1081 bool HasIntermediary = false;
1082
1083 public:
1084 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1085 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1086 if (!SyncPipe.size())
1087 return false;
1088
1089 unsigned SuccSize = llvm::count_if(SU->Succs, [](const SDep &Succ) {
1090 return Succ.getKind() == SDep::Data;
1091 });
1092 if (SuccSize >= Size)
1093 return true;
1094
1095 if (HasIntermediary) {
1096 for (auto Succ : SU->Succs) {
1097 unsigned SuccSize =
1098 llvm::count_if(Succ.getSUnit()->Succs, [](const SDep &SuccSucc) {
1099 return SuccSucc.getKind() == SDep::Data;
1100 });
1101 if (SuccSize >= Size)
1102 return true;
1103 }
1104 }
1105
1106 return false;
1107 }
1108 GreaterThanOrEqualToNSuccs(unsigned Size, const SIInstrInfo *TII,
1109 unsigned SGID, bool HasIntermediary = false,
1110 bool NeedsCache = false)
1111 : InstructionRule(TII, SGID, NeedsCache), Size(Size),
1112 HasIntermediary(HasIntermediary) {}
1113 };
1114
1115 // Whether or not the instruction is a relevant V_CVT instruction.
1116 class IsCvt final : public InstructionRule {
1117 public:
1118 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1119 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1120 auto Opc = SU->getInstr()->getOpcode();
1121 return Opc == AMDGPU::V_CVT_F16_F32_e32 ||
1122 Opc == AMDGPU::V_CVT_I32_F32_e32;
1123 }
1124 IsCvt(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
1125 : InstructionRule(TII, SGID, NeedsCache) {}
1126 };
1127
1128 // Whether or not the instruction is FMA_F32.
1129 class IsFMA final : public InstructionRule {
1130 public:
1131 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1132 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1133 return SU->getInstr()->getOpcode() == AMDGPU::V_FMA_F32_e64 ||
1134 SU->getInstr()->getOpcode() == AMDGPU::V_PK_FMA_F32;
1135 }
1136 IsFMA(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
1137 : InstructionRule(TII, SGID, NeedsCache) {}
1138 };
1139
1140 // Whether or not the instruction is a V_ADD_F32 instruction.
1141 class IsPipeAdd final : public InstructionRule {
1142 public:
1143 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1144 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1145 return SU->getInstr()->getOpcode() == AMDGPU::V_ADD_F32_e32;
1146 }
1147 IsPipeAdd(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
1148 : InstructionRule(TII, SGID, NeedsCache) {}
1149 };
1150
1151 /// Whether or not the instruction is an immediate RAW successor
1152 /// of the SchedGroup \p Distance steps before.
1153 class IsSuccOfPrevNthGroup final : public InstructionRule {
1154 private:
1155 unsigned Distance = 1;
1156
1157 public:
1158 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1159 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1160 SchedGroup *OtherGroup = nullptr;
1161 if (!SyncPipe.size())
1162 return false;
1163
1164 for (auto &PipeSG : SyncPipe) {
1165 if ((unsigned)PipeSG.getSGID() == SGID - Distance)
1166 OtherGroup = &PipeSG;
1167 }
1168
1169 if (!OtherGroup)
1170 return false;
1171 if (!OtherGroup->Collection.size())
1172 return true;
1173
1174 for (auto &OtherEle : OtherGroup->Collection) {
1175 for (auto &Succ : OtherEle->Succs) {
1176 if (Succ.getSUnit() == SU && Succ.getKind() == SDep::Data)
1177 return true;
1178 }
1179 }
1180
1181 return false;
1182 }
1183 IsSuccOfPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
1184 unsigned SGID, bool NeedsCache = false)
1185 : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
1186 };
1187
1188 /// Whether or not the instruction is a transitive successor of any
1189 /// instruction the the SchedGroup \p Distance steps before.
1190 class IsReachableFromPrevNthGroup final : public InstructionRule {
1191 private:
1192 unsigned Distance = 1;
1193
1194 public:
1195 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1196 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1197 SchedGroup *OtherGroup = nullptr;
1198 if (!SyncPipe.size())
1199 return false;
1200
1201 for (auto &PipeSG : SyncPipe) {
1202 if ((unsigned)PipeSG.getSGID() == SGID - Distance)
1203 OtherGroup = &PipeSG;
1204 }
1205
1206 if (!OtherGroup)
1207 return false;
1208 if (!OtherGroup->Collection.size())
1209 return true;
1210
1211 auto *DAG = SyncPipe[0].DAG;
1212
1213 for (auto &OtherEle : OtherGroup->Collection)
1214 if (DAG->IsReachable(const_cast<SUnit *>(SU), OtherEle))
1215 return true;
1216
1217 return false;
1218 }
1219 IsReachableFromPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
1220 unsigned SGID, bool NeedsCache = false)
1221 : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
1222 };
1223
1224 /// Whether or not the instruction occurs after the SU with NodeNUm \p Number
1225 class OccursAtOrAfterNode final : public InstructionRule {
1226 private:
1227 unsigned Number = 1;
1228
1229 public:
1230 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1231 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1232
1233 return SU->NodeNum >= Number;
1234 }
1235 OccursAtOrAfterNode(unsigned Number, const SIInstrInfo *TII, unsigned SGID,
1236 bool NeedsCache = false)
1237 : InstructionRule(TII, SGID, NeedsCache), Number(Number) {}
1238 };
1239
1240 /// Whether or not the SU is exactly the \p Number th MFMA in the chain
1241 /// starting with \p ChainSeed
1242 class IsExactMFMA final : public InstructionRule {
1243 private:
1244 unsigned Number = 1;
1245 SUnit *ChainSeed;
1246
1247 public:
1248 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1249 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1250 if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr()))
1251 return false;
1252
1253 if (Cache->empty()) {
1254 auto *TempSU = ChainSeed;
1255 auto Depth = Number;
1256 while (Depth > 0) {
1257 --Depth;
1258 bool Found = false;
1259 for (auto &Succ : TempSU->Succs) {
1260 if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
1261 TempSU = Succ.getSUnit();
1262 Found = true;
1263 break;
1264 }
1265 }
1266 if (!Found) {
1267 return false;
1268 }
1269 }
1270 Cache->push_back(TempSU);
1271 }
1272 // If we failed to find the instruction to be placed into the cache, we
1273 // would have already exited.
1274 assert(!Cache->empty());
1275
1276 return (*Cache)[0] == SU;
1277 }
1278
1279 IsExactMFMA(unsigned Number, SUnit *ChainSeed, const SIInstrInfo *TII,
1280 unsigned SGID, bool NeedsCache = false)
1281 : InstructionRule(TII, SGID, NeedsCache), Number(Number),
1282 ChainSeed(ChainSeed) {}
1283 };
1284
1285 // Whether the instruction occurs after the first TRANS instruction. This
1286 // implies the instruction can not be a predecessor of the first TRANS
1287 // insruction
1288 class OccursAfterExp final : public InstructionRule {
1289 public:
1290 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1291 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1292
1293 auto *DAG = SyncPipe[0].DAG;
1294 if (Cache->empty()) {
1295 for (auto &SU : DAG->SUnits)
1296 if (TII->isTRANS(SU.getInstr()->getOpcode())) {
1297 Cache->push_back(&SU);
1298 break;
1299 }
1300 if (Cache->empty())
1301 return false;
1302 }
1303
1304 return SU->NodeNum > (*Cache)[0]->NodeNum;
1305 }
1306
1307 OccursAfterExp(const SIInstrInfo *TII, unsigned SGID,
1308 bool NeedsCache = false)
1309 : InstructionRule(TII, SGID, NeedsCache) {}
1310 };
1311
1312public:
1313 bool applyIGLPStrategy(
1315 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
1317
1318 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
1320
1321 MFMAExpInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
1322 : IGLPStrategy(DAG, TII) {
1323 IsBottomUp = false;
1324 }
1325};
1326
1327unsigned MFMAExpInterleaveOpt::TransPipeCount = 0;
1328unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0;
1329unsigned MFMAExpInterleaveOpt::AddPipeCount = 0;
1330unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0;
1331unsigned MFMAExpInterleaveOpt::ExpRequirement = 0;
1332unsigned MFMAExpInterleaveOpt::MFMAChains = 0;
1333unsigned MFMAExpInterleaveOpt::MFMAChainLength = 0;
1334bool MFMAExpInterleaveOpt::HasCvt = false;
1335bool MFMAExpInterleaveOpt::HasChainBetweenCvt = false;
1336std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt;
1337
1338bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
1339 SmallVector<SUnit *, 10> ExpPipeCands;
1340 SmallVector<SUnit *, 10> MFMAPipeCands;
1341 SmallVector<SUnit *, 10> MFMAPipeSUs;
1344
1345 auto isBitPack = [](unsigned Opc) {
1346 return Opc == AMDGPU::V_PACK_B32_F16_e64 || Opc == AMDGPU::V_PERM_B32_e64;
1347 };
1348
1349 auto isCvt = [](unsigned Opc) {
1350 return Opc == AMDGPU::V_CVT_F16_F32_e32 || Opc == AMDGPU::V_CVT_I32_F32_e32;
1351 };
1352
1353 auto isAdd = [](unsigned Opc) { return Opc == AMDGPU::V_ADD_F32_e32; };
1354
1355 AddPipeCount = 0;
1356 for (SUnit &SU : DAG->SUnits) {
1357 auto Opc = SU.getInstr()->getOpcode();
1358 if (TII->isTRANS(Opc)) {
1359 // Avoid counting a potential bonus V_EXP which all the MFMA depend on
1360 if (SU.Succs.size() >= 7)
1361 continue;
1362 for (auto &Succ : SU.Succs) {
1363 if (Succ.getSUnit()->Succs.size() >= 7)
1364 continue;
1365 }
1366 ExpPipeCands.push_back(&SU);
1367 }
1368
1369 if (TII->isMFMAorWMMA(*SU.getInstr()))
1370 MFMAPipeCands.push_back(&SU);
1371
1372 if (isBitPack(Opc))
1373 PackSUs.push_back(&SU);
1374
1375 if (isCvt(Opc))
1376 CvtSUs.push_back(&SU);
1377
1378 if (isAdd(Opc))
1379 ++AddPipeCount;
1380 }
1381
1382 if (!(PackSUs.size() && MFMAPipeCands.size() && ExpPipeCands.size()))
1383 return false;
1384
1385 TransPipeCount = 0;
1386
1387 std::optional<SUnit *> TempMFMA;
1388 std::optional<SUnit *> TempExp;
1389 // Count the number of EXPs that reach an MFMA
1390 for (auto &PredSU : ExpPipeCands) {
1391 for (auto &SuccSU : MFMAPipeCands) {
1392 if (DAG->IsReachable(SuccSU, PredSU)) {
1393 if (!TempExp) {
1394 TempExp = PredSU;
1395 TempMFMA = SuccSU;
1396 }
1397 MFMAPipeSUs.push_back(SuccSU);
1398 ++TransPipeCount;
1399 break;
1400 }
1401 }
1402 }
1403
1404 if (!(TempExp && TempMFMA))
1405 return false;
1406
1407 HasChainBetweenCvt = none_of((*TempExp)->Succs, [&isCvt](SDep &Succ) {
1408 return isCvt(Succ.getSUnit()->getInstr()->getOpcode());
1409 });
1410
1411 // Count the number of MFMAs that are reached by an EXP
1412 for (auto &SuccSU : MFMAPipeCands) {
1413 if (MFMAPipeSUs.size() &&
1414 any_of(MFMAPipeSUs, [&SuccSU](SUnit *PotentialMatch) {
1415 return PotentialMatch->NodeNum == SuccSU->NodeNum;
1416 }))
1417 continue;
1418
1419 for (auto &PredSU : ExpPipeCands) {
1420 if (DAG->IsReachable(SuccSU, PredSU)) {
1421 MFMAPipeSUs.push_back(SuccSU);
1422 break;
1423 }
1424 }
1425 }
1426
1427 MFMAPipeCount = MFMAPipeSUs.size();
1428
1429 assert(TempExp && TempMFMA);
1430 assert(MFMAPipeCount > 0);
1431
1432 std::optional<SUnit *> TempCvt;
1433 for (auto &SuccSU : CvtSUs) {
1434 if (DAG->IsReachable(SuccSU, *TempExp)) {
1435 TempCvt = SuccSU;
1436 break;
1437 }
1438 }
1439
1440 HasCvt = false;
1441 if (TempCvt.has_value()) {
1442 for (auto &SuccSU : MFMAPipeSUs) {
1443 if (DAG->IsReachable(SuccSU, *TempCvt)) {
1444 HasCvt = true;
1445 break;
1446 }
1447 }
1448 }
1449
1450 MFMAChains = 0;
1451 for (auto &MFMAPipeSU : MFMAPipeSUs) {
1452 if (is_contained(MFMAChainSeeds, MFMAPipeSU))
1453 continue;
1454 if (none_of(MFMAPipeSU->Preds, [&TII](SDep &Succ) {
1455 return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
1456 })) {
1457 MFMAChainSeeds.push_back(MFMAPipeSU);
1458 ++MFMAChains;
1459 }
1460 }
1461
1462 if (!MFMAChains)
1463 return false;
1464
1465 for (auto Pred : MFMAChainSeeds[0]->Preds) {
1466 if (TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) &&
1467 Pred.getSUnit()->getInstr()->mayLoad())
1468 FirstPipeDSR = Pred.getSUnit()->NodeNum;
1469 }
1470
1471 MFMAChainLength = MFMAPipeCount / MFMAChains;
1472
1473 // The number of bit pack operations that depend on a single V_EXP
1474 unsigned PackSuccCount =
1475 llvm::count_if(PackSUs, [this, &TempExp](SUnit *VPack) {
1476 return DAG->IsReachable(VPack, *TempExp);
1477 });
1478
1479 // The number of bit pack operations an MFMA depends on
1480 unsigned PackPredCount =
1481 llvm::count_if((*TempMFMA)->Preds, [&isBitPack](SDep &Pred) {
1482 auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1483 return isBitPack(Opc);
1484 });
1485
1486 auto *PackPred = llvm::find_if((*TempMFMA)->Preds, [&isBitPack](SDep &Pred) {
1487 auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1488 return isBitPack(Opc);
1489 });
1490
1491 if (PackPred == (*TempMFMA)->Preds.end())
1492 return false;
1493
1494 MFMAEnablement = 0;
1495 ExpRequirement = 0;
1496 // How many MFMAs depend on a single bit pack operation
1497 MFMAEnablement =
1498 llvm::count_if(PackPred->getSUnit()->Succs, [&TII](SDep &Succ) {
1499 return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
1500 });
1501
1502 // The number of MFMAs that depend on a single V_EXP
1503 MFMAEnablement *= PackSuccCount;
1504
1505 // The number of V_EXPs required to resolve all dependencies for an MFMA
1506 ExpRequirement =
1507 llvm::count_if(ExpPipeCands, [this, &PackPred](SUnit *ExpBase) {
1508 return DAG->IsReachable(PackPred->getSUnit(), ExpBase);
1509 });
1510
1511 ExpRequirement *= PackPredCount;
1512 return true;
1513}
1514
1515bool MFMAExpInterleaveOpt::shouldApplyStrategy(ScheduleDAGInstrs *DAG,
1517 const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
1518 const SIInstrInfo *TII = ST.getInstrInfo();
1519
1521 MFMAChainSeeds.clear();
1522 if (Phase != AMDGPU::SchedulingPhase::PostRA && !analyzeDAG(TII))
1523 return false;
1524
1525 return true;
1526}
1527
1528bool MFMAExpInterleaveOpt::applyIGLPStrategy(
1530 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
1532
1533 bool IsSmallKernelType =
1534 MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32;
1535 bool IsLargeKernelType =
1536 MFMAEnablement == 4 && ExpRequirement == 4 && TransPipeCount == 64;
1537
1538 if (!(IsSmallKernelType || IsLargeKernelType))
1539 return false;
1540
1541 const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
1542 const SIInstrInfo *TII = ST.getInstrInfo();
1543
1544 unsigned PipelineSyncID = 0;
1545 SchedGroup *SG = nullptr;
1546
1547 unsigned MFMAChain = 0;
1548 unsigned PositionInChain = 0;
1549 unsigned CurrMFMAForTransPosition = 0;
1550
1551 auto incrementTransPosition = [&MFMAChain, &PositionInChain,
1552 &CurrMFMAForTransPosition]() {
1553 CurrMFMAForTransPosition += MFMAEnablement;
1554 PositionInChain = (CurrMFMAForTransPosition / MFMAChains);
1555 MFMAChain = CurrMFMAForTransPosition % MFMAChains;
1556 };
1557
1558 auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() {
1559 auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1560 return (TempMFMAForTrans / MFMAChains);
1561 };
1562
1563 auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() {
1564 auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1565 return TempMFMAForTrans % MFMAChains;
1566 };
1567
1568 unsigned CurrMFMAPosition = 0;
1569 unsigned MFMAChainForMFMA = 0;
1570 unsigned PositionInChainForMFMA = 0;
1571
1572 auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA,
1573 &PositionInChainForMFMA]() {
1574 ++CurrMFMAPosition;
1575 MFMAChainForMFMA = CurrMFMAPosition % MFMAChains;
1576 PositionInChainForMFMA = CurrMFMAPosition / MFMAChains;
1577 };
1578
1579 bool IsPostRA = Phase == AMDGPU::SchedulingPhase::PostRA;
1580 assert(IsPostRA || MFMAChainSeeds.size() == MFMAChains);
1581
1582 bool UsesFMA = IsSmallKernelType || !IsPostRA;
1583 bool UsesDSRead = IsLargeKernelType && !IsPostRA && FirstPipeDSR;
1584 bool UsesCvt = HasCvt && (IsSmallKernelType || !IsPostRA);
1585 bool UsesVALU = IsSmallKernelType;
1586
1587 // PHASE 1: "Prefetch"
1588 if (UsesFMA) {
1589 // First Round FMA
1590 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1591 SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, TII);
1592 if (!IsPostRA && MFMAChains) {
1593 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1594 PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),
1595 true));
1596 } else
1597 SG->addRule(
1598 std::make_shared<EnablesNthMFMA>(1, TII, SG->getSGID(), true));
1599 SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
1600 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1601
1602 // Second Round FMA
1603 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1604 SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, TII);
1605 if (!IsPostRA && MFMAChains) {
1606 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1607 getNextTransPositionInChain(),
1608 MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), true));
1609 } else
1610 SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, TII,
1611 SG->getSGID(), true));
1612 SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
1613 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1614 }
1615
1616 if (UsesDSRead) {
1617 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1618 SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII);
1619 SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR, TII,
1620 SG->getSGID()));
1621 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1622 }
1623
1624 // First Round EXP
1625 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1626 SchedGroupMask::TRANS, ExpRequirement, PipelineSyncID, DAG, TII);
1627 if (!IsPostRA && MFMAChains)
1628 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1629 PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(), true));
1630 else
1631 SG->addRule(std::make_shared<EnablesNthMFMA>(1, TII, SG->getSGID(), true));
1632 SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
1633 SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),
1634 HasChainBetweenCvt));
1635 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1636
1637 incrementTransPosition();
1638
1639 // First Round CVT, Third Round FMA, Second Round EXP; interleaved
1640 for (unsigned I = 0; I < ExpRequirement; I++) {
1641 // First Round CVT
1642 if (UsesCvt) {
1643 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1644 SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
1645 SG->addRule(std::make_shared<IsCvt>(TII, SG->getSGID()));
1646 if (HasChainBetweenCvt)
1647 SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
1648 1 + (2 + UsesFMA) * I, TII, SG->getSGID()));
1649 else
1650 SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(
1651 1 + (2 + UsesFMA) * I, TII, SG->getSGID()));
1652 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1653 }
1654
1655 // Third Round FMA
1656 if (UsesFMA) {
1657 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1658 SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
1659 if (!IsPostRA && MFMAChains) {
1660 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1661 getNextTransPositionInChain(),
1662 MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), true));
1663 } else
1664 SG->addRule(std::make_shared<EnablesNthMFMA>(2 * MFMAEnablement + 1,
1665 TII, SG->getSGID(), true));
1666 SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
1667 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1668 }
1669
1670 // Second Round EXP
1671 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1672 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
1673 if (!IsPostRA && MFMAChains)
1674 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1675 PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),
1676 true));
1677 else
1678 SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, TII,
1679 SG->getSGID(), true));
1680 SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
1681 SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),
1682 HasChainBetweenCvt));
1683 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1684 }
1685
1686 // The "extra" EXP which enables all MFMA
1687 // TODO: UsesExtraExp
1688 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1689 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
1690 SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
1691 SG->addRule(std::make_shared<GreaterThanOrEqualToNSuccs>(
1692 8, TII, SG->getSGID(), HasChainBetweenCvt));
1693 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1694
1695 // PHASE 2: Main Interleave Loop
1696
1697 // The number of MFMAs per iteration
1698 unsigned MFMARatio =
1699 MFMAEnablement > ExpRequirement ? MFMAEnablement / ExpRequirement : 1;
1700 // The number of Exps per iteration
1701 unsigned ExpRatio =
1702 MFMAEnablement > ExpRequirement ? 1 : ExpRequirement / MFMAEnablement;
1703 // The reamaining Exps
1704 unsigned RemainingExp = TransPipeCount > (2 * ExpRequirement)
1705 ? TransPipeCount - (2 * ExpRequirement)
1706 : 0;
1707 unsigned ExpLoopCount = RemainingExp / ExpRatio;
1708 // In loop MFMAs
1709 unsigned MFMAInLoop = MFMAPipeCount > (MFMAEnablement * 2)
1710 ? MFMAPipeCount - (MFMAEnablement * 2)
1711 : 0;
1712 unsigned MFMALoopCount = MFMAInLoop / MFMARatio;
1713 unsigned VALUOps =
1714 AddPipeCount < MFMAPipeCount ? 1 : AddPipeCount / MFMAPipeCount;
1715 unsigned LoopSize = std::min(ExpLoopCount, MFMALoopCount);
1716
1717 for (unsigned I = 0; I < LoopSize; I++) {
1718 if (!(I * ExpRatio % ExpRequirement))
1719 incrementTransPosition();
1720
1721 // Round N MFMA
1722 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1723 SchedGroupMask::MFMA, MFMARatio, PipelineSyncID, DAG, TII);
1724 if (!IsPostRA && MFMAChains)
1725 SG->addRule(std::make_shared<IsExactMFMA>(
1726 PositionInChainForMFMA, MFMAChainSeeds[MFMAChainForMFMA], TII,
1727 SG->getSGID(), true));
1728 else
1729 SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true));
1730 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1731 incrementMFMAPosition();
1732
1733 if (UsesVALU) {
1734 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1735 SchedGroupMask::VALU, VALUOps, PipelineSyncID, DAG, TII);
1736 SG->addRule(std::make_shared<IsPipeAdd>(TII, SG->getSGID()));
1737 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1738 }
1739
1740 if (UsesDSRead && !(I % 4)) {
1741 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1742 SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII);
1743 SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR, TII,
1744 SG->getSGID()));
1745 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1746 }
1747
1748 // CVT, EXP, FMA Interleaving
1749 for (unsigned J = 0; J < ExpRatio; J++) {
1750 auto MFMAOffset = (1 + UsesVALU) * MFMARatio * (I + 1);
1751 auto MaxMFMAOffset =
1752 (1 + UsesVALU) * ExpRequirement * MFMARatio / ExpRatio;
1753
1754 // Round N + 1 CVT
1755 if (UsesCvt) {
1756 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1757 SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
1758 SG->addRule(std::make_shared<IsCvt>(TII, SG->getSGID()));
1759 auto BaseDiff = (2 + UsesFMA) * (ExpRequirement - 1) + 1;
1760 auto DSROffset = I / 4 + 1;
1761 auto MaxDSROffset = MaxMFMAOffset / 4;
1762 // TODO: UsesExtraExp
1763 auto ExpOffset = I * ExpRatio + J >= ExpRequirement ? 0 : 1;
1764 auto CurrentOffset = UsesDSRead * std::min(MaxDSROffset, DSROffset) +
1765 std::min(MaxMFMAOffset, MFMAOffset) + BaseDiff +
1766 ExpOffset;
1767 if (HasChainBetweenCvt)
1768 SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
1769 CurrentOffset, TII, SG->getSGID()));
1770 else
1771 SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(CurrentOffset, TII,
1772 SG->getSGID()));
1773 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1774 }
1775
1776 // Round N + 3 FMA
1777 if (UsesFMA) {
1778 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1779 SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
1780 if (!IsPostRA && MFMAChains)
1781 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1782 getNextTransPositionInChain(),
1783 MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(),
1784 true));
1785 else
1786 SG->addRule(std::make_shared<EnablesNthMFMA>(
1787 (((I * ExpRatio + J) / ExpRequirement) + 3) * MFMAEnablement + 1,
1788 TII, SG->getSGID(), true));
1789 SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
1790 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1791 }
1792
1793 // Round N + 2 Exp
1794 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1795 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
1796 if (!IsPostRA && MFMAChains)
1797 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1798 PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),
1799 true));
1800 else
1801 SG->addRule(std::make_shared<EnablesNthMFMA>(
1802 (((I * ExpRatio + J) / ExpRequirement) + 2) * MFMAEnablement + 1,
1803 TII, SG->getSGID(), true));
1804 SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
1805 SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),
1806 HasChainBetweenCvt));
1807 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1808 }
1809 }
1810
1811 // PHASE 3: Remaining MFMAs
1812 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1813 SchedGroupMask::MFMA, MFMAEnablement * 2, PipelineSyncID, DAG, TII);
1814 SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true));
1815 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1816 return true;
1817}
1818
1819class MFMAExpSimpleInterleaveOpt final : public IGLPStrategy {
1820public:
1821 bool applyIGLPStrategy(
1823 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
1825
1826 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
1827 AMDGPU::SchedulingPhase Phase) override {
1828 return true;
1829 }
1830
1831 MFMAExpSimpleInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
1832 : IGLPStrategy(DAG, TII) {
1833 IsBottomUp = true;
1834 }
1835};
1836
1837bool MFMAExpSimpleInterleaveOpt::applyIGLPStrategy(
1839 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
1841 // Count the number of MFMA instructions.
1842 unsigned MFMACount = 0;
1843 for (const MachineInstr &I : *DAG)
1844 if (TII->isMFMAorWMMA(I))
1845 ++MFMACount;
1846
1847 const unsigned PipelineSyncID = 0;
1848 for (unsigned I = 0; I < MFMACount * 3; ++I) {
1849 SchedGroup *SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1850 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
1851 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1852
1853 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1854 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
1855 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1856 }
1857
1858 return true;
1859}
1860
1861class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
1862private:
1863 // Whether the DS_READ is a predecessor of first four MFMA in region
1864 class EnablesInitialMFMA final : public InstructionRule {
1865 public:
1866 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1867 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1868 if (!SyncPipe.size())
1869 return false;
1870 int MFMAsFound = 0;
1871 if (!Cache->size()) {
1872 for (auto &Elt : SyncPipe[0].DAG->SUnits) {
1873 if (TII->isMFMAorWMMA(*Elt.getInstr())) {
1874 ++MFMAsFound;
1875 if (MFMAsFound > 4)
1876 break;
1877 Cache->push_back(&Elt);
1878 }
1879 }
1880 }
1881
1882 auto *DAG = SyncPipe[0].DAG;
1883 for (auto &Elt : *Cache) {
1884 if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU)))
1885 return true;
1886 }
1887 return false;
1888 }
1889
1890 EnablesInitialMFMA(const SIInstrInfo *TII, unsigned SGID,
1891 bool NeedsCache = false)
1892 : InstructionRule(TII, SGID, NeedsCache) {}
1893 };
1894
1895 // Whether the MI is a V_PERM and is a predecessor of a common DS_WRITE
1896 class IsPermForDSW final : public InstructionRule {
1897 public:
1898 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1899 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1900 auto *MI = SU->getInstr();
1901 if (MI->getOpcode() != AMDGPU::V_PERM_B32_e64)
1902 return false;
1903
1904 bool FitsInGroup = false;
1905 // Does the VALU have a DS_WRITE successor
1906 if (!Collection.size()) {
1907 for (auto &Succ : SU->Succs) {
1908 SUnit *SuccUnit = Succ.getSUnit();
1909 if (TII->isDS(*SuccUnit->getInstr()) &&
1910 SuccUnit->getInstr()->mayStore()) {
1911 Cache->push_back(SuccUnit);
1912 FitsInGroup = true;
1913 }
1914 }
1915 return FitsInGroup;
1916 }
1917
1918 // Does the VALU have a DS_WRITE successor that is the same as other
1919 // VALU already in the group. The V_PERMs will all share 1 DS_W succ
1920 return llvm::any_of(*Cache, [&SU](SUnit *Elt) {
1921 return llvm::any_of(SU->Succs, [&Elt](const SDep &ThisSucc) {
1922 return ThisSucc.getSUnit() == Elt;
1923 });
1924 });
1925 }
1926
1927 IsPermForDSW(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
1928 : InstructionRule(TII, SGID, NeedsCache) {}
1929 };
1930
1931 // Whether the SU is a successor of any element in previous SchedGroup
1932 class IsSuccOfPrevGroup final : public InstructionRule {
1933 public:
1934 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1935 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1936 SchedGroup *OtherGroup = nullptr;
1937 for (auto &PipeSG : SyncPipe) {
1938 if ((unsigned)PipeSG.getSGID() == SGID - 1) {
1939 OtherGroup = &PipeSG;
1940 }
1941 }
1942
1943 if (!OtherGroup)
1944 return false;
1945 if (!OtherGroup->Collection.size())
1946 return true;
1947
1948 // Does the previous VALU have this DS_Write as a successor
1949 return any_of(OtherGroup->Collection, [&SU](SUnit *Elt) {
1950 return any_of(Elt->Succs,
1951 [&SU](SDep &Succ) { return Succ.getSUnit() == SU; });
1952 });
1953 }
1954 IsSuccOfPrevGroup(const SIInstrInfo *TII, unsigned SGID,
1955 bool NeedsCache = false)
1956 : InstructionRule(TII, SGID, NeedsCache) {}
1957 };
1958
1959 // Whether the combined load width of group is 128 bits
1960 class VMEMSize final : public InstructionRule {
1961 public:
1962 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1963 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1964 auto *MI = SU->getInstr();
1965 if (MI->getOpcode() == TargetOpcode::BUNDLE)
1966 return false;
1967 if (!Collection.size())
1968 return true;
1969
1970 int NumBits = 0;
1971
1972 auto TRI = TII->getRegisterInfo();
1973 auto &MRI = MI->getMF()->getRegInfo();
1974 for (auto &Elt : Collection) {
1975 auto Op = Elt->getInstr()->getOperand(0);
1976 auto Size =
1977 TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(MRI, Op));
1978 NumBits += Size;
1979 }
1980
1981 if (NumBits < 128) {
1982 assert(TII->isVMEM(*MI) && MI->mayLoad());
1983 if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(
1984 MRI, MI->getOperand(0))) <=
1985 128)
1986 return true;
1987 }
1988
1989 return false;
1990 }
1991
1992 VMEMSize(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
1993 : InstructionRule(TII, SGID, NeedsCache) {}
1994 };
1995
1996 /// Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup
1997 /// that is \p Distance steps away
1998 class SharesPredWithPrevNthGroup final : public InstructionRule {
1999 private:
2000 unsigned Distance = 1;
2001
2002 public:
2003 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
2004 SmallVectorImpl<SchedGroup> &SyncPipe) override {
2005 SchedGroup *OtherGroup = nullptr;
2006 if (!SyncPipe.size())
2007 return false;
2008
2009 if (!Cache->size()) {
2010
2011 for (auto &PipeSG : SyncPipe) {
2012 if ((unsigned)PipeSG.getSGID() == SGID - Distance) {
2013 OtherGroup = &PipeSG;
2014 }
2015 }
2016
2017 if (!OtherGroup)
2018 return false;
2019 if (!OtherGroup->Collection.size())
2020 return true;
2021
2022 for (auto &OtherEle : OtherGroup->Collection) {
2023 for (auto &Pred : OtherEle->Preds) {
2024 if (Pred.getSUnit()->getInstr()->getOpcode() ==
2025 AMDGPU::V_PERM_B32_e64)
2026 Cache->push_back(Pred.getSUnit());
2027 }
2028 }
2029
2030 // If the other group has no PERM preds, then this group won't share any
2031 if (!Cache->size())
2032 return false;
2033 }
2034
2035 auto *DAG = SyncPipe[0].DAG;
2036 // Does the previous DS_WRITE share a V_PERM predecessor with this
2037 // VMEM_READ
2038 return llvm::any_of(*Cache, [&SU, &DAG](SUnit *Elt) {
2039 return DAG->IsReachable(const_cast<SUnit *>(SU), Elt);
2040 });
2041 }
2042 SharesPredWithPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
2043 unsigned SGID, bool NeedsCache = false)
2044 : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
2045 };
2046
2047public:
2048 bool applyIGLPStrategy(
2050 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
2052
2053 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
2054 AMDGPU::SchedulingPhase Phase) override {
2055 return true;
2056 }
2057
2058 MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
2059 : IGLPStrategy(DAG, TII) {
2060 IsBottomUp = false;
2061 }
2062};
2063
2064static unsigned DSWCount = 0;
2065static unsigned DSWWithPermCount = 0;
2066static unsigned DSWWithSharedVMEMCount = 0;
2067
2068bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
2069 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
2070 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
2072 unsigned MFMACount = 0;
2073 unsigned DSRCount = 0;
2074
2075 bool IsInitial = Phase == AMDGPU::SchedulingPhase::Initial;
2076
2077 assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 &&
2078 DSWWithSharedVMEMCount == 0)) &&
2079 "DSWCounters should be zero in pre-RA scheduling!");
2080 SmallVector<SUnit *, 6> DSWithPerms;
2081 for (auto &SU : DAG->SUnits) {
2082 auto *I = SU.getInstr();
2083 if (TII->isMFMAorWMMA(*I))
2084 ++MFMACount;
2085 else if (TII->isDS(*I)) {
2086 if (I->mayLoad())
2087 ++DSRCount;
2088 else if (I->mayStore() && IsInitial) {
2089 ++DSWCount;
2090 for (auto Pred : SU.Preds) {
2091 if (Pred.getSUnit()->getInstr()->getOpcode() ==
2092 AMDGPU::V_PERM_B32_e64) {
2093 DSWithPerms.push_back(&SU);
2094 break;
2095 }
2096 }
2097 }
2098 }
2099 }
2100
2101 if (IsInitial) {
2102 DSWWithPermCount = DSWithPerms.size();
2103 auto *I = DSWithPerms.begin();
2104 auto *E = DSWithPerms.end();
2105
2106 // Get the count of DS_WRITES with V_PERM predecessors which
2107 // have loop carried dependencies (WAR) on the same VMEM_READs.
2108 // We consider partial overlap as a miss -- in other words,
2109 // for a given DS_W, we only consider another DS_W as matching
2110 // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
2111 // for every V_PERM pred of this DS_W.
2112 DenseMap<MachineInstr *, SUnit *> VMEMLookup;
2114 for (; I != E; I++) {
2115 SUnit *Cand = nullptr;
2116 bool MissedAny = false;
2117 for (auto &Pred : (*I)->Preds) {
2118 if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
2119 continue;
2120
2121 if (Cand && llvm::is_contained(Counted, Cand))
2122 break;
2123
2124 for (auto &Succ : Pred.getSUnit()->Succs) {
2125 auto *MI = Succ.getSUnit()->getInstr();
2126 if (!TII->isVMEM(*MI) || !MI->mayLoad())
2127 continue;
2128
2129 if (MissedAny || !VMEMLookup.size()) {
2130 MissedAny = true;
2131 VMEMLookup[MI] = *I;
2132 continue;
2133 }
2134
2135 auto [It, Inserted] = VMEMLookup.try_emplace(MI, *I);
2136 if (Inserted) {
2137 MissedAny = true;
2138 continue;
2139 }
2140
2141 Cand = It->second;
2142 if (llvm::is_contained(Counted, Cand)) {
2143 MissedAny = true;
2144 break;
2145 }
2146 }
2147 }
2148 if (!MissedAny && Cand) {
2149 DSWWithSharedVMEMCount += 2;
2150 Counted.push_back(Cand);
2151 Counted.push_back(*I);
2152 }
2153 }
2154 }
2155
2156 assert(DSWWithSharedVMEMCount <= DSWWithPermCount);
2157 SchedGroup *SG;
2158 unsigned PipelineSyncID = 0;
2159 // For kernels with V_PERM, there are enough VALU to mix in between MFMAs
2160 if (DSWWithPermCount) {
2161 for (unsigned I = 0; I < MFMACount; I++) {
2162 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2163 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2164 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2165
2166 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2167 SchedGroupMask::VALU, 2, PipelineSyncID, DAG, TII);
2168 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2169 }
2170 }
2171
2172 PipelineSyncID = 1;
2173 // Phase 1: Break up DS_READ and MFMA clusters.
2174 // First DS_READ to make ready initial MFMA, then interleave MFMA with DS_READ
2175 // prefetch
2176
2177 // Make ready initial MFMA
2178 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2179 SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG, TII);
2180 SG->addRule(std::make_shared<EnablesInitialMFMA>(TII, SG->getSGID(), true));
2181 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2182
2183 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2184 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2185 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2186
2187 // Interleave MFMA with DS_READ prefetch
2188 for (unsigned I = 4; I < DSRCount; ++I) {
2189 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2190 SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);
2191 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2192
2193 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2194 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2195 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2196 }
2197
2198 // Phase 2a: Loop carried dependency with V_PERM
2199 // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
2200 // depend on. Interleave MFMA to keep XDL unit busy throughout.
2201 for (unsigned I = DSWWithSharedVMEMCount; I < DSWWithPermCount; ++I) {
2202 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2203 SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
2204 SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
2205 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2206
2207 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2208 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
2209 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
2210 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2211
2212 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2213 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
2214 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2215 1, TII, SG->getSGID(), true));
2216 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
2217 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2218
2219 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2220 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2221 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2222
2223 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2224 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
2225 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2226 3, TII, SG->getSGID(), true));
2227 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
2228 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2229
2230 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2231 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2232 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2233 }
2234
2235 // Phase 2b: Loop carried dependency without V_PERM
2236 // Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on.
2237 // Interleave MFMA to keep XDL unit busy throughout.
2238 for (unsigned I = DSWWithPermCount; I < DSWCount; I++) {
2239 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2240 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
2241 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2242
2243 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2244 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
2245 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
2246 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2247
2248 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2249 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2250 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2251 }
2252
2253 // Phase 2c: Loop carried dependency with V_PERM, VMEM_READs are
2254 // ultimately used by two DS_WRITE
2255 // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
2256 // depend on. Interleave MFMA to keep XDL unit busy throughout.
2257
2258 for (unsigned I = 0; I < DSWWithSharedVMEMCount; ++I) {
2259 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2260 SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
2261 SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
2262 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2263
2264 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2265 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
2266 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
2267 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2268
2269 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2270 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2271 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2272
2273 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2274 SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
2275 SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
2276 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2277
2278 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2279 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
2280 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
2281 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2282
2283 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2284 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2285 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2286
2287 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2288 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
2289 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2290 2, TII, SG->getSGID(), true));
2291 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
2292 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2293
2294 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2295 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2296 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2297
2298 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2299 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
2300 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2301 4, TII, SG->getSGID(), true));
2302 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
2303 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2304
2305 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2306 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2307 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2308 }
2309
2310 return true;
2311}
2312
2313static std::unique_ptr<IGLPStrategy>
2314createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
2315 const SIInstrInfo *TII) {
2316 switch (ID) {
2317 case MFMASmallGemmOptID:
2318 return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
2319 case MFMASmallGemmSingleWaveOptID:
2320 return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII);
2321 case MFMAExpInterleaveID:
2322 return std::make_unique<MFMAExpInterleaveOpt>(DAG, TII);
2323 case MFMAExpSimpleInterleaveID:
2324 return std::make_unique<MFMAExpSimpleInterleaveOpt>(DAG, TII);
2325 }
2326
2327 llvm_unreachable("Unknown IGLPStrategyID");
2328}
2329
2330class IGroupLPDAGMutation : public ScheduleDAGMutation {
2331private:
2332 const SIInstrInfo *TII;
2333
2334 ScheduleDAGMI *DAG;
2335
2336 // Organize lists of SchedGroups by their SyncID. SchedGroups /
2337 // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added
2338 // between then.
2339 DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
2340
2341 // Used to track instructions that can be mapped to multiple sched groups
2342 DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
2343
2344 // Add DAG edges that enforce SCHED_BARRIER ordering.
2345 void addSchedBarrierEdges(SUnit &SU);
2346
2347 // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
2348 // not be reordered accross the SCHED_BARRIER. This is used for the base
2349 // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that
2350 // SCHED_BARRIER will always block all instructions that can be classified
2351 // into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size
2352 // and may only synchronize with some SchedGroups. Returns the inverse of
2353 // Mask. SCHED_BARRIER's mask describes which instruction types should be
2354 // allowed to be scheduled across it. Invert the mask to get the
2355 // SchedGroupMask of instructions that should be barred.
2356 SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const;
2357
2358 // Create SchedGroups for a SCHED_GROUP_BARRIER.
2359 void initSchedGroupBarrierPipelineStage(
2360 std::vector<SUnit>::reverse_iterator RIter);
2361
2362 bool initIGLPOpt(SUnit &SU);
2363
2364public:
2365 void apply(ScheduleDAGInstrs *DAGInstrs) override;
2366
2367 // The order in which the PipelineSolver should process the candidate
2368 // SchedGroup for a PipelineInstr. BOTTOM_UP will try to add SUs to the last
2369 // created SchedGroup first, and will consider that as the ultimate
2370 // predecessor group when linking. TOP_DOWN instead links and processes the
2371 // first created SchedGroup first.
2372 bool IsBottomUp = true;
2373
2374 // The scheduling phase this application of IGLP corresponds with.
2375 AMDGPU::SchedulingPhase Phase = AMDGPU::SchedulingPhase::Initial;
2376
2377 IGroupLPDAGMutation() = default;
2378 IGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) : Phase(Phase) {}
2379};
2380
2381unsigned SchedGroup::NumSchedGroups = 0;
2382
2383bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) {
2384 if (A != B && DAG->canAddEdge(B, A)) {
2385 DAG->addEdge(B, SDep(A, SDep::Artificial));
2386 return true;
2387 }
2388 return false;
2389}
2390
2391bool SchedGroup::canAddMI(const MachineInstr &MI) const {
2392 bool Result = false;
2393 if (MI.isMetaInstruction())
2394 Result = false;
2395
2396 else if (MI.isInlineAsm()) {
2397 const SIRegisterInfo &TRI = TII->getRegisterInfo();
2398 auto &MRI = MI.getParent()->getParent()->getRegInfo();
2399 bool SGPR_used = false, SGPR_big_def = false, VGPR_used = false,
2400 VMFMA_used = false, VReg32_used = false, MayLoad = MI.mayLoad(),
2401 MayStore = MI.mayStore();
2402 for (const MachineOperand &Operand : MI.operands())
2403 if (Operand.isReg()) {
2404 const TargetRegisterClass &RegClass =
2405 *TRI.getRegClassForOperandReg(MRI, Operand);
2406 if (TRI.hasVGPRs(&RegClass)) {
2407 VGPR_used = true;
2408 if (Operand.isUse() && TRI.getRegSizeInBits(RegClass) == 32)
2409 VReg32_used = true;
2410 }
2411 // > 128 bit registers are usually only used by MFMA instructions, so
2412 // we're using that as a heuristic to guess the schedule group mask of
2413 // the inline asm.
2414 if (TRI.hasAGPRs(&RegClass) || TRI.getRegSizeInBits(RegClass) > 128)
2415 VMFMA_used = true;
2416 if (TRI.hasSGPRs(&RegClass))
2417 SGPR_used = true;
2418 if (TRI.getRegSizeInBits(RegClass) > 64 && Operand.isDef())
2419 SGPR_big_def = true;
2420 }
2421
2422 typedef std::underlying_type_t<SchedGroupMask> SGMask_t;
2423 SGMask_t InlineAsmMask = 0;
2424 if (VGPR_used && !VMFMA_used && !MayLoad && !MayStore)
2425 InlineAsmMask |= (SGMask_t)SchedGroupMask::VALU;
2426 if (SGPR_used && !VGPR_used && !MayLoad && !MayStore)
2427 InlineAsmMask |= (SGMask_t)SchedGroupMask::SALU;
2428 if (VMFMA_used)
2429 InlineAsmMask |= (SGMask_t)SchedGroupMask::MFMA;
2430 if (VGPR_used && MayLoad)
2431 InlineAsmMask |= (SGMask_t)(VReg32_used ? SchedGroupMask::DS_READ
2432 : SchedGroupMask::VMEM_READ);
2433 if (VGPR_used && MayStore)
2434 InlineAsmMask |= (SGMask_t)(VReg32_used ? SchedGroupMask::DS_WRITE
2435 : SchedGroupMask::VMEM_WRITE);
2436 if (SGPR_big_def)
2437 InlineAsmMask |= (SGMask_t)SchedGroupMask::DS_READ;
2438 if (InlineAsmMask & (SGMask_t)SchedGroupMask::VALU ||
2439 InlineAsmMask & (SGMask_t)SchedGroupMask::SALU)
2440 InlineAsmMask |= (SGMask_t)SchedGroupMask::ALU;
2441 if (InlineAsmMask & (SGMask_t)SchedGroupMask::DS_READ ||
2442 InlineAsmMask & (SGMask_t)SchedGroupMask::DS_WRITE)
2443 InlineAsmMask |= (SGMask_t)SchedGroupMask::DS;
2444 if (InlineAsmMask & (SGMask_t)SchedGroupMask::VMEM_READ ||
2445 InlineAsmMask & (SGMask_t)SchedGroupMask::VMEM_WRITE)
2446 InlineAsmMask |= (SGMask_t)SchedGroupMask::VMEM;
2447
2448 Result = ((SGMask_t)SGMask & InlineAsmMask) != 0;
2449 }
2450
2451 else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
2452 (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) ||
2453 TII->isTRANS(MI)))
2454 Result = !MI.mayLoadOrStore();
2455
2456 else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
2457 TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI)) {
2458 // Some memory instructions may be marked as VALU (e.g. BUFFER_LOAD_*_LDS).
2459 // For our purposes, these shall not be classified as VALU as this results
2460 // in unexpected behavior.
2461 Result = !MI.mayLoadOrStore();
2462 }
2463
2464 else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
2465 TII->isSALU(MI))
2466 Result = !MI.mayLoadOrStore();
2467
2468 else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
2469 TII->isMFMAorWMMA(MI))
2470 Result = true;
2471
2472 else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
2473 TII->isVMEM(MI))
2474 Result = true;
2475
2476 else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
2477 MI.mayLoad() && TII->isVMEM(MI))
2478 Result = true;
2479
2480 else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
2481 MI.mayStore() && TII->isVMEM(MI))
2482 Result = true;
2483
2484 else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
2485 TII->isDS(MI))
2486 Result = true;
2487
2488 else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
2489 MI.mayLoad() && TII->isDS(MI))
2490 Result = true;
2491
2492 else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
2493 MI.mayStore() && TII->isDS(MI))
2494 Result = true;
2495
2496 else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) &&
2497 TII->isTRANS(MI))
2498 Result = true;
2499
2500 LLVM_DEBUG(
2501 dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)
2502 << (Result ? " could classify " : " unable to classify ") << MI);
2503
2504 return Result;
2505}
2506
2507int SchedGroup::link(SUnit &SU, bool MakePred,
2508 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
2509 int MissedEdges = 0;
2510 for (auto *A : Collection) {
2511 SUnit *B = &SU;
2512 if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2513 continue;
2514 if (MakePred)
2515 std::swap(A, B);
2516
2517 if (DAG->IsReachable(B, A))
2518 continue;
2519
2520 // tryAddEdge returns false if there is a dependency that makes adding
2521 // the A->B edge impossible, otherwise it returns true;
2522 bool Added = tryAddEdge(A, B);
2523 if (Added)
2524 AddedEdges.emplace_back(A, B);
2525 else
2526 ++MissedEdges;
2527 }
2528
2529 return MissedEdges;
2530}
2531
2532void SchedGroup::link(SUnit &SU, bool MakePred) {
2533 for (auto *A : Collection) {
2534 SUnit *B = &SU;
2535 if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2536 continue;
2537 if (MakePred)
2538 std::swap(A, B);
2539
2540 tryAddEdge(A, B);
2541 }
2542}
2543
2544void SchedGroup::link(SUnit &SU,
2545 function_ref<bool(const SUnit *A, const SUnit *B)> P) {
2546 for (auto *A : Collection) {
2547 SUnit *B = &SU;
2548 if (P(A, B))
2549 std::swap(A, B);
2550
2551 tryAddEdge(A, B);
2552 }
2553}
2554
2555void SchedGroup::link(SchedGroup &OtherGroup) {
2556 for (auto *B : OtherGroup.Collection)
2557 link(*B);
2558}
2559
2560bool SchedGroup::canAddSU(SUnit &SU) const {
2561 MachineInstr &MI = *SU.getInstr();
2562 if (MI.getOpcode() != TargetOpcode::BUNDLE)
2563 return canAddMI(MI);
2564
2565 // Special case for bundled MIs.
2566 const MachineBasicBlock *MBB = MI.getParent();
2567 MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;
2568 while (E != MBB->end() && E->isBundledWithPred())
2569 ++E;
2570
2571 // Return true if all of the bundled MIs can be added to this group.
2572 return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); });
2573}
2574
2575void SchedGroup::initSchedGroup() {
2576 for (auto &SU : DAG->SUnits) {
2577 if (isFull())
2578 break;
2579
2580 if (canAddSU(SU))
2581 add(SU);
2582 }
2583}
2584
2585void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
2586 SUnitsToCandidateSGsMap &SyncedInstrs) {
2587 SUnit &InitSU = *RIter;
2588 for (auto E = DAG->SUnits.rend(); RIter != E; ++RIter) {
2589 auto &SU = *RIter;
2590 if (isFull())
2591 break;
2592
2593 if (canAddSU(SU))
2594 SyncedInstrs[&SU].push_back(SGID);
2595 }
2596
2597 add(InitSU);
2598 assert(MaxSize);
2599 (*MaxSize)++;
2600}
2601
2602void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) {
2603 auto I = DAG->SUnits.rbegin();
2604 auto E = DAG->SUnits.rend();
2605 for (; I != E; ++I) {
2606 auto &SU = *I;
2607 if (isFull())
2608 break;
2609 if (canAddSU(SU))
2610 SyncedInstrs[&SU].push_back(SGID);
2611 }
2612}
2613
2614void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
2615 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
2616 if (!TSchedModel || DAGInstrs->SUnits.empty())
2617 return;
2618
2619 LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n");
2620 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
2621 TII = ST.getInstrInfo();
2622 DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
2623 SyncedSchedGroups.clear();
2624 SyncedInstrs.clear();
2625 bool FoundSB = false;
2626 bool FoundIGLP = false;
2627 bool ShouldApplyIGLP = false;
2628 for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) {
2629 unsigned Opc = R->getInstr()->getOpcode();
2630 // SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive.
2631 if (Opc == AMDGPU::SCHED_BARRIER) {
2632 addSchedBarrierEdges(*R);
2633 FoundSB = true;
2634 } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) {
2635 initSchedGroupBarrierPipelineStage(R);
2636 FoundSB = true;
2637 } else if (Opc == AMDGPU::IGLP_OPT) {
2638 if (!FoundSB && !FoundIGLP) {
2639 FoundIGLP = true;
2640 ShouldApplyIGLP = initIGLPOpt(*R);
2641 }
2642 }
2643 }
2644
2645 if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) {
2646 PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
2647 // PipelineSolver performs the mutation by adding the edges it
2648 // determined as the best
2649 PS.solve();
2650 return;
2651 }
2652}
2653
2654void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
2655 MachineInstr &MI = *SchedBarrier.getInstr();
2656 assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
2657 // Remove all existing edges from the SCHED_BARRIER that were added due to the
2658 // instruction having side effects.
2659 LLVM_DEBUG(dbgs() << "Building SchedGroup for SchedBarrier with Mask: "
2660 << MI.getOperand(0).getImm() << "\n");
2661 auto InvertedMask =
2662 invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm());
2663 SchedGroup SG(InvertedMask, std::nullopt, DAG, TII);
2664 SG.initSchedGroup();
2665
2666 // Preserve original instruction ordering relative to the SCHED_BARRIER.
2667 SG.link(
2668 SchedBarrier,
2669 (function_ref<bool(const SUnit *A, const SUnit *B)>)[](
2670 const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; });
2671}
2672
2673SchedGroupMask
2674IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
2675 // Invert mask and erase bits for types of instructions that are implied to be
2676 // allowed past the SCHED_BARRIER.
2677 SchedGroupMask InvertedMask = ~Mask;
2678
2679 // ALU implies VALU, SALU, MFMA, TRANS.
2680 if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
2681 InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU &
2682 ~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS;
2683 // VALU, SALU, MFMA, TRANS implies ALU.
2684 else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||
2685 (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||
2686 (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE ||
2687 (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE)
2688 InvertedMask &= ~SchedGroupMask::ALU;
2689
2690 // VMEM implies VMEM_READ, VMEM_WRITE.
2691 if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
2692 InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE;
2693 // VMEM_READ, VMEM_WRITE implies VMEM.
2694 else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||
2695 (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE)
2696 InvertedMask &= ~SchedGroupMask::VMEM;
2697
2698 // DS implies DS_READ, DS_WRITE.
2699 if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE)
2700 InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE;
2701 // DS_READ, DS_WRITE implies DS.
2702 else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE ||
2703 (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE)
2704 InvertedMask &= ~SchedGroupMask::DS;
2705
2706 LLVM_DEBUG(dbgs() << "After Inverting, SchedGroup Mask: " << (int)InvertedMask
2707 << "\n");
2708
2709 return InvertedMask;
2710}
2711
2712void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
2713 std::vector<SUnit>::reverse_iterator RIter) {
2714 // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due
2715 // to the instruction having side effects.
2716 MachineInstr &SGB = *RIter->getInstr();
2717 assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER);
2718 int32_t SGMask = SGB.getOperand(0).getImm();
2719 int32_t Size = SGB.getOperand(1).getImm();
2720 int32_t SyncID = SGB.getOperand(2).getImm();
2721
2722 auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,
2723 Size, SyncID, DAG, TII);
2724
2725 SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]);
2726}
2727
2728bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
2729 IGLPStrategyID StrategyID =
2730 (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm();
2731 auto S = createIGLPStrategy(StrategyID, DAG, TII);
2732 if (!S->shouldApplyStrategy(DAG, Phase))
2733 return false;
2734
2735 IsBottomUp = S->IsBottomUp;
2736 return S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, Phase);
2737}
2738
2739} // namespace
2740
2741/// \p Phase specifes whether or not this is a reentry into the
2742/// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the
2743/// same scheduling region (e.g. pre and post-RA scheduling / multiple
2744/// scheduling "phases"), we can reenter this mutation framework more than once
2745/// for a given region.
2746std::unique_ptr<ScheduleDAGMutation>
2748 return std::make_unique<IGroupLPDAGMutation>(Phase);
2749}
unsigned const MachineRegisterInfo * MRI
aarch64 falkor hwpf fix Falkor HW Prefetch Fix Late Phase
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
MachineBasicBlock & MBB
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file defines the DenseMap class.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
#define I(x, y, z)
Definition MD5.cpp:57
Register const TargetRegisterInfo * TRI
#define T
#define P(N)
Interface definition for SIInstrInfo.
#define LLVM_DEBUG(...)
Definition Debug.h:114
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
unsigned size() const
Definition DenseMap.h:110
const HexagonRegisterInfo & getRegisterInfo() const
Instructions::iterator instr_iterator
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const MachineOperand & getOperand(unsigned i) const
int64_t getImm() const
Scheduling dependency.
Definition ScheduleDAG.h:51
SUnit * getSUnit() const
@ Data
Regular data dependence (aka true-dependence).
Definition ScheduleDAG.h:55
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition ScheduleDAG.h:74
Scheduling unit. This is a node in the scheduling DAG.
unsigned NodeNum
Entry # of node in the node vector.
LLVM_ABI void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
SmallVector< SDep, 4 > Succs
All sunit successors.
SmallVector< SDep, 4 > Preds
All sunit predecessors.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
A ScheduleDAG for scheduling lists of MachineInstr.
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
bool addEdge(SUnit *SuccSU, const SDep &PredDep)
Add a DAG edge to the given SU with the given predecessor dependence data.
bool IsReachable(SUnit *SU, SUnit *TargetSU)
IsReachable - Checks if SU is reachable from TargetSU.
bool canAddEdge(SUnit *SuccSU, SUnit *PredSU)
True if an edge can be added from PredSU to SuccSU without creating a cycle.
void dump() const override
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
std::vector< SUnit > SUnits
The scheduling units.
MachineFunction & MF
Machine function.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An efficient, type-erasing, non-owning reference to a callable.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
void apply(Opt *O, const Mod &M, const Mods &... Ms)
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1667
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase)
Phase specifes whether or not this is a reentry into the IGroupLPDAGMutation.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
FormattedNumber format_hex(uint64_t N, unsigned Width, bool Upper=false)
format_hex - Output N as a fixed width hexadecimal.
Definition Format.h:191
DWARFExpression::Operation Op
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2009
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
Function object to check whether the second component of a container supported by std::get (like std:...
Definition STLExtras.h:1446