LLVM 23.0.0git
AMDGPUIGroupLP.cpp
Go to the documentation of this file.
1//===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file This file defines a set of schedule DAG mutations that can be used to
10// override default scheduler behavior to enforce specific scheduling patterns.
11// They should be used in cases where runtime performance considerations such as
12// inter-wavefront interactions, mean that compile-time heuristics cannot
13// predict the optimal instruction ordering, or in kernels where optimum
14// instruction scheduling is important enough to warrant manual intervention.
15//
16//===----------------------------------------------------------------------===//
17
18#include "AMDGPUIGroupLP.h"
20#include "SIInstrInfo.h"
23#include "llvm/ADT/DenseMap.h"
26
27#include <type_traits>
28
29using namespace llvm;
30using namespace llvm::AMDGPU;
31
32#define DEBUG_TYPE "igrouplp"
33
34namespace {
35
36static cl::opt<bool> EnableExactSolver(
37 "amdgpu-igrouplp-exact-solver", cl::Hidden,
38 cl::desc("Whether to use the exponential time solver to fit "
39 "the instructions to the pipeline as closely as "
40 "possible."),
41 cl::init(false));
42
43static cl::opt<unsigned> CutoffForExact(
44 "amdgpu-igrouplp-exact-solver-cutoff", cl::init(0), cl::Hidden,
45 cl::desc("The maximum number of scheduling group conflicts "
46 "which we attempt to solve with the exponential time "
47 "exact solver. Problem sizes greater than this will"
48 "be solved by the less accurate greedy algorithm. Selecting "
49 "solver by size is superseded by manually selecting "
50 "the solver (e.g. by amdgpu-igrouplp-exact-solver"));
51
52static cl::opt<uint64_t> MaxBranchesExplored(
53 "amdgpu-igrouplp-exact-solver-max-branches", cl::init(0), cl::Hidden,
54 cl::desc("The amount of branches that we are willing to explore with"
55 "the exact algorithm before giving up."));
56
57static cl::opt<bool> UseCostHeur(
58 "amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden,
59 cl::desc("Whether to use the cost heuristic to make choices as we "
60 "traverse the search space using the exact solver. Defaulted "
61 "to on, and if turned off, we will use the node order -- "
62 "attempting to put the later nodes in the later sched groups. "
63 "Experimentally, results are mixed, so this should be set on a "
64 "case-by-case basis."));
65
66// Components of the mask that determines which instruction types may be may be
67// classified into a SchedGroup.
68enum class SchedGroupMask {
69 NONE = 0u,
70 ALU = 1u << 0,
71 VALU = 1u << 1,
72 SALU = 1u << 2,
73 MFMA = 1u << 3,
74 VMEM = 1u << 4,
75 VMEM_READ = 1u << 5,
76 VMEM_WRITE = 1u << 6,
77 DS = 1u << 7,
78 DS_READ = 1u << 8,
79 DS_WRITE = 1u << 9,
80 TRANS = 1u << 10,
81 LDSDMA = 1u << 11,
82 ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
83 DS_READ | DS_WRITE | TRANS | LDSDMA,
84 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
85};
86
87class SchedGroup;
88
89// InstructionRule class is used to enact a filter which determines whether or
90// not an SU maps to a given SchedGroup. It contains complementary data
91// structures (e.g Cache) to help those filters.
92class InstructionRule {
93protected:
94 const SIInstrInfo *TII;
95 unsigned SGID;
96 // A cache made available to the Filter to store SUnits for subsequent
97 // invocations of the Filter
98 std::optional<SmallVector<SUnit *, 4>> Cache;
99
100public:
101 virtual bool
102 apply(const SUnit *, const ArrayRef<SUnit *>,
104 return true;
105 };
106
107 InstructionRule(const SIInstrInfo *TII, unsigned SGID,
108 bool NeedsCache = false)
109 : TII(TII), SGID(SGID) {
110 if (NeedsCache) {
111 Cache = SmallVector<SUnit *, 4>();
112 }
113 }
114
115 virtual ~InstructionRule() = default;
116};
117
118using SUnitsToCandidateSGsMap = DenseMap<SUnit *, SmallVector<int, 4>>;
119
120// Classify instructions into groups to enable fine tuned control over the
121// scheduler. These groups may be more specific than current SchedModel
122// instruction classes.
123class SchedGroup {
124private:
125 // Mask that defines which instruction types can be classified into this
126 // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER
127 // and SCHED_GROUP_BARRIER.
128 SchedGroupMask SGMask;
129
130 // Maximum number of SUnits that can be added to this group.
131 std::optional<unsigned> MaxSize;
132
133 // SchedGroups will only synchronize with other SchedGroups that have the same
134 // SyncID.
135 int SyncID = 0;
136
137 // SGID is used to map instructions to candidate SchedGroups
138 unsigned SGID;
139
140 // The different rules each instruction in this SchedGroup must conform to
142
143 // Count of the number of created SchedGroups, used to initialize SGID.
144 static unsigned NumSchedGroups;
145
146 // Use SGMask to determine whether we can classify MI as a member of this
147 // SchedGroup object.
148 bool canAddMI(const MachineInstr &MI) const;
149
150public:
151 // Collection of SUnits that are classified as members of this group.
152 SmallVector<SUnit *, 32> Collection;
153
155 const SIInstrInfo *TII;
156
157 // Try to add and edge from SU A to SU B.
158 bool tryAddEdge(SUnit *A, SUnit *B);
159
160 // Returns true if SU can be added to this SchedGroup.
161 bool canAddSU(SUnit &SU) const;
162
163 // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If
164 // MakePred is true, SU will be a predecessor of the SUnits in this
165 // SchedGroup, otherwise SU will be a successor.
166 void link(SUnit &SU, bool MakePred = false);
167
168 // Add DAG dependencies and track which edges are added, and the count of
169 // missed edges
170 int link(SUnit &SU, bool MakePred,
171 std::list<std::pair<SUnit *, SUnit *>> &AddedEdges);
172
173 // Add DAG dependencies from all SUnits in this SchedGroup and this SU.
174 // Use the predicate to determine whether SU should be a predecessor (P =
175 // true) or a successor (P = false) of this SchedGroup.
176 void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P);
177
178 // Add DAG dependencies such that SUnits in this group shall be ordered
179 // before SUnits in OtherGroup.
180 void link(SchedGroup &OtherGroup);
181
182 // Returns true if no more instructions may be added to this group.
183 bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; }
184
185 // Append a constraint that SUs must meet in order to fit into this
186 // SchedGroup. Since many rules involve the relationship between a SchedGroup
187 // and the SUnits in other SchedGroups, rules are checked at Pipeline Solve
188 // time (rather than SchedGroup init time.)
189 void addRule(std::shared_ptr<InstructionRule> NewRule) {
190 Rules.push_back(NewRule);
191 }
192
193 // Returns true if the SU matches all rules
194 bool allowedByRules(const SUnit *SU,
195 SmallVectorImpl<SchedGroup> &SyncPipe) const {
196 for (auto &Rule : Rules) {
197 if (!Rule->apply(SU, Collection, SyncPipe))
198 return false;
199 }
200 return true;
201 }
202
203 // Add SU to the SchedGroup.
204 void add(SUnit &SU) {
205 LLVM_DEBUG(dbgs() << "For SchedGroup with mask "
206 << format_hex((int)SGMask, 10, true) << " adding "
207 << *SU.getInstr());
208 Collection.push_back(&SU);
209 }
210
211 // Remove last element in the SchedGroup
212 void pop() { Collection.pop_back(); }
213
214 template <class T>
215 void findCandidateSUnits(T Begin, T End,
216 SUnitsToCandidateSGsMap &SyncedInstrs);
217
218 /// Find each SUnit in the DAG that could potentially be added to
219 /// this SchedGroup and add the SGID to the candidate SchedGroups
220 /// for SU in \p SyncedInstrs.
221 void findCandidateSUnits(SUnitsToCandidateSGsMap &SyncedInstrs);
222
223 int getSyncID() { return SyncID; }
224
225 int getSGID() { return SGID; }
226
227 SchedGroupMask getMask() { return SGMask; }
228
229 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
230 ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
231 : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) {
232 SGID = NumSchedGroups++;
233 }
234
235 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID,
236 ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
237 : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) {
238 SGID = NumSchedGroups++;
239 }
240};
241
242using SUToCandSGsPair = std::pair<SUnit *, SmallVector<int, 4>>;
243using SUsToCandSGsVec = SmallVector<SUToCandSGsPair, 4>;
244
245// The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline
246// in non-trivial cases. For example, if the requested pipeline is
247// {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction
248// in the DAG, then we will have an instruction that can not be trivially
249// assigned to a SchedGroup. The PipelineSolver class implements two algorithms
250// to find a good solution to the pipeline -- a greedy algorithm and an exact
251// algorithm. The exact algorithm has an exponential time complexity and should
252// only be used for small sized problems or medium sized problems where an exact
253// solution is highly desired.
254class PipelineSolver {
255 [[maybe_unused]] ScheduleDAGMI *DAG;
256
257 // Instructions that can be assigned to multiple SchedGroups
259 SmallVector<SUsToCandSGsVec, 4> PipelineInstrs;
261 // The current working pipeline
263 // The pipeline that has the best solution found so far
265
266 // Whether or not we actually have any SyncedInstrs to try to solve.
267 bool NeedsSolver = false;
268
269 // Compute an estimate of the size of search tree -- the true size is
270 // the product of each conflictedInst.Matches.size() across all SyncPipelines
271 unsigned computeProblemSize();
272
273 // The cost penalty of not assigning a SU to a SchedGroup
274 int MissPenalty = 0;
275
276 // Costs in terms of the number of edges we are unable to add
277 int BestCost = -1;
278 int CurrCost = 0;
279
280 // Index pointing to the conflicting instruction that is currently being
281 // fitted
282 int CurrConflInstNo = 0;
283 // Index to the pipeline that is currently being fitted
284 int CurrSyncGroupIdx = 0;
285 // The first non trivial pipeline
286 int BeginSyncGroupIdx = 0;
287
288 // How many branches we have explored
289 uint64_t BranchesExplored = 0;
290
291 // The direction in which we process the candidate SchedGroups per SU
292 bool IsBottomUp = true;
293
294 // Update indices to fit next conflicting instruction
295 void advancePosition();
296 // Recede indices to attempt to find better fit for previous conflicting
297 // instruction
298 void retreatPosition();
299
300 // The exponential time algorithm which finds the provably best fit
301 bool solveExact();
302 // The polynomial time algorithm which attempts to find a good fit
303 bool solveGreedy();
304 // Find the best SchedGroup for the current SU using the heuristic given all
305 // current information. One step in the greedy algorithm. Templated against
306 // the SchedGroup iterator (either reverse or forward).
307 template <typename T>
308 void greedyFind(std::list<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E);
309 // Whether or not the current solution is optimal
310 bool checkOptimal();
311 // Populate the ready list, prioiritizing fewest missed edges first
312 // Templated against the SchedGroup iterator (either reverse or forward).
313 template <typename T>
314 void populateReadyList(SmallVectorImpl<std::pair<int, int>> &ReadyList, T I,
315 T E);
316 // Add edges corresponding to the SchedGroups as assigned by solver
317 void makePipeline();
318 // Link the SchedGroups in the best found pipeline.
319 // Tmplated against the SchedGroup iterator (either reverse or forward).
320 template <typename T> void linkSchedGroups(T I, T E);
321 // Add the edges from the SU to the other SchedGroups in pipeline, and
322 // return the number of edges missed.
323 int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
324 std::list<std::pair<SUnit *, SUnit *>> &AddedEdges);
325 /// Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It
326 /// returns the cost (in terms of missed pipeline edges), and tracks the edges
327 /// added in \p AddedEdges
328 template <typename T>
329 int linkSUnit(SUnit *SU, int SGID,
330 std::list<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E);
331 /// Remove the edges passed via \p AddedEdges
332 void removeEdges(const std::list<std::pair<SUnit *, SUnit *>> &AddedEdges);
333 // Convert the passed in maps to arrays for bidirectional iterators
334 void convertSyncMapsToArrays();
335
336 void reset();
337
338public:
339 // Invoke the solver to map instructions to instruction groups. Heuristic &&
340 // command-line-option determines to use exact or greedy algorithm.
341 void solve();
342
343 PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
345 ScheduleDAGMI *DAG, bool IsBottomUp = true)
346 : DAG(DAG), SyncedInstrs(SyncedInstrs),
347 SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) {
348
349 for (auto &PipelineInstrs : SyncedInstrs) {
350 if (!PipelineInstrs.second.empty()) {
351 NeedsSolver = true;
352 break;
353 }
354 }
355
356 if (!NeedsSolver)
357 return;
358
359 convertSyncMapsToArrays();
360
361 CurrPipeline = BestPipeline;
362
363 while (static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.size() &&
364 PipelineInstrs[BeginSyncGroupIdx].empty())
365 ++BeginSyncGroupIdx;
366
367 if (static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.size())
368 return;
369 }
370};
371
372void PipelineSolver::reset() {
373
374 for (auto &SyncPipeline : CurrPipeline) {
375 for (auto &SG : SyncPipeline) {
376 SmallVector<SUnit *, 32> TempCollection = SG.Collection;
377 SG.Collection.clear();
378 auto *SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) {
379 return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER;
380 });
381 if (SchedBarr != TempCollection.end())
382 SG.Collection.push_back(*SchedBarr);
383 }
384 }
385
386 CurrSyncGroupIdx = BeginSyncGroupIdx;
387 CurrConflInstNo = 0;
388 CurrCost = 0;
389}
390
391void PipelineSolver::convertSyncMapsToArrays() {
392 for (auto &SyncPipe : SyncedSchedGroups) {
393 BestPipeline.insert(BestPipeline.begin(), SyncPipe.second);
394 }
395
396 int PipelineIDx = SyncedInstrs.size() - 1;
397 PipelineInstrs.resize(SyncedInstrs.size());
398 for (auto &SyncInstrMap : SyncedInstrs) {
399 for (auto &SUsToCandSGs : SyncInstrMap.second) {
400 if (PipelineInstrs[PipelineIDx].empty()) {
401 PipelineInstrs[PipelineIDx].push_back(
402 std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
403 continue;
404 }
405 auto *SortPosition = PipelineInstrs[PipelineIDx].begin();
406 // Insert them in sorted order -- this allows for good parsing order in
407 // the greedy algorithm
408 while (SortPosition != PipelineInstrs[PipelineIDx].end() &&
409 SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum)
410 ++SortPosition;
411 PipelineInstrs[PipelineIDx].insert(
412 SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
413 }
414 --PipelineIDx;
415 }
416}
417
418template <typename T> void PipelineSolver::linkSchedGroups(T I, T E) {
419 for (; I != E; ++I) {
420 auto &GroupA = *I;
421 for (auto J = std::next(I); J != E; ++J) {
422 auto &GroupB = *J;
423 GroupA.link(GroupB);
424 }
425 }
426}
427
428void PipelineSolver::makePipeline() {
429 // Preserve the order of barrier for subsequent SchedGroupBarrier mutations
430 for (auto &SyncPipeline : BestPipeline) {
431 LLVM_DEBUG(dbgs() << "Printing SchedGroups\n");
432 for (auto &SG : SyncPipeline) {
433 LLVM_DEBUG(dbgs() << "SchedGroup with SGID " << SG.getSGID()
434 << " has: \n");
435 SUnit *SGBarr = nullptr;
436 for (auto &SU : SG.Collection) {
437 if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
438 SGBarr = SU;
439 LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n");
440 }
441 // Command line requested IGroupLP doesn't have SGBarr
442 if (!SGBarr)
443 continue;
444 SG.link(*SGBarr, false);
445 }
446 }
447
448 for (auto &SyncPipeline : BestPipeline) {
449 IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend())
450 : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end());
451 }
452}
453
454template <typename T>
455int PipelineSolver::linkSUnit(
456 SUnit *SU, int SGID, std::list<std::pair<SUnit *, SUnit *>> &AddedEdges,
457 T I, T E) {
458 bool MakePred = false;
459 int AddedCost = 0;
460 for (; I < E; ++I) {
461 if (I->getSGID() == SGID) {
462 MakePred = true;
463 continue;
464 }
465 auto Group = *I;
466 AddedCost += Group.link(*SU, MakePred, AddedEdges);
467 assert(AddedCost >= 0);
468 }
469 return AddedCost;
470}
471
472int PipelineSolver::addEdges(
473 SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
474 std::list<std::pair<SUnit *, SUnit *>> &AddedEdges) {
475
476 // For IsBottomUp, the first SchedGroup in SyncPipeline contains the
477 // instructions that are the ultimate successors in the resultant mutation.
478 // Therefore, in such a configuration, the SchedGroups occurring before the
479 // candidate SGID are successors of the candidate SchedGroup, thus the current
480 // SU should be linked as a predecessor to SUs in those SchedGroups. The
481 // opposite is true if !IsBottomUp. IsBottomUp occurs in the case of multiple
482 // SCHED_GROUP_BARRIERS, or if a user specifies IGLP_OPT SchedGroups using
483 // IsBottomUp (in reverse).
484 return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.rbegin(),
485 SyncPipeline.rend())
486 : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.begin(),
487 SyncPipeline.end());
488}
489
490void PipelineSolver::removeEdges(
491 const std::list<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
492 // Only remove the edges that we have added when testing
493 // the fit.
494 for (auto &PredSuccPair : EdgesToRemove) {
495 SUnit *Pred = PredSuccPair.first;
496 SUnit *Succ = PredSuccPair.second;
497
498 auto *Match = llvm::find_if(
499 Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; });
500 if (Match != Succ->Preds.end()) {
501 assert(Match->isArtificial());
502 Succ->removePred(*Match);
503 }
504 }
505}
506
507void PipelineSolver::advancePosition() {
508 ++CurrConflInstNo;
509
510 if (static_cast<size_t>(CurrConflInstNo) >=
511 PipelineInstrs[CurrSyncGroupIdx].size()) {
512 CurrConflInstNo = 0;
513 ++CurrSyncGroupIdx;
514 // Advance to next non-trivial pipeline
515 while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() &&
516 PipelineInstrs[CurrSyncGroupIdx].empty())
517 ++CurrSyncGroupIdx;
518 }
519}
520
521void PipelineSolver::retreatPosition() {
522 assert(CurrConflInstNo >= 0);
523 assert(CurrSyncGroupIdx >= 0);
524
525 if (CurrConflInstNo > 0) {
526 --CurrConflInstNo;
527 return;
528 }
529
530 if (CurrConflInstNo == 0) {
531 // If we return to the starting position, we have explored
532 // the entire tree
533 if (CurrSyncGroupIdx == BeginSyncGroupIdx)
534 return;
535
536 --CurrSyncGroupIdx;
537 // Go to previous non-trivial pipeline
538 while (PipelineInstrs[CurrSyncGroupIdx].empty())
539 --CurrSyncGroupIdx;
540
541 CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1;
542 }
543}
544
545bool PipelineSolver::checkOptimal() {
546 if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {
547 if (BestCost == -1 || CurrCost < BestCost) {
548 BestPipeline = CurrPipeline;
549 BestCost = CurrCost;
550 LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n");
551 }
552 assert(BestCost >= 0);
553 }
554
555 bool DoneExploring = false;
556 if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
557 DoneExploring = true;
558
559 return (DoneExploring || BestCost == 0);
560}
561
562template <typename T>
563void PipelineSolver::populateReadyList(
564 SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, T E) {
565 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
566 auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
567 assert(CurrSU.second.size() >= 1);
568
569 for (; I != E; ++I) {
570 std::list<std::pair<SUnit *, SUnit *>> AddedEdges;
571 int CandSGID = *I;
572 SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
573 return SG.getSGID() == CandSGID;
574 });
575 assert(Match);
576
577 if (UseCostHeur) {
578 if (Match->isFull()) {
579 ReadyList.push_back(std::pair(*I, MissPenalty));
580 continue;
581 }
582
583 int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
584 ReadyList.push_back(std::pair(*I, TempCost));
585 removeEdges(AddedEdges);
586 } else
587 ReadyList.push_back(std::pair(*I, -1));
588 }
589
590 if (UseCostHeur)
591 std::sort(ReadyList.begin(), ReadyList.end(), llvm::less_second());
592
593 assert(ReadyList.size() == CurrSU.second.size());
594}
595
596bool PipelineSolver::solveExact() {
597 if (checkOptimal())
598 return true;
599
600 if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())
601 return false;
602
603 assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());
604 assert(static_cast<size_t>(CurrConflInstNo) <
605 PipelineInstrs[CurrSyncGroupIdx].size());
606 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
607 LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
608 << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
609
610 // SchedGroup -> Cost pairs
612 // Prioritize the candidate sched groups in terms of lowest cost first
613 IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.rbegin(),
614 CurrSU.second.rend())
615 : populateReadyList(ReadyList, CurrSU.second.begin(),
616 CurrSU.second.end());
617
618 auto *I = ReadyList.begin();
619 auto *E = ReadyList.end();
620 for (; I != E; ++I) {
621 // If we are trying SGs in least cost order, and the current SG is cost
622 // infeasible, then all subsequent SGs will also be cost infeasible, so we
623 // can prune.
624 if (BestCost != -1 && (CurrCost + I->second > BestCost))
625 return false;
626
627 int CandSGID = I->first;
628 int AddedCost = 0;
629 std::list<std::pair<SUnit *, SUnit *>> AddedEdges;
630 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
631 SchedGroup *Match;
632 for (auto &SG : SyncPipeline) {
633 if (SG.getSGID() == CandSGID)
634 Match = &SG;
635 }
636
637 if (Match->isFull())
638 continue;
639
640 if (!Match->allowedByRules(CurrSU.first, SyncPipeline))
641 continue;
642
643 LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask "
644 << (int)Match->getMask() << "and ID " << CandSGID
645 << "\n");
646 Match->add(*CurrSU.first);
647 AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
648 LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n");
649 CurrCost += AddedCost;
650 advancePosition();
651 ++BranchesExplored;
652 bool FinishedExploring = false;
653 // If the Cost after adding edges is greater than a known solution,
654 // backtrack
655 if (CurrCost < BestCost || BestCost == -1) {
656 if (solveExact()) {
657 FinishedExploring = BestCost != 0;
658 if (!FinishedExploring)
659 return true;
660 }
661 }
662
663 retreatPosition();
664 CurrCost -= AddedCost;
665 removeEdges(AddedEdges);
666 Match->pop();
667 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
668 if (FinishedExploring)
669 return true;
670 }
671
672 // Try the pipeline where the current instruction is omitted
673 // Potentially if we omit a problematic instruction from the pipeline,
674 // all the other instructions can nicely fit.
675 CurrCost += MissPenalty;
676 advancePosition();
677
678 LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n");
679
680 bool FinishedExploring = false;
681 if (CurrCost < BestCost || BestCost == -1) {
682 if (solveExact()) {
683 bool FinishedExploring = BestCost != 0;
684 if (!FinishedExploring)
685 return true;
686 }
687 }
688
689 retreatPosition();
690 CurrCost -= MissPenalty;
691 return FinishedExploring;
692}
693
694template <typename T>
695void PipelineSolver::greedyFind(
696 std::list<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E) {
697 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
698
699 struct GroupInfo {
700 SchedGroup *SG;
701 std::list<std::pair<SUnit *, SUnit *>> Edges;
702 int Cost = 0;
703 };
704 std::optional<GroupInfo> Best;
705
706 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
707 LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
708 << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
709
710 // Since we have added the potential SchedGroups from bottom up, but
711 // traversed the DAG from top down, parse over the groups from last to
712 // first. If we fail to do this for the greedy algorithm, the solution will
713 // likely not be good in more complex cases.
714 for (; I != E; ++I) {
715 int CandSGID = *I;
716 SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
717 return SG.getSGID() == CandSGID;
718 });
719 assert(Match);
720
721 LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "
722 << (int)Match->getMask() << "\n");
723
724 if (Match->isFull()) {
725 LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");
726 continue;
727 }
728 if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) {
729 LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " has conflicting rule\n");
730 continue;
731 }
732
733 std::list<std::pair<SUnit *, SUnit *>> TempEdges;
734 int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, TempEdges);
735 LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n");
736
737 if (!Best || TempCost < Best->Cost) {
738 Best = {Match, TempEdges, TempCost};
739 if (Best->Cost == 0)
740 break;
741 }
742
743 removeEdges(TempEdges);
744 }
745
746 if (Best) {
747 SchedGroup *SG = Best->SG;
748 std::list<std::pair<SUnit *, SUnit *>> &Edges = Best->Edges;
749
750 SG->add(*CurrSU.first);
751 if (AddedEdges.empty())
752 AddedEdges = Edges;
753 else
754 AddedEdges.splice(std::prev(AddedEdges.cend()), Edges);
755
756 for (const std::pair<SUnit *, SUnit *> &E : Edges) {
757 if (!SG->tryAddEdge(E.first, E.second))
758 llvm_unreachable("Edges known to be insertable.");
759 }
760
761 LLVM_DEBUG(dbgs() << "Best Group has ID: " << SG->getSGID() << " and Mask"
762 << (int)SG->getMask() << "\n");
763 BestCost += Best->Cost;
764 } else
765 BestCost += MissPenalty;
766}
767
768bool PipelineSolver::solveGreedy() {
769 BestCost = 0;
770 std::list<std::pair<SUnit *, SUnit *>> AddedEdges;
771
772 while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
773 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
774 IsBottomUp
775 ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend())
776 : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end());
777 advancePosition();
778 }
779 BestPipeline = CurrPipeline;
780 removeEdges(AddedEdges);
781 return false;
782}
783
784unsigned PipelineSolver::computeProblemSize() {
785 unsigned ProblemSize = 0;
786 for (auto &PipeConflicts : PipelineInstrs) {
787 ProblemSize += PipeConflicts.size();
788 }
789
790 return ProblemSize;
791}
792
793void PipelineSolver::solve() {
794 if (!NeedsSolver)
795 return;
796
797 unsigned ProblemSize = computeProblemSize();
798 assert(ProblemSize > 0);
799
800 bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact;
801 MissPenalty = (ProblemSize / 2) + 1;
802
803 LLVM_DEBUG(DAG->dump());
804 if (EnableExactSolver || BelowCutoff) {
805 LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n");
806 solveGreedy();
807 reset();
808 LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n");
809 if (BestCost > 0) {
810 LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n");
811 solveExact();
812 LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n");
813 }
814 } else { // Use the Greedy Algorithm by default
815 LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n");
816 solveGreedy();
817 LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n");
818 }
819
820 makePipeline();
821 LLVM_DEBUG(dbgs() << "After applying mutation\n");
822 LLVM_DEBUG(DAG->dump());
823}
824
825// Implement a IGLP scheduling strategy.
826class IGLPStrategy {
827protected:
829
830 const SIInstrInfo *TII;
831
832public:
833 /// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy.
834 virtual bool applyIGLPStrategy(
836 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
838
839 // Returns true if this strategy should be applied to a ScheduleDAG.
840 virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
842
843 bool IsBottomUp = true;
844
845 IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
846 : DAG(DAG), TII(TII) {}
847
848 virtual ~IGLPStrategy() = default;
849};
850
851class MFMASmallGemmOpt final : public IGLPStrategy {
852private:
853public:
854 bool applyIGLPStrategy(
856 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
858
859 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
861 return true;
862 }
863
864 MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
865 : IGLPStrategy(DAG, TII) {
866 IsBottomUp = true;
867 }
868};
869
870bool MFMASmallGemmOpt::applyIGLPStrategy(
872 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
874 // Count the number of MFMA instructions.
875 unsigned MFMACount = 0;
876 for (const MachineInstr &I : *DAG)
877 if (TII->isMFMAorWMMA(I))
878 ++MFMACount;
879
880 const unsigned PipelineSyncID = 0;
881 SchedGroup *SG = nullptr;
882 for (unsigned I = 0; I < MFMACount * 3; ++I) {
883 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
884 SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII);
885 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
886
887 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
888 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
889 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
890 }
891
892 return true;
893}
894
895class MFMAExpInterleaveOpt final : public IGLPStrategy {
896private:
897 // The count of TRANS SUs involved in the interleaved pipeline
898 static unsigned TransPipeCount;
899 // The count of MFMA SUs involved in the interleaved pipeline
900 static unsigned MFMAPipeCount;
901 // The count of Add SUs involved in the interleaved pipeline
902 static unsigned AddPipeCount;
903 // The number of transitive MFMA successors for each TRANS SU
904 static unsigned MFMAEnablement;
905 // The number of transitive TRANS predecessors for each MFMA SU
906 static unsigned ExpRequirement;
907 // The count of independent "chains" of MFMA instructions in the pipeline
908 static unsigned MFMAChains;
909 // Whether or not the pipeline has V_CVT instructions
910 static bool HasCvt;
911 // Whether or not there are instructions between the TRANS instruction and
912 // V_CVT
913 static bool HasChainBetweenCvt;
914 // The first occuring DS_READ which feeds an MFMA chain
915 static std::optional<unsigned> FirstPipeDSR;
916 // The MFMAPipe SUs with no MFMA predecessors
917 SmallVector<SUnit *, 4> MFMAChainSeeds;
918 // Compute the heuristics for the pipeline, returning whether or not the DAG
919 // is well formatted for the mutation
920 bool analyzeDAG(const SIInstrInfo *TII);
921
922 /// Whether or not the instruction is a transitive predecessor of an MFMA
923 /// instruction
924 class IsPipeExp final : public InstructionRule {
925 public:
926 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
927 SmallVectorImpl<SchedGroup> &SyncPipe) override {
928
929 auto *DAG = SyncPipe[0].DAG;
930
931 if (Cache->empty()) {
932 auto I = DAG->SUnits.rbegin();
933 auto E = DAG->SUnits.rend();
934 for (; I != E; I++) {
935 if (TII->isMFMAorWMMA(*I->getInstr()))
936 Cache->push_back(&*I);
937 }
938 if (Cache->empty())
939 return false;
940 }
941
942 auto Reaches = any_of(*Cache, [&SU, &DAG](SUnit *TargetSU) {
943 return DAG->IsReachable(TargetSU, const_cast<SUnit *>(SU));
944 });
945
946 return Reaches;
947 }
948 IsPipeExp(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
949 : InstructionRule(TII, SGID, NeedsCache) {}
950 };
951
952 /// Whether or not the instruction is a transitive predecessor of the
953 /// \p Number th MFMA of the MFMAs occuring after a TRANS instruction
954 class EnablesNthMFMA final : public InstructionRule {
955 private:
956 unsigned Number = 1;
957
958 public:
959 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
960 SmallVectorImpl<SchedGroup> &SyncPipe) override {
961 bool FoundTrans = false;
962 unsigned Counter = 1;
963 auto *DAG = SyncPipe[0].DAG;
964
965 if (Cache->empty()) {
966 auto I = DAG->SUnits.begin();
967 auto E = DAG->SUnits.end();
968 for (; I != E; I++) {
969 if (FoundTrans && TII->isMFMAorWMMA(*I->getInstr())) {
970 if (Counter == Number) {
971 Cache->push_back(&*I);
972 break;
973 }
974 ++Counter;
975 }
976 if (!FoundTrans && TII->isTRANS(I->getInstr()->getOpcode()))
977 FoundTrans = true;
978 }
979 if (Cache->empty())
980 return false;
981 }
982
983 return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU));
984 }
985
986 EnablesNthMFMA(unsigned Number, const SIInstrInfo *TII, unsigned SGID,
987 bool NeedsCache = false)
988 : InstructionRule(TII, SGID, NeedsCache), Number(Number) {}
989 };
990
991 /// Whether or not the instruction enables the exact MFMA that is the \p
992 /// Number th MFMA in the chain starting with \p ChainSeed
993 class EnablesNthMFMAInChain final : public InstructionRule {
994 private:
995 unsigned Number = 1;
996 SUnit *ChainSeed;
997
998 public:
999 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1000 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1001 auto *DAG = SyncPipe[0].DAG;
1002
1003 if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr()))
1004 return false;
1005
1006 if (Cache->empty()) {
1007 auto *TempSU = ChainSeed;
1008 auto Depth = Number;
1009 while (Depth > 0) {
1010 --Depth;
1011 bool Found = false;
1012 for (auto &Succ : TempSU->Succs) {
1013 if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
1014 TempSU = Succ.getSUnit();
1015 Found = true;
1016 break;
1017 }
1018 }
1019 if (!Found)
1020 return false;
1021 }
1022
1023 Cache->push_back(TempSU);
1024 }
1025 // If we failed to find the instruction to be placed into the cache, we
1026 // would have already exited.
1027 assert(!Cache->empty());
1028
1029 return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU));
1030 }
1031
1032 EnablesNthMFMAInChain(unsigned Number, SUnit *ChainSeed,
1033 const SIInstrInfo *TII, unsigned SGID,
1034 bool NeedsCache = false)
1035 : InstructionRule(TII, SGID, NeedsCache), Number(Number),
1036 ChainSeed(ChainSeed) {}
1037 };
1038
1039 /// Whether or not the instruction has less than \p Size immediate successors.
1040 /// If \p HasIntermediary is true, this tests also whether all successors of
1041 /// the SUnit have less than \p Size successors.
1042 class LessThanNSuccs final : public InstructionRule {
1043 private:
1044 unsigned Size = 1;
1045 bool HasIntermediary = false;
1046
1047 public:
1048 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1049 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1050 if (!SyncPipe.size())
1051 return false;
1052
1053 unsigned SuccSize = llvm::count_if(SU->Succs, [](const SDep &Succ) {
1054 return Succ.getKind() == SDep::Data;
1055 });
1056 if (SuccSize >= Size)
1057 return false;
1058
1059 if (HasIntermediary) {
1060 for (auto Succ : SU->Succs) {
1061 unsigned SuccSize =
1062 llvm::count_if(Succ.getSUnit()->Succs, [](const SDep &SuccSucc) {
1063 return SuccSucc.getKind() == SDep::Data;
1064 });
1065 if (SuccSize >= Size)
1066 return false;
1067 }
1068 }
1069
1070 return true;
1071 }
1072 LessThanNSuccs(unsigned Size, const SIInstrInfo *TII, unsigned SGID,
1073 bool HasIntermediary = false, bool NeedsCache = false)
1074 : InstructionRule(TII, SGID, NeedsCache), Size(Size),
1075 HasIntermediary(HasIntermediary) {}
1076 };
1077
1078 /// Whether or not the instruction has greater than or equal to \p Size
1079 /// immediate successors. If \p HasIntermediary is true, this tests also
1080 /// whether all successors of the SUnit have greater than or equal to \p Size
1081 /// successors.
1082 class GreaterThanOrEqualToNSuccs final : public InstructionRule {
1083 private:
1084 unsigned Size = 1;
1085 bool HasIntermediary = false;
1086
1087 public:
1088 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1089 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1090 if (!SyncPipe.size())
1091 return false;
1092
1093 unsigned SuccSize = llvm::count_if(SU->Succs, [](const SDep &Succ) {
1094 return Succ.getKind() == SDep::Data;
1095 });
1096 if (SuccSize >= Size)
1097 return true;
1098
1099 if (HasIntermediary) {
1100 for (auto Succ : SU->Succs) {
1101 unsigned SuccSize =
1102 llvm::count_if(Succ.getSUnit()->Succs, [](const SDep &SuccSucc) {
1103 return SuccSucc.getKind() == SDep::Data;
1104 });
1105 if (SuccSize >= Size)
1106 return true;
1107 }
1108 }
1109
1110 return false;
1111 }
1112 GreaterThanOrEqualToNSuccs(unsigned Size, const SIInstrInfo *TII,
1113 unsigned SGID, bool HasIntermediary = false,
1114 bool NeedsCache = false)
1115 : InstructionRule(TII, SGID, NeedsCache), Size(Size),
1116 HasIntermediary(HasIntermediary) {}
1117 };
1118
1119 // Whether or not the instruction is a relevant V_CVT instruction.
1120 class IsCvt final : public InstructionRule {
1121 public:
1122 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1123 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1124 auto Opc = SU->getInstr()->getOpcode();
1125 return Opc == AMDGPU::V_CVT_F16_F32_e32 ||
1126 Opc == AMDGPU::V_CVT_I32_F32_e32;
1127 }
1128 IsCvt(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
1129 : InstructionRule(TII, SGID, NeedsCache) {}
1130 };
1131
1132 // Whether or not the instruction is FMA_F32.
1133 class IsFMA final : public InstructionRule {
1134 public:
1135 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1136 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1137 return SU->getInstr()->getOpcode() == AMDGPU::V_FMA_F32_e64 ||
1138 SU->getInstr()->getOpcode() == AMDGPU::V_PK_FMA_F32;
1139 }
1140 IsFMA(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
1141 : InstructionRule(TII, SGID, NeedsCache) {}
1142 };
1143
1144 // Whether or not the instruction is a V_ADD_F32 instruction.
1145 class IsPipeAdd final : public InstructionRule {
1146 public:
1147 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1148 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1149 return SU->getInstr()->getOpcode() == AMDGPU::V_ADD_F32_e32;
1150 }
1151 IsPipeAdd(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
1152 : InstructionRule(TII, SGID, NeedsCache) {}
1153 };
1154
1155 /// Whether or not the instruction is an immediate RAW successor
1156 /// of the SchedGroup \p Distance steps before.
1157 class IsSuccOfPrevNthGroup final : public InstructionRule {
1158 private:
1159 unsigned Distance = 1;
1160
1161 public:
1162 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1163 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1164 SchedGroup *OtherGroup = nullptr;
1165 if (!SyncPipe.size())
1166 return false;
1167
1168 for (auto &PipeSG : SyncPipe) {
1169 if ((unsigned)PipeSG.getSGID() == SGID - Distance)
1170 OtherGroup = &PipeSG;
1171 }
1172
1173 if (!OtherGroup)
1174 return false;
1175 if (!OtherGroup->Collection.size())
1176 return true;
1177
1178 for (auto &OtherEle : OtherGroup->Collection) {
1179 for (auto &Succ : OtherEle->Succs) {
1180 if (Succ.getSUnit() == SU && Succ.getKind() == SDep::Data)
1181 return true;
1182 }
1183 }
1184
1185 return false;
1186 }
1187 IsSuccOfPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
1188 unsigned SGID, bool NeedsCache = false)
1189 : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
1190 };
1191
1192 /// Whether or not the instruction is a transitive successor of any
1193 /// instruction the the SchedGroup \p Distance steps before.
1194 class IsReachableFromPrevNthGroup final : public InstructionRule {
1195 private:
1196 unsigned Distance = 1;
1197
1198 public:
1199 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1200 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1201 SchedGroup *OtherGroup = nullptr;
1202 if (!SyncPipe.size())
1203 return false;
1204
1205 for (auto &PipeSG : SyncPipe) {
1206 if ((unsigned)PipeSG.getSGID() == SGID - Distance)
1207 OtherGroup = &PipeSG;
1208 }
1209
1210 if (!OtherGroup)
1211 return false;
1212 if (!OtherGroup->Collection.size())
1213 return true;
1214
1215 auto *DAG = SyncPipe[0].DAG;
1216
1217 for (auto &OtherEle : OtherGroup->Collection)
1218 if (DAG->IsReachable(const_cast<SUnit *>(SU), OtherEle))
1219 return true;
1220
1221 return false;
1222 }
1223 IsReachableFromPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
1224 unsigned SGID, bool NeedsCache = false)
1225 : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
1226 };
1227
1228 /// Whether or not the instruction occurs after the SU with NodeNUm \p Number
1229 class OccursAtOrAfterNode final : public InstructionRule {
1230 private:
1231 unsigned Number = 1;
1232
1233 public:
1234 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1235 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1236
1237 return SU->NodeNum >= Number;
1238 }
1239 OccursAtOrAfterNode(unsigned Number, const SIInstrInfo *TII, unsigned SGID,
1240 bool NeedsCache = false)
1241 : InstructionRule(TII, SGID, NeedsCache), Number(Number) {}
1242 };
1243
1244 /// Whether or not the SU is exactly the \p Number th MFMA in the chain
1245 /// starting with \p ChainSeed
1246 class IsExactMFMA final : public InstructionRule {
1247 private:
1248 unsigned Number = 1;
1249 SUnit *ChainSeed;
1250
1251 public:
1252 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1253 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1254 if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr()))
1255 return false;
1256
1257 if (Cache->empty()) {
1258 auto *TempSU = ChainSeed;
1259 auto Depth = Number;
1260 while (Depth > 0) {
1261 --Depth;
1262 bool Found = false;
1263 for (auto &Succ : TempSU->Succs) {
1264 if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
1265 TempSU = Succ.getSUnit();
1266 Found = true;
1267 break;
1268 }
1269 }
1270 if (!Found) {
1271 return false;
1272 }
1273 }
1274 Cache->push_back(TempSU);
1275 }
1276 // If we failed to find the instruction to be placed into the cache, we
1277 // would have already exited.
1278 assert(!Cache->empty());
1279
1280 return (*Cache)[0] == SU;
1281 }
1282
1283 IsExactMFMA(unsigned Number, SUnit *ChainSeed, const SIInstrInfo *TII,
1284 unsigned SGID, bool NeedsCache = false)
1285 : InstructionRule(TII, SGID, NeedsCache), Number(Number),
1286 ChainSeed(ChainSeed) {}
1287 };
1288
1289 // Whether the instruction occurs after the first TRANS instruction. This
1290 // implies the instruction can not be a predecessor of the first TRANS
1291 // insruction
1292 class OccursAfterExp final : public InstructionRule {
1293 public:
1294 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1295 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1296
1297 auto *DAG = SyncPipe[0].DAG;
1298 if (Cache->empty()) {
1299 for (auto &SU : DAG->SUnits)
1300 if (TII->isTRANS(SU.getInstr()->getOpcode())) {
1301 Cache->push_back(&SU);
1302 break;
1303 }
1304 if (Cache->empty())
1305 return false;
1306 }
1307
1308 return SU->NodeNum > (*Cache)[0]->NodeNum;
1309 }
1310
1311 OccursAfterExp(const SIInstrInfo *TII, unsigned SGID,
1312 bool NeedsCache = false)
1313 : InstructionRule(TII, SGID, NeedsCache) {}
1314 };
1315
1316public:
1317 bool applyIGLPStrategy(
1319 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
1321
1322 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
1324
1325 MFMAExpInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
1326 : IGLPStrategy(DAG, TII) {
1327 IsBottomUp = false;
1328 }
1329};
1330
1331unsigned MFMAExpInterleaveOpt::TransPipeCount = 0;
1332unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0;
1333unsigned MFMAExpInterleaveOpt::AddPipeCount = 0;
1334unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0;
1335unsigned MFMAExpInterleaveOpt::ExpRequirement = 0;
1336unsigned MFMAExpInterleaveOpt::MFMAChains = 0;
1337bool MFMAExpInterleaveOpt::HasCvt = false;
1338bool MFMAExpInterleaveOpt::HasChainBetweenCvt = false;
1339std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt;
1340
1341bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
1342 SmallVector<SUnit *, 10> ExpPipeCands;
1343 SmallVector<SUnit *, 10> MFMAPipeCands;
1344 SmallVector<SUnit *, 10> MFMAPipeSUs;
1347
1348 auto isBitPack = [](unsigned Opc) {
1349 return Opc == AMDGPU::V_PACK_B32_F16_e64 || Opc == AMDGPU::V_PERM_B32_e64;
1350 };
1351
1352 auto isCvt = [](unsigned Opc) {
1353 return Opc == AMDGPU::V_CVT_F16_F32_e32 || Opc == AMDGPU::V_CVT_I32_F32_e32;
1354 };
1355
1356 auto isAdd = [](unsigned Opc) { return Opc == AMDGPU::V_ADD_F32_e32; };
1357
1358 AddPipeCount = 0;
1359 for (SUnit &SU : DAG->SUnits) {
1360 auto Opc = SU.getInstr()->getOpcode();
1361 if (TII->isTRANS(Opc)) {
1362 // Avoid counting a potential bonus V_EXP which all the MFMA depend on
1363 if (SU.Succs.size() >= 7)
1364 continue;
1365 for (auto &Succ : SU.Succs) {
1366 if (Succ.getSUnit()->Succs.size() >= 7)
1367 continue;
1368 }
1369 ExpPipeCands.push_back(&SU);
1370 }
1371
1372 if (TII->isMFMAorWMMA(*SU.getInstr()))
1373 MFMAPipeCands.push_back(&SU);
1374
1375 if (isBitPack(Opc))
1376 PackSUs.push_back(&SU);
1377
1378 if (isCvt(Opc))
1379 CvtSUs.push_back(&SU);
1380
1381 if (isAdd(Opc))
1382 ++AddPipeCount;
1383 }
1384
1385 if (!(PackSUs.size() && MFMAPipeCands.size() && ExpPipeCands.size()))
1386 return false;
1387
1388 TransPipeCount = 0;
1389
1390 std::optional<SUnit *> TempMFMA;
1391 std::optional<SUnit *> TempExp;
1392 // Count the number of EXPs that reach an MFMA
1393 for (auto &PredSU : ExpPipeCands) {
1394 for (auto &SuccSU : MFMAPipeCands) {
1395 if (DAG->IsReachable(SuccSU, PredSU)) {
1396 if (!TempExp) {
1397 TempExp = PredSU;
1398 TempMFMA = SuccSU;
1399 }
1400 MFMAPipeSUs.push_back(SuccSU);
1401 ++TransPipeCount;
1402 break;
1403 }
1404 }
1405 }
1406
1407 if (!(TempExp && TempMFMA))
1408 return false;
1409
1410 HasChainBetweenCvt = none_of((*TempExp)->Succs, [&isCvt](SDep &Succ) {
1411 return isCvt(Succ.getSUnit()->getInstr()->getOpcode());
1412 });
1413
1414 // Count the number of MFMAs that are reached by an EXP
1415 for (auto &SuccSU : MFMAPipeCands) {
1416 if (MFMAPipeSUs.size() &&
1417 any_of(MFMAPipeSUs, [&SuccSU](SUnit *PotentialMatch) {
1418 return PotentialMatch->NodeNum == SuccSU->NodeNum;
1419 }))
1420 continue;
1421
1422 for (auto &PredSU : ExpPipeCands) {
1423 if (DAG->IsReachable(SuccSU, PredSU)) {
1424 MFMAPipeSUs.push_back(SuccSU);
1425 break;
1426 }
1427 }
1428 }
1429
1430 MFMAPipeCount = MFMAPipeSUs.size();
1431
1432 assert(TempExp && TempMFMA);
1433 assert(MFMAPipeCount > 0);
1434
1435 std::optional<SUnit *> TempCvt;
1436 for (auto &SuccSU : CvtSUs) {
1437 if (DAG->IsReachable(SuccSU, *TempExp)) {
1438 TempCvt = SuccSU;
1439 break;
1440 }
1441 }
1442
1443 HasCvt = false;
1444 if (TempCvt.has_value()) {
1445 for (auto &SuccSU : MFMAPipeSUs) {
1446 if (DAG->IsReachable(SuccSU, *TempCvt)) {
1447 HasCvt = true;
1448 break;
1449 }
1450 }
1451 }
1452
1453 MFMAChains = 0;
1454 for (auto &MFMAPipeSU : MFMAPipeSUs) {
1455 if (is_contained(MFMAChainSeeds, MFMAPipeSU))
1456 continue;
1457 if (none_of(MFMAPipeSU->Preds, [&TII](SDep &Succ) {
1458 return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
1459 })) {
1460 MFMAChainSeeds.push_back(MFMAPipeSU);
1461 ++MFMAChains;
1462 }
1463 }
1464
1465 if (!MFMAChains)
1466 return false;
1467
1468 for (auto Pred : MFMAChainSeeds[0]->Preds) {
1469 if (TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) &&
1470 Pred.getSUnit()->getInstr()->mayLoad())
1471 FirstPipeDSR = Pred.getSUnit()->NodeNum;
1472 }
1473
1474 // The number of bit pack operations that depend on a single V_EXP
1475 unsigned PackSuccCount =
1476 llvm::count_if(PackSUs, [this, &TempExp](SUnit *VPack) {
1477 return DAG->IsReachable(VPack, *TempExp);
1478 });
1479
1480 // The number of bit pack operations an MFMA depends on
1481 unsigned PackPredCount =
1482 llvm::count_if((*TempMFMA)->Preds, [&isBitPack](SDep &Pred) {
1483 auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1484 return isBitPack(Opc);
1485 });
1486
1487 auto *PackPred = llvm::find_if((*TempMFMA)->Preds, [&isBitPack](SDep &Pred) {
1488 auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1489 return isBitPack(Opc);
1490 });
1491
1492 if (PackPred == (*TempMFMA)->Preds.end())
1493 return false;
1494
1495 MFMAEnablement = 0;
1496 ExpRequirement = 0;
1497 // How many MFMAs depend on a single bit pack operation
1498 MFMAEnablement =
1499 llvm::count_if(PackPred->getSUnit()->Succs, [&TII](SDep &Succ) {
1500 return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
1501 });
1502
1503 // The number of MFMAs that depend on a single V_EXP
1504 MFMAEnablement *= PackSuccCount;
1505
1506 // The number of V_EXPs required to resolve all dependencies for an MFMA
1507 ExpRequirement =
1508 llvm::count_if(ExpPipeCands, [this, &PackPred](SUnit *ExpBase) {
1509 return DAG->IsReachable(PackPred->getSUnit(), ExpBase);
1510 });
1511
1512 ExpRequirement *= PackPredCount;
1513 return true;
1514}
1515
1516bool MFMAExpInterleaveOpt::shouldApplyStrategy(ScheduleDAGInstrs *DAG,
1518 const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
1519 const SIInstrInfo *TII = ST.getInstrInfo();
1520
1522 MFMAChainSeeds.clear();
1523 if (Phase != AMDGPU::SchedulingPhase::PostRA && !analyzeDAG(TII))
1524 return false;
1525
1526 return true;
1527}
1528
1529bool MFMAExpInterleaveOpt::applyIGLPStrategy(
1531 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
1533
1534 bool IsSmallKernelType =
1535 MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32;
1536 bool IsLargeKernelType =
1537 MFMAEnablement == 4 && ExpRequirement == 4 && TransPipeCount == 64;
1538
1539 if (!(IsSmallKernelType || IsLargeKernelType))
1540 return false;
1541
1542 const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
1543 const SIInstrInfo *TII = ST.getInstrInfo();
1544
1545 unsigned PipelineSyncID = 0;
1546 SchedGroup *SG = nullptr;
1547
1548 unsigned MFMAChain = 0;
1549 unsigned PositionInChain = 0;
1550 unsigned CurrMFMAForTransPosition = 0;
1551
1552 auto incrementTransPosition = [&MFMAChain, &PositionInChain,
1553 &CurrMFMAForTransPosition]() {
1554 CurrMFMAForTransPosition += MFMAEnablement;
1555 PositionInChain = (CurrMFMAForTransPosition / MFMAChains);
1556 MFMAChain = CurrMFMAForTransPosition % MFMAChains;
1557 };
1558
1559 auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() {
1560 auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1561 return (TempMFMAForTrans / MFMAChains);
1562 };
1563
1564 auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() {
1565 auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1566 return TempMFMAForTrans % MFMAChains;
1567 };
1568
1569 unsigned CurrMFMAPosition = 0;
1570 unsigned MFMAChainForMFMA = 0;
1571 unsigned PositionInChainForMFMA = 0;
1572
1573 auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA,
1574 &PositionInChainForMFMA]() {
1575 ++CurrMFMAPosition;
1576 MFMAChainForMFMA = CurrMFMAPosition % MFMAChains;
1577 PositionInChainForMFMA = CurrMFMAPosition / MFMAChains;
1578 };
1579
1580 bool IsPostRA = Phase == AMDGPU::SchedulingPhase::PostRA;
1581 assert(IsPostRA || MFMAChainSeeds.size() == MFMAChains);
1582
1583 bool UsesFMA = IsSmallKernelType || !IsPostRA;
1584 bool UsesDSRead = IsLargeKernelType && !IsPostRA && FirstPipeDSR;
1585 bool UsesCvt = HasCvt && (IsSmallKernelType || !IsPostRA);
1586 bool UsesVALU = IsSmallKernelType;
1587
1588 // PHASE 1: "Prefetch"
1589 if (UsesFMA) {
1590 // First Round FMA
1591 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1592 SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, TII);
1593 if (!IsPostRA && MFMAChains) {
1594 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1595 PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),
1596 true));
1597 } else
1598 SG->addRule(
1599 std::make_shared<EnablesNthMFMA>(1, TII, SG->getSGID(), true));
1600 SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
1601 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1602
1603 // Second Round FMA
1604 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1605 SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, TII);
1606 if (!IsPostRA && MFMAChains) {
1607 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1608 getNextTransPositionInChain(),
1609 MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), true));
1610 } else
1611 SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, TII,
1612 SG->getSGID(), true));
1613 SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
1614 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1615 }
1616
1617 if (UsesDSRead) {
1618 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1619 SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII);
1620 SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR, TII,
1621 SG->getSGID()));
1622 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1623 }
1624
1625 // First Round EXP
1626 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1627 SchedGroupMask::TRANS, ExpRequirement, PipelineSyncID, DAG, TII);
1628 if (!IsPostRA && MFMAChains)
1629 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1630 PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(), true));
1631 else
1632 SG->addRule(std::make_shared<EnablesNthMFMA>(1, TII, SG->getSGID(), true));
1633 SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
1634 SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),
1635 HasChainBetweenCvt));
1636 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1637
1638 incrementTransPosition();
1639
1640 // First Round CVT, Third Round FMA, Second Round EXP; interleaved
1641 for (unsigned I = 0; I < ExpRequirement; I++) {
1642 // First Round CVT
1643 if (UsesCvt) {
1644 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1645 SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
1646 SG->addRule(std::make_shared<IsCvt>(TII, SG->getSGID()));
1647 if (HasChainBetweenCvt)
1648 SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
1649 1 + (2 + UsesFMA) * I, TII, SG->getSGID()));
1650 else
1651 SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(
1652 1 + (2 + UsesFMA) * I, TII, SG->getSGID()));
1653 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1654 }
1655
1656 // Third Round FMA
1657 if (UsesFMA) {
1658 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1659 SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
1660 if (!IsPostRA && MFMAChains) {
1661 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1662 getNextTransPositionInChain(),
1663 MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), true));
1664 } else
1665 SG->addRule(std::make_shared<EnablesNthMFMA>(2 * MFMAEnablement + 1,
1666 TII, SG->getSGID(), true));
1667 SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
1668 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1669 }
1670
1671 // Second Round EXP
1672 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1673 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
1674 if (!IsPostRA && MFMAChains)
1675 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1676 PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),
1677 true));
1678 else
1679 SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, TII,
1680 SG->getSGID(), true));
1681 SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
1682 SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),
1683 HasChainBetweenCvt));
1684 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1685 }
1686
1687 // The "extra" EXP which enables all MFMA
1688 // TODO: UsesExtraExp
1689 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1690 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
1691 SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
1692 SG->addRule(std::make_shared<GreaterThanOrEqualToNSuccs>(
1693 8, TII, SG->getSGID(), HasChainBetweenCvt));
1694 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1695
1696 // PHASE 2: Main Interleave Loop
1697
1698 // The number of MFMAs per iteration
1699 unsigned MFMARatio =
1700 MFMAEnablement > ExpRequirement ? MFMAEnablement / ExpRequirement : 1;
1701 // The number of Exps per iteration
1702 unsigned ExpRatio =
1703 MFMAEnablement > ExpRequirement ? 1 : ExpRequirement / MFMAEnablement;
1704 // The reamaining Exps
1705 unsigned RemainingExp = TransPipeCount > (2 * ExpRequirement)
1706 ? TransPipeCount - (2 * ExpRequirement)
1707 : 0;
1708 unsigned ExpLoopCount = RemainingExp / ExpRatio;
1709 // In loop MFMAs
1710 unsigned MFMAInLoop = MFMAPipeCount > (MFMAEnablement * 2)
1711 ? MFMAPipeCount - (MFMAEnablement * 2)
1712 : 0;
1713 unsigned MFMALoopCount = MFMAInLoop / MFMARatio;
1714 unsigned VALUOps =
1715 AddPipeCount < MFMAPipeCount ? 1 : AddPipeCount / MFMAPipeCount;
1716 unsigned LoopSize = std::min(ExpLoopCount, MFMALoopCount);
1717
1718 for (unsigned I = 0; I < LoopSize; I++) {
1719 if (!(I * ExpRatio % ExpRequirement))
1720 incrementTransPosition();
1721
1722 // Round N MFMA
1723 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1724 SchedGroupMask::MFMA, MFMARatio, PipelineSyncID, DAG, TII);
1725 if (!IsPostRA && MFMAChains)
1726 SG->addRule(std::make_shared<IsExactMFMA>(
1727 PositionInChainForMFMA, MFMAChainSeeds[MFMAChainForMFMA], TII,
1728 SG->getSGID(), true));
1729 else
1730 SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true));
1731 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1732 incrementMFMAPosition();
1733
1734 if (UsesVALU) {
1735 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1736 SchedGroupMask::VALU, VALUOps, PipelineSyncID, DAG, TII);
1737 SG->addRule(std::make_shared<IsPipeAdd>(TII, SG->getSGID()));
1738 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1739 }
1740
1741 if (UsesDSRead && !(I % 4)) {
1742 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1743 SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII);
1744 SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR, TII,
1745 SG->getSGID()));
1746 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1747 }
1748
1749 // CVT, EXP, FMA Interleaving
1750 for (unsigned J = 0; J < ExpRatio; J++) {
1751 auto MFMAOffset = (1 + UsesVALU) * MFMARatio * (I + 1);
1752 auto MaxMFMAOffset =
1753 (1 + UsesVALU) * ExpRequirement * MFMARatio / ExpRatio;
1754
1755 // Round N + 1 CVT
1756 if (UsesCvt) {
1757 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1758 SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
1759 SG->addRule(std::make_shared<IsCvt>(TII, SG->getSGID()));
1760 auto BaseDiff = (2 + UsesFMA) * (ExpRequirement - 1) + 1;
1761 auto DSROffset = I / 4 + 1;
1762 auto MaxDSROffset = MaxMFMAOffset / 4;
1763 // TODO: UsesExtraExp
1764 auto ExpOffset = I * ExpRatio + J >= ExpRequirement ? 0 : 1;
1765 auto CurrentOffset = UsesDSRead * std::min(MaxDSROffset, DSROffset) +
1766 std::min(MaxMFMAOffset, MFMAOffset) + BaseDiff +
1767 ExpOffset;
1768 if (HasChainBetweenCvt)
1769 SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
1770 CurrentOffset, TII, SG->getSGID()));
1771 else
1772 SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(CurrentOffset, TII,
1773 SG->getSGID()));
1774 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1775 }
1776
1777 // Round N + 3 FMA
1778 if (UsesFMA) {
1779 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1780 SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
1781 if (!IsPostRA && MFMAChains)
1782 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1783 getNextTransPositionInChain(),
1784 MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(),
1785 true));
1786 else
1787 SG->addRule(std::make_shared<EnablesNthMFMA>(
1788 (((I * ExpRatio + J) / ExpRequirement) + 3) * MFMAEnablement + 1,
1789 TII, SG->getSGID(), true));
1790 SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
1791 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1792 }
1793
1794 // Round N + 2 Exp
1795 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1796 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
1797 if (!IsPostRA && MFMAChains)
1798 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1799 PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),
1800 true));
1801 else
1802 SG->addRule(std::make_shared<EnablesNthMFMA>(
1803 (((I * ExpRatio + J) / ExpRequirement) + 2) * MFMAEnablement + 1,
1804 TII, SG->getSGID(), true));
1805 SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
1806 SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),
1807 HasChainBetweenCvt));
1808 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1809 }
1810 }
1811
1812 // PHASE 3: Remaining MFMAs
1813 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1814 SchedGroupMask::MFMA, MFMAEnablement * 2, PipelineSyncID, DAG, TII);
1815 SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true));
1816 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1817 return true;
1818}
1819
1820class MFMAExpSimpleInterleaveOpt final : public IGLPStrategy {
1821public:
1822 bool applyIGLPStrategy(
1824 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
1826
1827 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
1828 AMDGPU::SchedulingPhase Phase) override {
1829 return true;
1830 }
1831
1832 MFMAExpSimpleInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
1833 : IGLPStrategy(DAG, TII) {
1834 IsBottomUp = true;
1835 }
1836};
1837
1838bool MFMAExpSimpleInterleaveOpt::applyIGLPStrategy(
1840 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
1842 // Count the number of MFMA instructions.
1843 unsigned MFMACount = 0;
1844 for (const MachineInstr &I : *DAG)
1845 if (TII->isMFMAorWMMA(I))
1846 ++MFMACount;
1847
1848 const unsigned PipelineSyncID = 0;
1849 for (unsigned I = 0; I < MFMACount * 3; ++I) {
1850 SchedGroup *SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1851 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
1852 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1853
1854 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1855 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
1856 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1857 }
1858
1859 return true;
1860}
1861
1862class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
1863private:
1864 // Whether the DS_READ is a predecessor of first four MFMA in region
1865 class EnablesInitialMFMA final : public InstructionRule {
1866 public:
1867 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1868 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1869 if (!SyncPipe.size())
1870 return false;
1871 int MFMAsFound = 0;
1872 if (!Cache->size()) {
1873 for (auto &Elt : SyncPipe[0].DAG->SUnits) {
1874 if (TII->isMFMAorWMMA(*Elt.getInstr())) {
1875 ++MFMAsFound;
1876 if (MFMAsFound > 4)
1877 break;
1878 Cache->push_back(&Elt);
1879 }
1880 }
1881 }
1882
1883 auto *DAG = SyncPipe[0].DAG;
1884 for (auto &Elt : *Cache) {
1885 if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU)))
1886 return true;
1887 }
1888 return false;
1889 }
1890
1891 EnablesInitialMFMA(const SIInstrInfo *TII, unsigned SGID,
1892 bool NeedsCache = false)
1893 : InstructionRule(TII, SGID, NeedsCache) {}
1894 };
1895
1896 // Whether the MI is a V_PERM and is a predecessor of a common DS_WRITE
1897 class IsPermForDSW final : public InstructionRule {
1898 public:
1899 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1900 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1901 auto *MI = SU->getInstr();
1902 if (MI->getOpcode() != AMDGPU::V_PERM_B32_e64)
1903 return false;
1904
1905 bool FitsInGroup = false;
1906 // Does the VALU have a DS_WRITE successor
1907 if (!Collection.size()) {
1908 for (auto &Succ : SU->Succs) {
1909 SUnit *SuccUnit = Succ.getSUnit();
1910 if (TII->isDS(*SuccUnit->getInstr()) &&
1911 SuccUnit->getInstr()->mayStore()) {
1912 Cache->push_back(SuccUnit);
1913 FitsInGroup = true;
1914 }
1915 }
1916 return FitsInGroup;
1917 }
1918
1919 // Does the VALU have a DS_WRITE successor that is the same as other
1920 // VALU already in the group. The V_PERMs will all share 1 DS_W succ
1921 return llvm::any_of(*Cache, [&SU](SUnit *Elt) {
1922 return llvm::any_of(SU->Succs, [&Elt](const SDep &ThisSucc) {
1923 return ThisSucc.getSUnit() == Elt;
1924 });
1925 });
1926 }
1927
1928 IsPermForDSW(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
1929 : InstructionRule(TII, SGID, NeedsCache) {}
1930 };
1931
1932 // Whether the SU is a successor of any element in previous SchedGroup
1933 class IsSuccOfPrevGroup final : public InstructionRule {
1934 public:
1935 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1936 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1937 SchedGroup *OtherGroup = nullptr;
1938 for (auto &PipeSG : SyncPipe) {
1939 if ((unsigned)PipeSG.getSGID() == SGID - 1) {
1940 OtherGroup = &PipeSG;
1941 }
1942 }
1943
1944 if (!OtherGroup)
1945 return false;
1946 if (!OtherGroup->Collection.size())
1947 return true;
1948
1949 // Does the previous VALU have this DS_Write as a successor
1950 return any_of(OtherGroup->Collection, [&SU](SUnit *Elt) {
1951 return any_of(Elt->Succs,
1952 [&SU](SDep &Succ) { return Succ.getSUnit() == SU; });
1953 });
1954 }
1955 IsSuccOfPrevGroup(const SIInstrInfo *TII, unsigned SGID,
1956 bool NeedsCache = false)
1957 : InstructionRule(TII, SGID, NeedsCache) {}
1958 };
1959
1960 // Whether the combined load width of group is 128 bits
1961 class VMEMSize final : public InstructionRule {
1962 public:
1963 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1964 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1965 auto *MI = SU->getInstr();
1966 if (MI->getOpcode() == TargetOpcode::BUNDLE)
1967 return false;
1968 if (!Collection.size())
1969 return true;
1970
1971 int NumBits = 0;
1972
1973 auto TRI = TII->getRegisterInfo();
1974 auto &MRI = MI->getMF()->getRegInfo();
1975 for (auto &Elt : Collection) {
1976 auto Op = Elt->getInstr()->getOperand(0);
1977 auto Size =
1978 TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(MRI, Op));
1979 NumBits += Size;
1980 }
1981
1982 if (NumBits < 128) {
1983 assert(TII->isVMEM(*MI) && MI->mayLoad());
1984 if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(
1985 MRI, MI->getOperand(0))) <=
1986 128)
1987 return true;
1988 }
1989
1990 return false;
1991 }
1992
1993 VMEMSize(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
1994 : InstructionRule(TII, SGID, NeedsCache) {}
1995 };
1996
1997 /// Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup
1998 /// that is \p Distance steps away
1999 class SharesPredWithPrevNthGroup final : public InstructionRule {
2000 private:
2001 unsigned Distance = 1;
2002
2003 public:
2004 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
2005 SmallVectorImpl<SchedGroup> &SyncPipe) override {
2006 SchedGroup *OtherGroup = nullptr;
2007 if (!SyncPipe.size())
2008 return false;
2009
2010 if (!Cache->size()) {
2011
2012 for (auto &PipeSG : SyncPipe) {
2013 if ((unsigned)PipeSG.getSGID() == SGID - Distance) {
2014 OtherGroup = &PipeSG;
2015 }
2016 }
2017
2018 if (!OtherGroup)
2019 return false;
2020 if (!OtherGroup->Collection.size())
2021 return true;
2022
2023 for (auto &OtherEle : OtherGroup->Collection) {
2024 for (auto &Pred : OtherEle->Preds) {
2025 if (Pred.getSUnit()->getInstr()->getOpcode() ==
2026 AMDGPU::V_PERM_B32_e64)
2027 Cache->push_back(Pred.getSUnit());
2028 }
2029 }
2030
2031 // If the other group has no PERM preds, then this group won't share any
2032 if (!Cache->size())
2033 return false;
2034 }
2035
2036 auto *DAG = SyncPipe[0].DAG;
2037 // Does the previous DS_WRITE share a V_PERM predecessor with this
2038 // VMEM_READ
2039 return llvm::any_of(*Cache, [&SU, &DAG](SUnit *Elt) {
2040 return DAG->IsReachable(const_cast<SUnit *>(SU), Elt);
2041 });
2042 }
2043 SharesPredWithPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
2044 unsigned SGID, bool NeedsCache = false)
2045 : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
2046 };
2047
2048public:
2049 bool applyIGLPStrategy(
2051 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
2053
2054 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
2055 AMDGPU::SchedulingPhase Phase) override {
2056 return true;
2057 }
2058
2059 MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
2060 : IGLPStrategy(DAG, TII) {
2061 IsBottomUp = false;
2062 }
2063};
2064
2065static unsigned DSWCount = 0;
2066static unsigned DSWWithPermCount = 0;
2067static unsigned DSWWithSharedVMEMCount = 0;
2068
2069bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
2070 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
2071 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
2073 unsigned MFMACount = 0;
2074 unsigned DSRCount = 0;
2075
2076 bool IsInitial = Phase == AMDGPU::SchedulingPhase::Initial;
2077
2078 assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 &&
2079 DSWWithSharedVMEMCount == 0)) &&
2080 "DSWCounters should be zero in pre-RA scheduling!");
2081 SmallVector<SUnit *, 6> DSWithPerms;
2082 for (auto &SU : DAG->SUnits) {
2083 auto *I = SU.getInstr();
2084 if (TII->isMFMAorWMMA(*I))
2085 ++MFMACount;
2086 else if (TII->isDS(*I)) {
2087 if (I->mayLoad())
2088 ++DSRCount;
2089 else if (I->mayStore() && IsInitial) {
2090 ++DSWCount;
2091 for (auto Pred : SU.Preds) {
2092 if (Pred.getSUnit()->getInstr()->getOpcode() ==
2093 AMDGPU::V_PERM_B32_e64) {
2094 DSWithPerms.push_back(&SU);
2095 break;
2096 }
2097 }
2098 }
2099 }
2100 }
2101
2102 if (IsInitial) {
2103 DSWWithPermCount = DSWithPerms.size();
2104 auto *I = DSWithPerms.begin();
2105 auto *E = DSWithPerms.end();
2106
2107 // Get the count of DS_WRITES with V_PERM predecessors which
2108 // have loop carried dependencies (WAR) on the same VMEM_READs.
2109 // We consider partial overlap as a miss -- in other words,
2110 // for a given DS_W, we only consider another DS_W as matching
2111 // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
2112 // for every V_PERM pred of this DS_W.
2113 DenseMap<MachineInstr *, SUnit *> VMEMLookup;
2115 for (; I != E; I++) {
2116 SUnit *Cand = nullptr;
2117 bool MissedAny = false;
2118 for (auto &Pred : (*I)->Preds) {
2119 if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
2120 continue;
2121
2122 if (Cand && llvm::is_contained(Counted, Cand))
2123 break;
2124
2125 for (auto &Succ : Pred.getSUnit()->Succs) {
2126 auto *MI = Succ.getSUnit()->getInstr();
2127 if (!TII->isVMEM(*MI) || !MI->mayLoad())
2128 continue;
2129
2130 if (MissedAny || !VMEMLookup.size()) {
2131 MissedAny = true;
2132 VMEMLookup[MI] = *I;
2133 continue;
2134 }
2135
2136 auto [It, Inserted] = VMEMLookup.try_emplace(MI, *I);
2137 if (Inserted) {
2138 MissedAny = true;
2139 continue;
2140 }
2141
2142 Cand = It->second;
2143 if (llvm::is_contained(Counted, Cand)) {
2144 MissedAny = true;
2145 break;
2146 }
2147 }
2148 }
2149 if (!MissedAny && Cand) {
2150 DSWWithSharedVMEMCount += 2;
2151 Counted.push_back(Cand);
2152 Counted.push_back(*I);
2153 }
2154 }
2155 }
2156
2157 assert(DSWWithSharedVMEMCount <= DSWWithPermCount);
2158 SchedGroup *SG;
2159 unsigned PipelineSyncID = 0;
2160 // For kernels with V_PERM, there are enough VALU to mix in between MFMAs
2161 if (DSWWithPermCount) {
2162 for (unsigned I = 0; I < MFMACount; I++) {
2163 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2164 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2165 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2166
2167 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2168 SchedGroupMask::VALU, 2, PipelineSyncID, DAG, TII);
2169 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2170 }
2171 }
2172
2173 PipelineSyncID = 1;
2174 // Phase 1: Break up DS_READ and MFMA clusters.
2175 // First DS_READ to make ready initial MFMA, then interleave MFMA with DS_READ
2176 // prefetch
2177
2178 // Make ready initial MFMA
2179 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2180 SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG, TII);
2181 SG->addRule(std::make_shared<EnablesInitialMFMA>(TII, SG->getSGID(), true));
2182 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2183
2184 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2185 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2186 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2187
2188 // Interleave MFMA with DS_READ prefetch
2189 for (unsigned I = 4; I < DSRCount; ++I) {
2190 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2191 SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);
2192 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2193
2194 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2195 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2196 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2197 }
2198
2199 // Phase 2a: Loop carried dependency with V_PERM
2200 // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
2201 // depend on. Interleave MFMA to keep XDL unit busy throughout.
2202 for (unsigned I = DSWWithSharedVMEMCount; I < DSWWithPermCount; ++I) {
2203 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2204 SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
2205 SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
2206 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2207
2208 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2209 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
2210 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
2211 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2212
2213 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2214 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
2215 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2216 1, TII, SG->getSGID(), true));
2217 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
2218 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2219
2220 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2221 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2222 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2223
2224 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2225 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
2226 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2227 3, TII, SG->getSGID(), true));
2228 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
2229 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2230
2231 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2232 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2233 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2234 }
2235
2236 // Phase 2b: Loop carried dependency without V_PERM
2237 // Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on.
2238 // Interleave MFMA to keep XDL unit busy throughout.
2239 for (unsigned I = DSWWithPermCount; I < DSWCount; I++) {
2240 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2241 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
2242 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2243
2244 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2245 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
2246 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
2247 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2248
2249 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2250 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2251 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2252 }
2253
2254 // Phase 2c: Loop carried dependency with V_PERM, VMEM_READs are
2255 // ultimately used by two DS_WRITE
2256 // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
2257 // depend on. Interleave MFMA to keep XDL unit busy throughout.
2258
2259 for (unsigned I = 0; I < DSWWithSharedVMEMCount; ++I) {
2260 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2261 SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
2262 SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
2263 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2264
2265 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2266 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
2267 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
2268 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2269
2270 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2271 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2272 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2273
2274 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2275 SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
2276 SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
2277 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2278
2279 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2280 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
2281 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
2282 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2283
2284 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2285 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2286 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2287
2288 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2289 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
2290 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2291 2, TII, SG->getSGID(), true));
2292 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
2293 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2294
2295 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2296 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2297 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2298
2299 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2300 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
2301 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2302 4, TII, SG->getSGID(), true));
2303 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
2304 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2305
2306 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2307 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2308 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2309 }
2310
2311 return true;
2312}
2313
2314static std::unique_ptr<IGLPStrategy>
2315createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
2316 const SIInstrInfo *TII) {
2317 switch (ID) {
2318 case MFMASmallGemmOptID:
2319 return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
2321 return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII);
2323 return std::make_unique<MFMAExpInterleaveOpt>(DAG, TII);
2325 return std::make_unique<MFMAExpSimpleInterleaveOpt>(DAG, TII);
2326 }
2327
2328 llvm_unreachable("Unknown IGLPStrategyID");
2329}
2330
2331class IGroupLPDAGMutation : public ScheduleDAGMutation {
2332private:
2333 const SIInstrInfo *TII;
2334
2335 ScheduleDAGMI *DAG;
2336
2337 // Organize lists of SchedGroups by their SyncID. SchedGroups /
2338 // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added
2339 // between then.
2340 DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
2341
2342 // Used to track instructions that can be mapped to multiple sched groups
2343 DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
2344
2345 // Add DAG edges that enforce SCHED_BARRIER ordering.
2346 void addSchedBarrierEdges(SUnit &SU);
2347
2348 // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
2349 // not be reordered accross the SCHED_BARRIER. This is used for the base
2350 // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that
2351 // SCHED_BARRIER will always block all instructions that can be classified
2352 // into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size
2353 // and may only synchronize with some SchedGroups. Returns the inverse of
2354 // Mask. SCHED_BARRIER's mask describes which instruction types should be
2355 // allowed to be scheduled across it. Invert the mask to get the
2356 // SchedGroupMask of instructions that should be barred.
2357 SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const;
2358
2359 // Create SchedGroups for a SCHED_GROUP_BARRIER.
2360 void initSchedGroupBarrierPipelineStage(
2361 std::vector<SUnit>::reverse_iterator RIter);
2362
2363 bool initIGLPOpt(SUnit &SU);
2364
2365public:
2366 void apply(ScheduleDAGInstrs *DAGInstrs) override;
2367
2368 // The order in which the PipelineSolver should process the candidate
2369 // SchedGroup for a PipelineInstr. BOTTOM_UP will try to add SUs to the last
2370 // created SchedGroup first, and will consider that as the ultimate
2371 // predecessor group when linking. TOP_DOWN instead links and processes the
2372 // first created SchedGroup first.
2373 bool IsBottomUp = true;
2374
2375 // The scheduling phase this application of IGLP corresponds with.
2376 AMDGPU::SchedulingPhase Phase = AMDGPU::SchedulingPhase::Initial;
2377
2378 IGroupLPDAGMutation() = default;
2379 IGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) : Phase(Phase) {}
2380};
2381
2382unsigned SchedGroup::NumSchedGroups = 0;
2383
2384bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) {
2385 return A != B && DAG->addEdge(B, SDep(A, SDep::Artificial));
2386}
2387
2388bool SchedGroup::canAddMI(const MachineInstr &MI) const {
2389 bool Result = false;
2390 if (MI.isMetaInstruction())
2391 Result = false;
2392
2393 else if (MI.isInlineAsm()) {
2394 const SIRegisterInfo &TRI = TII->getRegisterInfo();
2395 auto &MRI = MI.getParent()->getParent()->getRegInfo();
2396 bool SGPR_used = false, SGPR_big_def = false, VGPR_used = false,
2397 VMFMA_used = false, VReg32_used = false, MayLoad = MI.mayLoad(),
2398 MayStore = MI.mayStore();
2399 for (const MachineOperand &Operand : MI.operands())
2400 if (Operand.isReg()) {
2401 const TargetRegisterClass &RegClass =
2402 *TRI.getRegClassForOperandReg(MRI, Operand);
2403 if (TRI.hasVGPRs(&RegClass)) {
2404 VGPR_used = true;
2405 if (Operand.isUse() && TRI.getRegSizeInBits(RegClass) == 32)
2406 VReg32_used = true;
2407 }
2408 // > 128 bit registers are usually only used by MFMA instructions, so
2409 // we're using that as a heuristic to guess the schedule group mask of
2410 // the inline asm.
2411 if (TRI.hasAGPRs(&RegClass) || TRI.getRegSizeInBits(RegClass) > 128)
2412 VMFMA_used = true;
2413 if (TRI.hasSGPRs(&RegClass))
2414 SGPR_used = true;
2415 if (TRI.getRegSizeInBits(RegClass) > 64 && Operand.isDef())
2416 SGPR_big_def = true;
2417 }
2418
2419 typedef std::underlying_type_t<SchedGroupMask> SGMask_t;
2420 SGMask_t InlineAsmMask = 0;
2421 if (VGPR_used && !VMFMA_used && !MayLoad && !MayStore)
2422 InlineAsmMask |= (SGMask_t)SchedGroupMask::VALU;
2423 if (SGPR_used && !VGPR_used && !MayLoad && !MayStore)
2424 InlineAsmMask |= (SGMask_t)SchedGroupMask::SALU;
2425 if (VMFMA_used)
2426 InlineAsmMask |= (SGMask_t)SchedGroupMask::MFMA;
2427 if (VGPR_used && MayLoad)
2428 InlineAsmMask |= (SGMask_t)(VReg32_used ? SchedGroupMask::DS_READ
2429 : SchedGroupMask::VMEM_READ);
2430 if (VGPR_used && MayStore)
2431 InlineAsmMask |= (SGMask_t)(VReg32_used ? SchedGroupMask::DS_WRITE
2432 : SchedGroupMask::VMEM_WRITE);
2433 if (SGPR_big_def)
2434 InlineAsmMask |= (SGMask_t)SchedGroupMask::DS_READ;
2435 if (InlineAsmMask & (SGMask_t)SchedGroupMask::VALU ||
2436 InlineAsmMask & (SGMask_t)SchedGroupMask::SALU)
2437 InlineAsmMask |= (SGMask_t)SchedGroupMask::ALU;
2438 if (InlineAsmMask & (SGMask_t)SchedGroupMask::DS_READ ||
2439 InlineAsmMask & (SGMask_t)SchedGroupMask::DS_WRITE)
2440 InlineAsmMask |= (SGMask_t)SchedGroupMask::DS;
2441 if (InlineAsmMask & (SGMask_t)SchedGroupMask::VMEM_READ ||
2442 InlineAsmMask & (SGMask_t)SchedGroupMask::VMEM_WRITE)
2443 InlineAsmMask |= (SGMask_t)SchedGroupMask::VMEM;
2444
2445 Result = ((SGMask_t)SGMask & InlineAsmMask) != 0;
2446 }
2447
2448 else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
2449 (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) ||
2450 TII->isTRANS(MI)))
2451 Result = !MI.mayLoadOrStore();
2452
2453 else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
2454 TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI) &&
2455 !TII->isLDSDMA(MI)) {
2456 // Some memory instructions may be marked as VALU (e.g. BUFFER_LOAD_*_LDS).
2457 // For our purposes, these shall not be classified as VALU as this results
2458 // in unexpected behavior.
2459 Result = !MI.mayLoadOrStore();
2460 }
2461
2462 else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
2463 TII->isSALU(MI))
2464 Result = !MI.mayLoadOrStore();
2465
2466 else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
2467 TII->isMFMAorWMMA(MI))
2468 Result = true;
2469
2470 else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
2471 (TII->isVMEM(MI) || TII->isLDSDMA(MI)))
2472 Result = true;
2473
2474 else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
2475 MI.mayLoad() && TII->isVMEM(MI))
2476 Result = true;
2477
2478 else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
2479 MI.mayStore() && TII->isVMEM(MI))
2480 Result = true;
2481
2482 else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
2483 (TII->isDS(MI) || TII->isLDSDMA(MI)))
2484 Result = true;
2485
2486 else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
2487 MI.mayLoad() && TII->isDS(MI))
2488 Result = true;
2489
2490 else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
2491 MI.mayStore() && TII->isDS(MI))
2492 Result = true;
2493
2494 else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) &&
2495 TII->isTRANS(MI))
2496 Result = true;
2497
2498 else if (((SGMask & SchedGroupMask::LDSDMA) != SchedGroupMask::NONE) &&
2499 TII->isLDSDMA(MI))
2500 Result = true;
2501
2502 LLVM_DEBUG(
2503 dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)
2504 << (Result ? " could classify " : " unable to classify ") << MI);
2505
2506 return Result;
2507}
2508
2509int SchedGroup::link(SUnit &SU, bool MakePred,
2510 std::list<std::pair<SUnit *, SUnit *>> &AddedEdges) {
2511 int MissedEdges = 0;
2512 for (auto *A : Collection) {
2513 SUnit *B = &SU;
2514 if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2515 continue;
2516 if (MakePred)
2517 std::swap(A, B);
2518
2519 if (DAG->IsReachable(B, A))
2520 continue;
2521
2522 // tryAddEdge returns false if there is a dependency that makes adding
2523 // the A->B edge impossible, otherwise it returns true;
2524 bool Added = tryAddEdge(A, B);
2525 if (Added)
2526 AddedEdges.emplace_back(A, B);
2527 else
2528 ++MissedEdges;
2529 }
2530
2531 return MissedEdges;
2532}
2533
2534void SchedGroup::link(SUnit &SU, bool MakePred) {
2535 for (auto *A : Collection) {
2536 SUnit *B = &SU;
2537 if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2538 continue;
2539 if (MakePred)
2540 std::swap(A, B);
2541
2542 tryAddEdge(A, B);
2543 }
2544}
2545
2546void SchedGroup::link(SUnit &SU,
2547 function_ref<bool(const SUnit *A, const SUnit *B)> P) {
2548 for (auto *A : Collection) {
2549 SUnit *B = &SU;
2550 if (P(A, B))
2551 std::swap(A, B);
2552
2553 tryAddEdge(A, B);
2554 }
2555}
2556
2557void SchedGroup::link(SchedGroup &OtherGroup) {
2558 for (auto *B : OtherGroup.Collection)
2559 link(*B);
2560}
2561
2562bool SchedGroup::canAddSU(SUnit &SU) const {
2563 MachineInstr &MI = *SU.getInstr();
2564 if (MI.getOpcode() != TargetOpcode::BUNDLE)
2565 return canAddMI(MI);
2566
2567 // Special case for bundled MIs.
2568 const MachineBasicBlock *MBB = MI.getParent();
2569 MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;
2570 while (E != MBB->end() && E->isBundledWithPred())
2571 ++E;
2572
2573 // Return true if all of the bundled MIs can be added to this group.
2574 return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); });
2575}
2576
2577template <class T>
2578void SchedGroup::findCandidateSUnits(T Begin, T End,
2579 SUnitsToCandidateSGsMap &SyncedInstrs) {
2580 for (SUnit &SU : make_range(Begin, End)) {
2581 if (canAddSU(SU))
2582 SyncedInstrs[&SU].push_back(SGID);
2583 }
2584}
2585
2586void SchedGroup::findCandidateSUnits(SUnitsToCandidateSGsMap &SyncedInstrs) {
2587 findCandidateSUnits(DAG->SUnits.rbegin(), DAG->SUnits.rend(), SyncedInstrs);
2588}
2589
2590void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
2591 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
2592 if (!TSchedModel || DAGInstrs->SUnits.empty())
2593 return;
2594
2595 LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n");
2596 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
2597 TII = ST.getInstrInfo();
2598 DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
2599 SyncedSchedGroups.clear();
2600 SyncedInstrs.clear();
2601 bool FoundSB = false;
2602 bool FoundIGLP = false;
2603 bool ShouldApplyIGLP = false;
2604 for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) {
2605 unsigned Opc = R->getInstr()->getOpcode();
2606 // SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive.
2607 if (Opc == AMDGPU::SCHED_BARRIER) {
2608 addSchedBarrierEdges(*R);
2609 FoundSB = true;
2610 } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) {
2611 initSchedGroupBarrierPipelineStage(R);
2612 FoundSB = true;
2613 } else if (Opc == AMDGPU::IGLP_OPT) {
2614 if (!FoundSB && !FoundIGLP) {
2615 FoundIGLP = true;
2616 ShouldApplyIGLP = initIGLPOpt(*R);
2617 }
2618 }
2619 }
2620
2621 if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) {
2622 PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
2623 // PipelineSolver performs the mutation by adding the edges it
2624 // determined as the best
2625 PS.solve();
2626 return;
2627 }
2628}
2629
2630void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
2631 MachineInstr &MI = *SchedBarrier.getInstr();
2632 assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
2633 LLVM_DEBUG(dbgs() << "Building SchedGroup for SchedBarrier with Mask: "
2634 << MI.getOperand(0).getImm() << "\n");
2635 auto InvertedMask =
2636 invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm());
2637 SchedGroup SG(InvertedMask, std::nullopt, DAG, TII);
2638
2639 for (SUnit &SU : DAG->SUnits)
2640 if (SG.canAddSU(SU))
2641 SG.add(SU);
2642
2643 // Preserve original instruction ordering relative to the SCHED_BARRIER.
2644 SG.link(
2645 SchedBarrier,
2646 (function_ref<bool(const SUnit *A, const SUnit *B)>)[](
2647 const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; });
2648}
2649
2650SchedGroupMask
2651IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
2652 // Invert mask and erase bits for types of instructions that are implied to be
2653 // allowed past the SCHED_BARRIER.
2654 SchedGroupMask InvertedMask = ~Mask;
2655
2656 // ALU implies VALU, SALU, MFMA, TRANS.
2657 if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
2658 InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU &
2659 ~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS;
2660 // VALU, SALU, MFMA, TRANS implies ALU.
2661 else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||
2662 (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||
2663 (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE ||
2664 (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE)
2665 InvertedMask &= ~SchedGroupMask::ALU;
2666
2667 // VMEM implies VMEM_READ, VMEM_WRITE, LDSDMA.
2668 if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
2669 InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE &
2670 ~SchedGroupMask::LDSDMA;
2671 // VMEM_READ, VMEM_WRITE, LDSDMA implies VMEM.
2672 else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||
2673 (InvertedMask & SchedGroupMask::VMEM_WRITE) ==
2674 SchedGroupMask::NONE ||
2675 (InvertedMask & SchedGroupMask::LDSDMA) == SchedGroupMask::NONE)
2676 InvertedMask &= ~SchedGroupMask::VMEM;
2677
2678 // DS implies DS_READ, DS_WRITE, LDSDMA.
2679 if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE)
2680 InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE &
2681 ~SchedGroupMask::LDSDMA;
2682 // DS_READ, DS_WRITE implies DS.
2683 else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE ||
2684 (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE)
2685 InvertedMask &= ~SchedGroupMask::DS;
2686
2687 LLVM_DEBUG(dbgs() << "After Inverting, SchedGroup Mask: " << (int)InvertedMask
2688 << "\n");
2689
2690 return InvertedMask;
2691}
2692
2693void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
2694 std::vector<SUnit>::reverse_iterator RIter) {
2695 MachineInstr &SGB = *RIter->getInstr();
2696 assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER);
2697 int32_t SGMask = SGB.getOperand(0).getImm();
2698 int32_t Size = SGB.getOperand(1).getImm();
2699 int32_t SyncID = SGB.getOperand(2).getImm();
2700
2701 Size++; // Make room for the SCHED_GROUP_BARRIER instruction
2702 auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,
2703 Size, SyncID, DAG, TII);
2704 SG.add(*RIter);
2705 SG.findCandidateSUnits(RIter, SG.DAG->SUnits.rend(),
2706 SyncedInstrs[SG.getSyncID()]);
2707}
2708
2709bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
2710 IGLPStrategyID StrategyID =
2712 auto S = createIGLPStrategy(StrategyID, DAG, TII);
2713 if (!S->shouldApplyStrategy(DAG, Phase))
2714 return false;
2715
2716 IsBottomUp = S->IsBottomUp;
2717 return S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, Phase);
2718}
2719
2720} // namespace
2721
2722/// \p Phase specifes whether or not this is a reentry into the
2723/// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the
2724/// same scheduling region (e.g. pre and post-RA scheduling / multiple
2725/// scheduling "phases"), we can reenter this mutation framework more than once
2726/// for a given region.
2727std::unique_ptr<ScheduleDAGMutation>
2729 return std::make_unique<IGroupLPDAGMutation>(Phase);
2730}
aarch64 falkor hwpf fix Falkor HW Prefetch Fix Late Phase
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
MachineBasicBlock & MBB
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file defines the DenseMap class.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
#define I(x, y, z)
Definition MD5.cpp:57
Register const TargetRegisterInfo * TRI
#define T
#define P(N)
Interface definition for SIInstrInfo.
#define LLVM_DEBUG(...)
Definition Debug.h:114
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
unsigned size() const
Definition DenseMap.h:110
const HexagonRegisterInfo & getRegisterInfo() const
Instructions::iterator instr_iterator
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const MachineOperand & getOperand(unsigned i) const
int64_t getImm() const
Scheduling dependency.
Definition ScheduleDAG.h:51
SUnit * getSUnit() const
@ Data
Regular data dependence (aka true-dependence).
Definition ScheduleDAG.h:55
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition ScheduleDAG.h:74
Scheduling unit. This is a node in the scheduling DAG.
unsigned NodeNum
Entry # of node in the node vector.
LLVM_ABI void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
SmallVector< SDep, 4 > Succs
All sunit successors.
SmallVector< SDep, 4 > Preds
All sunit predecessors.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
A ScheduleDAG for scheduling lists of MachineInstr.
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
bool addEdge(SUnit *SuccSU, const SDep &PredDep)
Add a DAG edge to the given SU with the given predecessor dependence data.
bool IsReachable(SUnit *SU, SUnit *TargetSU)
IsReachable - Checks if SU is reachable from TargetSU.
void dump() const override
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
std::vector< SUnit > SUnits
The scheduling units.
MachineFunction & MF
Machine function.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An efficient, type-erasing, non-owning reference to a callable.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
IGLPStrategyID
Operand 0 immediate for IGLP_OPT pseudo instructions.
@ MFMASmallGemmSingleWaveOptID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
void apply(Opt *O, const Mod &M, const Mods &... Ms)
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase)
Phase specifes whether or not this is a reentry into the IGroupLPDAGMutation.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
FormattedNumber format_hex(uint64_t N, unsigned Width, bool Upper=false)
format_hex - Output N as a fixed width hexadecimal.
Definition Format.h:191
DWARFExpression::Operation Op
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2019
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
Function object to check whether the second component of a container supported by std::get (like std:...
Definition STLExtras.h:1448