31#define DEBUG_TYPE "igrouplp"
37 cl::desc(
"Whether to use the exponential time solver to fit "
38 "the instructions to the pipeline as closely as "
44 cl::desc(
"The maximum number of scheduling group conflicts "
45 "which we attempt to solve with the exponential time "
46 "exact solver. Problem sizes greater than this will"
47 "be solved by the less accurate greedy algorithm. Selecting "
48 "solver by size is superseded by manually selecting "
49 "the solver (e.g. by amdgpu-igrouplp-exact-solver"));
53 cl::desc(
"The amount of branches that we are willing to explore with"
54 "the exact algorithm before giving up."));
58 cl::desc(
"Whether to use the cost heuristic to make choices as we "
59 "traverse the search space using the exact solver. Defaulted "
60 "to on, and if turned off, we will use the node order -- "
61 "attempting to put the later nodes in the later sched groups. "
62 "Experimentally, results are mixed, so this should be set on a "
63 "case-by-case basis."));
67enum class SchedGroupMask {
80 ALL = ALU | VALU | SALU |
MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
81 DS_READ | DS_WRITE | TRANS,
90class InstructionRule {
96 std::optional<SmallVector<SUnit *, 4>> Cache;
106 bool NeedsCache =
false)
113 virtual ~InstructionRule() =
default;
126 SchedGroupMask SGMask;
129 std::optional<unsigned> MaxSize;
142 static unsigned NumSchedGroups;
159 bool canAddSU(
SUnit &SU)
const;
164 void link(
SUnit &SU,
bool MakePred =
false);
168 int link(
SUnit &SU,
bool MakePred,
169 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
178 void link(SchedGroup &OtherGroup);
181 bool isFull()
const {
return MaxSize && Collection.
size() >= *MaxSize; }
187 void addRule(std::shared_ptr<InstructionRule> NewRule) {
192 bool allowedByRules(
const SUnit *SU,
194 for (
auto &Rule : Rules) {
195 if (!Rule->apply(SU, Collection, SyncPipe))
202 void add(
SUnit &SU) {
204 <<
format_hex((
int)SGMask, 10,
true) <<
" adding "
210 void pop() { Collection.
pop_back(); }
213 void initSchedGroup();
220 void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
221 SUnitsToCandidateSGsMap &SyncedInstrs);
223 void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs);
225 int getSyncID() {
return SyncID; }
227 int getSGID() {
return SGID; }
229 SchedGroupMask
getMask() {
return SGMask; }
231 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
233 : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG),
TII(
TII) {
234 SGID = NumSchedGroups++;
237 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
int SyncID,
239 : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG),
TII(
TII) {
240 SGID = NumSchedGroups++;
244using SUToCandSGsPair = std::pair<SUnit *, SmallVector<int, 4>>;
256class PipelineSolver {
269 bool NeedsSolver =
false;
273 unsigned computeProblemSize();
284 int CurrConflInstNo = 0;
286 int CurrSyncGroupIdx = 0;
288 int BeginSyncGroupIdx = 0;
294 bool IsBottomUp =
true;
297 void advancePosition();
300 void retreatPosition();
309 template <
typename T>
310 void greedyFind(std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
T I,
316 template <
typename T>
323 template <
typename T>
void linkSchedGroups(
T I,
T E);
327 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
331 template <
typename T>
332 int linkSUnit(
SUnit *SU,
int SGID,
333 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
T I,
T E);
335 void removeEdges(
const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
337 void convertSyncMapsToArrays();
349 : DAG(DAG), SyncedInstrs(SyncedInstrs),
350 SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) {
352 for (
auto &PipelineInstrs : SyncedInstrs) {
353 if (PipelineInstrs.second.
size() > 0) {
362 convertSyncMapsToArrays();
364 CurrPipeline = BestPipeline;
366 while (
static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.
size() &&
367 PipelineInstrs[BeginSyncGroupIdx].
size() == 0)
370 if (
static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.
size())
375void PipelineSolver::reset() {
377 for (
auto &SyncPipeline : CurrPipeline) {
378 for (
auto &SG : SyncPipeline) {
380 SG.Collection.
clear();
384 if (SchedBarr != TempCollection.
end())
385 SG.Collection.push_back(*SchedBarr);
389 CurrSyncGroupIdx = BeginSyncGroupIdx;
394void PipelineSolver::convertSyncMapsToArrays() {
395 for (
auto &SyncPipe : SyncedSchedGroups) {
396 BestPipeline.insert(BestPipeline.begin(), SyncPipe.second);
399 int PipelineIDx = SyncedInstrs.size() - 1;
400 PipelineInstrs.resize(SyncedInstrs.size());
401 for (
auto &SyncInstrMap : SyncedInstrs) {
402 for (
auto &SUsToCandSGs : SyncInstrMap.second) {
403 if (PipelineInstrs[PipelineIDx].
size() == 0) {
404 PipelineInstrs[PipelineIDx].push_back(
405 std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
408 auto *SortPosition = PipelineInstrs[PipelineIDx].begin();
411 while (SortPosition != PipelineInstrs[PipelineIDx].end() &&
412 SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum)
414 PipelineInstrs[PipelineIDx].insert(
415 SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
421template <
typename T>
void PipelineSolver::linkSchedGroups(
T I,
T E) {
422 for (;
I !=
E; ++
I) {
424 for (
auto J = std::next(
I); J !=
E; ++J) {
431void PipelineSolver::makePipeline() {
433 for (
auto &SyncPipeline : BestPipeline) {
435 for (
auto &SG : SyncPipeline) {
438 SUnit *SGBarr =
nullptr;
439 for (
auto &SU : SG.Collection) {
440 if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
447 SG.link(*SGBarr,
false);
451 for (
auto &SyncPipeline : BestPipeline) {
452 IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend())
453 : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end());
458int PipelineSolver::linkSUnit(
459 SUnit *SU,
int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
461 bool MakePred =
false;
464 if (
I->getSGID() == SGID) {
469 AddedCost += Group.link(*SU, MakePred, AddedEdges);
475int PipelineSolver::addEdges(
477 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
487 return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.
rbegin(),
489 : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.
begin(),
493void PipelineSolver::removeEdges(
494 const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
497 for (
auto &PredSuccPair : EdgesToRemove) {
498 SUnit *Pred = PredSuccPair.first;
499 SUnit *Succ = PredSuccPair.second;
502 Succ->
Preds, [&Pred](
SDep &
P) { return P.getSUnit() == Pred; });
503 if (Match != Succ->
Preds.end()) {
504 assert(Match->isArtificial());
510void PipelineSolver::advancePosition() {
513 if (
static_cast<size_t>(CurrConflInstNo) >=
514 PipelineInstrs[CurrSyncGroupIdx].
size()) {
518 while (
static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() &&
519 PipelineInstrs[CurrSyncGroupIdx].size() == 0)
524void PipelineSolver::retreatPosition() {
525 assert(CurrConflInstNo >= 0);
526 assert(CurrSyncGroupIdx >= 0);
528 if (CurrConflInstNo > 0) {
533 if (CurrConflInstNo == 0) {
536 if (CurrSyncGroupIdx == BeginSyncGroupIdx)
541 while (PipelineInstrs[CurrSyncGroupIdx].
size() == 0)
544 CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1;
548bool PipelineSolver::checkOptimal() {
549 if (
static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {
550 if (BestCost == -1 || CurrCost < BestCost) {
551 BestPipeline = CurrPipeline;
558 bool DoneExploring =
false;
559 if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
560 DoneExploring =
true;
562 return (DoneExploring || BestCost == 0);
566void PipelineSolver::populateReadyList(
568 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
569 auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
570 assert(CurrSU.second.size() >= 1);
572 for (;
I !=
E; ++
I) {
573 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
575 SchedGroup *Match =
llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
576 return SG.getSGID() == CandSGID;
581 if (Match->isFull()) {
582 ReadyList.push_back(std::pair(*
I, MissPenalty));
586 int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
587 ReadyList.push_back(std::pair(*
I, TempCost));
588 removeEdges(AddedEdges);
590 ReadyList.push_back(std::pair(*
I, -1));
596 assert(ReadyList.size() == CurrSU.second.size());
599bool PipelineSolver::solveExact() {
603 if (
static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())
606 assert(
static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());
607 assert(
static_cast<size_t>(CurrConflInstNo) <
608 PipelineInstrs[CurrSyncGroupIdx].
size());
609 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
611 <<
") in Pipeline # " << CurrSyncGroupIdx <<
"\n");
616 IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.
rbegin(),
617 CurrSU.second.rend())
618 : populateReadyList(ReadyList, CurrSU.second.
begin(),
619 CurrSU.second.end());
621 auto *
I = ReadyList.
begin();
622 auto *
E = ReadyList.
end();
623 for (;
I !=
E; ++
I) {
627 if (BestCost != -1 && (CurrCost +
I->second > BestCost))
630 int CandSGID =
I->first;
632 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
633 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
635 for (
auto &SG : SyncPipeline) {
636 if (SG.getSGID() == CandSGID)
643 if (!Match->allowedByRules(CurrSU.first, SyncPipeline))
647 << (
int)Match->getMask() <<
"and ID " << CandSGID
649 Match->add(*CurrSU.first);
650 AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
651 LLVM_DEBUG(
dbgs() <<
"Cost of Assignment: " << AddedCost <<
"\n");
652 CurrCost += AddedCost;
655 bool FinishedExploring =
false;
658 if (CurrCost < BestCost || BestCost == -1) {
660 FinishedExploring = BestCost != 0;
661 if (!FinishedExploring)
667 CurrCost -= AddedCost;
668 removeEdges(AddedEdges);
670 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
671 if (FinishedExploring)
678 CurrCost += MissPenalty;
681 LLVM_DEBUG(
dbgs() <<
"NOT Assigned (" << CurrSU.first->NodeNum <<
")\n");
683 bool FinishedExploring =
false;
684 if (CurrCost < BestCost || BestCost == -1) {
686 bool FinishedExploring = BestCost != 0;
687 if (!FinishedExploring)
693 CurrCost -= MissPenalty;
694 return FinishedExploring;
698void PipelineSolver::greedyFind(
699 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
T I,
T E) {
700 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
701 int BestNodeCost = -1;
703 SchedGroup *BestGroup =
nullptr;
704 int BestGroupID = -1;
705 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
707 <<
") in Pipeline # " << CurrSyncGroupIdx <<
"\n");
713 for (;
I !=
E; ++
I) {
714 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
716 SchedGroup *Match =
llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
717 return SG.getSGID() == CandSGID;
721 LLVM_DEBUG(
dbgs() <<
"Trying SGID # " << CandSGID <<
" with Mask "
722 << (
int)Match->getMask() <<
"\n");
724 if (Match->isFull()) {
728 if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) {
729 LLVM_DEBUG(
dbgs() <<
"SGID # " << CandSGID <<
" has conflicting rule\n");
732 TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
734 if (TempCost < BestNodeCost || BestNodeCost == -1) {
736 BestNodeCost = TempCost;
737 BestGroupID = CandSGID;
739 removeEdges(AddedEdges);
740 if (BestNodeCost == 0)
744 if (BestGroupID != -1) {
745 BestGroup->add(*CurrSU.first);
746 addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);
747 LLVM_DEBUG(
dbgs() <<
"Best Group has ID: " << BestGroupID <<
" and Mask"
748 << (
int)BestGroup->getMask() <<
"\n");
749 BestCost += TempCost;
751 BestCost += MissPenalty;
753 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
756bool PipelineSolver::solveGreedy() {
758 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
760 while (
static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
761 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
763 ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend())
764 : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end());
767 BestPipeline = CurrPipeline;
768 removeEdges(AddedEdges);
772unsigned PipelineSolver::computeProblemSize() {
773 unsigned ProblemSize = 0;
774 for (
auto &PipeConflicts : PipelineInstrs) {
775 ProblemSize += PipeConflicts.size();
781void PipelineSolver::solve() {
785 unsigned ProblemSize = computeProblemSize();
788 bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact;
789 MissPenalty = (ProblemSize / 2) + 1;
792 if (EnableExactSolver || BelowCutoff) {
796 LLVM_DEBUG(
dbgs() <<
"Greedy produced best cost of " << BestCost <<
"\n");
800 LLVM_DEBUG(
dbgs() <<
"Exact produced best cost of " << BestCost <<
"\n");
812enum IGLPStrategyID :
int {
813 MFMASmallGemmOptID = 0,
814 MFMASmallGemmSingleWaveOptID = 1,
815 MFMAExpInterleaveID = 2,
816 MFMAExpSimpleInterleaveID = 3
828 virtual bool applyIGLPStrategy(
837 bool IsBottomUp =
true;
842 virtual ~IGLPStrategy() =
default;
845class MFMASmallGemmOpt final :
public IGLPStrategy {
848 bool applyIGLPStrategy(
859 : IGLPStrategy(DAG,
TII) {
864bool MFMASmallGemmOpt::applyIGLPStrategy(
869 unsigned MFMACount = 0;
871 if (
TII->isMFMAorWMMA(
I))
874 const unsigned PipelineSyncID = 0;
875 SchedGroup *SG =
nullptr;
876 for (
unsigned I = 0;
I < MFMACount * 3; ++
I) {
877 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
878 SchedGroupMask::DS, 2, PipelineSyncID, DAG,
TII);
879 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
881 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
882 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
883 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
889class MFMAExpInterleaveOpt final :
public IGLPStrategy {
892 static unsigned TransPipeCount;
894 static unsigned MFMAPipeCount;
896 static unsigned AddPipeCount;
898 static unsigned MFMAEnablement;
900 static unsigned ExpRequirement;
902 static unsigned MFMAChains;
904 static unsigned MFMAChainLength;
909 static bool HasChainBetweenCvt;
911 static std::optional<unsigned> FirstPipeDSR;
920 class IsPipeExp final :
public InstructionRule {
925 auto *DAG = SyncPipe[0].DAG;
927 if (Cache->empty()) {
928 auto I = DAG->SUnits.rbegin();
929 auto E = DAG->SUnits.rend();
930 for (;
I !=
E;
I++) {
931 if (
TII->isMFMAorWMMA(*
I->getInstr()))
932 Cache->push_back(&*
I);
938 auto Reaches =
any_of(*Cache, [&SU, &DAG](
SUnit *TargetSU) {
939 return DAG->IsReachable(TargetSU,
const_cast<SUnit *
>(SU));
944 IsPipeExp(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
945 : InstructionRule(
TII, SGID, NeedsCache) {}
950 class EnablesNthMFMA final :
public InstructionRule {
957 bool FoundTrans =
false;
958 unsigned Counter = 1;
959 auto *DAG = SyncPipe[0].DAG;
961 if (Cache->empty()) {
962 auto I = DAG->SUnits.begin();
963 auto E = DAG->SUnits.end();
964 for (;
I !=
E;
I++) {
965 if (FoundTrans &&
TII->isMFMAorWMMA(*
I->getInstr())) {
967 Cache->push_back(&*
I);
972 if (!FoundTrans &&
TII->isTRANS(
I->getInstr()->getOpcode()))
979 return DAG->IsReachable((*Cache)[0],
const_cast<SUnit *
>(SU));
983 bool NeedsCache =
false)
989 class EnablesNthMFMAInChain final :
public InstructionRule {
997 auto *DAG = SyncPipe[0].DAG;
999 if (!SU || !
TII->isMFMAorWMMA(*ChainSeed->
getInstr()))
1002 if (Cache->empty()) {
1003 auto *TempSU = ChainSeed;
1008 for (
auto &Succ : TempSU->Succs) {
1009 if (
TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
1010 TempSU = Succ.getSUnit();
1019 Cache->push_back(TempSU);
1025 return DAG->IsReachable((*Cache)[0],
const_cast<SUnit *
>(SU));
1028 EnablesNthMFMAInChain(
unsigned Number,
SUnit *ChainSeed,
1030 bool NeedsCache =
false)
1032 ChainSeed(ChainSeed) {}
1038 class LessThanNSuccs final :
public InstructionRule {
1041 bool HasIntermediary =
false;
1046 if (!SyncPipe.
size())
1050 return Succ.getKind() == SDep::Data;
1052 if (SuccSize >=
Size)
1055 if (HasIntermediary) {
1056 for (
auto Succ : SU->
Succs) {
1059 return SuccSucc.getKind() == SDep::Data;
1061 if (SuccSize >=
Size)
1069 bool HasIntermediary =
false,
bool NeedsCache =
false)
1070 : InstructionRule(
TII, SGID, NeedsCache),
Size(
Size),
1071 HasIntermediary(HasIntermediary) {}
1078 class GreaterThanOrEqualToNSuccs final :
public InstructionRule {
1081 bool HasIntermediary =
false;
1086 if (!SyncPipe.
size())
1090 return Succ.getKind() == SDep::Data;
1092 if (SuccSize >=
Size)
1095 if (HasIntermediary) {
1096 for (
auto Succ : SU->
Succs) {
1099 return SuccSucc.getKind() == SDep::Data;
1101 if (SuccSize >=
Size)
1109 unsigned SGID,
bool HasIntermediary =
false,
1110 bool NeedsCache =
false)
1111 : InstructionRule(
TII, SGID, NeedsCache),
Size(
Size),
1112 HasIntermediary(HasIntermediary) {}
1116 class IsCvt final :
public InstructionRule {
1121 return Opc == AMDGPU::V_CVT_F16_F32_e32 ||
1122 Opc == AMDGPU::V_CVT_I32_F32_e32;
1124 IsCvt(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1125 : InstructionRule(
TII, SGID, NeedsCache) {}
1129 class IsFMA final :
public InstructionRule {
1136 IsFMA(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1137 : InstructionRule(
TII, SGID, NeedsCache) {}
1141 class IsPipeAdd final :
public InstructionRule {
1147 IsPipeAdd(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1148 : InstructionRule(
TII, SGID, NeedsCache) {}
1153 class IsSuccOfPrevNthGroup final :
public InstructionRule {
1155 unsigned Distance = 1;
1160 SchedGroup *OtherGroup =
nullptr;
1161 if (!SyncPipe.
size())
1164 for (
auto &PipeSG : SyncPipe) {
1165 if ((
unsigned)PipeSG.getSGID() == SGID - Distance)
1166 OtherGroup = &PipeSG;
1171 if (!OtherGroup->Collection.size())
1174 for (
auto &OtherEle : OtherGroup->Collection) {
1175 for (
auto &Succ : OtherEle->Succs) {
1176 if (Succ.getSUnit() == SU && Succ.getKind() ==
SDep::Data)
1184 unsigned SGID,
bool NeedsCache =
false)
1185 : InstructionRule(
TII, SGID, NeedsCache), Distance(Distance) {}
1190 class IsReachableFromPrevNthGroup final :
public InstructionRule {
1192 unsigned Distance = 1;
1197 SchedGroup *OtherGroup =
nullptr;
1198 if (!SyncPipe.
size())
1201 for (
auto &PipeSG : SyncPipe) {
1202 if ((
unsigned)PipeSG.getSGID() == SGID - Distance)
1203 OtherGroup = &PipeSG;
1208 if (!OtherGroup->Collection.size())
1211 auto *DAG = SyncPipe[0].DAG;
1213 for (
auto &OtherEle : OtherGroup->Collection)
1214 if (DAG->IsReachable(
const_cast<SUnit *
>(SU), OtherEle))
1219 IsReachableFromPrevNthGroup(
unsigned Distance,
const SIInstrInfo *
TII,
1220 unsigned SGID,
bool NeedsCache =
false)
1221 : InstructionRule(
TII, SGID, NeedsCache), Distance(Distance) {}
1225 class OccursAtOrAfterNode final :
public InstructionRule {
1236 bool NeedsCache =
false)
1242 class IsExactMFMA final :
public InstructionRule {
1250 if (!SU || !
TII->isMFMAorWMMA(*ChainSeed->
getInstr()))
1253 if (Cache->empty()) {
1254 auto *TempSU = ChainSeed;
1259 for (
auto &Succ : TempSU->Succs) {
1260 if (
TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
1261 TempSU = Succ.getSUnit();
1270 Cache->push_back(TempSU);
1276 return (*Cache)[0] == SU;
1280 unsigned SGID,
bool NeedsCache =
false)
1282 ChainSeed(ChainSeed) {}
1288 class OccursAfterExp final :
public InstructionRule {
1293 auto *DAG = SyncPipe[0].DAG;
1294 if (Cache->empty()) {
1295 for (
auto &SU : DAG->SUnits)
1297 Cache->push_back(&SU);
1304 return SU->
NodeNum > (*Cache)[0]->NodeNum;
1308 bool NeedsCache =
false)
1309 : InstructionRule(
TII, SGID, NeedsCache) {}
1313 bool applyIGLPStrategy(
1322 : IGLPStrategy(DAG,
TII) {
1327unsigned MFMAExpInterleaveOpt::TransPipeCount = 0;
1328unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0;
1329unsigned MFMAExpInterleaveOpt::AddPipeCount = 0;
1330unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0;
1331unsigned MFMAExpInterleaveOpt::ExpRequirement = 0;
1332unsigned MFMAExpInterleaveOpt::MFMAChains = 0;
1333unsigned MFMAExpInterleaveOpt::MFMAChainLength = 0;
1334bool MFMAExpInterleaveOpt::HasCvt =
false;
1335bool MFMAExpInterleaveOpt::HasChainBetweenCvt =
false;
1336std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt;
1345 auto isBitPack = [](
unsigned Opc) {
1346 return Opc == AMDGPU::V_PACK_B32_F16_e64 ||
Opc == AMDGPU::V_PERM_B32_e64;
1349 auto isCvt = [](
unsigned Opc) {
1350 return Opc == AMDGPU::V_CVT_F16_F32_e32 ||
Opc == AMDGPU::V_CVT_I32_F32_e32;
1353 auto isAdd = [](
unsigned Opc) {
return Opc == AMDGPU::V_ADD_F32_e32; };
1356 for (
SUnit &SU : DAG->SUnits) {
1360 if (SU.
Succs.size() >= 7)
1362 for (
auto &Succ : SU.
Succs) {
1363 if (Succ.getSUnit()->Succs.size() >= 7)
1382 if (!(PackSUs.
size() && MFMAPipeCands.
size() && ExpPipeCands.
size()))
1387 std::optional<SUnit *> TempMFMA;
1388 std::optional<SUnit *> TempExp;
1390 for (
auto &PredSU : ExpPipeCands) {
1391 for (
auto &SuccSU : MFMAPipeCands) {
1392 if (DAG->IsReachable(SuccSU, PredSU)) {
1404 if (!(TempExp && TempMFMA))
1407 HasChainBetweenCvt =
none_of((*TempExp)->Succs, [&isCvt](
SDep &Succ) {
1408 return isCvt(Succ.getSUnit()->getInstr()->getOpcode());
1412 for (
auto &SuccSU : MFMAPipeCands) {
1413 if (MFMAPipeSUs.
size() &&
1414 any_of(MFMAPipeSUs, [&SuccSU](
SUnit *PotentialMatch) {
1415 return PotentialMatch->
NodeNum == SuccSU->NodeNum;
1419 for (
auto &PredSU : ExpPipeCands) {
1420 if (DAG->IsReachable(SuccSU, PredSU)) {
1427 MFMAPipeCount = MFMAPipeSUs.
size();
1429 assert(TempExp && TempMFMA);
1430 assert(MFMAPipeCount > 0);
1432 std::optional<SUnit *> TempCvt;
1433 for (
auto &SuccSU : CvtSUs) {
1434 if (DAG->IsReachable(SuccSU, *TempExp)) {
1441 if (TempCvt.has_value()) {
1442 for (
auto &SuccSU : MFMAPipeSUs) {
1443 if (DAG->IsReachable(SuccSU, *TempCvt)) {
1451 for (
auto &MFMAPipeSU : MFMAPipeSUs) {
1455 return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
1457 MFMAChainSeeds.push_back(MFMAPipeSU);
1465 for (
auto Pred : MFMAChainSeeds[0]->Preds) {
1466 if (
TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) &&
1467 Pred.getSUnit()->getInstr()->mayLoad())
1468 FirstPipeDSR = Pred.getSUnit()->NodeNum;
1471 MFMAChainLength = MFMAPipeCount / MFMAChains;
1474 unsigned PackSuccCount =
1476 return DAG->IsReachable(VPack, *TempExp);
1480 unsigned PackPredCount =
1482 auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1483 return isBitPack(Opc);
1487 auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1488 return isBitPack(Opc);
1491 if (PackPred == (*TempMFMA)->Preds.end())
1499 return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
1503 MFMAEnablement *= PackSuccCount;
1508 return DAG->IsReachable(PackPred->getSUnit(), ExpBase);
1511 ExpRequirement *= PackPredCount;
1521 MFMAChainSeeds.clear();
1528bool MFMAExpInterleaveOpt::applyIGLPStrategy(
1533 bool IsSmallKernelType =
1534 MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32;
1535 bool IsLargeKernelType =
1536 MFMAEnablement == 4 && ExpRequirement == 4 && TransPipeCount == 64;
1538 if (!(IsSmallKernelType || IsLargeKernelType))
1544 unsigned PipelineSyncID = 0;
1545 SchedGroup *SG =
nullptr;
1547 unsigned MFMAChain = 0;
1548 unsigned PositionInChain = 0;
1549 unsigned CurrMFMAForTransPosition = 0;
1551 auto incrementTransPosition = [&MFMAChain, &PositionInChain,
1552 &CurrMFMAForTransPosition]() {
1553 CurrMFMAForTransPosition += MFMAEnablement;
1554 PositionInChain = (CurrMFMAForTransPosition / MFMAChains);
1555 MFMAChain = CurrMFMAForTransPosition % MFMAChains;
1558 auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() {
1559 auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1560 return (TempMFMAForTrans / MFMAChains);
1563 auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() {
1564 auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1565 return TempMFMAForTrans % MFMAChains;
1568 unsigned CurrMFMAPosition = 0;
1569 unsigned MFMAChainForMFMA = 0;
1570 unsigned PositionInChainForMFMA = 0;
1572 auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA,
1573 &PositionInChainForMFMA]() {
1575 MFMAChainForMFMA = CurrMFMAPosition % MFMAChains;
1576 PositionInChainForMFMA = CurrMFMAPosition / MFMAChains;
1580 assert(IsPostRA || MFMAChainSeeds.size() == MFMAChains);
1582 bool UsesFMA = IsSmallKernelType || !IsPostRA;
1583 bool UsesDSRead = IsLargeKernelType && !IsPostRA && FirstPipeDSR;
1584 bool UsesCvt = HasCvt && (IsSmallKernelType || !IsPostRA);
1585 bool UsesVALU = IsSmallKernelType;
1590 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1591 SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG,
TII);
1592 if (!IsPostRA && MFMAChains) {
1593 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1594 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
1598 std::make_shared<EnablesNthMFMA>(1,
TII, SG->getSGID(),
true));
1599 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1600 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1603 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1604 SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG,
TII);
1605 if (!IsPostRA && MFMAChains) {
1606 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1607 getNextTransPositionInChain(),
1608 MFMAChainSeeds[getNextTransMFMAChain()],
TII, SG->getSGID(),
true));
1610 SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1,
TII,
1611 SG->getSGID(),
true));
1612 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1613 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1617 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1618 SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG,
TII);
1619 SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR,
TII,
1621 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1625 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1626 SchedGroupMask::TRANS, ExpRequirement, PipelineSyncID, DAG,
TII);
1627 if (!IsPostRA && MFMAChains)
1628 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1629 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
true));
1631 SG->addRule(std::make_shared<EnablesNthMFMA>(1,
TII, SG->getSGID(),
true));
1632 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1633 SG->addRule(std::make_shared<LessThanNSuccs>(8,
TII, SG->getSGID(),
1634 HasChainBetweenCvt));
1635 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1637 incrementTransPosition();
1640 for (
unsigned I = 0;
I < ExpRequirement;
I++) {
1643 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1644 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1645 SG->addRule(std::make_shared<IsCvt>(
TII, SG->getSGID()));
1646 if (HasChainBetweenCvt)
1647 SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
1648 1 + (2 + UsesFMA) *
I,
TII, SG->getSGID()));
1650 SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(
1651 1 + (2 + UsesFMA) *
I,
TII, SG->getSGID()));
1652 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1657 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1658 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1659 if (!IsPostRA && MFMAChains) {
1660 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1661 getNextTransPositionInChain(),
1662 MFMAChainSeeds[getNextTransMFMAChain()],
TII, SG->getSGID(),
true));
1664 SG->addRule(std::make_shared<EnablesNthMFMA>(2 * MFMAEnablement + 1,
1665 TII, SG->getSGID(),
true));
1666 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1667 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1671 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1672 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG,
TII);
1673 if (!IsPostRA && MFMAChains)
1674 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1675 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
1678 SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1,
TII,
1679 SG->getSGID(),
true));
1680 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1681 SG->addRule(std::make_shared<LessThanNSuccs>(8,
TII, SG->getSGID(),
1682 HasChainBetweenCvt));
1683 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1688 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1689 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG,
TII);
1690 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1691 SG->addRule(std::make_shared<GreaterThanOrEqualToNSuccs>(
1692 8,
TII, SG->getSGID(), HasChainBetweenCvt));
1693 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1698 unsigned MFMARatio =
1699 MFMAEnablement > ExpRequirement ? MFMAEnablement / ExpRequirement : 1;
1702 MFMAEnablement > ExpRequirement ? 1 : ExpRequirement / MFMAEnablement;
1704 unsigned RemainingExp = TransPipeCount > (2 * ExpRequirement)
1705 ? TransPipeCount - (2 * ExpRequirement)
1707 unsigned ExpLoopCount = RemainingExp / ExpRatio;
1709 unsigned MFMAInLoop = MFMAPipeCount > (MFMAEnablement * 2)
1710 ? MFMAPipeCount - (MFMAEnablement * 2)
1712 unsigned MFMALoopCount = MFMAInLoop / MFMARatio;
1714 AddPipeCount < MFMAPipeCount ? 1 : AddPipeCount / MFMAPipeCount;
1715 unsigned LoopSize = std::min(ExpLoopCount, MFMALoopCount);
1717 for (
unsigned I = 0;
I < LoopSize;
I++) {
1718 if (!(
I * ExpRatio % ExpRequirement))
1719 incrementTransPosition();
1722 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1723 SchedGroupMask::MFMA, MFMARatio, PipelineSyncID, DAG,
TII);
1724 if (!IsPostRA && MFMAChains)
1725 SG->addRule(std::make_shared<IsExactMFMA>(
1726 PositionInChainForMFMA, MFMAChainSeeds[MFMAChainForMFMA],
TII,
1727 SG->getSGID(),
true));
1729 SG->addRule(std::make_shared<OccursAfterExp>(
TII, SG->getSGID(),
true));
1730 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1731 incrementMFMAPosition();
1734 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1735 SchedGroupMask::VALU, VALUOps, PipelineSyncID, DAG,
TII);
1736 SG->addRule(std::make_shared<IsPipeAdd>(
TII, SG->getSGID()));
1737 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1740 if (UsesDSRead && !(
I % 4)) {
1741 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1742 SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG,
TII);
1743 SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR,
TII,
1745 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1749 for (
unsigned J = 0; J < ExpRatio; J++) {
1750 auto MFMAOffset = (1 + UsesVALU) * MFMARatio * (
I + 1);
1751 auto MaxMFMAOffset =
1752 (1 + UsesVALU) * ExpRequirement * MFMARatio / ExpRatio;
1756 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1757 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1758 SG->addRule(std::make_shared<IsCvt>(
TII, SG->getSGID()));
1759 auto BaseDiff = (2 + UsesFMA) * (ExpRequirement - 1) + 1;
1760 auto DSROffset =
I / 4 + 1;
1761 auto MaxDSROffset = MaxMFMAOffset / 4;
1763 auto ExpOffset =
I * ExpRatio + J >= ExpRequirement ? 0 : 1;
1764 auto CurrentOffset = UsesDSRead * std::min(MaxDSROffset, DSROffset) +
1765 std::min(MaxMFMAOffset, MFMAOffset) + BaseDiff +
1767 if (HasChainBetweenCvt)
1768 SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
1769 CurrentOffset,
TII, SG->getSGID()));
1771 SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(CurrentOffset,
TII,
1773 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1778 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1779 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1780 if (!IsPostRA && MFMAChains)
1781 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1782 getNextTransPositionInChain(),
1783 MFMAChainSeeds[getNextTransMFMAChain()],
TII, SG->getSGID(),
1786 SG->addRule(std::make_shared<EnablesNthMFMA>(
1787 (((
I * ExpRatio + J) / ExpRequirement) + 3) * MFMAEnablement + 1,
1788 TII, SG->getSGID(),
true));
1789 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1790 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1794 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1795 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG,
TII);
1796 if (!IsPostRA && MFMAChains)
1797 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1798 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
1801 SG->addRule(std::make_shared<EnablesNthMFMA>(
1802 (((
I * ExpRatio + J) / ExpRequirement) + 2) * MFMAEnablement + 1,
1803 TII, SG->getSGID(),
true));
1804 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1805 SG->addRule(std::make_shared<LessThanNSuccs>(8,
TII, SG->getSGID(),
1806 HasChainBetweenCvt));
1807 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1812 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1813 SchedGroupMask::MFMA, MFMAEnablement * 2, PipelineSyncID, DAG,
TII);
1814 SG->addRule(std::make_shared<OccursAfterExp>(
TII, SG->getSGID(),
true));
1815 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1819class MFMAExpSimpleInterleaveOpt final :
public IGLPStrategy {
1821 bool applyIGLPStrategy(
1832 : IGLPStrategy(DAG,
TII) {
1837bool MFMAExpSimpleInterleaveOpt::applyIGLPStrategy(
1842 unsigned MFMACount = 0;
1844 if (
TII->isMFMAorWMMA(
I))
1847 const unsigned PipelineSyncID = 0;
1848 for (
unsigned I = 0;
I < MFMACount * 3; ++
I) {
1849 SchedGroup *SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1850 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG,
TII);
1851 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1853 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1854 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
1855 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1861class MFMASmallGemmSingleWaveOpt final :
public IGLPStrategy {
1864 class EnablesInitialMFMA final :
public InstructionRule {
1868 if (!SyncPipe.
size())
1871 if (!Cache->size()) {
1872 for (
auto &Elt : SyncPipe[0].DAG->
SUnits) {
1873 if (
TII->isMFMAorWMMA(*Elt.getInstr())) {
1877 Cache->push_back(&Elt);
1882 auto *DAG = SyncPipe[0].DAG;
1883 for (
auto &Elt : *Cache) {
1891 bool NeedsCache =
false)
1892 : InstructionRule(
TII, SGID, NeedsCache) {}
1896 class IsPermForDSW final :
public InstructionRule {
1901 if (
MI->getOpcode() != AMDGPU::V_PERM_B32_e64)
1904 bool FitsInGroup =
false;
1906 if (!Collection.
size()) {
1907 for (
auto &Succ : SU->
Succs) {
1908 SUnit *SuccUnit = Succ.getSUnit();
1911 Cache->push_back(SuccUnit);
1922 return ThisSucc.getSUnit() == Elt;
1927 IsPermForDSW(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1928 : InstructionRule(
TII, SGID, NeedsCache) {}
1932 class IsSuccOfPrevGroup final :
public InstructionRule {
1936 SchedGroup *OtherGroup =
nullptr;
1937 for (
auto &PipeSG : SyncPipe) {
1938 if ((
unsigned)PipeSG.getSGID() == SGID - 1) {
1939 OtherGroup = &PipeSG;
1945 if (!OtherGroup->Collection.size())
1949 return any_of(OtherGroup->Collection, [&SU](
SUnit *Elt) {
1950 return any_of(Elt->Succs,
1951 [&SU](SDep &Succ) { return Succ.getSUnit() == SU; });
1955 bool NeedsCache =
false)
1956 : InstructionRule(
TII, SGID, NeedsCache) {}
1960 class VMEMSize final :
public InstructionRule {
1965 if (
MI->getOpcode() == TargetOpcode::BUNDLE)
1967 if (!Collection.
size())
1972 auto TRI =
TII->getRegisterInfo();
1973 auto &
MRI =
MI->getMF()->getRegInfo();
1974 for (
auto &Elt : Collection) {
1975 auto Op = Elt->getInstr()->getOperand(0);
1977 TRI.getRegSizeInBits(*
TRI.getRegClassForOperandReg(
MRI,
Op));
1981 if (NumBits < 128) {
1983 if (NumBits +
TRI.getRegSizeInBits(*
TRI.getRegClassForOperandReg(
1984 MRI,
MI->getOperand(0))) <=
1992 VMEMSize(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1993 : InstructionRule(
TII, SGID, NeedsCache) {}
1998 class SharesPredWithPrevNthGroup final :
public InstructionRule {
2000 unsigned Distance = 1;
2005 SchedGroup *OtherGroup =
nullptr;
2006 if (!SyncPipe.
size())
2009 if (!Cache->size()) {
2011 for (
auto &PipeSG : SyncPipe) {
2012 if ((
unsigned)PipeSG.getSGID() == SGID - Distance) {
2013 OtherGroup = &PipeSG;
2019 if (!OtherGroup->Collection.size())
2022 for (
auto &OtherEle : OtherGroup->Collection) {
2023 for (
auto &Pred : OtherEle->Preds) {
2024 if (Pred.getSUnit()->getInstr()->getOpcode() ==
2025 AMDGPU::V_PERM_B32_e64)
2026 Cache->push_back(Pred.getSUnit());
2035 auto *DAG = SyncPipe[0].DAG;
2042 SharesPredWithPrevNthGroup(
unsigned Distance,
const SIInstrInfo *
TII,
2043 unsigned SGID,
bool NeedsCache =
false)
2044 : InstructionRule(
TII, SGID, NeedsCache), Distance(Distance) {}
2048 bool applyIGLPStrategy(
2059 : IGLPStrategy(DAG,
TII) {
2064static unsigned DSWCount = 0;
2065static unsigned DSWWithPermCount = 0;
2066static unsigned DSWWithSharedVMEMCount = 0;
2068bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
2069 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
2072 unsigned MFMACount = 0;
2073 unsigned DSRCount = 0;
2075 bool IsInitial =
Phase == AMDGPU::SchedulingPhase::Initial;
2077 assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 &&
2078 DSWWithSharedVMEMCount == 0)) &&
2079 "DSWCounters should be zero in pre-RA scheduling!");
2081 for (
auto &SU : DAG->SUnits) {
2082 auto *
I = SU.getInstr();
2083 if (
TII->isMFMAorWMMA(*
I))
2085 else if (
TII->isDS(*
I)) {
2088 else if (
I->mayStore() && IsInitial) {
2090 for (
auto Pred : SU.Preds) {
2091 if (Pred.getSUnit()->getInstr()->getOpcode() ==
2092 AMDGPU::V_PERM_B32_e64) {
2102 DSWWithPermCount = DSWithPerms.
size();
2103 auto *
I = DSWithPerms.
begin();
2104 auto *
E = DSWithPerms.
end();
2112 DenseMap<MachineInstr *, SUnit *> VMEMLookup;
2114 for (;
I !=
E;
I++) {
2115 SUnit *Cand =
nullptr;
2116 bool MissedAny =
false;
2117 for (
auto &Pred : (*I)->Preds) {
2118 if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
2124 for (
auto &Succ : Pred.getSUnit()->Succs) {
2125 auto *
MI = Succ.getSUnit()->getInstr();
2126 if (!
TII->isVMEM(*
MI) || !
MI->mayLoad())
2129 if (MissedAny || !VMEMLookup.
size()) {
2131 VMEMLookup[
MI] = *
I;
2148 if (!MissedAny && Cand) {
2149 DSWWithSharedVMEMCount += 2;
2156 assert(DSWWithSharedVMEMCount <= DSWWithPermCount);
2158 unsigned PipelineSyncID = 0;
2160 if (DSWWithPermCount) {
2161 for (
unsigned I = 0;
I < MFMACount;
I++) {
2162 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2163 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2164 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2166 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2167 SchedGroupMask::VALU, 2, PipelineSyncID, DAG,
TII);
2168 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2178 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2179 SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG,
TII);
2180 SG->addRule(std::make_shared<EnablesInitialMFMA>(
TII, SG->getSGID(),
true));
2181 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2183 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2184 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2185 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2188 for (
unsigned I = 4;
I < DSRCount; ++
I) {
2189 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2190 SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG,
TII);
2191 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2193 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2194 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2195 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2201 for (
unsigned I = DSWWithSharedVMEMCount;
I < DSWWithPermCount; ++
I) {
2202 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2203 SchedGroupMask::VALU, 4, PipelineSyncID, DAG,
TII);
2204 SG->addRule(std::make_shared<IsPermForDSW>(
TII, SG->getSGID(),
true));
2205 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2207 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2208 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2209 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(
TII, SG->getSGID()));
2210 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2212 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2213 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2214 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2215 1,
TII, SG->getSGID(),
true));
2216 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2217 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2219 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2220 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2221 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2223 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2224 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2225 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2226 3,
TII, SG->getSGID(),
true));
2227 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2228 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2230 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2231 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2232 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2238 for (
unsigned I = DSWWithPermCount;
I < DSWCount;
I++) {
2239 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2240 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2241 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2243 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2244 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2245 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2246 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2248 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2249 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2250 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2258 for (
unsigned I = 0;
I < DSWWithSharedVMEMCount; ++
I) {
2259 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2260 SchedGroupMask::VALU, 4, PipelineSyncID, DAG,
TII);
2261 SG->addRule(std::make_shared<IsPermForDSW>(
TII, SG->getSGID(),
true));
2262 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2264 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2265 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2266 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(
TII, SG->getSGID()));
2267 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2269 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2270 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2271 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2273 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2274 SchedGroupMask::VALU, 4, PipelineSyncID, DAG,
TII);
2275 SG->addRule(std::make_shared<IsPermForDSW>(
TII, SG->getSGID(),
true));
2276 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2278 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2279 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2280 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(
TII, SG->getSGID()));
2281 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2283 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2284 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2285 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2287 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2288 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2289 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2290 2,
TII, SG->getSGID(),
true));
2291 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2292 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2294 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2295 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2296 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2298 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2299 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2300 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2301 4,
TII, SG->getSGID(),
true));
2302 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2303 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2305 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2306 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2307 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2313static std::unique_ptr<IGLPStrategy>
2314createIGLPStrategy(IGLPStrategyID
ID, ScheduleDAGInstrs *DAG,
2315 const SIInstrInfo *
TII) {
2317 case MFMASmallGemmOptID:
2318 return std::make_unique<MFMASmallGemmOpt>(DAG,
TII);
2319 case MFMASmallGemmSingleWaveOptID:
2320 return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG,
TII);
2321 case MFMAExpInterleaveID:
2322 return std::make_unique<MFMAExpInterleaveOpt>(DAG,
TII);
2323 case MFMAExpSimpleInterleaveID:
2324 return std::make_unique<MFMAExpSimpleInterleaveOpt>(DAG,
TII);
2330class IGroupLPDAGMutation :
public ScheduleDAGMutation {
2332 const SIInstrInfo *
TII;
2339 DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
2342 DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
2345 void addSchedBarrierEdges(SUnit &SU);
2356 SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask)
const;
2359 void initSchedGroupBarrierPipelineStage(
2360 std::vector<SUnit>::reverse_iterator RIter);
2362 bool initIGLPOpt(SUnit &SU);
2365 void apply(ScheduleDAGInstrs *DAGInstrs)
override;
2372 bool IsBottomUp =
true;
2377 IGroupLPDAGMutation() =
default;
2381unsigned SchedGroup::NumSchedGroups = 0;
2383bool SchedGroup::tryAddEdge(SUnit *
A, SUnit *
B) {
2391bool SchedGroup::canAddMI(
const MachineInstr &
MI)
const {
2393 if (
MI.isMetaInstruction())
2396 else if (
MI.isInlineAsm()) {
2398 auto &
MRI =
MI.getParent()->getParent()->getRegInfo();
2399 bool SGPR_used =
false, SGPR_big_def =
false, VGPR_used =
false,
2400 VMFMA_used =
false, VReg32_used =
false,
MayLoad =
MI.mayLoad(),
2402 for (
const MachineOperand &Operand :
MI.operands())
2403 if (Operand.isReg()) {
2404 const TargetRegisterClass &RegClass =
2405 *
TRI.getRegClassForOperandReg(
MRI, Operand);
2406 if (
TRI.hasVGPRs(&RegClass)) {
2408 if (Operand.isUse() &&
TRI.getRegSizeInBits(RegClass) == 32)
2414 if (
TRI.hasAGPRs(&RegClass) ||
TRI.getRegSizeInBits(RegClass) > 128)
2416 if (
TRI.hasSGPRs(&RegClass))
2418 if (
TRI.getRegSizeInBits(RegClass) > 64 && Operand.isDef())
2419 SGPR_big_def =
true;
2422 typedef std::underlying_type_t<SchedGroupMask> SGMask_t;
2423 SGMask_t InlineAsmMask = 0;
2424 if (VGPR_used && !VMFMA_used && !MayLoad && !MayStore)
2425 InlineAsmMask |= (SGMask_t)SchedGroupMask::VALU;
2426 if (SGPR_used && !VGPR_used && !MayLoad && !MayStore)
2427 InlineAsmMask |= (SGMask_t)SchedGroupMask::SALU;
2429 InlineAsmMask |= (SGMask_t)SchedGroupMask::MFMA;
2430 if (VGPR_used && MayLoad)
2431 InlineAsmMask |= (SGMask_t)(VReg32_used ? SchedGroupMask::DS_READ
2432 : SchedGroupMask::VMEM_READ);
2433 if (VGPR_used && MayStore)
2434 InlineAsmMask |= (SGMask_t)(VReg32_used ? SchedGroupMask::DS_WRITE
2435 : SchedGroupMask::VMEM_WRITE);
2437 InlineAsmMask |= (SGMask_t)SchedGroupMask::DS_READ;
2438 if (InlineAsmMask & (SGMask_t)SchedGroupMask::VALU ||
2439 InlineAsmMask & (SGMask_t)SchedGroupMask::SALU)
2440 InlineAsmMask |= (SGMask_t)SchedGroupMask::ALU;
2441 if (InlineAsmMask & (SGMask_t)SchedGroupMask::DS_READ ||
2442 InlineAsmMask & (SGMask_t)SchedGroupMask::DS_WRITE)
2443 InlineAsmMask |= (SGMask_t)SchedGroupMask::DS;
2444 if (InlineAsmMask & (SGMask_t)SchedGroupMask::VMEM_READ ||
2445 InlineAsmMask & (SGMask_t)SchedGroupMask::VMEM_WRITE)
2446 InlineAsmMask |= (SGMask_t)SchedGroupMask::VMEM;
2448 Result = ((SGMask_t)SGMask & InlineAsmMask) != 0;
2451 else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
2456 else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
2464 else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
2468 else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
2469 TII->isMFMAorWMMA(
MI))
2472 else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
2476 else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
2477 MI.mayLoad() &&
TII->isVMEM(
MI))
2480 else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
2481 MI.mayStore() &&
TII->isVMEM(
MI))
2484 else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
2488 else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
2489 MI.mayLoad() &&
TII->isDS(
MI))
2492 else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
2493 MI.mayStore() &&
TII->isDS(
MI))
2496 else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) &&
2501 dbgs() <<
"For SchedGroup with mask " <<
format_hex((
int)SGMask, 10,
true)
2502 << (Result ?
" could classify " :
" unable to classify ") <<
MI);
2507int SchedGroup::link(SUnit &SU,
bool MakePred,
2508 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
2509 int MissedEdges = 0;
2510 for (
auto *
A : Collection) {
2512 if (
A ==
B ||
A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2522 bool Added = tryAddEdge(
A,
B);
2524 AddedEdges.emplace_back(
A,
B);
2532void SchedGroup::link(SUnit &SU,
bool MakePred) {
2533 for (
auto *
A : Collection) {
2535 if (
A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2544void SchedGroup::link(SUnit &SU,
2545 function_ref<
bool(
const SUnit *
A,
const SUnit *
B)>
P) {
2546 for (
auto *
A : Collection) {
2555void SchedGroup::link(SchedGroup &OtherGroup) {
2556 for (
auto *
B : OtherGroup.Collection)
2560bool SchedGroup::canAddSU(SUnit &SU)
const {
2562 if (
MI.getOpcode() != TargetOpcode::BUNDLE)
2563 return canAddMI(
MI);
2566 const MachineBasicBlock *
MBB =
MI.getParent();
2568 while (
E !=
MBB->
end() &&
E->isBundledWithPred())
2572 return std::all_of(
B,
E, [
this](MachineInstr &
MI) {
return canAddMI(
MI); });
2575void SchedGroup::initSchedGroup() {
2576 for (
auto &SU : DAG->
SUnits) {
2585void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
2586 SUnitsToCandidateSGsMap &SyncedInstrs) {
2587 SUnit &InitSU = *RIter;
2588 for (
auto E = DAG->
SUnits.rend(); RIter !=
E; ++RIter) {
2594 SyncedInstrs[&SU].push_back(SGID);
2602void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) {
2603 auto I = DAG->
SUnits.rbegin();
2605 for (;
I !=
E; ++
I) {
2610 SyncedInstrs[&SU].push_back(SGID);
2614void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
2615 const TargetSchedModel *TSchedModel = DAGInstrs->
getSchedModel();
2616 if (!TSchedModel || DAGInstrs->
SUnits.empty())
2621 TII =
ST.getInstrInfo();
2622 DAG =
static_cast<ScheduleDAGMI *
>(DAGInstrs);
2623 SyncedSchedGroups.clear();
2624 SyncedInstrs.clear();
2625 bool FoundSB =
false;
2626 bool FoundIGLP =
false;
2627 bool ShouldApplyIGLP =
false;
2628 for (
auto R = DAG->
SUnits.rbegin(),
E = DAG->
SUnits.rend(); R !=
E; ++R) {
2629 unsigned Opc =
R->getInstr()->getOpcode();
2631 if (
Opc == AMDGPU::SCHED_BARRIER) {
2632 addSchedBarrierEdges(*R);
2634 }
else if (
Opc == AMDGPU::SCHED_GROUP_BARRIER) {
2635 initSchedGroupBarrierPipelineStage(R);
2637 }
else if (
Opc == AMDGPU::IGLP_OPT) {
2638 if (!FoundSB && !FoundIGLP) {
2640 ShouldApplyIGLP = initIGLPOpt(*R);
2645 if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) {
2646 PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
2654void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
2656 assert(
MI.getOpcode() == AMDGPU::SCHED_BARRIER);
2659 LLVM_DEBUG(
dbgs() <<
"Building SchedGroup for SchedBarrier with Mask: "
2660 <<
MI.getOperand(0).getImm() <<
"\n");
2662 invertSchedBarrierMask((SchedGroupMask)
MI.getOperand(0).getImm());
2663 SchedGroup SG(InvertedMask, std::nullopt, DAG,
TII);
2664 SG.initSchedGroup();
2669 (function_ref<
bool(
const SUnit *
A,
const SUnit *
B)>)[](
2670 const SUnit *
A,
const SUnit *
B) {
return A->NodeNum >
B->NodeNum; });
2674IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask)
const {
2677 SchedGroupMask InvertedMask = ~Mask;
2680 if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
2681 InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU &
2682 ~SchedGroupMask
::MFMA & ~SchedGroupMask::TRANS;
2684 else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||
2685 (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||
2686 (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE ||
2687 (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE)
2688 InvertedMask &= ~SchedGroupMask::ALU;
2691 if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
2692 InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE;
2694 else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||
2695 (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE)
2696 InvertedMask &= ~SchedGroupMask::VMEM;
2699 if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE)
2700 InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE;
2702 else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE ||
2703 (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE)
2704 InvertedMask &= ~SchedGroupMask::DS;
2706 LLVM_DEBUG(
dbgs() <<
"After Inverting, SchedGroup Mask: " << (
int)InvertedMask
2709 return InvertedMask;
2712void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
2713 std::vector<SUnit>::reverse_iterator RIter) {
2716 MachineInstr &SGB = *RIter->getInstr();
2722 auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,
2725 SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]);
2728bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
2729 IGLPStrategyID StrategyID =
2731 auto S = createIGLPStrategy(StrategyID, DAG,
TII);
2732 if (!S->shouldApplyStrategy(DAG,
Phase))
2735 IsBottomUp = S->IsBottomUp;
2736 return S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups,
Phase);
2746std::unique_ptr<ScheduleDAGMutation>
2748 return std::make_unique<IGroupLPDAGMutation>(
Phase);
unsigned const MachineRegisterInfo * MRI
aarch64 falkor hwpf fix Falkor HW Prefetch Fix Late Phase
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file defines the DenseMap class.
const HexagonInstrInfo * TII
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
Register const TargetRegisterInfo * TRI
Interface definition for SIInstrInfo.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
const HexagonRegisterInfo & getRegisterInfo() const
Instructions::iterator instr_iterator
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const MachineOperand & getOperand(unsigned i) const
@ Data
Regular data dependence (aka true-dependence).
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Scheduling unit. This is a node in the scheduling DAG.
unsigned NodeNum
Entry # of node in the node vector.
LLVM_ABI void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
SmallVector< SDep, 4 > Succs
All sunit successors.
SmallVector< SDep, 4 > Preds
All sunit predecessors.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
A ScheduleDAG for scheduling lists of MachineInstr.
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
bool addEdge(SUnit *SuccSU, const SDep &PredDep)
Add a DAG edge to the given SU with the given predecessor dependence data.
bool IsReachable(SUnit *SU, SUnit *TargetSU)
IsReachable - Checks if SU is reachable from TargetSU.
bool canAddEdge(SUnit *SuccSU, SUnit *PredSU)
True if an edge can be added from PredSU to SuccSU without creating a cycle.
void dump() const override
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
std::vector< SUnit > SUnits
The scheduling units.
MachineFunction & MF
Machine function.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
reverse_iterator rbegin()
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An efficient, type-erasing, non-owning reference to a callable.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
void apply(Opt *O, const Mod &M, const Mods &... Ms)
initializer< Ty > init(const Ty &Val)
LLVM_ABI void link(std::unique_ptr< LinkGraph > G, std::unique_ptr< JITLinkContext > Ctx)
Link the given graph.
This is an optimization pass for GlobalISel generic memory operations.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
@ LLVM_MARK_AS_BITMASK_ENUM
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase)
Phase specifes whether or not this is a reentry into the IGroupLPDAGMutation.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
FormattedNumber format_hex(uint64_t N, unsigned Width, bool Upper=false)
format_hex - Output N as a fixed width hexadecimal.
DWARFExpression::Operation Op
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Function object to check whether the second component of a container supported by std::get (like std:...