32#define DEBUG_TYPE "igrouplp"
38 cl::desc(
"Whether to use the exponential time solver to fit "
39 "the instructions to the pipeline as closely as "
45 cl::desc(
"The maximum number of scheduling group conflicts "
46 "which we attempt to solve with the exponential time "
47 "exact solver. Problem sizes greater than this will"
48 "be solved by the less accurate greedy algorithm. Selecting "
49 "solver by size is superseded by manually selecting "
50 "the solver (e.g. by amdgpu-igrouplp-exact-solver"));
54 cl::desc(
"The amount of branches that we are willing to explore with"
55 "the exact algorithm before giving up."));
59 cl::desc(
"Whether to use the cost heuristic to make choices as we "
60 "traverse the search space using the exact solver. Defaulted "
61 "to on, and if turned off, we will use the node order -- "
62 "attempting to put the later nodes in the later sched groups. "
63 "Experimentally, results are mixed, so this should be set on a "
64 "case-by-case basis."));
68enum class SchedGroupMask {
82 ALL = ALU | VALU |
SALU |
MFMA |
VMEM | VMEM_READ | VMEM_WRITE |
DS |
83 DS_READ | DS_WRITE |
TRANS | LDSDMA,
92class InstructionRule {
98 std::optional<SmallVector<SUnit *, 4>> Cache;
108 bool NeedsCache =
false)
115 virtual ~InstructionRule() =
default;
128 SchedGroupMask SGMask;
131 std::optional<unsigned> MaxSize;
144 static unsigned NumSchedGroups;
161 bool canAddSU(
SUnit &SU)
const;
166 void link(
SUnit &SU,
bool MakePred =
false);
170 int link(
SUnit &SU,
bool MakePred,
171 std::list<std::pair<SUnit *, SUnit *>> &AddedEdges);
180 void link(SchedGroup &OtherGroup);
183 bool isFull()
const {
return MaxSize && Collection.
size() >= *MaxSize; }
189 void addRule(std::shared_ptr<InstructionRule> NewRule) {
194 bool allowedByRules(
const SUnit *SU,
196 for (
auto &Rule : Rules) {
197 if (!Rule->apply(SU, Collection, SyncPipe))
204 void add(
SUnit &SU) {
206 <<
format_hex((
int)SGMask, 10,
true) <<
" adding "
212 void pop() { Collection.
pop_back(); }
215 void findCandidateSUnits(
T Begin,
T End,
216 SUnitsToCandidateSGsMap &SyncedInstrs);
221 void findCandidateSUnits(SUnitsToCandidateSGsMap &SyncedInstrs);
223 int getSyncID() {
return SyncID; }
225 int getSGID() {
return SGID; }
227 SchedGroupMask
getMask() {
return SGMask; }
229 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
231 : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG),
TII(
TII) {
232 SGID = NumSchedGroups++;
235 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
int SyncID,
237 : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG),
TII(
TII) {
238 SGID = NumSchedGroups++;
242using SUToCandSGsPair = std::pair<SUnit *, SmallVector<int, 4>>;
254class PipelineSolver {
267 bool NeedsSolver =
false;
271 unsigned computeProblemSize();
282 int CurrConflInstNo = 0;
284 int CurrSyncGroupIdx = 0;
286 int BeginSyncGroupIdx = 0;
292 bool IsBottomUp =
true;
295 void advancePosition();
298 void retreatPosition();
307 template <
typename T>
308 void greedyFind(std::list<std::pair<SUnit *, SUnit *>> &AddedEdges,
T I,
T E);
313 template <
typename T>
320 template <
typename T>
void linkSchedGroups(
T I,
T E);
324 std::list<std::pair<SUnit *, SUnit *>> &AddedEdges);
328 template <
typename T>
329 int linkSUnit(
SUnit *SU,
int SGID,
330 std::list<std::pair<SUnit *, SUnit *>> &AddedEdges,
T I,
T E);
332 void removeEdges(
const std::list<std::pair<SUnit *, SUnit *>> &AddedEdges);
334 void convertSyncMapsToArrays();
346 : DAG(DAG), SyncedInstrs(SyncedInstrs),
347 SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) {
349 for (
auto &PipelineInstrs : SyncedInstrs) {
350 if (!PipelineInstrs.second.
empty()) {
359 convertSyncMapsToArrays();
361 CurrPipeline = BestPipeline;
363 while (
static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.
size() &&
364 PipelineInstrs[BeginSyncGroupIdx].
empty())
367 if (
static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.
size())
372void PipelineSolver::reset() {
374 for (
auto &SyncPipeline : CurrPipeline) {
375 for (
auto &SG : SyncPipeline) {
377 SG.Collection.
clear();
381 if (SchedBarr != TempCollection.
end())
382 SG.Collection.push_back(*SchedBarr);
386 CurrSyncGroupIdx = BeginSyncGroupIdx;
391void PipelineSolver::convertSyncMapsToArrays() {
392 for (
auto &SyncPipe : SyncedSchedGroups) {
393 BestPipeline.insert(BestPipeline.begin(), SyncPipe.second);
396 int PipelineIDx = SyncedInstrs.size() - 1;
397 PipelineInstrs.resize(SyncedInstrs.size());
398 for (
auto &SyncInstrMap : SyncedInstrs) {
399 for (
auto &SUsToCandSGs : SyncInstrMap.second) {
400 if (PipelineInstrs[PipelineIDx].empty()) {
401 PipelineInstrs[PipelineIDx].push_back(
402 std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
405 auto *SortPosition = PipelineInstrs[PipelineIDx].begin();
408 while (SortPosition != PipelineInstrs[PipelineIDx].end() &&
409 SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum)
411 PipelineInstrs[PipelineIDx].insert(
412 SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
418template <
typename T>
void PipelineSolver::linkSchedGroups(
T I,
T E) {
419 for (;
I !=
E; ++
I) {
421 for (
auto J = std::next(
I); J !=
E; ++J) {
428void PipelineSolver::makePipeline() {
430 for (
auto &SyncPipeline : BestPipeline) {
432 for (
auto &SG : SyncPipeline) {
435 SUnit *SGBarr =
nullptr;
436 for (
auto &SU : SG.Collection) {
437 if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
444 SG.link(*SGBarr,
false);
448 for (
auto &SyncPipeline : BestPipeline) {
449 IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend())
450 : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end());
455int PipelineSolver::linkSUnit(
456 SUnit *SU,
int SGID, std::list<std::pair<SUnit *, SUnit *>> &AddedEdges,
458 bool MakePred =
false;
461 if (
I->getSGID() == SGID) {
466 AddedCost += Group.link(*SU, MakePred, AddedEdges);
472int PipelineSolver::addEdges(
474 std::list<std::pair<SUnit *, SUnit *>> &AddedEdges) {
484 return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.
rbegin(),
486 : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.
begin(),
490void PipelineSolver::removeEdges(
491 const std::list<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
494 for (
auto &PredSuccPair : EdgesToRemove) {
495 SUnit *Pred = PredSuccPair.first;
496 SUnit *Succ = PredSuccPair.second;
499 Succ->
Preds, [&Pred](
SDep &
P) { return P.getSUnit() == Pred; });
500 if (Match != Succ->
Preds.end()) {
501 assert(Match->isArtificial());
507void PipelineSolver::advancePosition() {
510 if (
static_cast<size_t>(CurrConflInstNo) >=
511 PipelineInstrs[CurrSyncGroupIdx].
size()) {
515 while (
static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() &&
516 PipelineInstrs[CurrSyncGroupIdx].empty())
521void PipelineSolver::retreatPosition() {
522 assert(CurrConflInstNo >= 0);
523 assert(CurrSyncGroupIdx >= 0);
525 if (CurrConflInstNo > 0) {
530 if (CurrConflInstNo == 0) {
533 if (CurrSyncGroupIdx == BeginSyncGroupIdx)
538 while (PipelineInstrs[CurrSyncGroupIdx].empty())
541 CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1;
545bool PipelineSolver::checkOptimal() {
546 if (
static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {
547 if (BestCost == -1 || CurrCost < BestCost) {
548 BestPipeline = CurrPipeline;
555 bool DoneExploring =
false;
556 if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
557 DoneExploring =
true;
559 return (DoneExploring || BestCost == 0);
563void PipelineSolver::populateReadyList(
565 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
566 auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
567 assert(CurrSU.second.size() >= 1);
569 for (;
I !=
E; ++
I) {
570 std::list<std::pair<SUnit *, SUnit *>> AddedEdges;
572 SchedGroup *Match =
llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
573 return SG.getSGID() == CandSGID;
578 if (Match->isFull()) {
579 ReadyList.push_back(std::pair(*
I, MissPenalty));
583 int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
584 ReadyList.push_back(std::pair(*
I, TempCost));
585 removeEdges(AddedEdges);
587 ReadyList.push_back(std::pair(*
I, -1));
593 assert(ReadyList.size() == CurrSU.second.size());
596bool PipelineSolver::solveExact() {
600 if (
static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())
603 assert(
static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());
604 assert(
static_cast<size_t>(CurrConflInstNo) <
605 PipelineInstrs[CurrSyncGroupIdx].
size());
606 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
608 <<
") in Pipeline # " << CurrSyncGroupIdx <<
"\n");
613 IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.
rbegin(),
614 CurrSU.second.rend())
615 : populateReadyList(ReadyList, CurrSU.second.
begin(),
616 CurrSU.second.end());
618 auto *
I = ReadyList.
begin();
619 auto *
E = ReadyList.
end();
620 for (;
I !=
E; ++
I) {
624 if (BestCost != -1 && (CurrCost +
I->second > BestCost))
627 int CandSGID =
I->first;
629 std::list<std::pair<SUnit *, SUnit *>> AddedEdges;
630 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
632 for (
auto &SG : SyncPipeline) {
633 if (SG.getSGID() == CandSGID)
640 if (!Match->allowedByRules(CurrSU.first, SyncPipeline))
644 << (
int)Match->getMask() <<
"and ID " << CandSGID
646 Match->add(*CurrSU.first);
647 AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
648 LLVM_DEBUG(
dbgs() <<
"Cost of Assignment: " << AddedCost <<
"\n");
649 CurrCost += AddedCost;
652 bool FinishedExploring =
false;
655 if (CurrCost < BestCost || BestCost == -1) {
657 FinishedExploring = BestCost != 0;
658 if (!FinishedExploring)
664 CurrCost -= AddedCost;
665 removeEdges(AddedEdges);
667 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
668 if (FinishedExploring)
675 CurrCost += MissPenalty;
678 LLVM_DEBUG(
dbgs() <<
"NOT Assigned (" << CurrSU.first->NodeNum <<
")\n");
680 bool FinishedExploring =
false;
681 if (CurrCost < BestCost || BestCost == -1) {
683 bool FinishedExploring = BestCost != 0;
684 if (!FinishedExploring)
690 CurrCost -= MissPenalty;
691 return FinishedExploring;
695void PipelineSolver::greedyFind(
696 std::list<std::pair<SUnit *, SUnit *>> &AddedEdges,
T I,
T E) {
697 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
701 std::list<std::pair<SUnit *, SUnit *>> Edges;
704 std::optional<GroupInfo> Best;
706 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
708 <<
") in Pipeline # " << CurrSyncGroupIdx <<
"\n");
714 for (;
I !=
E; ++
I) {
716 SchedGroup *Match =
llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
717 return SG.getSGID() == CandSGID;
721 LLVM_DEBUG(
dbgs() <<
"Trying SGID # " << CandSGID <<
" with Mask "
722 << (
int)Match->getMask() <<
"\n");
724 if (Match->isFull()) {
728 if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) {
729 LLVM_DEBUG(
dbgs() <<
"SGID # " << CandSGID <<
" has conflicting rule\n");
733 std::list<std::pair<SUnit *, SUnit *>> TempEdges;
734 int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, TempEdges);
737 if (!Best || TempCost < Best->Cost) {
738 Best = {Match, TempEdges, TempCost};
743 removeEdges(TempEdges);
747 SchedGroup *SG = Best->SG;
748 std::list<std::pair<SUnit *, SUnit *>> &Edges = Best->Edges;
750 SG->add(*CurrSU.first);
751 if (AddedEdges.empty())
754 AddedEdges.splice(std::prev(AddedEdges.cend()), Edges);
756 for (
const std::pair<SUnit *, SUnit *> &
E : Edges) {
757 if (!SG->tryAddEdge(
E.first,
E.second))
761 LLVM_DEBUG(
dbgs() <<
"Best Group has ID: " << SG->getSGID() <<
" and Mask"
762 << (
int)SG->getMask() <<
"\n");
763 BestCost += Best->Cost;
765 BestCost += MissPenalty;
768bool PipelineSolver::solveGreedy() {
770 std::list<std::pair<SUnit *, SUnit *>> AddedEdges;
772 while (
static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
773 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
775 ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend())
776 : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end());
779 BestPipeline = CurrPipeline;
780 removeEdges(AddedEdges);
784unsigned PipelineSolver::computeProblemSize() {
785 unsigned ProblemSize = 0;
786 for (
auto &PipeConflicts : PipelineInstrs) {
787 ProblemSize += PipeConflicts.size();
793void PipelineSolver::solve() {
797 unsigned ProblemSize = computeProblemSize();
800 bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact;
801 MissPenalty = (ProblemSize / 2) + 1;
804 if (EnableExactSolver || BelowCutoff) {
808 LLVM_DEBUG(
dbgs() <<
"Greedy produced best cost of " << BestCost <<
"\n");
812 LLVM_DEBUG(
dbgs() <<
"Exact produced best cost of " << BestCost <<
"\n");
817 LLVM_DEBUG(
dbgs() <<
"Greedy produced best cost of " << BestCost <<
"\n");
834 virtual bool applyIGLPStrategy(
843 bool IsBottomUp =
true;
848 virtual ~IGLPStrategy() =
default;
851class MFMASmallGemmOpt final :
public IGLPStrategy {
854 bool applyIGLPStrategy(
865 : IGLPStrategy(DAG,
TII) {
870bool MFMASmallGemmOpt::applyIGLPStrategy(
875 unsigned MFMACount = 0;
877 if (
TII->isMFMAorWMMA(
I))
880 const unsigned PipelineSyncID = 0;
881 SchedGroup *SG =
nullptr;
882 for (
unsigned I = 0;
I < MFMACount * 3; ++
I) {
883 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
884 SchedGroupMask::DS, 2, PipelineSyncID, DAG,
TII);
885 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
887 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
888 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
889 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
895class MFMAExpInterleaveOpt final :
public IGLPStrategy {
898 static unsigned TransPipeCount;
900 static unsigned MFMAPipeCount;
902 static unsigned AddPipeCount;
904 static unsigned MFMAEnablement;
906 static unsigned ExpRequirement;
908 static unsigned MFMAChains;
913 static bool HasChainBetweenCvt;
915 static std::optional<unsigned> FirstPipeDSR;
924 class IsPipeExp final :
public InstructionRule {
929 auto *DAG = SyncPipe[0].DAG;
931 if (Cache->empty()) {
932 auto I = DAG->SUnits.rbegin();
933 auto E = DAG->SUnits.rend();
934 for (;
I !=
E;
I++) {
935 if (
TII->isMFMAorWMMA(*
I->getInstr()))
936 Cache->push_back(&*
I);
942 auto Reaches =
any_of(*Cache, [&SU, &DAG](
SUnit *TargetSU) {
943 return DAG->IsReachable(TargetSU,
const_cast<SUnit *
>(SU));
948 IsPipeExp(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
949 : InstructionRule(
TII, SGID, NeedsCache) {}
954 class EnablesNthMFMA final :
public InstructionRule {
961 bool FoundTrans =
false;
962 unsigned Counter = 1;
963 auto *DAG = SyncPipe[0].DAG;
965 if (Cache->empty()) {
966 auto I = DAG->SUnits.begin();
967 auto E = DAG->SUnits.end();
968 for (;
I !=
E;
I++) {
969 if (FoundTrans &&
TII->isMFMAorWMMA(*
I->getInstr())) {
971 Cache->push_back(&*
I);
976 if (!FoundTrans &&
TII->isTRANS(
I->getInstr()->getOpcode()))
983 return DAG->IsReachable((*Cache)[0],
const_cast<SUnit *
>(SU));
987 bool NeedsCache =
false)
993 class EnablesNthMFMAInChain final :
public InstructionRule {
1001 auto *DAG = SyncPipe[0].DAG;
1003 if (!SU || !
TII->isMFMAorWMMA(*ChainSeed->
getInstr()))
1006 if (Cache->empty()) {
1007 auto *TempSU = ChainSeed;
1012 for (
auto &Succ : TempSU->Succs) {
1013 if (
TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
1014 TempSU = Succ.getSUnit();
1023 Cache->push_back(TempSU);
1029 return DAG->IsReachable((*Cache)[0],
const_cast<SUnit *
>(SU));
1032 EnablesNthMFMAInChain(
unsigned Number,
SUnit *ChainSeed,
1034 bool NeedsCache =
false)
1036 ChainSeed(ChainSeed) {}
1042 class LessThanNSuccs final :
public InstructionRule {
1045 bool HasIntermediary =
false;
1050 if (!SyncPipe.
size())
1054 return Succ.getKind() == SDep::Data;
1056 if (SuccSize >=
Size)
1059 if (HasIntermediary) {
1060 for (
auto Succ : SU->
Succs) {
1063 return SuccSucc.getKind() == SDep::Data;
1065 if (SuccSize >=
Size)
1073 bool HasIntermediary =
false,
bool NeedsCache =
false)
1074 : InstructionRule(
TII, SGID, NeedsCache),
Size(
Size),
1075 HasIntermediary(HasIntermediary) {}
1082 class GreaterThanOrEqualToNSuccs final :
public InstructionRule {
1085 bool HasIntermediary =
false;
1090 if (!SyncPipe.
size())
1094 return Succ.getKind() == SDep::Data;
1096 if (SuccSize >=
Size)
1099 if (HasIntermediary) {
1100 for (
auto Succ : SU->
Succs) {
1103 return SuccSucc.getKind() == SDep::Data;
1105 if (SuccSize >=
Size)
1113 unsigned SGID,
bool HasIntermediary =
false,
1114 bool NeedsCache =
false)
1115 : InstructionRule(
TII, SGID, NeedsCache),
Size(
Size),
1116 HasIntermediary(HasIntermediary) {}
1120 class IsCvt final :
public InstructionRule {
1125 return Opc == AMDGPU::V_CVT_F16_F32_e32 ||
1126 Opc == AMDGPU::V_CVT_I32_F32_e32;
1128 IsCvt(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1129 : InstructionRule(
TII, SGID, NeedsCache) {}
1133 class IsFMA final :
public InstructionRule {
1140 IsFMA(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1141 : InstructionRule(
TII, SGID, NeedsCache) {}
1145 class IsPipeAdd final :
public InstructionRule {
1151 IsPipeAdd(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1152 : InstructionRule(
TII, SGID, NeedsCache) {}
1157 class IsSuccOfPrevNthGroup final :
public InstructionRule {
1159 unsigned Distance = 1;
1164 SchedGroup *OtherGroup =
nullptr;
1165 if (!SyncPipe.
size())
1168 for (
auto &PipeSG : SyncPipe) {
1169 if ((
unsigned)PipeSG.getSGID() == SGID - Distance)
1170 OtherGroup = &PipeSG;
1175 if (!OtherGroup->Collection.size())
1178 for (
auto &OtherEle : OtherGroup->Collection) {
1179 for (
auto &Succ : OtherEle->Succs) {
1180 if (Succ.getSUnit() == SU && Succ.getKind() ==
SDep::Data)
1188 unsigned SGID,
bool NeedsCache =
false)
1189 : InstructionRule(
TII, SGID, NeedsCache), Distance(Distance) {}
1194 class IsReachableFromPrevNthGroup final :
public InstructionRule {
1196 unsigned Distance = 1;
1201 SchedGroup *OtherGroup =
nullptr;
1202 if (!SyncPipe.
size())
1205 for (
auto &PipeSG : SyncPipe) {
1206 if ((
unsigned)PipeSG.getSGID() == SGID - Distance)
1207 OtherGroup = &PipeSG;
1212 if (!OtherGroup->Collection.size())
1215 auto *DAG = SyncPipe[0].DAG;
1217 for (
auto &OtherEle : OtherGroup->Collection)
1218 if (DAG->IsReachable(
const_cast<SUnit *
>(SU), OtherEle))
1223 IsReachableFromPrevNthGroup(
unsigned Distance,
const SIInstrInfo *
TII,
1224 unsigned SGID,
bool NeedsCache =
false)
1225 : InstructionRule(
TII, SGID, NeedsCache), Distance(Distance) {}
1229 class OccursAtOrAfterNode final :
public InstructionRule {
1240 bool NeedsCache =
false)
1246 class IsExactMFMA final :
public InstructionRule {
1254 if (!SU || !
TII->isMFMAorWMMA(*ChainSeed->
getInstr()))
1257 if (Cache->empty()) {
1258 auto *TempSU = ChainSeed;
1263 for (
auto &Succ : TempSU->Succs) {
1264 if (
TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
1265 TempSU = Succ.getSUnit();
1274 Cache->push_back(TempSU);
1280 return (*Cache)[0] == SU;
1284 unsigned SGID,
bool NeedsCache =
false)
1286 ChainSeed(ChainSeed) {}
1292 class OccursAfterExp final :
public InstructionRule {
1297 auto *DAG = SyncPipe[0].DAG;
1298 if (Cache->empty()) {
1299 for (
auto &SU : DAG->SUnits)
1301 Cache->push_back(&SU);
1308 return SU->
NodeNum > (*Cache)[0]->NodeNum;
1312 bool NeedsCache =
false)
1313 : InstructionRule(
TII, SGID, NeedsCache) {}
1317 bool applyIGLPStrategy(
1326 : IGLPStrategy(DAG,
TII) {
1331unsigned MFMAExpInterleaveOpt::TransPipeCount = 0;
1332unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0;
1333unsigned MFMAExpInterleaveOpt::AddPipeCount = 0;
1334unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0;
1335unsigned MFMAExpInterleaveOpt::ExpRequirement = 0;
1336unsigned MFMAExpInterleaveOpt::MFMAChains = 0;
1337bool MFMAExpInterleaveOpt::HasCvt =
false;
1338bool MFMAExpInterleaveOpt::HasChainBetweenCvt =
false;
1339std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt;
1348 auto isBitPack = [](
unsigned Opc) {
1349 return Opc == AMDGPU::V_PACK_B32_F16_e64 ||
Opc == AMDGPU::V_PERM_B32_e64;
1352 auto isCvt = [](
unsigned Opc) {
1353 return Opc == AMDGPU::V_CVT_F16_F32_e32 ||
Opc == AMDGPU::V_CVT_I32_F32_e32;
1356 auto isAdd = [](
unsigned Opc) {
return Opc == AMDGPU::V_ADD_F32_e32; };
1359 for (
SUnit &SU : DAG->SUnits) {
1363 if (SU.
Succs.size() >= 7)
1365 for (
auto &Succ : SU.
Succs) {
1366 if (Succ.getSUnit()->Succs.size() >= 7)
1385 if (!(PackSUs.
size() && MFMAPipeCands.
size() && ExpPipeCands.
size()))
1390 std::optional<SUnit *> TempMFMA;
1391 std::optional<SUnit *> TempExp;
1393 for (
auto &PredSU : ExpPipeCands) {
1394 for (
auto &SuccSU : MFMAPipeCands) {
1395 if (DAG->IsReachable(SuccSU, PredSU)) {
1407 if (!(TempExp && TempMFMA))
1410 HasChainBetweenCvt =
none_of((*TempExp)->Succs, [&isCvt](
SDep &Succ) {
1411 return isCvt(Succ.getSUnit()->getInstr()->getOpcode());
1415 for (
auto &SuccSU : MFMAPipeCands) {
1416 if (MFMAPipeSUs.
size() &&
1417 any_of(MFMAPipeSUs, [&SuccSU](
SUnit *PotentialMatch) {
1418 return PotentialMatch->
NodeNum == SuccSU->NodeNum;
1422 for (
auto &PredSU : ExpPipeCands) {
1423 if (DAG->IsReachable(SuccSU, PredSU)) {
1430 MFMAPipeCount = MFMAPipeSUs.
size();
1432 assert(TempExp && TempMFMA);
1433 assert(MFMAPipeCount > 0);
1435 std::optional<SUnit *> TempCvt;
1436 for (
auto &SuccSU : CvtSUs) {
1437 if (DAG->IsReachable(SuccSU, *TempExp)) {
1444 if (TempCvt.has_value()) {
1445 for (
auto &SuccSU : MFMAPipeSUs) {
1446 if (DAG->IsReachable(SuccSU, *TempCvt)) {
1454 for (
auto &MFMAPipeSU : MFMAPipeSUs) {
1458 return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
1460 MFMAChainSeeds.push_back(MFMAPipeSU);
1468 for (
auto Pred : MFMAChainSeeds[0]->Preds) {
1469 if (
TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) &&
1470 Pred.getSUnit()->getInstr()->mayLoad())
1471 FirstPipeDSR = Pred.getSUnit()->NodeNum;
1475 unsigned PackSuccCount =
1477 return DAG->IsReachable(VPack, *TempExp);
1481 unsigned PackPredCount =
1483 auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1484 return isBitPack(Opc);
1488 auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1489 return isBitPack(Opc);
1492 if (PackPred == (*TempMFMA)->Preds.end())
1500 return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
1504 MFMAEnablement *= PackSuccCount;
1509 return DAG->IsReachable(PackPred->getSUnit(), ExpBase);
1512 ExpRequirement *= PackPredCount;
1522 MFMAChainSeeds.clear();
1529bool MFMAExpInterleaveOpt::applyIGLPStrategy(
1534 bool IsSmallKernelType =
1535 MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32;
1536 bool IsLargeKernelType =
1537 MFMAEnablement == 4 && ExpRequirement == 4 && TransPipeCount == 64;
1539 if (!(IsSmallKernelType || IsLargeKernelType))
1545 unsigned PipelineSyncID = 0;
1546 SchedGroup *SG =
nullptr;
1548 unsigned MFMAChain = 0;
1549 unsigned PositionInChain = 0;
1550 unsigned CurrMFMAForTransPosition = 0;
1552 auto incrementTransPosition = [&MFMAChain, &PositionInChain,
1553 &CurrMFMAForTransPosition]() {
1554 CurrMFMAForTransPosition += MFMAEnablement;
1555 PositionInChain = (CurrMFMAForTransPosition / MFMAChains);
1556 MFMAChain = CurrMFMAForTransPosition % MFMAChains;
1559 auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() {
1560 auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1561 return (TempMFMAForTrans / MFMAChains);
1564 auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() {
1565 auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1566 return TempMFMAForTrans % MFMAChains;
1569 unsigned CurrMFMAPosition = 0;
1570 unsigned MFMAChainForMFMA = 0;
1571 unsigned PositionInChainForMFMA = 0;
1573 auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA,
1574 &PositionInChainForMFMA]() {
1576 MFMAChainForMFMA = CurrMFMAPosition % MFMAChains;
1577 PositionInChainForMFMA = CurrMFMAPosition / MFMAChains;
1581 assert(IsPostRA || MFMAChainSeeds.size() == MFMAChains);
1583 bool UsesFMA = IsSmallKernelType || !IsPostRA;
1584 bool UsesDSRead = IsLargeKernelType && !IsPostRA && FirstPipeDSR;
1585 bool UsesCvt = HasCvt && (IsSmallKernelType || !IsPostRA);
1586 bool UsesVALU = IsSmallKernelType;
1591 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1592 SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG,
TII);
1593 if (!IsPostRA && MFMAChains) {
1594 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1595 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
1599 std::make_shared<EnablesNthMFMA>(1,
TII, SG->getSGID(),
true));
1600 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1601 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1604 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1605 SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG,
TII);
1606 if (!IsPostRA && MFMAChains) {
1607 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1608 getNextTransPositionInChain(),
1609 MFMAChainSeeds[getNextTransMFMAChain()],
TII, SG->getSGID(),
true));
1611 SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1,
TII,
1612 SG->getSGID(),
true));
1613 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1614 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1618 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1619 SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG,
TII);
1620 SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR,
TII,
1622 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1626 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1627 SchedGroupMask::TRANS, ExpRequirement, PipelineSyncID, DAG,
TII);
1628 if (!IsPostRA && MFMAChains)
1629 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1630 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
true));
1632 SG->addRule(std::make_shared<EnablesNthMFMA>(1,
TII, SG->getSGID(),
true));
1633 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1634 SG->addRule(std::make_shared<LessThanNSuccs>(8,
TII, SG->getSGID(),
1635 HasChainBetweenCvt));
1636 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1638 incrementTransPosition();
1641 for (
unsigned I = 0;
I < ExpRequirement;
I++) {
1644 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1645 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1646 SG->addRule(std::make_shared<IsCvt>(
TII, SG->getSGID()));
1647 if (HasChainBetweenCvt)
1648 SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
1649 1 + (2 + UsesFMA) *
I,
TII, SG->getSGID()));
1651 SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(
1652 1 + (2 + UsesFMA) *
I,
TII, SG->getSGID()));
1653 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1658 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1659 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1660 if (!IsPostRA && MFMAChains) {
1661 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1662 getNextTransPositionInChain(),
1663 MFMAChainSeeds[getNextTransMFMAChain()],
TII, SG->getSGID(),
true));
1665 SG->addRule(std::make_shared<EnablesNthMFMA>(2 * MFMAEnablement + 1,
1666 TII, SG->getSGID(),
true));
1667 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1668 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1672 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1673 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG,
TII);
1674 if (!IsPostRA && MFMAChains)
1675 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1676 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
1679 SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1,
TII,
1680 SG->getSGID(),
true));
1681 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1682 SG->addRule(std::make_shared<LessThanNSuccs>(8,
TII, SG->getSGID(),
1683 HasChainBetweenCvt));
1684 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1689 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1690 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG,
TII);
1691 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1692 SG->addRule(std::make_shared<GreaterThanOrEqualToNSuccs>(
1693 8,
TII, SG->getSGID(), HasChainBetweenCvt));
1694 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1699 unsigned MFMARatio =
1700 MFMAEnablement > ExpRequirement ? MFMAEnablement / ExpRequirement : 1;
1703 MFMAEnablement > ExpRequirement ? 1 : ExpRequirement / MFMAEnablement;
1705 unsigned RemainingExp = TransPipeCount > (2 * ExpRequirement)
1706 ? TransPipeCount - (2 * ExpRequirement)
1708 unsigned ExpLoopCount = RemainingExp / ExpRatio;
1710 unsigned MFMAInLoop = MFMAPipeCount > (MFMAEnablement * 2)
1711 ? MFMAPipeCount - (MFMAEnablement * 2)
1713 unsigned MFMALoopCount = MFMAInLoop / MFMARatio;
1715 AddPipeCount < MFMAPipeCount ? 1 : AddPipeCount / MFMAPipeCount;
1716 unsigned LoopSize = std::min(ExpLoopCount, MFMALoopCount);
1718 for (
unsigned I = 0;
I < LoopSize;
I++) {
1719 if (!(
I * ExpRatio % ExpRequirement))
1720 incrementTransPosition();
1723 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1724 SchedGroupMask::MFMA, MFMARatio, PipelineSyncID, DAG,
TII);
1725 if (!IsPostRA && MFMAChains)
1726 SG->addRule(std::make_shared<IsExactMFMA>(
1727 PositionInChainForMFMA, MFMAChainSeeds[MFMAChainForMFMA],
TII,
1728 SG->getSGID(),
true));
1730 SG->addRule(std::make_shared<OccursAfterExp>(
TII, SG->getSGID(),
true));
1731 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1732 incrementMFMAPosition();
1735 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1736 SchedGroupMask::VALU, VALUOps, PipelineSyncID, DAG,
TII);
1737 SG->addRule(std::make_shared<IsPipeAdd>(
TII, SG->getSGID()));
1738 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1741 if (UsesDSRead && !(
I % 4)) {
1742 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1743 SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG,
TII);
1744 SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR,
TII,
1746 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1750 for (
unsigned J = 0; J < ExpRatio; J++) {
1751 auto MFMAOffset = (1 + UsesVALU) * MFMARatio * (
I + 1);
1752 auto MaxMFMAOffset =
1753 (1 + UsesVALU) * ExpRequirement * MFMARatio / ExpRatio;
1757 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1758 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1759 SG->addRule(std::make_shared<IsCvt>(
TII, SG->getSGID()));
1760 auto BaseDiff = (2 + UsesFMA) * (ExpRequirement - 1) + 1;
1761 auto DSROffset =
I / 4 + 1;
1762 auto MaxDSROffset = MaxMFMAOffset / 4;
1764 auto ExpOffset =
I * ExpRatio + J >= ExpRequirement ? 0 : 1;
1765 auto CurrentOffset = UsesDSRead * std::min(MaxDSROffset, DSROffset) +
1766 std::min(MaxMFMAOffset, MFMAOffset) + BaseDiff +
1768 if (HasChainBetweenCvt)
1769 SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
1770 CurrentOffset,
TII, SG->getSGID()));
1772 SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(CurrentOffset,
TII,
1774 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1779 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1780 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1781 if (!IsPostRA && MFMAChains)
1782 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1783 getNextTransPositionInChain(),
1784 MFMAChainSeeds[getNextTransMFMAChain()],
TII, SG->getSGID(),
1787 SG->addRule(std::make_shared<EnablesNthMFMA>(
1788 (((
I * ExpRatio + J) / ExpRequirement) + 3) * MFMAEnablement + 1,
1789 TII, SG->getSGID(),
true));
1790 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1791 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1795 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1796 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG,
TII);
1797 if (!IsPostRA && MFMAChains)
1798 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1799 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
1802 SG->addRule(std::make_shared<EnablesNthMFMA>(
1803 (((
I * ExpRatio + J) / ExpRequirement) + 2) * MFMAEnablement + 1,
1804 TII, SG->getSGID(),
true));
1805 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1806 SG->addRule(std::make_shared<LessThanNSuccs>(8,
TII, SG->getSGID(),
1807 HasChainBetweenCvt));
1808 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1813 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1814 SchedGroupMask::MFMA, MFMAEnablement * 2, PipelineSyncID, DAG,
TII);
1815 SG->addRule(std::make_shared<OccursAfterExp>(
TII, SG->getSGID(),
true));
1816 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1820class MFMAExpSimpleInterleaveOpt final :
public IGLPStrategy {
1822 bool applyIGLPStrategy(
1833 : IGLPStrategy(DAG,
TII) {
1838bool MFMAExpSimpleInterleaveOpt::applyIGLPStrategy(
1843 unsigned MFMACount = 0;
1845 if (
TII->isMFMAorWMMA(
I))
1848 const unsigned PipelineSyncID = 0;
1849 for (
unsigned I = 0;
I < MFMACount * 3; ++
I) {
1850 SchedGroup *SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1851 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG,
TII);
1852 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1854 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1855 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
1856 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1862class MFMASmallGemmSingleWaveOpt final :
public IGLPStrategy {
1865 class EnablesInitialMFMA final :
public InstructionRule {
1869 if (!SyncPipe.
size())
1872 if (!Cache->size()) {
1873 for (
auto &Elt : SyncPipe[0].DAG->
SUnits) {
1874 if (
TII->isMFMAorWMMA(*Elt.getInstr())) {
1878 Cache->push_back(&Elt);
1883 auto *DAG = SyncPipe[0].DAG;
1884 for (
auto &Elt : *Cache) {
1892 bool NeedsCache =
false)
1893 : InstructionRule(
TII, SGID, NeedsCache) {}
1897 class IsPermForDSW final :
public InstructionRule {
1902 if (
MI->getOpcode() != AMDGPU::V_PERM_B32_e64)
1905 bool FitsInGroup =
false;
1907 if (!Collection.
size()) {
1908 for (
auto &Succ : SU->
Succs) {
1909 SUnit *SuccUnit = Succ.getSUnit();
1912 Cache->push_back(SuccUnit);
1923 return ThisSucc.getSUnit() == Elt;
1928 IsPermForDSW(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1929 : InstructionRule(
TII, SGID, NeedsCache) {}
1933 class IsSuccOfPrevGroup final :
public InstructionRule {
1937 SchedGroup *OtherGroup =
nullptr;
1938 for (
auto &PipeSG : SyncPipe) {
1939 if ((
unsigned)PipeSG.getSGID() == SGID - 1) {
1940 OtherGroup = &PipeSG;
1946 if (!OtherGroup->Collection.size())
1950 return any_of(OtherGroup->Collection, [&SU](
SUnit *Elt) {
1951 return any_of(Elt->Succs,
1952 [&SU](SDep &Succ) { return Succ.getSUnit() == SU; });
1956 bool NeedsCache =
false)
1957 : InstructionRule(
TII, SGID, NeedsCache) {}
1961 class VMEMSize final :
public InstructionRule {
1966 if (
MI->getOpcode() == TargetOpcode::BUNDLE)
1968 if (!Collection.
size())
1973 auto TRI =
TII->getRegisterInfo();
1974 auto &MRI =
MI->getMF()->getRegInfo();
1975 for (
auto &Elt : Collection) {
1976 auto Op = Elt->getInstr()->getOperand(0);
1978 TRI.getRegSizeInBits(*
TRI.getRegClassForOperandReg(MRI,
Op));
1982 if (NumBits < 128) {
1984 if (NumBits +
TRI.getRegSizeInBits(*
TRI.getRegClassForOperandReg(
1985 MRI,
MI->getOperand(0))) <=
1993 VMEMSize(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1994 : InstructionRule(
TII, SGID, NeedsCache) {}
1999 class SharesPredWithPrevNthGroup final :
public InstructionRule {
2001 unsigned Distance = 1;
2006 SchedGroup *OtherGroup =
nullptr;
2007 if (!SyncPipe.
size())
2010 if (!Cache->size()) {
2012 for (
auto &PipeSG : SyncPipe) {
2013 if ((
unsigned)PipeSG.getSGID() == SGID - Distance) {
2014 OtherGroup = &PipeSG;
2020 if (!OtherGroup->Collection.size())
2023 for (
auto &OtherEle : OtherGroup->Collection) {
2024 for (
auto &Pred : OtherEle->Preds) {
2025 if (Pred.getSUnit()->getInstr()->getOpcode() ==
2026 AMDGPU::V_PERM_B32_e64)
2027 Cache->push_back(Pred.getSUnit());
2036 auto *DAG = SyncPipe[0].DAG;
2043 SharesPredWithPrevNthGroup(
unsigned Distance,
const SIInstrInfo *
TII,
2044 unsigned SGID,
bool NeedsCache =
false)
2045 : InstructionRule(
TII, SGID, NeedsCache), Distance(Distance) {}
2049 bool applyIGLPStrategy(
2060 : IGLPStrategy(DAG,
TII) {
2065static unsigned DSWCount = 0;
2066static unsigned DSWWithPermCount = 0;
2067static unsigned DSWWithSharedVMEMCount = 0;
2069bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
2070 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
2073 unsigned MFMACount = 0;
2074 unsigned DSRCount = 0;
2076 bool IsInitial =
Phase == AMDGPU::SchedulingPhase::Initial;
2078 assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 &&
2079 DSWWithSharedVMEMCount == 0)) &&
2080 "DSWCounters should be zero in pre-RA scheduling!");
2082 for (
auto &SU : DAG->SUnits) {
2083 auto *
I = SU.getInstr();
2084 if (
TII->isMFMAorWMMA(*
I))
2086 else if (
TII->isDS(*
I)) {
2089 else if (
I->mayStore() && IsInitial) {
2091 for (
auto Pred : SU.Preds) {
2092 if (Pred.getSUnit()->getInstr()->getOpcode() ==
2093 AMDGPU::V_PERM_B32_e64) {
2103 DSWWithPermCount = DSWithPerms.
size();
2104 auto *
I = DSWithPerms.
begin();
2105 auto *
E = DSWithPerms.
end();
2113 DenseMap<MachineInstr *, SUnit *> VMEMLookup;
2115 for (;
I !=
E;
I++) {
2116 SUnit *Cand =
nullptr;
2117 bool MissedAny =
false;
2118 for (
auto &Pred : (*I)->Preds) {
2119 if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
2125 for (
auto &Succ : Pred.getSUnit()->Succs) {
2126 auto *
MI = Succ.getSUnit()->getInstr();
2127 if (!
TII->isVMEM(*
MI) || !
MI->mayLoad())
2130 if (MissedAny || !VMEMLookup.
size()) {
2132 VMEMLookup[
MI] = *
I;
2149 if (!MissedAny && Cand) {
2150 DSWWithSharedVMEMCount += 2;
2157 assert(DSWWithSharedVMEMCount <= DSWWithPermCount);
2159 unsigned PipelineSyncID = 0;
2161 if (DSWWithPermCount) {
2162 for (
unsigned I = 0;
I < MFMACount;
I++) {
2163 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2164 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2165 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2167 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2168 SchedGroupMask::VALU, 2, PipelineSyncID, DAG,
TII);
2169 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2179 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2180 SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG,
TII);
2181 SG->addRule(std::make_shared<EnablesInitialMFMA>(
TII, SG->getSGID(),
true));
2182 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2184 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2185 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2186 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2189 for (
unsigned I = 4;
I < DSRCount; ++
I) {
2190 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2191 SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG,
TII);
2192 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2194 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2195 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2196 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2202 for (
unsigned I = DSWWithSharedVMEMCount;
I < DSWWithPermCount; ++
I) {
2203 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2204 SchedGroupMask::VALU, 4, PipelineSyncID, DAG,
TII);
2205 SG->addRule(std::make_shared<IsPermForDSW>(
TII, SG->getSGID(),
true));
2206 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2208 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2209 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2210 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(
TII, SG->getSGID()));
2211 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2213 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2214 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2215 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2216 1,
TII, SG->getSGID(),
true));
2217 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2218 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2220 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2221 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2222 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2224 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2225 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2226 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2227 3,
TII, SG->getSGID(),
true));
2228 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2229 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2231 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2232 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2233 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2239 for (
unsigned I = DSWWithPermCount;
I < DSWCount;
I++) {
2240 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2241 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2242 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2244 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2245 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2246 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2247 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2249 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2250 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2251 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2259 for (
unsigned I = 0;
I < DSWWithSharedVMEMCount; ++
I) {
2260 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2261 SchedGroupMask::VALU, 4, PipelineSyncID, DAG,
TII);
2262 SG->addRule(std::make_shared<IsPermForDSW>(
TII, SG->getSGID(),
true));
2263 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2265 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2266 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2267 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(
TII, SG->getSGID()));
2268 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2270 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2271 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2272 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2274 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2275 SchedGroupMask::VALU, 4, PipelineSyncID, DAG,
TII);
2276 SG->addRule(std::make_shared<IsPermForDSW>(
TII, SG->getSGID(),
true));
2277 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2279 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2280 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2281 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(
TII, SG->getSGID()));
2282 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2284 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2285 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2286 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2288 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2289 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2290 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2291 2,
TII, SG->getSGID(),
true));
2292 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2293 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2295 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2296 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2297 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2299 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2300 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2301 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2302 4,
TII, SG->getSGID(),
true));
2303 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2304 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2306 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2307 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2308 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2314static std::unique_ptr<IGLPStrategy>
2316 const SIInstrInfo *
TII) {
2319 return std::make_unique<MFMASmallGemmOpt>(DAG,
TII);
2321 return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG,
TII);
2323 return std::make_unique<MFMAExpInterleaveOpt>(DAG,
TII);
2325 return std::make_unique<MFMAExpSimpleInterleaveOpt>(DAG,
TII);
2331class IGroupLPDAGMutation :
public ScheduleDAGMutation {
2333 const SIInstrInfo *
TII;
2340 DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
2343 DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
2346 void addSchedBarrierEdges(SUnit &SU);
2357 SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask)
const;
2360 void initSchedGroupBarrierPipelineStage(
2361 std::vector<SUnit>::reverse_iterator RIter);
2363 bool initIGLPOpt(SUnit &SU);
2366 void apply(ScheduleDAGInstrs *DAGInstrs)
override;
2373 bool IsBottomUp =
true;
2378 IGroupLPDAGMutation() =
default;
2382unsigned SchedGroup::NumSchedGroups = 0;
2384bool SchedGroup::tryAddEdge(SUnit *
A, SUnit *
B) {
2388bool SchedGroup::canAddMI(
const MachineInstr &
MI)
const {
2390 if (
MI.isMetaInstruction())
2393 else if (
MI.isInlineAsm()) {
2395 auto &MRI =
MI.getParent()->getParent()->getRegInfo();
2396 bool SGPR_used =
false, SGPR_big_def =
false, VGPR_used =
false,
2397 VMFMA_used =
false, VReg32_used =
false,
MayLoad =
MI.mayLoad(),
2399 for (
const MachineOperand &Operand :
MI.operands())
2400 if (Operand.isReg()) {
2401 const TargetRegisterClass &RegClass =
2402 *
TRI.getRegClassForOperandReg(MRI, Operand);
2403 if (
TRI.hasVGPRs(&RegClass)) {
2405 if (Operand.isUse() &&
TRI.getRegSizeInBits(RegClass) == 32)
2411 if (
TRI.hasAGPRs(&RegClass) ||
TRI.getRegSizeInBits(RegClass) > 128)
2413 if (
TRI.hasSGPRs(&RegClass))
2415 if (
TRI.getRegSizeInBits(RegClass) > 64 && Operand.isDef())
2416 SGPR_big_def =
true;
2419 typedef std::underlying_type_t<SchedGroupMask> SGMask_t;
2420 SGMask_t InlineAsmMask = 0;
2421 if (VGPR_used && !VMFMA_used && !MayLoad && !MayStore)
2422 InlineAsmMask |= (SGMask_t)SchedGroupMask::VALU;
2423 if (SGPR_used && !VGPR_used && !MayLoad && !MayStore)
2424 InlineAsmMask |= (SGMask_t)SchedGroupMask::SALU;
2426 InlineAsmMask |= (SGMask_t)SchedGroupMask::MFMA;
2427 if (VGPR_used && MayLoad)
2428 InlineAsmMask |= (SGMask_t)(VReg32_used ? SchedGroupMask::DS_READ
2429 : SchedGroupMask::VMEM_READ);
2430 if (VGPR_used && MayStore)
2431 InlineAsmMask |= (SGMask_t)(VReg32_used ? SchedGroupMask::DS_WRITE
2432 : SchedGroupMask::VMEM_WRITE);
2434 InlineAsmMask |= (SGMask_t)SchedGroupMask::DS_READ;
2435 if (InlineAsmMask & (SGMask_t)SchedGroupMask::VALU ||
2436 InlineAsmMask & (SGMask_t)SchedGroupMask::SALU)
2437 InlineAsmMask |= (SGMask_t)SchedGroupMask::ALU;
2438 if (InlineAsmMask & (SGMask_t)SchedGroupMask::DS_READ ||
2439 InlineAsmMask & (SGMask_t)SchedGroupMask::DS_WRITE)
2440 InlineAsmMask |= (SGMask_t)SchedGroupMask::DS;
2441 if (InlineAsmMask & (SGMask_t)SchedGroupMask::VMEM_READ ||
2442 InlineAsmMask & (SGMask_t)SchedGroupMask::VMEM_WRITE)
2443 InlineAsmMask |= (SGMask_t)SchedGroupMask::VMEM;
2445 Result = ((SGMask_t)SGMask & InlineAsmMask) != 0;
2448 else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
2453 else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
2455 !
TII->isLDSDMA(
MI)) {
2462 else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
2466 else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
2467 TII->isMFMAorWMMA(
MI))
2470 else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
2474 else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
2475 MI.mayLoad() &&
TII->isVMEM(
MI))
2478 else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
2479 MI.mayStore() &&
TII->isVMEM(
MI))
2482 else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
2486 else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
2487 MI.mayLoad() &&
TII->isDS(
MI))
2490 else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
2491 MI.mayStore() &&
TII->isDS(
MI))
2494 else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) &&
2498 else if (((SGMask & SchedGroupMask::LDSDMA) != SchedGroupMask::NONE) &&
2503 dbgs() <<
"For SchedGroup with mask " <<
format_hex((
int)SGMask, 10,
true)
2504 << (Result ?
" could classify " :
" unable to classify ") <<
MI);
2509int SchedGroup::link(SUnit &SU,
bool MakePred,
2510 std::list<std::pair<SUnit *, SUnit *>> &AddedEdges) {
2511 int MissedEdges = 0;
2512 for (
auto *
A : Collection) {
2514 if (
A ==
B ||
A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2524 bool Added = tryAddEdge(
A,
B);
2526 AddedEdges.emplace_back(
A,
B);
2534void SchedGroup::link(SUnit &SU,
bool MakePred) {
2535 for (
auto *
A : Collection) {
2537 if (
A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2546void SchedGroup::link(SUnit &SU,
2547 function_ref<
bool(
const SUnit *
A,
const SUnit *
B)>
P) {
2548 for (
auto *
A : Collection) {
2557void SchedGroup::link(SchedGroup &OtherGroup) {
2558 for (
auto *
B : OtherGroup.Collection)
2562bool SchedGroup::canAddSU(SUnit &SU)
const {
2564 if (
MI.getOpcode() != TargetOpcode::BUNDLE)
2565 return canAddMI(
MI);
2568 const MachineBasicBlock *
MBB =
MI.getParent();
2570 while (
E !=
MBB->
end() &&
E->isBundledWithPred())
2574 return std::all_of(
B,
E, [
this](MachineInstr &
MI) {
return canAddMI(
MI); });
2578void SchedGroup::findCandidateSUnits(
T Begin,
T End,
2579 SUnitsToCandidateSGsMap &SyncedInstrs) {
2582 SyncedInstrs[&SU].push_back(SGID);
2586void SchedGroup::findCandidateSUnits(SUnitsToCandidateSGsMap &SyncedInstrs) {
2587 findCandidateSUnits(DAG->
SUnits.rbegin(), DAG->
SUnits.rend(), SyncedInstrs);
2590void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
2591 const TargetSchedModel *TSchedModel = DAGInstrs->
getSchedModel();
2592 if (!TSchedModel || DAGInstrs->
SUnits.empty())
2597 TII =
ST.getInstrInfo();
2598 DAG =
static_cast<ScheduleDAGMI *
>(DAGInstrs);
2599 SyncedSchedGroups.clear();
2600 SyncedInstrs.clear();
2601 bool FoundSB =
false;
2602 bool FoundIGLP =
false;
2603 bool ShouldApplyIGLP =
false;
2604 for (
auto R = DAG->
SUnits.rbegin(),
E = DAG->
SUnits.rend(); R !=
E; ++R) {
2605 unsigned Opc =
R->getInstr()->getOpcode();
2607 if (
Opc == AMDGPU::SCHED_BARRIER) {
2608 addSchedBarrierEdges(*R);
2610 }
else if (
Opc == AMDGPU::SCHED_GROUP_BARRIER) {
2611 initSchedGroupBarrierPipelineStage(R);
2613 }
else if (
Opc == AMDGPU::IGLP_OPT) {
2614 if (!FoundSB && !FoundIGLP) {
2616 ShouldApplyIGLP = initIGLPOpt(*R);
2621 if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) {
2622 PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
2630void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
2632 assert(
MI.getOpcode() == AMDGPU::SCHED_BARRIER);
2633 LLVM_DEBUG(
dbgs() <<
"Building SchedGroup for SchedBarrier with Mask: "
2634 <<
MI.getOperand(0).getImm() <<
"\n");
2636 invertSchedBarrierMask((SchedGroupMask)
MI.getOperand(0).getImm());
2637 SchedGroup SG(InvertedMask, std::nullopt, DAG,
TII);
2639 for (SUnit &SU : DAG->
SUnits)
2640 if (SG.canAddSU(SU))
2646 (function_ref<
bool(
const SUnit *
A,
const SUnit *
B)>)[](
2647 const SUnit *
A,
const SUnit *
B) {
return A->NodeNum >
B->NodeNum; });
2651IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask)
const {
2654 SchedGroupMask InvertedMask = ~Mask;
2657 if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
2658 InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask
::SALU &
2661 else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||
2662 (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||
2663 (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE ||
2664 (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE)
2665 InvertedMask &= ~SchedGroupMask::ALU;
2668 if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
2669 InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE &
2670 ~SchedGroupMask::LDSDMA;
2672 else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||
2673 (InvertedMask & SchedGroupMask::VMEM_WRITE) ==
2674 SchedGroupMask::NONE ||
2675 (InvertedMask & SchedGroupMask::LDSDMA) == SchedGroupMask::NONE)
2676 InvertedMask &= ~SchedGroupMask
::VMEM;
2679 if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE)
2680 InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE &
2681 ~SchedGroupMask::LDSDMA;
2683 else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE ||
2684 (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE)
2685 InvertedMask &= ~SchedGroupMask
::DS;
2687 LLVM_DEBUG(
dbgs() <<
"After Inverting, SchedGroup Mask: " << (
int)InvertedMask
2690 return InvertedMask;
2693void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
2694 std::vector<SUnit>::reverse_iterator RIter) {
2695 MachineInstr &SGB = *RIter->getInstr();
2702 auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,
2705 SG.findCandidateSUnits(RIter, SG.DAG->
SUnits.rend(),
2706 SyncedInstrs[SG.getSyncID()]);
2709bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
2712 auto S = createIGLPStrategy(StrategyID, DAG,
TII);
2713 if (!S->shouldApplyStrategy(DAG,
Phase))
2716 IsBottomUp = S->IsBottomUp;
2717 return S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups,
Phase);
2727std::unique_ptr<ScheduleDAGMutation>
2729 return std::make_unique<IGroupLPDAGMutation>(
Phase);
aarch64 falkor hwpf fix Falkor HW Prefetch Fix Late Phase
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file defines the DenseMap class.
const HexagonInstrInfo * TII
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
Register const TargetRegisterInfo * TRI
Interface definition for SIInstrInfo.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
const HexagonRegisterInfo & getRegisterInfo() const
Instructions::iterator instr_iterator
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const MachineOperand & getOperand(unsigned i) const
@ Data
Regular data dependence (aka true-dependence).
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Scheduling unit. This is a node in the scheduling DAG.
unsigned NodeNum
Entry # of node in the node vector.
LLVM_ABI void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
SmallVector< SDep, 4 > Succs
All sunit successors.
SmallVector< SDep, 4 > Preds
All sunit predecessors.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
A ScheduleDAG for scheduling lists of MachineInstr.
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
bool addEdge(SUnit *SuccSU, const SDep &PredDep)
Add a DAG edge to the given SU with the given predecessor dependence data.
bool IsReachable(SUnit *SU, SUnit *TargetSU)
IsReachable - Checks if SU is reachable from TargetSU.
void dump() const override
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
std::vector< SUnit > SUnits
The scheduling units.
MachineFunction & MF
Machine function.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
reverse_iterator rbegin()
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An efficient, type-erasing, non-owning reference to a callable.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
IGLPStrategyID
Operand 0 immediate for IGLP_OPT pseudo instructions.
@ MFMASmallGemmSingleWaveOptID
@ MFMAExpSimpleInterleaveID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
void apply(Opt *O, const Mod &M, const Mods &... Ms)
initializer< Ty > init(const Ty &Val)
LLVM_ABI void link(std::unique_ptr< LinkGraph > G, std::unique_ptr< JITLinkContext > Ctx)
Link the given graph.
This is an optimization pass for GlobalISel generic memory operations.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
@ LLVM_MARK_AS_BITMASK_ENUM
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase)
Phase specifes whether or not this is a reentry into the IGroupLPDAGMutation.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
FormattedNumber format_hex(uint64_t N, unsigned Width, bool Upper=false)
format_hex - Output N as a fixed width hexadecimal.
DWARFExpression::Operation Op
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Function object to check whether the second component of a container supported by std::get (like std:...