30#define DEBUG_TYPE "igrouplp"
36 cl::desc(
"Whether to use the exponential time solver to fit "
37 "the instructions to the pipeline as closely as "
43 cl::desc(
"The maximum number of scheduling group conflicts "
44 "which we attempt to solve with the exponential time "
45 "exact solver. Problem sizes greater than this will"
46 "be solved by the less accurate greedy algorithm. Selecting "
47 "solver by size is superseded by manually selecting "
48 "the solver (e.g. by amdgpu-igrouplp-exact-solver"));
52 cl::desc(
"The amount of branches that we are willing to explore with"
53 "the exact algorithm before giving up."));
57 cl::desc(
"Whether to use the cost heuristic to make choices as we "
58 "traverse the search space using the exact solver. Defaulted "
59 "to on, and if turned off, we will use the node order -- "
60 "attempting to put the later nodes in the later sched groups. "
61 "Experimentally, results are mixed, so this should be set on a "
62 "case-by-case basis."));
66enum class SchedGroupMask {
79 ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
80 DS_READ | DS_WRITE | TRANS,
89class InstructionRule {
95 std::optional<SmallVector<SUnit *, 4>> Cache;
105 bool NeedsCache =
false)
112 virtual ~InstructionRule() =
default;
125 SchedGroupMask SGMask;
128 std::optional<unsigned> MaxSize;
141 static unsigned NumSchedGroups;
158 bool canAddSU(
SUnit &SU)
const;
163 void link(
SUnit &SU,
bool MakePred =
false);
167 int link(
SUnit &SU,
bool MakePred,
168 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
177 void link(SchedGroup &OtherGroup);
180 bool isFull()
const {
return MaxSize && Collection.
size() >= *MaxSize; }
186 void addRule(std::shared_ptr<InstructionRule> NewRule) {
191 bool allowedByRules(
const SUnit *SU,
195 for (
size_t I = 0;
I < Rules.
size();
I++) {
196 auto TheRule = Rules[
I].get();
197 if (!TheRule->apply(SU, Collection, SyncPipe)) {
205 void add(
SUnit &SU) {
207 <<
format_hex((
int)SGMask, 10,
true) <<
" adding "
213 void pop() { Collection.
pop_back(); }
216 void initSchedGroup();
223 void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
224 SUnitsToCandidateSGsMap &SyncedInstrs);
226 void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs);
228 int getSyncID() {
return SyncID; }
230 int getSGID() {
return SGID; }
232 SchedGroupMask getMask() {
return SGMask; }
234 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
236 : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG),
TII(
TII) {
237 SGID = NumSchedGroups++;
240 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
int SyncID,
242 : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG),
TII(
TII) {
243 SGID = NumSchedGroups++;
253 while (!SU.
Preds.empty())
257 while (!SU.
Succs.empty())
258 for (
auto &S : SU.
Succs)
259 for (
auto &SP : S.getSUnit()->Preds)
260 if (SP.getSUnit() == &SU)
261 S.getSUnit()->removePred(SP);
264typedef std::pair<SUnit *, SmallVector<int, 4>> SUToCandSGsPair;
276class PipelineSolver {
289 bool NeedsSolver =
false;
293 unsigned computeProblemSize();
304 int CurrConflInstNo = 0;
306 int CurrSyncGroupIdx = 0;
308 int BeginSyncGroupIdx = 0;
317 void advancePosition();
320 void retreatPosition();
329 template <
typename T>
330 void greedyFind(std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
T I,
336 template <
typename T>
343 template <
typename T>
void linkSchedGroups(
T I,
T E);
347 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
351 template <
typename T>
352 int linkSUnit(
SUnit *SU,
int SGID,
353 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
T I,
T E);
355 void removeEdges(
const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
357 void convertSyncMapsToArrays();
369 : DAG(DAG), SyncedInstrs(SyncedInstrs),
370 SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) {
372 for (
auto &PipelineInstrs : SyncedInstrs) {
373 if (PipelineInstrs.second.
size() > 0) {
382 convertSyncMapsToArrays();
384 CurrPipeline = BestPipeline;
386 while (
static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.
size() &&
387 PipelineInstrs[BeginSyncGroupIdx].
size() == 0)
390 if (
static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.
size())
395void PipelineSolver::reset() {
397 for (
auto &SyncPipeline : CurrPipeline) {
398 for (
auto &SG : SyncPipeline) {
400 SG.Collection.
clear();
404 if (SchedBarr != TempCollection.
end())
405 SG.Collection.push_back(*SchedBarr);
409 CurrSyncGroupIdx = BeginSyncGroupIdx;
414void PipelineSolver::convertSyncMapsToArrays() {
415 for (
auto &SyncPipe : SyncedSchedGroups) {
416 BestPipeline.insert(BestPipeline.begin(), SyncPipe.second);
419 int PipelineIDx = SyncedInstrs.size() - 1;
420 PipelineInstrs.resize(SyncedInstrs.size());
421 for (
auto &SyncInstrMap : SyncedInstrs) {
422 for (
auto &SUsToCandSGs : SyncInstrMap.second) {
423 if (PipelineInstrs[PipelineIDx].
size() == 0) {
424 PipelineInstrs[PipelineIDx].push_back(
425 std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
428 auto SortPosition = PipelineInstrs[PipelineIDx].begin();
431 while (SortPosition != PipelineInstrs[PipelineIDx].end() &&
432 SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum)
434 PipelineInstrs[PipelineIDx].insert(
435 SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
441template <
typename T>
void PipelineSolver::linkSchedGroups(
T I,
T E) {
442 for (;
I != E; ++
I) {
444 for (
auto J = std::next(
I); J != E; ++J) {
451void PipelineSolver::makePipeline() {
453 for (
auto &SyncPipeline : BestPipeline) {
455 for (
auto &SG : SyncPipeline) {
458 SUnit *SGBarr =
nullptr;
459 for (
auto &SU : SG.Collection) {
467 resetEdges(*SGBarr, DAG);
468 SG.link(*SGBarr,
false);
472 for (
auto &SyncPipeline : BestPipeline) {
473 IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend())
474 : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end());
479int PipelineSolver::linkSUnit(
480 SUnit *SU,
int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
482 bool MakePred =
false;
485 if (
I->getSGID() == SGID) {
490 AddedCost += Group.link(*SU, MakePred, AddedEdges);
496int PipelineSolver::addEdges(
498 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
508 return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.
rbegin(),
510 : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.
begin(),
514void PipelineSolver::removeEdges(
515 const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
518 for (
auto &PredSuccPair : EdgesToRemove) {
519 SUnit *Pred = PredSuccPair.first;
520 SUnit *Succ = PredSuccPair.second;
523 Succ->
Preds, [&Pred](
SDep &
P) { return P.getSUnit() == Pred; });
531void PipelineSolver::advancePosition() {
534 if (
static_cast<size_t>(CurrConflInstNo) >=
535 PipelineInstrs[CurrSyncGroupIdx].
size()) {
539 while (
static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() &&
540 PipelineInstrs[CurrSyncGroupIdx].size() == 0)
545void PipelineSolver::retreatPosition() {
546 assert(CurrConflInstNo >= 0);
547 assert(CurrSyncGroupIdx >= 0);
549 if (CurrConflInstNo > 0) {
554 if (CurrConflInstNo == 0) {
557 if (CurrSyncGroupIdx == BeginSyncGroupIdx)
562 while (PipelineInstrs[CurrSyncGroupIdx].
size() == 0)
565 CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1;
569bool PipelineSolver::checkOptimal() {
570 if (
static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {
571 if (BestCost == -1 || CurrCost < BestCost) {
572 BestPipeline = CurrPipeline;
579 bool DoneExploring =
false;
580 if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
581 DoneExploring =
true;
583 return (DoneExploring || BestCost == 0);
587void PipelineSolver::populateReadyList(
589 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
590 auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
591 assert(CurrSU.second.size() >= 1);
593 for (;
I != E; ++
I) {
594 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
597 return SG.getSGID() == CandSGID;
602 if (
Match->isFull()) {
603 ReadyList.push_back(std::pair(*
I, MissPenalty));
607 int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
608 ReadyList.push_back(std::pair(*
I, TempCost));
609 removeEdges(AddedEdges);
611 ReadyList.push_back(std::pair(*
I, -1));
615 std::sort(ReadyList.begin(), ReadyList.end(),
616 [](std::pair<int, int>
A, std::pair<int, int>
B) {
617 return A.second < B.second;
621 assert(ReadyList.size() == CurrSU.second.size());
624bool PipelineSolver::solveExact() {
628 if (
static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())
631 assert(
static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());
632 assert(
static_cast<size_t>(CurrConflInstNo) <
633 PipelineInstrs[CurrSyncGroupIdx].
size());
634 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
636 <<
") in Pipeline # " << CurrSyncGroupIdx <<
"\n");
641 IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.
rbegin(),
642 CurrSU.second.rend())
643 : populateReadyList(ReadyList, CurrSU.second.
begin(),
644 CurrSU.second.end());
646 auto I = ReadyList.
begin();
647 auto E = ReadyList.
end();
648 for (;
I != E; ++
I) {
652 if (BestCost != -1 && (CurrCost +
I->second > BestCost))
655 int CandSGID =
I->first;
657 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
658 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
660 for (
auto &SG : SyncPipeline) {
661 if (SG.getSGID() == CandSGID)
668 if (!
Match->allowedByRules(CurrSU.first, SyncPipeline))
672 << (
int)
Match->getMask() <<
"and ID " << CandSGID
674 Match->add(*CurrSU.first);
675 AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
676 LLVM_DEBUG(
dbgs() <<
"Cost of Assignment: " << AddedCost <<
"\n");
677 CurrCost += AddedCost;
680 bool FinishedExploring =
false;
683 if (CurrCost < BestCost || BestCost == -1) {
685 FinishedExploring = BestCost != 0;
686 if (!FinishedExploring)
692 CurrCost -= AddedCost;
693 removeEdges(AddedEdges);
695 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
696 if (FinishedExploring)
703 CurrCost += MissPenalty;
706 LLVM_DEBUG(
dbgs() <<
"NOT Assigned (" << CurrSU.first->NodeNum <<
")\n");
708 bool FinishedExploring =
false;
709 if (CurrCost < BestCost || BestCost == -1) {
711 bool FinishedExploring = BestCost != 0;
712 if (!FinishedExploring)
718 CurrCost -= MissPenalty;
719 return FinishedExploring;
723void PipelineSolver::greedyFind(
724 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
T I,
T E) {
725 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
726 int BestNodeCost = -1;
728 SchedGroup *BestGroup =
nullptr;
729 int BestGroupID = -1;
730 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
732 <<
") in Pipeline # " << CurrSyncGroupIdx <<
"\n");
738 for (;
I != E; ++
I) {
739 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
742 return SG.getSGID() == CandSGID;
746 LLVM_DEBUG(
dbgs() <<
"Trying SGID # " << CandSGID <<
" with Mask "
747 << (
int)
Match->getMask() <<
"\n");
749 if (
Match->isFull()) {
753 if (!
Match->allowedByRules(CurrSU.first, SyncPipeline)) {
754 LLVM_DEBUG(
dbgs() <<
"SGID # " << CandSGID <<
" has conflicting rule\n");
757 TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
759 if (TempCost < BestNodeCost || BestNodeCost == -1) {
761 BestNodeCost = TempCost;
762 BestGroupID = CandSGID;
764 removeEdges(AddedEdges);
765 if (BestNodeCost == 0)
769 if (BestGroupID != -1) {
770 BestGroup->add(*CurrSU.first);
771 addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);
772 LLVM_DEBUG(
dbgs() <<
"Best Group has ID: " << BestGroupID <<
" and Mask"
773 << (
int)BestGroup->getMask() <<
"\n");
774 BestCost += TempCost;
776 BestCost += MissPenalty;
778 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
781bool PipelineSolver::solveGreedy() {
783 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
785 while (
static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
786 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
788 ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend())
789 : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end());
792 BestPipeline = CurrPipeline;
793 removeEdges(AddedEdges);
797unsigned PipelineSolver::computeProblemSize() {
798 unsigned ProblemSize = 0;
799 for (
auto &PipeConflicts : PipelineInstrs) {
800 ProblemSize += PipeConflicts.size();
806void PipelineSolver::solve() {
810 unsigned ProblemSize = computeProblemSize();
813 bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact;
814 MissPenalty = (ProblemSize / 2) + 1;
817 if (EnableExactSolver || BelowCutoff) {
821 LLVM_DEBUG(
dbgs() <<
"Greedy produced best cost of " << BestCost <<
"\n");
825 LLVM_DEBUG(
dbgs() <<
"Exact produced best cost of " << BestCost <<
"\n");
837enum IGLPStrategyID :
int {
838 MFMASmallGemmOptID = 0,
839 MFMASmallGemmSingleWaveOptID = 1,
840 MFMAExpInterleave = 2
852 virtual bool applyIGLPStrategy(
866 virtual ~IGLPStrategy() =
default;
869class MFMASmallGemmOpt final :
public IGLPStrategy {
872 bool applyIGLPStrategy(
883 : IGLPStrategy(DAG,
TII) {
888bool MFMASmallGemmOpt::applyIGLPStrategy(
893 unsigned MFMACount = 0;
895 if (
TII->isMFMAorWMMA(
I))
898 const unsigned PipelineSyncID = 0;
899 SchedGroup *SG =
nullptr;
900 for (
unsigned I = 0;
I < MFMACount * 3; ++
I) {
901 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
902 SchedGroupMask::DS, 2, PipelineSyncID, DAG,
TII);
903 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
905 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
906 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
907 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
913class MFMAExpInterleaveOpt final :
public IGLPStrategy {
916 static unsigned TransPipeCount;
918 static unsigned MFMAPipeCount;
920 static unsigned AddPipeCount;
922 static unsigned MFMAEnablement;
924 static unsigned ExpRequirement;
926 static unsigned MFMAChains;
928 static unsigned MFMAChainLength;
933 static bool HasChainBetweenCvt;
935 static std::optional<unsigned> FirstPipeDSR;
944 class IsPipeExp final :
public InstructionRule {
949 auto DAG = SyncPipe[0].DAG;
951 if (Cache->empty()) {
953 auto E = DAG->
SUnits.rend();
954 for (;
I != E;
I++) {
955 if (
TII->isMFMAorWMMA(*
I->getInstr()))
956 Cache->push_back(&*
I);
962 auto Reaches = (std::any_of(
963 Cache->begin(), Cache->end(), [&SU, &DAG](
SUnit *TargetSU) {
964 return DAG->IsReachable(TargetSU, const_cast<SUnit *>(SU));
969 IsPipeExp(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
970 : InstructionRule(
TII, SGID, NeedsCache) {}
975 class EnablesNthMFMA final :
public InstructionRule {
982 bool FoundTrans =
false;
983 unsigned Counter = 1;
984 auto DAG = SyncPipe[0].DAG;
986 if (Cache->empty()) {
990 auto E = DAG->
SUnits.end();
991 for (;
I != E;
I++) {
992 if (FoundTrans &&
TII->isMFMAorWMMA(*
I->getInstr())) {
994 Cache->push_back(&*
I);
999 if (!FoundTrans &&
TII->isTRANS(
I->getInstr()->getOpcode()))
1010 bool NeedsCache =
false)
1016 class EnablesNthMFMAInChain final :
public InstructionRule {
1024 auto DAG = SyncPipe[0].DAG;
1026 if (!SU || !
TII->isMFMAorWMMA(*ChainSeed->
getInstr()))
1029 if (Cache->empty()) {
1030 auto TempSU = ChainSeed;
1035 for (
auto &Succ : TempSU->Succs) {
1036 if (
TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
1037 TempSU = Succ.getSUnit();
1046 Cache->push_back(TempSU);
1055 EnablesNthMFMAInChain(
unsigned Number,
SUnit *ChainSeed,
1057 bool NeedsCache =
false)
1059 ChainSeed(ChainSeed) {}
1065 class LessThanNSuccs final :
public InstructionRule {
1068 bool HasIntermediary =
false;
1073 if (!SyncPipe.
size())
1076 auto SuccSize = std::count_if(
1078 [](
const SDep &Succ) { return Succ.getKind() == SDep::Data; });
1079 if (SuccSize >= Size)
1082 if (HasIntermediary) {
1083 for (
auto Succ : SU->
Succs) {
1084 auto SuccSize = std::count_if(
1086 [](
const SDep &SuccSucc) {
1087 return SuccSucc.getKind() == SDep::Data;
1089 if (SuccSize >= Size)
1096 LessThanNSuccs(
unsigned Size,
const SIInstrInfo *
TII,
unsigned SGID,
1097 bool HasIntermediary =
false,
bool NeedsCache =
false)
1098 : InstructionRule(
TII, SGID, NeedsCache), Size(Size),
1099 HasIntermediary(HasIntermediary) {}
1106 class GreaterThanOrEqualToNSuccs final :
public InstructionRule {
1109 bool HasIntermediary =
false;
1114 if (!SyncPipe.
size())
1117 auto SuccSize = std::count_if(
1119 [](
const SDep &Succ) { return Succ.getKind() == SDep::Data; });
1120 if (SuccSize >= Size)
1123 if (HasIntermediary) {
1124 for (
auto Succ : SU->
Succs) {
1125 auto SuccSize = std::count_if(
1127 [](
const SDep &SuccSucc) {
1128 return SuccSucc.getKind() == SDep::Data;
1130 if (SuccSize >= Size)
1137 GreaterThanOrEqualToNSuccs(
unsigned Size,
const SIInstrInfo *
TII,
1138 unsigned SGID,
bool HasIntermediary =
false,
1139 bool NeedsCache =
false)
1140 : InstructionRule(
TII, SGID, NeedsCache), Size(Size),
1141 HasIntermediary(HasIntermediary) {}
1145 class IsCvt final :
public InstructionRule {
1150 return Opc == AMDGPU::V_CVT_F16_F32_e32 ||
1151 Opc == AMDGPU::V_CVT_I32_F32_e32;
1153 IsCvt(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1154 : InstructionRule(
TII, SGID, NeedsCache) {}
1158 class IsFMA final :
public InstructionRule {
1165 IsFMA(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1166 : InstructionRule(
TII, SGID, NeedsCache) {}
1170 class IsPipeAdd final :
public InstructionRule {
1176 IsPipeAdd(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1177 : InstructionRule(
TII, SGID, NeedsCache) {}
1182 class IsSuccOfPrevNthGroup final :
public InstructionRule {
1184 unsigned Distance = 1;
1189 SchedGroup *OtherGroup =
nullptr;
1190 if (!SyncPipe.
size())
1193 for (
auto &PipeSG : SyncPipe) {
1194 if ((
unsigned)PipeSG.getSGID() == SGID - Distance)
1195 OtherGroup = &PipeSG;
1200 if (!OtherGroup->Collection.size())
1203 for (
auto &OtherEle : OtherGroup->Collection) {
1204 for (
auto &Succ : OtherEle->Succs) {
1205 if (Succ.getSUnit() == SU && Succ.getKind() ==
SDep::Data)
1213 unsigned SGID,
bool NeedsCache =
false)
1214 : InstructionRule(
TII, SGID, NeedsCache), Distance(Distance) {}
1219 class IsReachableFromPrevNthGroup final :
public InstructionRule {
1221 unsigned Distance = 1;
1226 SchedGroup *OtherGroup =
nullptr;
1227 if (!SyncPipe.
size())
1230 for (
auto &PipeSG : SyncPipe) {
1231 if ((
unsigned)PipeSG.getSGID() == SGID - Distance)
1232 OtherGroup = &PipeSG;
1237 if (!OtherGroup->Collection.size())
1240 auto DAG = SyncPipe[0].DAG;
1242 for (
auto &OtherEle : OtherGroup->Collection)
1248 IsReachableFromPrevNthGroup(
unsigned Distance,
const SIInstrInfo *
TII,
1249 unsigned SGID,
bool NeedsCache =
false)
1250 : InstructionRule(
TII, SGID, NeedsCache), Distance(Distance) {}
1254 class OccursAtOrAfterNode final :
public InstructionRule {
1265 bool NeedsCache =
false)
1271 class IsExactMFMA final :
public InstructionRule {
1279 if (!SU || !
TII->isMFMAorWMMA(*ChainSeed->
getInstr()))
1282 if (Cache->empty()) {
1283 auto TempSU = ChainSeed;
1288 for (
auto &Succ : TempSU->Succs) {
1289 if (
TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
1290 TempSU = Succ.getSUnit();
1299 Cache->push_back(TempSU);
1305 return (*Cache)[0] == SU;
1309 unsigned SGID,
bool NeedsCache =
false)
1311 ChainSeed(ChainSeed) {}
1317 class OccursAfterExp final :
public InstructionRule {
1323 auto DAG = SyncPipe[0].DAG;
1324 if (Cache->empty()) {
1325 for (
auto &SU : DAG->
SUnits)
1334 return SU->
NodeNum > (*Cache)[0]->NodeNum;
1338 bool NeedsCache =
false)
1339 : InstructionRule(
TII, SGID, NeedsCache) {}
1343 bool applyIGLPStrategy(
1352 : IGLPStrategy(DAG,
TII) {
1357unsigned MFMAExpInterleaveOpt::TransPipeCount = 0;
1358unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0;
1359unsigned MFMAExpInterleaveOpt::AddPipeCount = 0;
1360unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0;
1361unsigned MFMAExpInterleaveOpt::ExpRequirement = 0;
1362unsigned MFMAExpInterleaveOpt::MFMAChains = 0;
1363unsigned MFMAExpInterleaveOpt::MFMAChainLength = 0;
1364bool MFMAExpInterleaveOpt::HasCvt =
false;
1365bool MFMAExpInterleaveOpt::HasChainBetweenCvt =
false;
1366std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt;
1375 auto isBitPack = [](
unsigned Opc) {
1376 return Opc == AMDGPU::V_PACK_B32_F16_e64 || Opc == AMDGPU::V_PERM_B32_e64;
1379 auto isCvt = [](
unsigned Opc) {
1380 return Opc == AMDGPU::V_CVT_F16_F32_e32 || Opc == AMDGPU::V_CVT_I32_F32_e32;
1383 auto isAdd = [](
unsigned Opc) {
return Opc == AMDGPU::V_ADD_F32_e32; };
1388 if (
TII->isTRANS(Opc)) {
1390 if (SU.
Succs.size() >= 7)
1392 for (
auto &Succ : SU.
Succs) {
1393 if (Succ.getSUnit()->Succs.size() >= 7)
1412 if (!(PackSUs.
size() && MFMAPipeCands.
size() && ExpPipeCands.
size()))
1417 std::optional<SUnit *> TempMFMA;
1418 std::optional<SUnit *> TempExp;
1420 for (
auto &PredSU : ExpPipeCands) {
1421 for (
auto &SuccSU : MFMAPipeCands) {
1434 if (!(TempExp && TempMFMA))
1437 HasChainBetweenCvt =
1438 std::find_if((*TempExp)->Succs.begin(), (*TempExp)->Succs.end(),
1439 [&isCvt](
SDep &Succ) {
1440 return isCvt(Succ.getSUnit()->getInstr()->getOpcode());
1441 }) == (*TempExp)->Succs.end();
1444 for (
auto &SuccSU : MFMAPipeCands) {
1445 if (MFMAPipeSUs.
size() &&
1446 std::find_if(MFMAPipeSUs.
begin(), MFMAPipeSUs.
end(),
1447 [&SuccSU](
SUnit *PotentialMatch) {
1448 return PotentialMatch->NodeNum == SuccSU->NodeNum;
1449 }) != MFMAPipeSUs.
end())
1452 for (
auto &PredSU : ExpPipeCands) {
1460 MFMAPipeCount = MFMAPipeSUs.
size();
1462 assert(TempExp && TempMFMA);
1463 assert(MFMAPipeCount > 0);
1465 std::optional<SUnit *> TempCvt;
1466 for (
auto &SuccSU : CvtSUs) {
1474 if (TempCvt.has_value()) {
1475 for (
auto &SuccSU : MFMAPipeSUs) {
1484 for (
auto &MFMAPipeSU : MFMAPipeSUs) {
1485 if (MFMAChainSeeds.size() &&
1486 std::find(MFMAChainSeeds.begin(), MFMAChainSeeds.end(), MFMAPipeSU) !=
1487 MFMAChainSeeds.end())
1489 if (!std::any_of(MFMAPipeSU->Preds.begin(), MFMAPipeSU->Preds.end(),
1491 return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
1493 MFMAChainSeeds.push_back(MFMAPipeSU);
1501 for (
auto Pred : MFMAChainSeeds[0]->Preds) {
1502 if (
TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) &&
1503 Pred.getSUnit()->getInstr()->mayLoad())
1504 FirstPipeDSR = Pred.getSUnit()->NodeNum;
1507 MFMAChainLength = MFMAPipeCount / MFMAChains;
1510 unsigned PackSuccCount = std::count_if(
1511 PackSUs.
begin(), PackSUs.
end(), [
this, &TempExp](
SUnit *VPack) {
1512 return DAG->IsReachable(VPack, *TempExp);
1516 unsigned PackPredCount =
1517 std::count_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(),
1518 [&isBitPack](
SDep &Pred) {
1519 auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1520 return isBitPack(Opc);
1524 std::find_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(),
1525 [&isBitPack](
SDep &Pred) {
1526 auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1527 return isBitPack(Opc);
1530 if (PackPred == (*TempMFMA)->Preds.end())
1537 std::count_if(PackPred->getSUnit()->Succs.begin(),
1538 PackPred->getSUnit()->Succs.end(), [&
TII](
SDep &Succ) {
1539 return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
1543 MFMAEnablement *= PackSuccCount;
1547 std::count_if(ExpPipeCands.
begin(), ExpPipeCands.
end(),
1548 [
this, &PackPred](
SUnit *ExpBase) {
1549 return DAG->IsReachable(PackPred->getSUnit(), ExpBase);
1552 ExpRequirement *= PackPredCount;
1561 if (
Phase != AMDGPU::SchedulingPhase::PostRA)
1562 MFMAChainSeeds.clear();
1563 if (
Phase != AMDGPU::SchedulingPhase::PostRA && !analyzeDAG(
TII))
1569bool MFMAExpInterleaveOpt::applyIGLPStrategy(
1574 bool IsSmallKernelType =
1575 MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32;
1576 bool IsLargeKernelType =
1577 MFMAEnablement == 4 && ExpRequirement == 4 && TransPipeCount == 64;
1579 if (!(IsSmallKernelType || IsLargeKernelType))
1585 unsigned PipelineSyncID = 0;
1586 SchedGroup *SG =
nullptr;
1588 unsigned MFMAChain = 0;
1589 unsigned PositionInChain = 0;
1590 unsigned CurrMFMAForTransPosition = 0;
1592 auto incrementTransPosition = [&MFMAChain, &PositionInChain,
1593 &CurrMFMAForTransPosition]() {
1594 CurrMFMAForTransPosition += MFMAEnablement;
1595 PositionInChain = (CurrMFMAForTransPosition / MFMAChains);
1596 MFMAChain = CurrMFMAForTransPosition % MFMAChains;
1599 auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() {
1600 auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1601 return (TempMFMAForTrans / MFMAChains);
1604 auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() {
1605 auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1606 return TempMFMAForTrans % MFMAChains;
1609 unsigned CurrMFMAPosition = 0;
1610 unsigned MFMAChainForMFMA = 0;
1611 unsigned PositionInChainForMFMA = 0;
1613 auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA,
1614 &PositionInChainForMFMA]() {
1616 MFMAChainForMFMA = CurrMFMAPosition % MFMAChains;
1617 PositionInChainForMFMA = CurrMFMAPosition / MFMAChains;
1620 bool IsPostRA =
Phase == AMDGPU::SchedulingPhase::PostRA;
1621 assert(IsPostRA || MFMAChainSeeds.size() == MFMAChains);
1623 bool UsesFMA = IsSmallKernelType || !IsPostRA;
1624 bool UsesDSRead = IsLargeKernelType && !IsPostRA && FirstPipeDSR;
1625 bool UsesCvt = HasCvt && (IsSmallKernelType || !IsPostRA);
1626 bool UsesVALU = IsSmallKernelType;
1631 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1632 SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG,
TII);
1633 if (!IsPostRA && MFMAChains) {
1634 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1635 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
1639 std::make_shared<EnablesNthMFMA>(1,
TII, SG->getSGID(),
true));
1640 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1641 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1644 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1645 SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG,
TII);
1646 if (!IsPostRA && MFMAChains) {
1647 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1648 getNextTransPositionInChain(),
1649 MFMAChainSeeds[getNextTransMFMAChain()],
TII, SG->getSGID(),
true));
1651 SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1,
TII,
1652 SG->getSGID(),
true));
1653 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1654 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1658 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1659 SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG,
TII);
1660 SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR,
TII,
1662 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1666 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1667 SchedGroupMask::TRANS, ExpRequirement, PipelineSyncID, DAG,
TII);
1668 if (!IsPostRA && MFMAChains)
1669 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1670 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
true));
1672 SG->addRule(std::make_shared<EnablesNthMFMA>(1,
TII, SG->getSGID(),
true));
1673 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1674 SG->addRule(std::make_shared<LessThanNSuccs>(8,
TII, SG->getSGID(),
1675 HasChainBetweenCvt));
1676 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1678 incrementTransPosition();
1681 for (
unsigned I = 0;
I < ExpRequirement;
I++) {
1684 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1685 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1686 SG->addRule(std::make_shared<IsCvt>(
TII, SG->getSGID()));
1687 if (HasChainBetweenCvt)
1688 SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
1689 1 + (2 + UsesFMA) *
I,
TII, SG->getSGID()));
1691 SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(
1692 1 + (2 + UsesFMA) *
I,
TII, SG->getSGID()));
1693 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1698 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1699 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1700 if (!IsPostRA && MFMAChains) {
1701 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1702 getNextTransPositionInChain(),
1703 MFMAChainSeeds[getNextTransMFMAChain()],
TII, SG->getSGID(),
true));
1705 SG->addRule(std::make_shared<EnablesNthMFMA>(2 * MFMAEnablement + 1,
1706 TII, SG->getSGID(),
true));
1707 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1708 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1712 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1713 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG,
TII);
1714 if (!IsPostRA && MFMAChains)
1715 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1716 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
1719 SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1,
TII,
1720 SG->getSGID(),
true));
1721 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1722 SG->addRule(std::make_shared<LessThanNSuccs>(8,
TII, SG->getSGID(),
1723 HasChainBetweenCvt));
1724 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1729 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1730 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG,
TII);
1731 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1732 SG->addRule(std::make_shared<GreaterThanOrEqualToNSuccs>(
1733 8,
TII, SG->getSGID(), HasChainBetweenCvt));
1734 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1739 unsigned MFMARatio =
1740 MFMAEnablement > ExpRequirement ? MFMAEnablement / ExpRequirement : 1;
1743 MFMAEnablement > ExpRequirement ? 1 : ExpRequirement / MFMAEnablement;
1745 unsigned RemainingExp = TransPipeCount > (2 * ExpRequirement)
1746 ? TransPipeCount - (2 * ExpRequirement)
1748 unsigned ExpLoopCount = RemainingExp / ExpRatio;
1750 unsigned MFMAInLoop = MFMAPipeCount > (MFMAEnablement * 2)
1751 ? MFMAPipeCount - (MFMAEnablement * 2)
1753 unsigned MFMALoopCount = MFMAInLoop / MFMARatio;
1755 AddPipeCount < MFMAPipeCount ? 1 : AddPipeCount / MFMAPipeCount;
1756 unsigned LoopSize = std::min(ExpLoopCount, MFMALoopCount);
1758 for (
unsigned I = 0;
I < LoopSize;
I++) {
1759 if (!(
I * ExpRatio % ExpRequirement))
1760 incrementTransPosition();
1763 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1764 SchedGroupMask::MFMA, MFMARatio, PipelineSyncID, DAG,
TII);
1765 if (!IsPostRA && MFMAChains)
1766 SG->addRule(std::make_shared<IsExactMFMA>(
1767 PositionInChainForMFMA, MFMAChainSeeds[MFMAChainForMFMA],
TII,
1768 SG->getSGID(),
true));
1770 SG->addRule(std::make_shared<OccursAfterExp>(
TII, SG->getSGID(),
true));
1771 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1772 incrementMFMAPosition();
1775 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1776 SchedGroupMask::VALU, VALUOps, PipelineSyncID, DAG,
TII);
1777 SG->addRule(std::make_shared<IsPipeAdd>(
TII, SG->getSGID()));
1778 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1781 if (UsesDSRead && !(
I % 4)) {
1782 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1783 SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG,
TII);
1784 SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR,
TII,
1786 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1790 for (
unsigned J = 0; J < ExpRatio; J++) {
1791 auto MFMAOffset = (1 + UsesVALU) * MFMARatio * (
I + 1);
1792 auto MaxMFMAOffset =
1793 (1 + UsesVALU) * ExpRequirement * MFMARatio / ExpRatio;
1797 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1798 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1799 SG->addRule(std::make_shared<IsCvt>(
TII, SG->getSGID()));
1800 auto BaseDiff = (2 + UsesFMA) * (ExpRequirement - 1) + 1;
1801 auto DSROffset =
I / 4 + 1;
1802 auto MaxDSROffset = MaxMFMAOffset / 4;
1804 auto ExpOffset =
I * ExpRatio + J >= ExpRequirement ? 0 : 1;
1805 auto CurrentOffset = UsesDSRead * std::min(MaxDSROffset, DSROffset) +
1806 std::min(MaxMFMAOffset, MFMAOffset) + BaseDiff +
1808 if (HasChainBetweenCvt)
1809 SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
1810 CurrentOffset,
TII, SG->getSGID()));
1812 SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(CurrentOffset,
TII,
1814 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1819 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1820 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1821 if (!IsPostRA && MFMAChains)
1822 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1823 getNextTransPositionInChain(),
1824 MFMAChainSeeds[getNextTransMFMAChain()],
TII, SG->getSGID(),
1827 SG->addRule(std::make_shared<EnablesNthMFMA>(
1828 (((
I * ExpRatio + J) / ExpRequirement) + 3) * MFMAEnablement + 1,
1829 TII, SG->getSGID(),
true));
1830 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1831 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1835 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1836 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG,
TII);
1837 if (!IsPostRA && MFMAChains)
1838 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1839 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
1842 SG->addRule(std::make_shared<EnablesNthMFMA>(
1843 (((
I * ExpRatio + J) / ExpRequirement) + 2) * MFMAEnablement + 1,
1844 TII, SG->getSGID(),
true));
1845 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1846 SG->addRule(std::make_shared<LessThanNSuccs>(8,
TII, SG->getSGID(),
1847 HasChainBetweenCvt));
1848 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1853 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1854 SchedGroupMask::MFMA, MFMAEnablement * 2, PipelineSyncID, DAG,
TII);
1855 SG->addRule(std::make_shared<OccursAfterExp>(
TII, SG->getSGID(),
true));
1856 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1860class MFMASmallGemmSingleWaveOpt final :
public IGLPStrategy {
1863 class EnablesInitialMFMA final :
public InstructionRule {
1867 if (!SyncPipe.
size())
1870 if (!Cache->size()) {
1871 for (
auto &Elt : SyncPipe[0].DAG->
SUnits) {
1872 if (
TII->isMFMAorWMMA(*Elt.getInstr())) {
1876 Cache->push_back(&Elt);
1882 auto DAG = SyncPipe[0].DAG;
1883 for (
auto &Elt : *Cache) {
1891 bool NeedsCache =
false)
1892 : InstructionRule(
TII, SGID, NeedsCache) {}
1896 class IsPermForDSW final :
public InstructionRule {
1901 if (
MI->getOpcode() != AMDGPU::V_PERM_B32_e64)
1904 bool FitsInGroup =
false;
1906 if (!Collection.
size()) {
1907 for (
auto &Succ : SU->
Succs) {
1908 SUnit *SuccUnit = Succ.getSUnit();
1911 Cache->push_back(SuccUnit);
1924 return ThisSucc.getSUnit() == Elt;
1929 IsPermForDSW(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1930 : InstructionRule(
TII, SGID, NeedsCache) {}
1934 class IsSuccOfPrevGroup final :
public InstructionRule {
1938 SchedGroup *OtherGroup =
nullptr;
1939 for (
auto &PipeSG : SyncPipe) {
1940 if ((
unsigned)PipeSG.getSGID() == SGID - 1) {
1941 OtherGroup = &PipeSG;
1947 if (!OtherGroup->Collection.size())
1951 return (std::any_of(OtherGroup->Collection.begin(),
1952 OtherGroup->Collection.end(), [&SU](
SUnit *Elt) {
1953 return std::any_of(Elt->Succs.begin(),
1956 return Succ.getSUnit() == SU;
1961 bool NeedsCache =
false)
1962 : InstructionRule(
TII, SGID, NeedsCache) {}
1966 class VMEMSize final :
public InstructionRule {
1971 if (
MI->getOpcode() == TargetOpcode::BUNDLE)
1973 if (!Collection.
size())
1978 auto TRI =
TII->getRegisterInfo();
1979 auto &
MRI =
MI->getParent()->getParent()->getRegInfo();
1980 for (
auto &Elt : Collection) {
1981 auto Op = Elt->getInstr()->getOperand(0);
1983 TRI.getRegSizeInBits(*
TRI.getRegClassForOperandReg(
MRI,
Op));
1987 if (NumBits < 128) {
1989 if (NumBits +
TRI.getRegSizeInBits(*
TRI.getRegClassForOperandReg(
1990 MRI,
MI->getOperand(0))) <=
1998 VMEMSize(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1999 : InstructionRule(
TII, SGID, NeedsCache) {}
2004 class SharesPredWithPrevNthGroup final :
public InstructionRule {
2006 unsigned Distance = 1;
2011 SchedGroup *OtherGroup =
nullptr;
2012 if (!SyncPipe.
size())
2015 if (!Cache->size()) {
2017 for (
auto &PipeSG : SyncPipe) {
2018 if ((
unsigned)PipeSG.getSGID() == SGID - Distance) {
2019 OtherGroup = &PipeSG;
2025 if (!OtherGroup->Collection.size())
2028 for (
auto &OtherEle : OtherGroup->Collection) {
2029 for (
auto &Pred : OtherEle->Preds) {
2030 if (Pred.getSUnit()->getInstr()->getOpcode() ==
2031 AMDGPU::V_PERM_B32_e64)
2032 Cache->push_back(Pred.getSUnit());
2041 auto DAG = SyncPipe[0].DAG;
2048 SharesPredWithPrevNthGroup(
unsigned Distance,
const SIInstrInfo *
TII,
2049 unsigned SGID,
bool NeedsCache =
false)
2050 : InstructionRule(
TII, SGID, NeedsCache), Distance(Distance) {}
2054 bool applyIGLPStrategy(
2065 : IGLPStrategy(DAG,
TII) {
2070static unsigned DSWCount = 0;
2071static unsigned DSWWithPermCount = 0;
2072static unsigned DSWWithSharedVMEMCount = 0;
2074bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
2078 unsigned MFMACount = 0;
2079 unsigned DSRCount = 0;
2081 bool IsInitial =
Phase == AMDGPU::SchedulingPhase::Initial;
2083 assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 &&
2084 DSWWithSharedVMEMCount == 0)) &&
2085 "DSWCounters should be zero in pre-RA scheduling!");
2087 for (
auto &SU : DAG->SUnits) {
2088 auto I = SU.getInstr();
2089 if (
TII->isMFMAorWMMA(*
I))
2091 else if (
TII->isDS(*
I)) {
2094 else if (
I->mayStore() && IsInitial) {
2096 for (
auto Pred : SU.Preds) {
2097 if (Pred.getSUnit()->getInstr()->getOpcode() ==
2098 AMDGPU::V_PERM_B32_e64) {
2108 DSWWithPermCount = DSWithPerms.
size();
2109 auto I = DSWithPerms.
begin();
2110 auto E = DSWithPerms.
end();
2120 for (;
I != E;
I++) {
2121 SUnit *Cand =
nullptr;
2122 bool MissedAny =
false;
2123 for (
auto &Pred : (*I)->Preds) {
2124 if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
2130 for (
auto &Succ : Pred.getSUnit()->Succs) {
2131 auto MI = Succ.getSUnit()->getInstr();
2132 if (!
TII->isVMEM(*
MI) || !
MI->mayLoad())
2135 if (MissedAny || !VMEMLookup.
size()) {
2137 VMEMLookup[
MI] = *
I;
2143 VMEMLookup[
MI] = *
I;
2147 Cand = VMEMLookup[
MI];
2154 if (!MissedAny && Cand) {
2155 DSWWithSharedVMEMCount += 2;
2162 assert(DSWWithSharedVMEMCount <= DSWWithPermCount);
2164 unsigned PipelineSyncID = 0;
2166 if (DSWWithPermCount) {
2167 for (
unsigned I = 0;
I < MFMACount;
I++) {
2168 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2169 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2170 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2172 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2173 SchedGroupMask::VALU, 2, PipelineSyncID, DAG,
TII);
2174 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2184 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2185 SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG,
TII);
2186 SG->addRule(std::make_shared<EnablesInitialMFMA>(
TII, SG->getSGID(),
true));
2187 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2189 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2190 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2191 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2194 for (
unsigned I = 0;
I < DSRCount - 4; ++
I) {
2195 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2196 SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG,
TII);
2197 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2199 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2200 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2201 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2207 for (
unsigned I = 0;
I < DSWWithPermCount - DSWWithSharedVMEMCount; ++
I) {
2208 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2209 SchedGroupMask::VALU, 4, PipelineSyncID, DAG,
TII);
2210 SG->addRule(std::make_shared<IsPermForDSW>(
TII, SG->getSGID(),
true));
2211 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2213 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2214 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2215 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(
TII, SG->getSGID()));
2216 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2218 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2219 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2220 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2221 1,
TII, SG->getSGID(),
true));
2222 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2223 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2225 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2226 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2227 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2229 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2230 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2231 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2232 3,
TII, SG->getSGID(),
true));
2233 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2234 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2236 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2237 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2238 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2244 for (
unsigned I = 0;
I < DSWCount - DSWWithPermCount;
I++) {
2245 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2246 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2247 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2249 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2250 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2251 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2252 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2254 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2255 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2256 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2264 for (
unsigned I = 0;
I < DSWWithSharedVMEMCount; ++
I) {
2265 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2266 SchedGroupMask::VALU, 4, PipelineSyncID, DAG,
TII);
2267 SG->addRule(std::make_shared<IsPermForDSW>(
TII, SG->getSGID(),
true));
2268 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2270 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2271 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2272 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(
TII, SG->getSGID()));
2273 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2275 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2276 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2277 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2279 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2280 SchedGroupMask::VALU, 4, PipelineSyncID, DAG,
TII);
2281 SG->addRule(std::make_shared<IsPermForDSW>(
TII, SG->getSGID(),
true));
2282 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2284 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2285 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2286 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(
TII, SG->getSGID()));
2287 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2289 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2290 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2291 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2293 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2294 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2295 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2296 2,
TII, SG->getSGID(),
true));
2297 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2298 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2300 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2301 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2302 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2304 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2305 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2306 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2307 4,
TII, SG->getSGID(),
true));
2308 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2309 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2311 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2312 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2313 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2319static std::unique_ptr<IGLPStrategy>
2323 case MFMASmallGemmOptID:
2324 return std::make_unique<MFMASmallGemmOpt>(DAG,
TII);
2325 case MFMASmallGemmSingleWaveOptID:
2326 return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG,
TII);
2327 case MFMAExpInterleave:
2328 return std::make_unique<MFMAExpInterleaveOpt>(DAG,
TII);
2349 void addSchedBarrierEdges(
SUnit &SU);
2360 SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask)
const;
2363 void initSchedGroupBarrierPipelineStage(
2364 std::vector<SUnit>::reverse_iterator RIter);
2366 bool initIGLPOpt(
SUnit &SU);
2376 bool IsBottomUp = 1;
2381 IGroupLPDAGMutation() =
default;
2385unsigned SchedGroup::NumSchedGroups = 0;
2397 if (
MI.isMetaInstruction())
2400 else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
2405 else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
2409 else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
2413 else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
2414 TII->isMFMAorWMMA(
MI))
2417 else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
2421 else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
2426 else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
2431 else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
2435 else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
2436 MI.mayLoad() &&
TII->isDS(
MI))
2439 else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
2440 MI.mayStore() &&
TII->isDS(
MI))
2443 else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) &&
2448 dbgs() <<
"For SchedGroup with mask " <<
format_hex((
int)SGMask, 10,
true)
2449 << (Result ?
" could classify " :
" unable to classify ") <<
MI);
2454int SchedGroup::link(
SUnit &SU,
bool MakePred,
2455 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
2456 int MissedEdges = 0;
2457 for (
auto *
A : Collection) {
2459 if (
A ==
B ||
A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2469 bool Added = tryAddEdge(
A,
B);
2471 AddedEdges.push_back(std::pair(
A,
B));
2479void SchedGroup::link(
SUnit &SU,
bool MakePred) {
2480 for (
auto *
A : Collection) {
2482 if (
A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2491void SchedGroup::link(
SUnit &SU,
2493 for (
auto *
A : Collection) {
2502void SchedGroup::link(SchedGroup &OtherGroup) {
2503 for (
auto *
B : OtherGroup.Collection)
2507bool SchedGroup::canAddSU(
SUnit &SU)
const {
2509 if (
MI.getOpcode() != TargetOpcode::BUNDLE)
2510 return canAddMI(
MI);
2515 while (E !=
MBB->
end() && E->isBundledWithPred())
2522void SchedGroup::initSchedGroup() {
2523 for (
auto &SU : DAG->
SUnits) {
2532void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
2533 SUnitsToCandidateSGsMap &SyncedInstrs) {
2534 SUnit &InitSU = *RIter;
2535 for (
auto E = DAG->
SUnits.rend(); RIter != E; ++RIter) {
2541 SyncedInstrs[&SU].push_back(SGID);
2549void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) {
2550 auto I = DAG->
SUnits.rbegin();
2551 auto E = DAG->
SUnits.rend();
2552 for (;
I != E; ++
I) {
2557 SyncedInstrs[&SU].push_back(SGID);
2563 if (!TSchedModel || DAGInstrs->
SUnits.empty())
2568 TII =
ST.getInstrInfo();
2570 SyncedSchedGroups.clear();
2571 SyncedInstrs.clear();
2572 bool FoundSB =
false;
2573 bool FoundIGLP =
false;
2574 bool ShouldApplyIGLP =
false;
2575 for (
auto R = DAG->
SUnits.rbegin(), E = DAG->
SUnits.rend();
R != E; ++
R) {
2576 unsigned Opc =
R->getInstr()->getOpcode();
2578 if (Opc == AMDGPU::SCHED_BARRIER) {
2579 addSchedBarrierEdges(*R);
2581 }
else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) {
2582 initSchedGroupBarrierPipelineStage(R);
2584 }
else if (Opc == AMDGPU::IGLP_OPT) {
2585 resetEdges(*R, DAG);
2586 if (!FoundSB && !FoundIGLP) {
2588 ShouldApplyIGLP = initIGLPOpt(*R);
2593 if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) {
2594 PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
2602void IGroupLPDAGMutation::addSchedBarrierEdges(
SUnit &SchedBarrier) {
2604 assert(
MI.getOpcode() == AMDGPU::SCHED_BARRIER);
2607 resetEdges(SchedBarrier, DAG);
2608 LLVM_DEBUG(
dbgs() <<
"Building SchedGroup for SchedBarrier with Mask: "
2609 <<
MI.getOperand(0).getImm() <<
"\n");
2611 invertSchedBarrierMask((SchedGroupMask)
MI.getOperand(0).getImm());
2612 SchedGroup SG(InvertedMask, std::nullopt, DAG,
TII);
2613 SG.initSchedGroup();
2619 const SUnit *
A,
const SUnit *
B) {
return A->NodeNum >
B->NodeNum; });
2623IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask)
const {
2626 SchedGroupMask InvertedMask = ~Mask;
2629 if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
2630 InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU &
2631 ~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS;
2633 else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||
2634 (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||
2635 (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE ||
2636 (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE)
2637 InvertedMask &= ~SchedGroupMask::ALU;
2640 if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
2641 InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE;
2643 else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||
2644 (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE)
2645 InvertedMask &= ~SchedGroupMask::VMEM;
2648 if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE)
2649 InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE;
2651 else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE ||
2652 (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE)
2653 InvertedMask &= ~SchedGroupMask::DS;
2655 LLVM_DEBUG(
dbgs() <<
"After Inverting, SchedGroup Mask: " << (
int)InvertedMask
2658 return InvertedMask;
2661void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
2662 std::vector<SUnit>::reverse_iterator RIter) {
2665 resetEdges(*RIter, DAG);
2672 auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,
2675 SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]);
2678bool IGroupLPDAGMutation::initIGLPOpt(
SUnit &SU) {
2679 IGLPStrategyID StrategyID =
2681 auto S = createIGLPStrategy(StrategyID, DAG,
TII);
2682 if (!S->shouldApplyStrategy(DAG,
Phase))
2685 IsBottomUp = S->IsBottomUp;
2686 return S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups,
Phase);
2698std::unique_ptr<ScheduleDAGMutation>
2700 return std::make_unique<IGroupLPDAGMutation>(
Phase);
unsigned const MachineRegisterInfo * MRI
aarch64 falkor hwpf fix Falkor HW Prefetch Fix Late Phase
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
This file defines the DenseMap class.
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
This class represents an Operation in the Expression.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Instructions::iterator instr_iterator
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const MachineOperand & getOperand(unsigned i) const
@ Data
Regular data dependence (aka true-dependence).
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Scheduling unit. This is a node in the scheduling DAG.
unsigned NodeNum
Entry # of node in the node vector.
void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
SmallVector< SDep, 4 > Succs
All sunit successors.
SmallVector< SDep, 4 > Preds
All sunit predecessors.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
A ScheduleDAG for scheduling lists of MachineInstr.
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
bool addEdge(SUnit *SuccSU, const SDep &PredDep)
Add a DAG edge to the given SU with the given predecessor dependence data.
bool IsReachable(SUnit *SU, SUnit *TargetSU)
IsReachable - Checks if SU is reachable from TargetSU.
bool canAddEdge(SUnit *SuccSU, SUnit *PredSU)
True if an edge can be added from PredSU to SuccSU without creating a cycle.
void dump() const override
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
Mutate the DAG as a postpass after normal DAG building.
virtual void apply(ScheduleDAGInstrs *DAG)=0
std::vector< SUnit > SUnits
The scheduling units.
MachineFunction & MF
Machine function.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
reverse_iterator rbegin()
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Provide an instruction scheduling machine model to CodeGen passes.
An efficient, type-erasing, non-owning reference to a callable.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
initializer< Ty > init(const Ty &Val)
void link(std::unique_ptr< LinkGraph > G, std::unique_ptr< JITLinkContext > Ctx)
Link the given graph.
This is an optimization pass for GlobalISel generic memory operations.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase)
Phase specifes whether or not this is a reentry into the IGroupLPDAGMutation.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
FormattedNumber format_hex(uint64_t N, unsigned Width, bool Upper=false)
format_hex - Output N as a fixed width hexadecimal.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.