42#define DEBUG_TYPE "si-insert-waitcnts"
45 "Force emit s_waitcnt expcnt(0) instrs");
47 "Force emit s_waitcnt lgkmcnt(0) instrs");
49 "Force emit s_waitcnt vmcnt(0) instrs");
52 "amdgpu-waitcnt-forcezero",
53 cl::desc(
"Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
67 SAMPLE_CNT = NUM_NORMAL_INST_CNTS,
70 NUM_EXTENDED_INST_CNTS,
71 NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
85auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
86 return enum_seq(LOAD_CNT, MaxCounter);
89using RegInterval = std::pair<int, int>;
91struct HardwareLimits {
96 unsigned SamplecntMax;
101struct RegisterEncoding {
111 VMEM_SAMPLER_READ_ACCESS,
112 VMEM_BVH_READ_ACCESS,
114 SCRATCH_WRITE_ACCESS,
134enum RegisterMapping {
135 SQ_MAX_PGM_VGPRS = 512,
137 SQ_MAX_PGM_SGPRS = 256,
145 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS,
166static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
167 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
168 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
169 AMDGPU::S_WAIT_KMCNT};
177static bool isNormalMode(InstCounterType MaxCounter) {
178 return MaxCounter == NUM_NORMAL_INST_CNTS;
183 assert(updateVMCntOnly(Inst));
186 return VMEM_NOSAMPLER;
190 return BaseInfo->
BVH ? VMEM_BVH
191 : BaseInfo->
Sampler ? VMEM_SAMPLER : VMEM_NOSAMPLER;
203 return Wait.StoreCnt;
205 return Wait.SampleCnt;
216 unsigned &WC = getCounterRef(
Wait,
T);
217 WC = std::min(WC, Count);
221 getCounterRef(
Wait,
T) = ~0
u;
225 return getCounterRef(
Wait,
T);
229InstCounterType eventCounter(
const unsigned *masks, WaitEventType
E) {
230 for (
auto T : inst_counter_types()) {
231 if (masks[
T] & (1 <<
E))
245class WaitcntBrackets {
247 WaitcntBrackets(
const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
248 HardwareLimits Limits, RegisterEncoding Encoding,
249 const unsigned *WaitEventMaskForInst,
250 InstCounterType SmemAccessCounter)
251 :
ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
252 Encoding(Encoding), WaitEventMaskForInst(WaitEventMaskForInst),
253 SmemAccessCounter(SmemAccessCounter) {}
255 unsigned getWaitCountMax(InstCounterType
T)
const {
258 return Limits.LoadcntMax;
260 return Limits.DscntMax;
262 return Limits.ExpcntMax;
264 return Limits.StorecntMax;
266 return Limits.SamplecntMax;
268 return Limits.BvhcntMax;
270 return Limits.KmcntMax;
277 unsigned getScoreLB(InstCounterType
T)
const {
282 unsigned getScoreUB(InstCounterType
T)
const {
287 unsigned getScoreRange(InstCounterType
T)
const {
288 return getScoreUB(
T) - getScoreLB(
T);
291 unsigned getRegScore(
int GprNo, InstCounterType
T)
const {
292 if (GprNo < NUM_ALL_VGPRS) {
293 return VgprScores[
T][GprNo];
295 assert(
T == SmemAccessCounter);
296 return SgprScores[GprNo - NUM_ALL_VGPRS];
305 bool counterOutOfOrder(InstCounterType
T)
const;
307 void simplifyWaitcnt(InstCounterType
T,
unsigned &Count)
const;
310 void applyWaitcnt(InstCounterType
T,
unsigned Count);
315 unsigned hasPendingEvent()
const {
return PendingEvents; }
316 unsigned hasPendingEvent(WaitEventType
E)
const {
317 return PendingEvents & (1 <<
E);
319 unsigned hasPendingEvent(InstCounterType
T)
const {
320 unsigned HasPending = PendingEvents & WaitEventMaskForInst[
T];
321 assert((HasPending != 0) == (getScoreRange(
T) != 0));
325 bool hasMixedPendingEvents(InstCounterType
T)
const {
326 unsigned Events = hasPendingEvent(
T);
328 return Events & (Events - 1);
331 bool hasPendingFlat()
const {
332 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
333 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
334 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
335 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
338 void setPendingFlat() {
339 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
340 LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
345 bool hasOtherPendingVmemTypes(
int GprNo, VmemType V)
const {
346 assert(GprNo < NUM_ALL_VGPRS);
347 return VgprVmemTypes[GprNo] & ~(1 <<
V);
350 void clearVgprVmemTypes(
int GprNo) {
351 assert(GprNo < NUM_ALL_VGPRS);
352 VgprVmemTypes[GprNo] = 0;
355 void setStateOnFunctionEntryOrReturn() {
356 setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT));
357 PendingEvents |= WaitEventMaskForInst[STORE_CNT];
374 static bool mergeScore(
const MergeInfo &M,
unsigned &Score,
375 unsigned OtherScore);
377 void setScoreLB(InstCounterType
T,
unsigned Val) {
382 void setScoreUB(InstCounterType
T,
unsigned Val) {
389 if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
393 void setRegScore(
int GprNo, InstCounterType
T,
unsigned Val) {
394 if (GprNo < NUM_ALL_VGPRS) {
395 VgprUB = std::max(VgprUB, GprNo);
396 VgprScores[
T][GprNo] = Val;
398 assert(
T == SmemAccessCounter);
399 SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
400 SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
406 unsigned OpNo,
unsigned Val);
409 InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
410 HardwareLimits Limits = {};
411 RegisterEncoding Encoding = {};
412 const unsigned *WaitEventMaskForInst;
413 InstCounterType SmemAccessCounter;
414 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
415 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
416 unsigned PendingEvents = 0;
418 unsigned LastFlat[NUM_INST_CNTS] = {0};
423 unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
426 unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
429 unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
441class WaitcntGenerator {
446 InstCounterType MaxCounter;
449 WaitcntGenerator() {}
450 WaitcntGenerator(
const GCNSubtarget *ST, InstCounterType MaxCounter)
466 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
481 virtual const unsigned *getWaitEventMask()
const = 0;
485 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const = 0;
487 virtual ~WaitcntGenerator() =
default;
490 static constexpr unsigned
491 eventMask(std::initializer_list<WaitEventType> Events) {
493 for (
auto &
E : Events)
500class WaitcntGeneratorPreGFX12 :
public WaitcntGenerator {
502 WaitcntGeneratorPreGFX12() {}
504 : WaitcntGenerator(
ST, NUM_NORMAL_INST_CNTS) {}
507 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
515 const unsigned *getWaitEventMask()
const override {
518 static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
519 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,
520 VMEM_BVH_READ_ACCESS}),
521 eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
522 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
523 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
524 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
529 return WaitEventMaskForInstPreGFX12;
532 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const override;
535class WaitcntGeneratorGFX12Plus :
public WaitcntGenerator {
537 WaitcntGeneratorGFX12Plus() {}
538 WaitcntGeneratorGFX12Plus(
const GCNSubtarget *ST, InstCounterType MaxCounter)
539 : WaitcntGenerator(
ST, MaxCounter) {}
542 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
550 const unsigned *getWaitEventMask()
const override {
553 static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
554 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS}),
555 eventMask({LDS_ACCESS, GDS_ACCESS}),
556 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
557 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
558 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
559 eventMask({VMEM_SAMPLER_READ_ACCESS}),
560 eventMask({VMEM_BVH_READ_ACCESS}),
561 eventMask({SMEM_ACCESS, SQ_MESSAGE})};
563 return WaitEventMaskForInstGFX12Plus;
566 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const override;
583 std::unique_ptr<WaitcntBrackets>
Incoming;
587 InstCounterType SmemAccessCounter;
593 bool ForceEmitZeroWaitcnts;
594 bool ForceEmitWaitcnt[NUM_INST_CNTS];
601 WaitcntGeneratorPreGFX12 WCGPreGFX12;
602 WaitcntGeneratorGFX12Plus WCGGFX12Plus;
604 WaitcntGenerator *WCG =
nullptr;
610 InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
616 (void)ForceExpCounter;
617 (void)ForceLgkmCounter;
618 (void)ForceVMCounter;
621 bool shouldFlushVmCnt(
MachineLoop *
ML, WaitcntBrackets &Brackets);
623 WaitcntBrackets &ScoreBrackets);
628 return "SI insert wait instructions";
640 bool isForceEmitWaitcnt()
const {
641 for (
auto T : inst_counter_types())
642 if (ForceEmitWaitcnt[
T])
647 void setForceEmitWaitcnt() {
653 ForceEmitWaitcnt[
EXP_CNT] =
true;
655 ForceEmitWaitcnt[
EXP_CNT] =
false;
660 ForceEmitWaitcnt[DS_CNT] =
true;
661 ForceEmitWaitcnt[KM_CNT] =
true;
663 ForceEmitWaitcnt[DS_CNT] =
false;
664 ForceEmitWaitcnt[KM_CNT] =
false;
669 ForceEmitWaitcnt[LOAD_CNT] =
true;
670 ForceEmitWaitcnt[SAMPLE_CNT] =
true;
671 ForceEmitWaitcnt[BVH_CNT] =
true;
673 ForceEmitWaitcnt[LOAD_CNT] =
false;
674 ForceEmitWaitcnt[SAMPLE_CNT] =
false;
675 ForceEmitWaitcnt[BVH_CNT] =
false;
682 WaitEventType getVmemWaitEventType(
const MachineInstr &Inst)
const {
684 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
685 VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
696 return SCRATCH_WRITE_ACCESS;
697 return VMEM_WRITE_ACCESS;
700 return VMEM_READ_ACCESS;
701 return VmemReadMapping[getVmemType(Inst)];
708 WaitcntBrackets &ScoreBrackets,
712 WaitcntBrackets &ScoreBrackets,
719 WaitcntBrackets *ScoreBrackets);
721 WaitcntBrackets &ScoreBrackets);
726RegInterval WaitcntBrackets::getRegInterval(
const MachineInstr *
MI,
729 unsigned OpNo)
const {
731 if (!
TRI->isInAllocatableClass(
Op.getReg()))
743 if (
TRI->isVectorRegister(*
MRI,
Op.getReg())) {
744 assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
747 Result.first += AGPR_OFFSET;
749 }
else if (
TRI->isSGPRReg(*
MRI,
Op.getReg())) {
750 assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
751 Result.first =
Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
753 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
761 unsigned Size =
TRI->getRegSizeInBits(*RC);
773 assert(
TRI->isVectorRegister(*
MRI,
MI->getOperand(OpNo).getReg()));
775 setRegScore(RegNo, EXP_CNT, Val);
783 InstCounterType
T = eventCounter(WaitEventMaskForInst,
E);
785 unsigned UB = getScoreUB(
T);
786 unsigned CurrScore = UB + 1;
792 PendingEvents |= 1 <<
E;
793 setScoreUB(
T, CurrScore);
803 if (AddrOpIdx != -1) {
804 setExpScore(&Inst,
TII,
TRI,
MRI, AddrOpIdx, CurrScore);
817 AMDGPU::OpName::data1),
822 Inst.
getOpcode() != AMDGPU::DS_CONSUME &&
823 Inst.
getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
826 if (
Op.isReg() && !
Op.isDef() &&
827 TRI->isVectorRegister(*
MRI,
Op.getReg())) {
828 setExpScore(&Inst,
TII,
TRI,
MRI,
I, CurrScore);
832 }
else if (
TII->isFLAT(Inst)) {
844 }
else if (
TII->isMIMG(Inst)) {
846 setExpScore(&Inst,
TII,
TRI,
MRI, 0, CurrScore);
853 }
else if (
TII->isMTBUF(Inst)) {
855 setExpScore(&Inst,
TII,
TRI,
MRI, 0, CurrScore);
857 }
else if (
TII->isMUBUF(Inst)) {
859 setExpScore(&Inst,
TII,
TRI,
MRI, 0, CurrScore);
866 }
else if (
TII->isLDSDIR(Inst)) {
873 if (
TII->isEXP(Inst)) {
892 setExpScore(&Inst,
TII,
TRI,
MRI,
I, CurrScore);
897 }
else if (Inst.
getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
898 Inst.
getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
899 Inst.
getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
905 setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
912 if (!
Op.isReg() || !
Op.isDef())
915 if (
T == LOAD_CNT ||
T == SAMPLE_CNT ||
T == BVH_CNT) {
916 if (
Interval.first >= NUM_ALL_VGPRS)
918 if (updateVMCntOnly(Inst)) {
923 VmemType
V = getVmemType(Inst);
925 VgprVmemTypes[RegNo] |= 1 <<
V;
929 setRegScore(RegNo,
T, CurrScore);
933 (
TII->isDS(Inst) ||
TII->mayWriteLDSThroughDMA(Inst))) {
938 if (!
MemOp->isStore() ||
943 auto AAI =
MemOp->getAAInfo();
951 if (!AAI || !AAI.Scope)
953 for (
unsigned I = 0,
E = LDSDMAStores.size();
I !=
E && !Slot; ++
I) {
954 for (
const auto *
MemOp : LDSDMAStores[
I]->memoperands()) {
955 if (
MemOp->isStore() && AAI ==
MemOp->getAAInfo()) {
961 if (Slot || LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1)
963 LDSDMAStores.push_back(&Inst);
964 Slot = LDSDMAStores.size();
967 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS + Slot,
T, CurrScore);
969 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS,
T, CurrScore);
976 for (
auto T : inst_counter_types(MaxCounter)) {
977 unsigned SR = getScoreRange(
T);
981 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"LOAD" :
"VM") <<
"_CNT("
985 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"DS" :
"LGKM") <<
"_CNT("
989 OS <<
" EXP_CNT(" << SR <<
"): ";
992 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"STORE" :
"VS") <<
"_CNT("
996 OS <<
" SAMPLE_CNT(" << SR <<
"): ";
999 OS <<
" BVH_CNT(" << SR <<
"): ";
1002 OS <<
" KM_CNT(" << SR <<
"): ";
1005 OS <<
" UNKNOWN(" << SR <<
"): ";
1011 unsigned LB = getScoreLB(
T);
1013 for (
int J = 0; J <= VgprUB; J++) {
1014 unsigned RegScore = getRegScore(J,
T);
1017 unsigned RelScore = RegScore - LB - 1;
1018 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
1019 OS << RelScore <<
":v" << J <<
" ";
1021 OS << RelScore <<
":ds ";
1025 if (
T == SmemAccessCounter) {
1026 for (
int J = 0; J <= SgprUB; J++) {
1027 unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS,
T);
1030 unsigned RelScore = RegScore - LB - 1;
1031 OS << RelScore <<
":s" << J <<
" ";
1043 simplifyWaitcnt(LOAD_CNT,
Wait.LoadCnt);
1044 simplifyWaitcnt(EXP_CNT,
Wait.ExpCnt);
1045 simplifyWaitcnt(DS_CNT,
Wait.DsCnt);
1046 simplifyWaitcnt(STORE_CNT,
Wait.StoreCnt);
1047 simplifyWaitcnt(SAMPLE_CNT,
Wait.SampleCnt);
1048 simplifyWaitcnt(BVH_CNT,
Wait.BvhCnt);
1049 simplifyWaitcnt(KM_CNT,
Wait.KmCnt);
1052void WaitcntBrackets::simplifyWaitcnt(InstCounterType
T,
1053 unsigned &Count)
const {
1057 if (Count >= getScoreRange(
T))
1061void WaitcntBrackets::determineWait(InstCounterType
T,
int RegNo,
1063 unsigned ScoreToWait = getRegScore(RegNo,
T);
1067 const unsigned LB = getScoreLB(
T);
1068 const unsigned UB = getScoreUB(
T);
1069 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1070 if ((
T == LOAD_CNT ||
T == DS_CNT) && hasPendingFlat() &&
1071 !
ST->hasFlatLgkmVMemCountInOrder()) {
1075 addWait(
Wait,
T, 0);
1076 }
else if (counterOutOfOrder(
T)) {
1080 addWait(
Wait,
T, 0);
1084 unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(
T) - 1);
1085 addWait(
Wait,
T, NeededWait);
1091 applyWaitcnt(LOAD_CNT,
Wait.LoadCnt);
1092 applyWaitcnt(EXP_CNT,
Wait.ExpCnt);
1093 applyWaitcnt(DS_CNT,
Wait.DsCnt);
1094 applyWaitcnt(STORE_CNT,
Wait.StoreCnt);
1095 applyWaitcnt(SAMPLE_CNT,
Wait.SampleCnt);
1096 applyWaitcnt(BVH_CNT,
Wait.BvhCnt);
1097 applyWaitcnt(KM_CNT,
Wait.KmCnt);
1100void WaitcntBrackets::applyWaitcnt(InstCounterType
T,
unsigned Count) {
1101 const unsigned UB = getScoreUB(
T);
1105 if (counterOutOfOrder(
T))
1107 setScoreLB(
T, std::max(getScoreLB(
T), UB - Count));
1110 PendingEvents &= ~WaitEventMaskForInst[
T];
1116bool WaitcntBrackets::counterOutOfOrder(InstCounterType
T)
const {
1118 if (
T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS))
1120 return hasMixedPendingEvents(
T);
1130char SIInsertWaitcnts::
ID = 0;
1135 return new SIInsertWaitcnts();
1145 if (NewEnc == MO.
getImm())
1156 case AMDGPU::S_WAIT_LOADCNT:
1158 case AMDGPU::S_WAIT_EXPCNT:
1160 case AMDGPU::S_WAIT_STORECNT:
1162 case AMDGPU::S_WAIT_SAMPLECNT:
1164 case AMDGPU::S_WAIT_BVHCNT:
1166 case AMDGPU::S_WAIT_DSCNT:
1168 case AMDGPU::S_WAIT_KMCNT:
1175bool WaitcntGenerator::promoteSoftWaitCnt(
MachineInstr *Waitcnt)
const {
1189bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1190 WaitcntBrackets &ScoreBrackets,
MachineInstr &OldWaitcntInstr,
1193 assert(isNormalMode(MaxCounter));
1201 if (II.isMetaInstruction())
1205 bool IsSoft = Opcode != II.getOpcode();
1209 if (Opcode == AMDGPU::S_WAITCNT) {
1210 unsigned IEnc = II.getOperand(0).getImm();
1213 ScoreBrackets.simplifyWaitcnt(OldWait);
1217 if (WaitcntInstr || (!
Wait.hasWaitExceptStoreCnt() && IsSoft)) {
1218 II.eraseFromParent();
1223 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1224 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1227 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1229 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1230 Wait.StoreCnt = std::min(
Wait.StoreCnt, OldVSCnt);
1232 if (WaitcntVsCntInstr || (!
Wait.hasWaitStoreCnt() && IsSoft)) {
1233 II.eraseFromParent();
1236 WaitcntVsCntInstr = &II;
1243 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1245 ScoreBrackets.applyWaitcnt(LOAD_CNT,
Wait.LoadCnt);
1246 ScoreBrackets.applyWaitcnt(EXP_CNT,
Wait.ExpCnt);
1247 ScoreBrackets.applyWaitcnt(DS_CNT,
Wait.DsCnt);
1254 <<
"applyPreexistingWaitcnt\n"
1255 <<
"New Instr at block end: " << *WaitcntInstr <<
'\n'
1256 :
dbgs() <<
"applyPreexistingWaitcnt\n"
1257 <<
"Old Instr: " << *It
1258 <<
"New Instr: " << *WaitcntInstr <<
'\n');
1261 if (WaitcntVsCntInstr) {
1263 AMDGPU::OpName::simm16,
Wait.StoreCnt);
1264 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1266 ScoreBrackets.applyWaitcnt(STORE_CNT,
Wait.StoreCnt);
1267 Wait.StoreCnt = ~0
u;
1270 ?
dbgs() <<
"applyPreexistingWaitcnt\n"
1271 <<
"New Instr at block end: " << *WaitcntVsCntInstr
1273 :
dbgs() <<
"applyPreexistingWaitcnt\n"
1274 <<
"Old Instr: " << *It
1275 <<
"New Instr: " << *WaitcntVsCntInstr <<
'\n');
1283bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1287 assert(isNormalMode(MaxCounter));
1294 if (
Wait.hasWaitExceptStoreCnt()) {
1296 [[maybe_unused]]
auto SWaitInst =
1301 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1302 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1305 if (
Wait.hasWaitStoreCnt()) {
1308 [[maybe_unused]]
auto SWaitInst =
1315 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1316 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1323WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(
bool IncludeVSCnt)
const {
1328WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(
bool IncludeVSCnt)
const {
1336bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1337 WaitcntBrackets &ScoreBrackets,
MachineInstr &OldWaitcntInstr,
1340 assert(!isNormalMode(MaxCounter));
1349 if (II.isMetaInstruction())
1358 bool IsSoft = Opcode != II.getOpcode();
1360 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1362 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1365 ScoreBrackets.simplifyWaitcnt(OldWait);
1367 UpdatableInstr = &CombinedLoadDsCntInstr;
1368 }
else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1370 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1373 ScoreBrackets.simplifyWaitcnt(OldWait);
1375 UpdatableInstr = &CombinedStoreDsCntInstr;
1380 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1382 ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
1383 addWait(
Wait, CT.value(), OldCnt);
1384 UpdatableInstr = &WaitInstrs[CT.value()];
1388 if (!*UpdatableInstr) {
1389 *UpdatableInstr = &II;
1396 if (CombinedLoadDsCntInstr) {
1404 if (
Wait.LoadCnt != ~0u &&
Wait.DsCnt != ~0u) {
1407 AMDGPU::OpName::simm16, NewEnc);
1408 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
1409 ScoreBrackets.applyWaitcnt(LOAD_CNT,
Wait.LoadCnt);
1410 ScoreBrackets.applyWaitcnt(DS_CNT,
Wait.DsCnt);
1415 ?
dbgs() <<
"applyPreexistingWaitcnt\n"
1416 <<
"New Instr at block end: "
1417 << *CombinedLoadDsCntInstr <<
'\n'
1418 :
dbgs() <<
"applyPreexistingWaitcnt\n"
1419 <<
"Old Instr: " << *It <<
"New Instr: "
1420 << *CombinedLoadDsCntInstr <<
'\n');
1427 if (CombinedStoreDsCntInstr) {
1429 if (
Wait.StoreCnt != ~0u &&
Wait.DsCnt != ~0u) {
1432 AMDGPU::OpName::simm16, NewEnc);
1433 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
1434 ScoreBrackets.applyWaitcnt(STORE_CNT,
Wait.StoreCnt);
1435 ScoreBrackets.applyWaitcnt(DS_CNT,
Wait.DsCnt);
1436 Wait.StoreCnt = ~0
u;
1440 ?
dbgs() <<
"applyPreexistingWaitcnt\n"
1441 <<
"New Instr at block end: "
1442 << *CombinedStoreDsCntInstr <<
'\n'
1443 :
dbgs() <<
"applyPreexistingWaitcnt\n"
1444 <<
"Old Instr: " << *It <<
"New Instr: "
1445 << *CombinedStoreDsCntInstr <<
'\n');
1458 if (
Wait.DsCnt != ~0u) {
1467 if (
Wait.LoadCnt != ~0u) {
1468 WaitsToErase.
push_back(&WaitInstrs[LOAD_CNT]);
1469 WaitsToErase.
push_back(&WaitInstrs[DS_CNT]);
1470 }
else if (
Wait.StoreCnt != ~0u) {
1471 WaitsToErase.
push_back(&WaitInstrs[STORE_CNT]);
1472 WaitsToErase.
push_back(&WaitInstrs[DS_CNT]);
1479 (*WI)->eraseFromParent();
1485 for (
auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1486 if (!WaitInstrs[CT])
1489 unsigned NewCnt = getWait(
Wait, CT);
1490 if (NewCnt != ~0u) {
1492 AMDGPU::OpName::simm16, NewCnt);
1493 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
1495 ScoreBrackets.applyWaitcnt(CT, NewCnt);
1496 setNoWait(
Wait, CT);
1499 ?
dbgs() <<
"applyPreexistingWaitcnt\n"
1500 <<
"New Instr at block end: " << *WaitInstrs[CT]
1502 :
dbgs() <<
"applyPreexistingWaitcnt\n"
1503 <<
"Old Instr: " << *It
1504 <<
"New Instr: " << *WaitInstrs[CT] <<
'\n');
1515bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
1519 assert(!isNormalMode(MaxCounter));
1525 if (
Wait.DsCnt != ~0u) {
1528 if (
Wait.LoadCnt != ~0u) {
1536 }
else if (
Wait.StoreCnt != ~0u) {
1543 Wait.StoreCnt = ~0
u;
1551 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1552 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1559 for (
auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1560 unsigned Count = getWait(
Wait, CT);
1564 [[maybe_unused]]
auto SWaitInst =
1571 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1572 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1579 unsigned Opc =
MI.getOpcode();
1580 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
1581 !
MI.getOperand(1).isUndef();
1611bool SIInsertWaitcnts::generateWaitcntInstBefore(
MachineInstr &
MI,
1612 WaitcntBrackets &ScoreBrackets,
1615 setForceEmitWaitcnt();
1617 if (
MI.isMetaInstruction())
1626 if (
MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
1627 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
1628 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
1629 MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
1630 MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
1637 if (
MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1638 MI.getOpcode() == AMDGPU::SI_RETURN ||
1639 MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
1641 Wait =
Wait.combined(WCG->getAllZeroWaitcnt(
false));
1649 else if (
MI.getOpcode() == AMDGPU::S_ENDPGM ||
1650 MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
1652 ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
1653 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
1657 else if ((
MI.getOpcode() == AMDGPU::S_SENDMSG ||
1658 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
1659 ST->hasLegacyGeometry() &&
1665 else if (
MI.getOpcode() == SC_FENCE) {
1666 const unsigned int group_size =
1667 context->shader_info->GetMaxThreadGroupSize();
1669 const bool group_is_multi_wave =
1670 (group_size == 0 || group_size > target_info->GetWaveFrontSize());
1671 const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
1673 for (
unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
1674 SCRegType src_type = Inst->GetSrcType(i);
1677 if (group_is_multi_wave ||
1678 context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
1679 EmitWaitcnt |= ScoreBrackets->updateByWait(DS_CNT,
1680 ScoreBrackets->getScoreUB(DS_CNT));
1682 if (target_info->HasBufferLoadToLDS()) {
1683 EmitWaitcnt |= ScoreBrackets->updateByWait(LOAD_CNT,
1684 ScoreBrackets->getScoreUB(LOAD_CNT));
1690 if (group_is_multi_wave || fence_is_global) {
1691 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1692 ScoreBrackets->getScoreUB(EXP_CNT));
1693 EmitWaitcnt |= ScoreBrackets->updateByWait(DS_CNT,
1694 ScoreBrackets->getScoreUB(DS_CNT));
1702 if (group_is_multi_wave || fence_is_global) {
1703 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1704 ScoreBrackets->getScoreUB(EXP_CNT));
1705 EmitWaitcnt |= ScoreBrackets->updateByWait(LOAD_CNT,
1706 ScoreBrackets->getScoreUB(LOAD_CNT));
1723 if (
MI.modifiesRegister(AMDGPU::EXEC,
TRI)) {
1726 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
1727 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
1728 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
1729 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
1743 if (
MI.getOperand(CallAddrOpIdx).isReg()) {
1744 RegInterval CallAddrOpInterval =
1745 ScoreBrackets.getRegInterval(&
MI,
MRI,
TRI, CallAddrOpIdx);
1747 for (
int RegNo = CallAddrOpInterval.first;
1748 RegNo < CallAddrOpInterval.second; ++RegNo)
1749 ScoreBrackets.determineWait(SmemAccessCounter, RegNo,
Wait);
1753 if (RtnAddrOpIdx != -1) {
1754 RegInterval RtnAddrOpInterval =
1755 ScoreBrackets.getRegInterval(&
MI,
MRI,
TRI, RtnAddrOpIdx);
1757 for (
int RegNo = RtnAddrOpInterval.first;
1758 RegNo < RtnAddrOpInterval.second; ++RegNo)
1759 ScoreBrackets.determineWait(SmemAccessCounter, RegNo,
Wait);
1778 const Value *
Ptr = Memop->getValue();
1779 if (Memop->isStore() && SLoadAddresses.
count(
Ptr)) {
1780 addWait(
Wait, SmemAccessCounter, 0);
1784 unsigned AS = Memop->getAddrSpace();
1788 if (
TII->mayWriteLDSThroughDMA(
MI))
1792 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1793 bool FoundAliasingStore =
false;
1800 if (
Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
1801 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
1802 for (
unsigned I = 0,
E = LDSDMAStores.size();
I !=
E; ++
I) {
1803 if (
MI.mayAlias(AA, *LDSDMAStores[
I],
true)) {
1804 FoundAliasingStore =
true;
1805 ScoreBrackets.determineWait(LOAD_CNT, RegNo +
I + 1,
Wait);
1809 if (!FoundAliasingStore)
1810 ScoreBrackets.determineWait(LOAD_CNT, RegNo,
Wait);
1811 if (Memop->isStore()) {
1812 ScoreBrackets.determineWait(EXP_CNT, RegNo,
Wait);
1817 for (
unsigned I = 0,
E =
MI.getNumOperands();
I !=
E; ++
I) {
1823 if (
Op.isTied() &&
Op.isUse() &&
TII->doesNotReadTiedSource(
MI))
1828 const bool IsVGPR =
TRI->isVectorRegister(*
MRI,
Op.getReg());
1835 if (
Op.isUse() || !updateVMCntOnly(
MI) ||
1836 ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
1838 ScoreBrackets.determineWait(LOAD_CNT, RegNo,
Wait);
1839 ScoreBrackets.determineWait(SAMPLE_CNT, RegNo,
Wait);
1840 ScoreBrackets.determineWait(BVH_CNT, RegNo,
Wait);
1841 ScoreBrackets.clearVgprVmemTypes(RegNo);
1843 if (
Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
1844 ScoreBrackets.determineWait(EXP_CNT, RegNo,
Wait);
1846 ScoreBrackets.determineWait(DS_CNT, RegNo,
Wait);
1848 ScoreBrackets.determineWait(SmemAccessCounter, RegNo,
Wait);
1859 if (
MI.getOpcode() == AMDGPU::S_BARRIER &&
1860 !
ST->hasAutoWaitcntBeforeBarrier() && !
ST->supportsBackOffBarrier()) {
1861 Wait =
Wait.combined(WCG->getAllZeroWaitcnt(
true));
1868 if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1874 ScoreBrackets.simplifyWaitcnt(
Wait);
1876 if (ForceEmitZeroWaitcnts)
1877 Wait = WCG->getAllZeroWaitcnt(
false);
1879 if (ForceEmitWaitcnt[LOAD_CNT])
1881 if (ForceEmitWaitcnt[EXP_CNT])
1883 if (ForceEmitWaitcnt[DS_CNT])
1885 if (ForceEmitWaitcnt[SAMPLE_CNT])
1887 if (ForceEmitWaitcnt[BVH_CNT])
1889 if (ForceEmitWaitcnt[KM_CNT])
1893 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
1895 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
1897 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
1901 return generateWaitcnt(
Wait,
MI.getIterator(), *
MI.getParent(), ScoreBrackets,
1908 WaitcntBrackets &ScoreBrackets,
1912 unsigned LoadCntPending = ScoreBrackets.hasPendingEvent(LOAD_CNT);
1913 unsigned SampleCntPending = ScoreBrackets.hasPendingEvent(SAMPLE_CNT);
1914 unsigned BvhCntPending = ScoreBrackets.hasPendingEvent(BVH_CNT);
1916 if (LoadCntPending == 0 && SampleCntPending == 0 && BvhCntPending == 0)
1919 if (LoadCntPending != 0)
1921 if (SampleCntPending != 0)
1923 if (BvhCntPending != 0)
1926 return generateWaitcnt(
Wait,
Block.instr_end(),
Block, ScoreBrackets,
1933 WaitcntBrackets &ScoreBrackets,
1937 if (OldWaitcntInstr)
1941 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr,
Wait, It);
1945 ScoreBrackets.applyWaitcnt(
Wait);
1948 if (
Wait.ExpCnt != ~0u && It !=
Block.instr_end() &&
1951 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
1959 <<
"Update Instr: " << *It);
1962 if (WCG->createNewWaitcnt(
Block, It,
Wait))
1971bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(
const MachineInstr &
MI)
const {
1979 if (
MI.memoperands_empty())
1988 unsigned AS = Memop->getAddrSpace();
1999bool SIInsertWaitcnts::mayAccessLDSThroughFlat(
const MachineInstr &
MI)
const {
2003 if (!
TII->usesLGKM_CNT(
MI))
2007 if (
ST->isTgSplitEnabled())
2012 if (
MI.memoperands_empty())
2017 unsigned AS = Memop->getAddrSpace();
2027bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
2032 if (
TII->isFLATScratch(
MI))
2036 if (
TII->isFLATGlobal(
MI))
2041 if (
MI.memoperands_empty())
2046 unsigned AS = Memop->getAddrSpace();
2047 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
2053 return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
2054 Opc == AMDGPU::GLOBAL_WBINV;
2057void SIInsertWaitcnts::updateEventWaitcntAfter(
MachineInstr &Inst,
2058 WaitcntBrackets *ScoreBrackets) {
2064 if (
TII->isDS(Inst) &&
TII->usesLGKM_CNT(Inst)) {
2066 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2067 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, GDS_ACCESS, Inst);
2068 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, GDS_GPR_LOCK, Inst);
2070 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, LDS_ACCESS, Inst);
2072 }
else if (
TII->isFLAT(Inst)) {
2079 int FlatASCount = 0;
2081 if (mayAccessVMEMThroughFlat(Inst)) {
2083 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, getVmemWaitEventType(Inst),
2087 if (mayAccessLDSThroughFlat(Inst)) {
2089 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, LDS_ACCESS, Inst);
2098 if (FlatASCount > 1)
2099 ScoreBrackets->setPendingFlat();
2102 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, getVmemWaitEventType(Inst),
2105 if (
ST->vmemWriteNeedsExpWaitcnt() &&
2107 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, VMW_GPR_LOCK, Inst);
2109 }
else if (
TII->isSMRD(Inst)) {
2110 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, SMEM_ACCESS, Inst);
2111 }
else if (Inst.
isCall()) {
2114 ScoreBrackets->applyWaitcnt(
2115 WCG->getAllZeroWaitcnt(
false));
2116 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2122 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, EXP_LDS_ACCESS, Inst);
2123 }
else if (
TII->isVINTERP(Inst)) {
2124 int64_t
Imm =
TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2125 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
2127 unsigned Imm =
TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2129 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, EXP_PARAM_ACCESS, Inst);
2131 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, EXP_POS_ACCESS, Inst);
2133 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, EXP_GPR_LOCK, Inst);
2136 case AMDGPU::S_SENDMSG:
2137 case AMDGPU::S_SENDMSG_RTN_B32:
2138 case AMDGPU::S_SENDMSG_RTN_B64:
2139 case AMDGPU::S_SENDMSGHALT:
2140 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, SQ_MESSAGE, Inst);
2142 case AMDGPU::S_MEMTIME:
2143 case AMDGPU::S_MEMREALTIME:
2144 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
2145 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
2146 case AMDGPU::S_BARRIER_LEAVE:
2147 case AMDGPU::S_GET_BARRIER_STATE_M0:
2148 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2149 ScoreBrackets->updateByEvent(
TII,
TRI,
MRI, SMEM_ACCESS, Inst);
2155bool WaitcntBrackets::mergeScore(
const MergeInfo &M,
unsigned &Score,
2156 unsigned OtherScore) {
2157 unsigned MyShifted = Score <=
M.OldLB ? 0 : Score +
M.MyShift;
2158 unsigned OtherShifted =
2159 OtherScore <=
M.OtherLB ? 0 : OtherScore +
M.OtherShift;
2160 Score = std::max(MyShifted, OtherShifted);
2161 return OtherShifted > MyShifted;
2169bool WaitcntBrackets::merge(
const WaitcntBrackets &
Other) {
2170 bool StrictDom =
false;
2172 VgprUB = std::max(VgprUB,
Other.VgprUB);
2173 SgprUB = std::max(SgprUB,
Other.SgprUB);
2175 for (
auto T : inst_counter_types(MaxCounter)) {
2177 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[
T];
2178 const unsigned OtherEvents =
Other.PendingEvents & WaitEventMaskForInst[
T];
2179 if (OtherEvents & ~OldEvents)
2181 PendingEvents |= OtherEvents;
2184 const unsigned MyPending = ScoreUBs[
T] - ScoreLBs[
T];
2185 const unsigned OtherPending =
Other.ScoreUBs[
T] -
Other.ScoreLBs[
T];
2186 const unsigned NewUB = ScoreLBs[
T] + std::max(MyPending, OtherPending);
2187 if (NewUB < ScoreLBs[
T])
2191 M.OldLB = ScoreLBs[
T];
2192 M.OtherLB =
Other.ScoreLBs[
T];
2193 M.MyShift = NewUB - ScoreUBs[
T];
2194 M.OtherShift = NewUB -
Other.ScoreUBs[
T];
2196 ScoreUBs[
T] = NewUB;
2198 StrictDom |= mergeScore(M, LastFlat[
T],
Other.LastFlat[
T]);
2200 for (
int J = 0; J <= VgprUB; J++)
2201 StrictDom |= mergeScore(M, VgprScores[
T][J],
Other.VgprScores[
T][J]);
2203 if (
T == SmemAccessCounter) {
2204 for (
int J = 0; J <= SgprUB; J++)
2205 StrictDom |= mergeScore(M, SgprScores[J],
Other.SgprScores[J]);
2209 for (
int J = 0; J <= VgprUB; J++) {
2210 unsigned char NewVmemTypes = VgprVmemTypes[J] |
Other.VgprVmemTypes[J];
2211 StrictDom |= NewVmemTypes != VgprVmemTypes[J];
2212 VgprVmemTypes[J] = NewVmemTypes;
2220 return Opcode == AMDGPU::S_WAITCNT ||
2223 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2224 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2231 WaitcntBrackets &ScoreBrackets) {
2235 dbgs() <<
"*** Block" <<
Block.getNumber() <<
" ***";
2236 ScoreBrackets.dump();
2242 bool VCCZCorrect =
true;
2243 if (
ST->hasReadVCCZBug()) {
2246 VCCZCorrect =
false;
2247 }
else if (!
ST->partialVCCWritesUpdateVCCZ()) {
2250 VCCZCorrect =
false;
2264 if (!OldWaitcntInstr)
2265 OldWaitcntInstr = &Inst;
2270 bool FlushVmCnt =
Block.getFirstTerminator() == Inst &&
2271 isPreheaderToFlush(
Block, ScoreBrackets);
2274 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
2276 OldWaitcntInstr =
nullptr;
2279 bool RestoreVCCZ = !VCCZCorrect &&
readsVCCZ(Inst);
2282 if (
ST->hasReadVCCZBug() || !
ST->partialVCCWritesUpdateVCCZ()) {
2286 if (!
ST->partialVCCWritesUpdateVCCZ())
2287 VCCZCorrect =
false;
2296 if (
ST->hasReadVCCZBug() &&
2297 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2300 VCCZCorrect =
false;
2308 if (
TII->isSMRD(Inst)) {
2312 if (!Memop->isInvariant()) {
2313 const Value *
Ptr = Memop->getValue();
2317 if (
ST->hasReadVCCZBug()) {
2319 VCCZCorrect =
false;
2323 updateEventWaitcntAfter(Inst, &ScoreBrackets);
2329 if (RequireCheckResourceType(Inst, context)) {
2331 ScoreBrackets->setScoreLB(LOAD_CNT,
2332 ScoreBrackets->getScoreUB(LOAD_CNT));
2338 ScoreBrackets.dump();
2348 TII->get(
ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2358 if (
Block.getFirstTerminator() ==
Block.end() &&
2359 isPreheaderToFlush(
Block, ScoreBrackets))
2360 Modified |= generateWaitcntBlockEnd(
Block, ScoreBrackets, OldWaitcntInstr);
2368 WaitcntBrackets &ScoreBrackets) {
2369 auto [Iterator, IsInserted] = PreheadersToFlush.
try_emplace(&
MBB,
false);
2371 return Iterator->second;
2382 shouldFlushVmCnt(
Loop, ScoreBrackets)) {
2383 Iterator->second =
true;
2390bool SIInsertWaitcnts::isVMEMOrFlatVMEM(
const MachineInstr &
MI)
const {
2404 WaitcntBrackets &Brackets) {
2405 bool HasVMemLoad =
false;
2406 bool HasVMemStore =
false;
2407 bool UsesVgprLoadedOutside =
false;
2413 if (isVMEMOrFlatVMEM(
MI)) {
2417 HasVMemStore =
true;
2419 for (
unsigned I = 0;
I <
MI.getNumOperands();
I++) {
2421 if (!
Op.isReg() || !
TRI->isVectorRegister(*
MRI,
Op.getReg()))
2434 if (Brackets.getRegScore(RegNo, LOAD_CNT) >
2435 Brackets.getScoreLB(LOAD_CNT) ||
2436 Brackets.getRegScore(RegNo, SAMPLE_CNT) >
2437 Brackets.getScoreLB(SAMPLE_CNT) ||
2438 Brackets.getRegScore(RegNo, BVH_CNT) >
2439 Brackets.getScoreLB(BVH_CNT)) {
2440 UsesVgprLoadedOutside =
true;
2446 else if (isVMEMOrFlatVMEM(
MI) &&
MI.mayLoad() &&
Op.isDef())
2457 if (!
ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
2459 return HasVMemLoad && UsesVgprLoadedOutside;
2464 TII =
ST->getInstrInfo();
2465 TRI = &
TII->getRegisterInfo();
2468 MLI = &getAnalysis<MachineLoopInfo>();
2469 PDT = &getAnalysis<MachinePostDominatorTree>();
2470 if (
auto AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
2471 AA = &AAR->getAAResults();
2475 if (
ST->hasExtendedWaitCounts()) {
2476 MaxCounter = NUM_EXTENDED_INST_CNTS;
2477 WCGGFX12Plus = WaitcntGeneratorGFX12Plus(ST, MaxCounter);
2478 WCG = &WCGGFX12Plus;
2480 MaxCounter = NUM_NORMAL_INST_CNTS;
2481 WCGPreGFX12 = WaitcntGeneratorPreGFX12(ST);
2486 for (
auto T : inst_counter_types())
2487 ForceEmitWaitcnt[
T] =
false;
2489 const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
2491 SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
2496 HardwareLimits Limits = {};
2497 if (
ST->hasExtendedWaitCounts()) {
2510 unsigned NumVGPRsMax =
ST->getAddressableNumVGPRs();
2511 unsigned NumSGPRsMax =
ST->getAddressableNumSGPRs();
2512 assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
2513 assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
2515 RegisterEncoding Encoding = {};
2518 Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
2521 Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
2537 I !=
E && (
I->isPHI() ||
I->isMetaInstruction()); ++
I)
2540 if (
ST->hasExtendedWaitCounts()) {
2543 for (
auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2544 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT)
2548 TII->get(instrsForExtendedCounterTypes[CT]))
2555 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
2556 ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst,
2558 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
2559 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
2569 std::unique_ptr<WaitcntBrackets> Brackets;
2574 for (
auto BII = BlockInfos.
begin(), BIE = BlockInfos.
end(); BII != BIE;
2577 BlockInfo &BI = BII->second;
2583 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
2585 *Brackets = *BI.Incoming;
2588 Brackets = std::make_unique<WaitcntBrackets>(
2589 ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst,
2592 *Brackets = WaitcntBrackets(ST, MaxCounter, Limits, Encoding,
2593 WaitEventMaskForInst, SmemAccessCounter);
2596 Modified |= insertWaitcntInBlock(MF, *
MBB, *Brackets);
2599 if (Brackets->hasPendingEvent()) {
2600 BlockInfo *MoveBracketsToSucc =
nullptr;
2602 auto SuccBII = BlockInfos.
find(Succ);
2603 BlockInfo &SuccBI = SuccBII->second;
2604 if (!SuccBI.Incoming) {
2605 SuccBI.Dirty =
true;
2608 if (!MoveBracketsToSucc) {
2609 MoveBracketsToSucc = &SuccBI;
2611 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
2613 }
else if (SuccBI.Incoming->merge(*Brackets)) {
2614 SuccBI.Dirty =
true;
2619 if (MoveBracketsToSucc)
2620 MoveBracketsToSucc->Incoming = std::move(Brackets);
2625 if (
ST->hasScalarStores()) {
2627 bool HaveScalarStores =
false;
2631 if (!HaveScalarStores &&
TII->isScalarStore(
MI))
2632 HaveScalarStores =
true;
2634 if (
MI.getOpcode() == AMDGPU::S_ENDPGM ||
2635 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
2640 if (HaveScalarStores) {
2650 bool SeenDCacheWB =
false;
2654 if (
I->getOpcode() == AMDGPU::S_DCACHE_WB)
2655 SeenDCacheWB =
true;
2656 else if (
TII->isScalarStore(*
I))
2657 SeenDCacheWB =
false;
2660 if ((
I->getOpcode() == AMDGPU::S_ENDPGM ||
2661 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
2674 if (
ST->requiresNopBeforeDeallocVGPRs()) {
2682 ReleaseVGPRInsts.clear();
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Provides AMDGPU specific target descriptions.
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
std::optional< std::vector< StOtherPiece > > Other
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
unsigned const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static bool callWaitsOnFunctionReturn(const MachineInstr &MI)
static bool isCacheInvOrWBInst(MachineInstr &Inst)
static bool callWaitsOnFunctionEntry(const MachineInstr &MI)
static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static bool readsVCCZ(const MachineInstr &MI)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Provides some synthesis utilities to produce sequences of values.
static const uint32_t IV[8]
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
bool isEntryFunction() const
Represent the analysis usage information of a pass.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
This class represents an Operation in the Expression.
static bool isCounterSet(unsigned ID)
static bool shouldExecute(unsigned CounterName)
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
FunctionPass class - This class is used to implement most global optimizations.
bool hasOptNone() const
Do not optimize this function (-O0).
Interval Class - An Interval is a set of nodes defined such that every node in the interval has all o...
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Represents a single loop in the control flow graph.
const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
bool isCall(QueryType Type=AnyInBundle) const
unsigned getNumOperands() const
Retuns the total number of operands.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI=nullptr) const
Return true if the MachineInstr fully defines the specified register.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
MachineLoop * getLoopFor(const MachineBasicBlock *BB) const
Return the innermost loop that BB lives in.
A description of a memory reference used in the backend.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
bool dominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class implements a map that also provides access to all stored values in a deterministic order.
iterator find(const KeyT &Key)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isVIMAGE(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
LLVM Value Representation.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
self_iterator getIterator()
This class implements an extremely fast bulk output stream that can only output to a stream.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ ID_DEALLOC_VGPRS_GFX11Plus
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
unsigned getStorecntBitMask(const IsaVersion &Version)
IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getSamplecntBitMask(const IsaVersion &Version)
unsigned getKmcntBitMask(const IsaVersion &Version)
unsigned getVmcntBitMask(const IsaVersion &Version)
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getBvhcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI)
If Reg is a pseudo reg, return the correct hardware register given STI otherwise return Reg.
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
unsigned getLoadcntBitMask(const IsaVersion &Version)
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
unsigned getDscntBitMask(const IsaVersion &Version)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
char & SIInsertWaitcntsID
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
FunctionPass * createSIInsertWaitcntsPass()
Instruction set architecture version.
Represents the counter values to wait for in an s_waitcnt instruction.
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
static constexpr bool is_iterable