89#define DEBUG_TYPE "si-wqm"
98 StateStrict = StateStrictWWM | StateStrictWQM,
105 explicit PrintState(
int State) : State(State) {}
111 static const std::pair<char, const char *> Mapping[] = {
112 std::pair(StateWQM,
"WQM"), std::pair(StateStrictWWM,
"StrictWWM"),
113 std::pair(StateStrictWQM,
"StrictWQM"), std::pair(StateExact,
"Exact")};
114 char State = PS.State;
115 for (
auto M : Mapping) {
116 if (State & M.first) {
133 char MarkedStates = 0;
140 char InitialState = 0;
141 bool NeedsLowering =
false;
153class SIWholeQuadMode {
190 std::vector<WorkItem> &Worklist);
193 std::vector<WorkItem> &Worklist);
195 std::vector<WorkItem> &Worklist);
197 std::vector<WorkItem> &Worklist);
198 char scanInstructions(
MachineFunction &MF, std::vector<WorkItem> &Worklist);
199 void propagateInstruction(
MachineInstr &
MI, std::vector<WorkItem> &Worklist);
214 Register SaveOrig,
char StrictStateNeeded);
217 char NonStrictState,
char CurrentStrictState);
226 bool lowerLiveMaskQueries();
227 bool lowerCopyInstrs();
228 bool lowerKillInstrs(
bool IsWQM);
242 StringRef getPassName()
const override {
return "SI Whole Quad Mode"; }
259char SIWholeQuadModeLegacy::ID = 0;
272 return new SIWholeQuadModeLegacy;
277 for (
const auto &BII : Blocks) {
280 <<
" InNeeds = " << PrintState(BII.second.InNeeds)
281 <<
", Needs = " << PrintState(BII.second.Needs)
282 <<
", OutNeeds = " << PrintState(BII.second.OutNeeds) <<
"\n\n";
285 auto III = Instructions.find(&
MI);
286 if (III != Instructions.end()) {
287 dbgs() <<
" " <<
MI <<
" Needs = " << PrintState(III->second.Needs)
288 <<
", OutNeeds = " << PrintState(III->second.OutNeeds) <<
'\n';
295void SIWholeQuadMode::markInstruction(MachineInstr &
MI,
char Flag,
296 std::vector<WorkItem> &Worklist) {
297 InstrInfo &
II = Instructions[&
MI];
299 assert(!(Flag & StateExact) && Flag != 0);
308 Flag &= ~II.Disabled;
312 if ((
II.Needs & Flag) == Flag)
317 Worklist.emplace_back(&
MI);
321void SIWholeQuadMode::markDefs(
const MachineInstr &
UseMI,
LiveRange &LR,
322 VirtRegOrUnit VRegOrUnit,
unsigned SubReg,
323 char Flag, std::vector<WorkItem> &Worklist) {
333 const LaneBitmask UseLanes =
345 LaneBitmask DefinedLanes;
347 PhiEntry(
const VNInfo *Phi,
unsigned PredIdx, LaneBitmask DefinedLanes)
348 :
Phi(
Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
350 using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
352 SmallSet<VisitKey, 4> Visited;
353 LaneBitmask DefinedLanes;
354 unsigned NextPredIdx = 0;
356 const VNInfo *NextValue =
nullptr;
357 const VisitKey
Key(
Value, DefinedLanes);
364 if (
Value->isPHIDef()) {
367 assert(
MBB &&
"Phi-def has no defining MBB");
370 unsigned Idx = NextPredIdx;
373 for (; PI != PE && !NextValue; ++PI, ++Idx) {
375 if (!Visited.
count(VisitKey(VN, DefinedLanes)))
385 assert(
MI &&
"Def has no defining instruction");
390 for (
const MachineOperand &
Op :
MI->all_defs()) {
395 LaneBitmask OpLanes =
397 :
TRI->getSubRegIndexLaneMask(
Op.getSubReg());
398 LaneBitmask Overlap = (UseLanes & OpLanes);
401 HasDef |= Overlap.
any();
404 DefinedLanes |= OpLanes;
408 if ((DefinedLanes & UseLanes) != UseLanes) {
411 if (
const VNInfo *VN = LRQ.
valueIn()) {
412 if (!Visited.
count(VisitKey(VN, DefinedLanes)))
419 markInstruction(*
MI, Flag, Worklist);
422 markInstruction(*
MI, Flag, Worklist);
426 if (!NextValue && !PhiStack.
empty()) {
429 NextValue =
Entry.Phi;
430 NextPredIdx =
Entry.PredIdx;
431 DefinedLanes =
Entry.DefinedLanes;
439void SIWholeQuadMode::markOperand(
const MachineInstr &
MI,
440 const MachineOperand &
Op,
char Flag,
441 std::vector<WorkItem> &Worklist) {
448 case AMDGPU::EXEC_LO:
458 markDefs(
MI, LR, VirtRegOrUnit(
Reg),
Op.getSubReg(), Flag, Worklist);
467 markDefs(
MI, LR, VirtRegOrUnit(Unit), AMDGPU::NoSubRegister, Flag,
474void SIWholeQuadMode::markInstructionUses(
const MachineInstr &
MI,
char Flag,
475 std::vector<WorkItem> &Worklist) {
476 LLVM_DEBUG(
dbgs() <<
"markInstructionUses " << PrintState(Flag) <<
": "
479 for (
const MachineOperand &Use :
MI.all_uses())
480 markOperand(
MI, Use, Flag, Worklist);
485char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
486 std::vector<WorkItem> &Worklist) {
487 char GlobalFlags = 0;
489 SmallVector<MachineInstr *, 4> SoftWQMInstrs;
490 bool HasImplicitDerivatives =
497 ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
498 for (MachineBasicBlock *
MBB : RPOT) {
499 BlockInfo &BBI = Blocks[
MBB];
501 for (MachineInstr &
MI : *
MBB) {
502 InstrInfo &III = Instructions[&
MI];
503 unsigned Opcode =
MI.getOpcode();
506 if (
TII->isWQM(Opcode)) {
511 if (ST->hasExtendedImageInsts() && HasImplicitDerivatives) {
515 markInstructionUses(
MI, StateWQM, Worklist);
516 GlobalFlags |= StateWQM;
518 }
else if (Opcode == AMDGPU::WQM) {
522 LowerToCopyInstrs.insert(&
MI);
523 }
else if (Opcode == AMDGPU::SOFT_WQM) {
524 LowerToCopyInstrs.insert(&
MI);
526 }
else if (Opcode == AMDGPU::STRICT_WWM) {
530 markInstructionUses(
MI, StateStrictWWM, Worklist);
531 GlobalFlags |= StateStrictWWM;
533 }
else if (Opcode == AMDGPU::STRICT_WQM ||
534 TII->isDualSourceBlendEXP(
MI)) {
538 markInstructionUses(
MI, StateStrictWQM, Worklist);
539 GlobalFlags |= StateStrictWQM;
541 if (Opcode == AMDGPU::STRICT_WQM) {
547 BBI.Needs |= StateExact;
548 if (!(BBI.InNeeds & StateExact)) {
549 BBI.InNeeds |= StateExact;
550 Worklist.emplace_back(
MBB);
552 GlobalFlags |= StateExact;
553 III.Disabled = StateWQM | StateStrict;
555 }
else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
556 Opcode == AMDGPU::DS_PARAM_LOAD ||
557 Opcode == AMDGPU::LDS_DIRECT_LOAD ||
558 Opcode == AMDGPU::DS_DIRECT_LOAD) {
561 III.Needs |= StateStrictWQM;
562 GlobalFlags |= StateStrictWQM;
563 }
else if (Opcode == AMDGPU::V_SET_INACTIVE_B32) {
565 III.Disabled = StateStrict;
566 MachineOperand &Inactive =
MI.getOperand(4);
567 if (Inactive.
isReg()) {
568 if (Inactive.
isUndef() &&
MI.getOperand(3).getImm() == 0)
569 LowerToCopyInstrs.insert(&
MI);
571 markOperand(
MI, Inactive, StateStrictWWM, Worklist);
574 BBI.NeedsLowering =
true;
575 }
else if (
TII->isDisableWQM(
MI)) {
576 BBI.Needs |= StateExact;
577 if (!(BBI.InNeeds & StateExact)) {
578 BBI.InNeeds |= StateExact;
579 Worklist.emplace_back(
MBB);
581 GlobalFlags |= StateExact;
582 III.Disabled = StateWQM | StateStrict;
583 }
else if (Opcode == AMDGPU::SI_PS_LIVE ||
584 Opcode == AMDGPU::SI_LIVE_MASK) {
586 }
else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
587 Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
588 Opcode == AMDGPU::SI_DEMOTE_I1) {
590 BBI.NeedsLowering =
true;
591 }
else if (Opcode == AMDGPU::SI_INIT_EXEC ||
592 Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT ||
593 Opcode == AMDGPU::SI_INIT_WHOLE_WAVE) {
595 }
else if (WQMOutputs) {
600 for (
const MachineOperand &MO :
MI.defs()) {
603 TRI->hasVectorRegisters(
TRI->getPhysRegBaseClass(
Reg))) {
611 markInstruction(
MI, Flags, Worklist);
612 GlobalFlags |=
Flags;
621 if (GlobalFlags & StateWQM) {
622 for (MachineInstr *
MI : SetInactiveInstrs)
623 markInstruction(*
MI, StateWQM, Worklist);
624 for (MachineInstr *
MI : SoftWQMInstrs)
625 markInstruction(*
MI, StateWQM, Worklist);
631void SIWholeQuadMode::propagateInstruction(MachineInstr &
MI,
632 std::vector<WorkItem>& Worklist) {
633 MachineBasicBlock *
MBB =
MI.getParent();
634 InstrInfo
II = Instructions[&
MI];
635 BlockInfo &BI = Blocks[
MBB];
639 if ((
II.OutNeeds & StateWQM) && !(
II.Disabled & StateWQM) &&
640 (
MI.isTerminator() || (
TII->usesVM_CNT(
MI) &&
MI.mayStore()))) {
641 Instructions[&
MI].Needs = StateWQM;
646 if (
II.Needs & StateWQM) {
647 BI.Needs |= StateWQM;
648 if (!(BI.InNeeds & StateWQM)) {
649 BI.InNeeds |= StateWQM;
650 Worklist.emplace_back(
MBB);
655 if (MachineInstr *PrevMI =
MI.getPrevNode()) {
656 char InNeeds = (
II.Needs & ~StateStrict) |
II.OutNeeds;
657 if (!PrevMI->isPHI()) {
658 InstrInfo &PrevII = Instructions[PrevMI];
659 if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
660 PrevII.OutNeeds |= InNeeds;
661 Worklist.emplace_back(PrevMI);
670 markInstructionUses(
MI,
II.Needs, Worklist);
674 if (
II.Needs & StateStrictWWM)
675 BI.Needs |= StateStrictWWM;
676 if (
II.Needs & StateStrictWQM)
677 BI.Needs |= StateStrictWQM;
680void SIWholeQuadMode::propagateBlock(MachineBasicBlock &
MBB,
681 std::vector<WorkItem>& Worklist) {
682 BlockInfo BI = Blocks[&
MBB];
687 InstrInfo &LastII = Instructions[LastMI];
688 if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
689 LastII.OutNeeds |= BI.OutNeeds;
690 Worklist.emplace_back(LastMI);
696 BlockInfo &PredBI = Blocks[Pred];
697 if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
700 PredBI.OutNeeds |= BI.InNeeds;
701 PredBI.InNeeds |= BI.InNeeds;
702 Worklist.emplace_back(Pred);
707 BlockInfo &SuccBI = Blocks[Succ];
708 if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
711 SuccBI.InNeeds |= BI.OutNeeds;
712 Worklist.emplace_back(Succ);
716char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
717 std::vector<WorkItem> Worklist;
718 char GlobalFlags = scanInstructions(MF, Worklist);
720 while (!Worklist.empty()) {
721 WorkItem WI = Worklist.back();
725 propagateInstruction(*WI.MI, Worklist);
727 propagateBlock(*WI.MBB, Worklist);
734SIWholeQuadMode::saveSCC(MachineBasicBlock &
MBB,
736 Register SaveReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
741 MachineInstr *Restore =
752void SIWholeQuadMode::splitBlock(MachineInstr *TermMI) {
753 MachineBasicBlock *BB = TermMI->
getParent();
757 MachineBasicBlock *SplitBB =
758 BB->
splitAt(*TermMI,
true, LIS);
762 unsigned NewOpcode = 0;
764 case AMDGPU::S_AND_B32:
765 NewOpcode = AMDGPU::S_AND_B32_term;
767 case AMDGPU::S_AND_B64:
768 NewOpcode = AMDGPU::S_AND_B64_term;
770 case AMDGPU::S_MOV_B32:
771 NewOpcode = AMDGPU::S_MOV_B32_term;
773 case AMDGPU::S_MOV_B64:
774 NewOpcode = AMDGPU::S_MOV_B64_term;
776 case AMDGPU::S_ANDN2_B32:
777 NewOpcode = AMDGPU::S_ANDN2_B32_term;
779 case AMDGPU::S_ANDN2_B64:
780 NewOpcode = AMDGPU::S_ANDN2_B64_term;
794 for (MachineBasicBlock *Succ : SplitBB->
successors()) {
795 DTUpdates.
push_back({DomTreeT::Insert, SplitBB, Succ});
796 DTUpdates.
push_back({DomTreeT::Delete, BB, Succ});
798 DTUpdates.
push_back({DomTreeT::Insert, BB, SplitBB});
806MachineInstr *SIWholeQuadMode::lowerKillF32(MachineInstr &
MI) {
821 switch (
MI.getOperand(2).getImm()) {
823 Opcode = AMDGPU::V_CMP_LG_F32_e64;
826 Opcode = AMDGPU::V_CMP_GE_F32_e64;
829 Opcode = AMDGPU::V_CMP_GT_F32_e64;
832 Opcode = AMDGPU::V_CMP_LE_F32_e64;
835 Opcode = AMDGPU::V_CMP_LT_F32_e64;
838 Opcode = AMDGPU::V_CMP_EQ_F32_e64;
841 Opcode = AMDGPU::V_CMP_O_F32_e64;
844 Opcode = AMDGPU::V_CMP_U_F32_e64;
848 Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
852 Opcode = AMDGPU::V_CMP_NLT_F32_e64;
856 Opcode = AMDGPU::V_CMP_NLE_F32_e64;
860 Opcode = AMDGPU::V_CMP_NGT_F32_e64;
864 Opcode = AMDGPU::V_CMP_NGE_F32_e64;
868 Opcode = AMDGPU::V_CMP_NLG_F32_e64;
874 MachineBasicBlock &
MBB = *
MI.getParent();
877 MachineInstr *VcmpMI;
878 const MachineOperand &Op0 =
MI.getOperand(0);
879 const MachineOperand &Op1 =
MI.getOperand(1);
895 MachineInstr *MaskUpdateMI =
902 MachineInstr *EarlyTermMI =
905 MachineInstr *ExecMaskMI =
923MachineInstr *SIWholeQuadMode::lowerKillI1(MachineInstr &
MI,
bool IsWQM) {
926 MachineBasicBlock &
MBB = *
MI.getParent();
929 MachineInstr *MaskUpdateMI =
nullptr;
931 const bool IsDemote = IsWQM && (
MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
932 const MachineOperand &
Op =
MI.getOperand(0);
933 int64_t KillVal =
MI.getOperand(1).getImm();
934 MachineInstr *ComputeKilledMaskMI =
nullptr;
940 if (
Op.getImm() == KillVal) {
947 bool IsLastTerminator = std::next(
MI.getIterator()) ==
MBB.
end();
948 if (!IsLastTerminator) {
963 TmpReg =
MRI->createVirtualRegister(
TRI->getBoolRC());
980 MachineInstr *EarlyTermMI =
985 MachineInstr *NewTerm;
986 MachineInstr *WQMMaskMI =
nullptr;
990 LiveMaskWQM =
MRI->createVirtualRegister(
TRI->getBoolRC());
1001 }
else if (!IsWQM) {
1019 if (ComputeKilledMaskMI)
1042void SIWholeQuadMode::lowerBlock(MachineBasicBlock &
MBB, BlockInfo &BI) {
1043 if (!BI.NeedsLowering)
1048 SmallVector<MachineInstr *, 4> SplitPoints;
1050 char State = BI.InitialState;
1054 auto MIState = StateTransition.find(&
MI);
1055 if (MIState != StateTransition.end())
1056 State = MIState->second;
1058 MachineInstr *SplitPoint =
nullptr;
1059 switch (
MI.getOpcode()) {
1060 case AMDGPU::SI_DEMOTE_I1:
1061 case AMDGPU::SI_KILL_I1_TERMINATOR:
1062 SplitPoint = lowerKillI1(
MI, State == StateWQM);
1064 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1065 SplitPoint = lowerKillF32(
MI);
1067 case AMDGPU::ENTER_STRICT_WWM:
1068 ActiveLanesReg =
MI.getOperand(0).getReg();
1070 case AMDGPU::EXIT_STRICT_WWM:
1073 case AMDGPU::V_SET_INACTIVE_B32:
1074 if (ActiveLanesReg) {
1075 LiveInterval &LI = LIS->
getInterval(
MI.getOperand(5).getReg());
1076 MRI->constrainRegClass(ActiveLanesReg,
TRI->getWaveMaskRegClass());
1077 MI.getOperand(5).setReg(ActiveLanesReg);
1080 assert(State == StateExact || State == StateWQM);
1091 for (MachineInstr *
MI : SplitPoints)
1111 SlotIndex FirstIdx = FirstNonDbg != MBBE
1116 SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1117 const LiveRange::Segment *S;
1126 if (
Next < FirstIdx)
1131 assert(EndMI &&
"Segment does not end on valid instruction");
1155 bool IsExecDef =
false;
1156 for (
const MachineOperand &MO :
MBBI->all_defs()) {
1158 MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1172void SIWholeQuadMode::toExact(MachineBasicBlock &
MBB,
1177 bool IsTerminator = Before ==
MBB.
end();
1178 if (!IsTerminator) {
1180 if (FirstTerm !=
MBB.
end()) {
1183 IsTerminator = BeforeIdx > FirstTermIdx;
1203 StateTransition[
MI] = StateExact;
1206void SIWholeQuadMode::toWQM(MachineBasicBlock &
MBB,
1221 StateTransition[
MI] = StateWQM;
1224void SIWholeQuadMode::toStrictMode(MachineBasicBlock &
MBB,
1226 Register SaveOrig,
char StrictStateNeeded) {
1229 assert(StrictStateNeeded == StateStrictWWM ||
1230 StrictStateNeeded == StateStrictWQM);
1234 if (StrictStateNeeded == StateStrictWWM) {
1242 StateTransition[
MI] = StrictStateNeeded;
1245void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &
MBB,
1247 Register SavedOrig,
char NonStrictState,
1248 char CurrentStrictState) {
1252 assert(CurrentStrictState == StateStrictWWM ||
1253 CurrentStrictState == StateStrictWQM);
1257 if (CurrentStrictState == StateStrictWWM) {
1267 StateTransition[
MI] = NonStrictState;
1270void SIWholeQuadMode::processBlock(MachineBasicBlock &
MBB, BlockInfo &BI,
1274 if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1275 BI.InitialState = StateWQM;
1284 bool WQMFromExec = IsEntry;
1285 char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1286 char NonStrictState = 0;
1287 const TargetRegisterClass *BoolRC =
TRI->getBoolRC();
1292 if (
II != IE &&
II->getOpcode() == AMDGPU::COPY &&
1293 II->getOperand(1).getReg() == LMC.
ExecReg)
1308 BI.InitialState = State;
1310 for (
unsigned Idx = 0;; ++Idx) {
1312 char Needs = StateExact | StateWQM;
1318 if (FirstStrict == IE)
1322 if (IsEntry && Idx == 0 && (BI.InNeeds & StateWQM))
1328 MachineInstr &
MI = *
II;
1330 if (
MI.isTerminator() ||
TII->mayReadEXEC(*
MRI,
MI)) {
1331 auto III = Instructions.find(&
MI);
1332 if (III != Instructions.end()) {
1333 if (III->second.Needs & StateStrictWWM)
1334 Needs = StateStrictWWM;
1335 else if (III->second.Needs & StateStrictWQM)
1336 Needs = StateStrictWQM;
1337 else if (III->second.Needs & StateWQM)
1340 Needs &= ~III->second.Disabled;
1341 OutNeeds = III->second.OutNeeds;
1346 Needs = StateExact | StateWQM | StateStrict;
1350 if (
MI.isBranch() && OutNeeds == StateExact)
1356 if (BI.OutNeeds & StateWQM)
1358 else if (BI.OutNeeds == StateExact)
1361 Needs = StateWQM | StateExact;
1365 if (!(Needs & State)) {
1367 if (State == StateStrictWWM || Needs == StateStrictWWM ||
1368 State == StateStrictWQM || Needs == StateStrictWQM) {
1370 First = FirstStrict;
1377 bool SaveSCC =
false;
1380 case StateStrictWWM:
1381 case StateStrictWQM:
1385 SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1389 SaveSCC = !(Needs & StateWQM);
1395 char StartState = State & StateStrict ? NonStrictState : State;
1397 StartState == StateWQM && (Needs & StateExact) && !(Needs & StateWQM);
1398 bool ExactToWQM = StartState == StateExact && (Needs & StateWQM) &&
1399 !(Needs & StateExact);
1400 bool PreferLast = Needs == StateWQM;
1405 if ((WQMToExact && (OutNeeds & StateWQM)) || ExactToWQM) {
1407 if (
TII->hasUnwantedEffectsWhenEXECEmpty(*
I)) {
1408 PreferLast = WQMToExact;
1414 prepareInsertion(
MBB,
First,
II, PreferLast, SaveSCC);
1416 if (State & StateStrict) {
1417 assert(State == StateStrictWWM || State == StateStrictWQM);
1418 assert(SavedNonStrictReg);
1419 fromStrictMode(
MBB, Before, SavedNonStrictReg, NonStrictState, State);
1422 SavedNonStrictReg = 0;
1423 State = NonStrictState;
1426 if (Needs & StateStrict) {
1427 NonStrictState = State;
1428 assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1429 assert(!SavedNonStrictReg);
1430 SavedNonStrictReg =
MRI->createVirtualRegister(BoolRC);
1432 toStrictMode(
MBB, Before, SavedNonStrictReg, Needs);
1436 if (!WQMFromExec && (OutNeeds & StateWQM)) {
1438 SavedWQMReg =
MRI->createVirtualRegister(BoolRC);
1441 toExact(
MBB, Before, SavedWQMReg);
1443 }
else if (ExactToWQM) {
1444 assert(WQMFromExec == (SavedWQMReg == 0));
1446 toWQM(
MBB, Before, SavedWQMReg);
1462 if (Needs != (StateExact | StateWQM | StateStrict)) {
1463 if (Needs != (StateExact | StateWQM))
1474 assert(!SavedNonStrictReg);
1477bool SIWholeQuadMode::lowerLiveMaskQueries() {
1478 for (MachineInstr *
MI : LiveMaskQueries) {
1482 MachineInstr *
Copy =
1487 MI->eraseFromParent();
1489 return !LiveMaskQueries.empty();
1492bool SIWholeQuadMode::lowerCopyInstrs() {
1493 for (MachineInstr *
MI : LowerToMovInstrs) {
1494 assert(
MI->getNumExplicitOperands() == 2);
1498 const TargetRegisterClass *regClass =
1499 TRI->getRegClassForOperandReg(*
MRI,
MI->getOperand(0));
1500 if (
TRI->isVGPRClass(regClass)) {
1501 const unsigned MovOp =
TII->getMovOpcode(regClass);
1502 MI->setDesc(
TII->get(MovOp));
1506 assert(
any_of(
MI->implicit_operands(), [](
const MachineOperand &MO) {
1507 return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1513 if (
MI->getOperand(0).isEarlyClobber()) {
1515 MI->getOperand(0).setIsEarlyClobber(
false);
1518 int Index =
MI->findRegisterUseOperandIdx(AMDGPU::EXEC,
nullptr);
1519 while (Index >= 0) {
1520 MI->removeOperand(Index);
1521 Index =
MI->findRegisterUseOperandIdx(AMDGPU::EXEC,
nullptr);
1523 MI->setDesc(
TII->get(AMDGPU::COPY));
1527 for (MachineInstr *
MI : LowerToCopyInstrs) {
1530 if (
MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32) {
1531 assert(
MI->getNumExplicitOperands() == 6);
1533 LiveInterval *RecomputeLI =
nullptr;
1534 if (
MI->getOperand(4).isReg())
1535 RecomputeLI = &LIS->
getInterval(
MI->getOperand(4).getReg());
1537 MI->removeOperand(5);
1538 MI->removeOperand(4);
1539 MI->removeOperand(3);
1540 MI->removeOperand(1);
1545 assert(
MI->getNumExplicitOperands() == 2);
1548 unsigned CopyOp =
MI->getOperand(1).isReg()
1549 ? (unsigned)AMDGPU::COPY
1550 :
TII->getMovOpcode(
TRI->getRegClassForOperandReg(
1551 *
MRI,
MI->getOperand(0)));
1552 MI->setDesc(
TII->get(CopyOp));
1555 return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty();
1558bool SIWholeQuadMode::lowerKillInstrs(
bool IsWQM) {
1559 for (MachineInstr *
MI : KillInstrs) {
1560 MachineInstr *SplitPoint =
nullptr;
1561 switch (
MI->getOpcode()) {
1562 case AMDGPU::SI_DEMOTE_I1:
1563 case AMDGPU::SI_KILL_I1_TERMINATOR:
1564 SplitPoint = lowerKillI1(*
MI, IsWQM);
1566 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1567 SplitPoint = lowerKillF32(*
MI);
1573 return !KillInstrs.empty();
1576void SIWholeQuadMode::lowerInitExec(MachineInstr &
MI) {
1577 MachineBasicBlock *
MBB =
MI.getParent();
1579 if (
MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) {
1581 "init whole wave not in entry block");
1582 Register EntryExec =
MRI->createVirtualRegister(
TRI->getBoolRC());
1588 MRI->replaceRegWith(
MI.getOperand(0).getReg(), EntryExec);
1594 MI.eraseFromParent();
1603 if (
MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
1607 .
addImm(
MI.getOperand(0).getImm());
1612 MI.eraseFromParent();
1623 Register InputReg =
MI.getOperand(0).getReg();
1624 MachineInstr *FirstMI = &*
MBB->
begin();
1626 MachineInstr *DefInstr =
MRI->getVRegDef(InputReg);
1629 if (DefInstr != FirstMI) {
1647 Register CountReg =
MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
1648 auto BfeMI =
BuildMI(*
MBB, FirstMI,
DL,
TII->get(AMDGPU::S_BFE_U32), CountReg)
1650 .
addImm((
MI.getOperand(1).getImm() & Mask) | 0x70000);
1654 auto CmpMI =
BuildMI(*
MBB, FirstMI,
DL,
TII->get(AMDGPU::S_CMP_EQ_U32))
1655 .
addReg(CountReg, RegState::Kill)
1661 MI.eraseFromParent();
1666 MI.eraseFromParent();
1681SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry,
bool &
Changed) {
1684 for (MachineInstr *
MI : InitExecInstrs) {
1688 if (
MI->getParent() == &Entry)
1689 InsertPt = std::next(
MI->getIterator());
1698bool SIWholeQuadMode::run(MachineFunction &MF) {
1700 <<
" ------------- \n");
1703 Instructions.clear();
1705 LiveMaskQueries.clear();
1706 LowerToCopyInstrs.clear();
1707 LowerToMovInstrs.clear();
1709 InitExecInstrs.clear();
1710 SetInactiveInstrs.
clear();
1711 StateTransition.clear();
1722 const bool HasLiveMaskQueries = !LiveMaskQueries.empty();
1723 const bool HasWaveModes = GlobalFlags & ~StateExact;
1724 const bool HasKills = !KillInstrs.empty();
1725 const bool UsesWQM = GlobalFlags & StateWQM;
1726 if (HasKills || UsesWQM || (HasWaveModes && HasLiveMaskQueries)) {
1727 LiveMaskReg =
MRI->createVirtualRegister(
TRI->getBoolRC());
1737 for (MachineInstr *
MI : SetInactiveInstrs) {
1738 if (LowerToCopyInstrs.contains(
MI))
1740 auto &
Info = Instructions[
MI];
1741 if (
Info.MarkedStates & StateStrict) {
1742 Info.Needs |= StateStrictWWM;
1743 Info.Disabled &= ~StateStrictWWM;
1744 Blocks[
MI->getParent()].Needs |= StateStrictWWM;
1747 LowerToCopyInstrs.insert(
MI);
1753 Changed |= lowerLiveMaskQueries();
1756 if (!HasWaveModes) {
1758 Changed |= lowerKillInstrs(
false);
1759 }
else if (GlobalFlags == StateWQM) {
1765 lowerKillInstrs(
true);
1769 if (GlobalFlags & StateWQM)
1770 Blocks[&
Entry].InNeeds |= StateWQM;
1772 for (
auto &BII : Blocks)
1773 processBlock(*BII.first, BII.second, BII.first == &Entry);
1775 for (
auto &BII : Blocks)
1776 lowerBlock(*BII.first, BII.second);
1781 if (LiveMaskReg != LMC.
ExecReg)
1790 if (!KillInstrs.empty() || !InitExecInstrs.empty())
1796bool SIWholeQuadModeLegacy::runOnMachineFunction(MachineFunction &MF) {
1797 LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
1798 auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
1799 MachineDominatorTree *MDT = MDTWrapper ? &MDTWrapper->getDomTree() :
nullptr;
1801 getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
1802 MachinePostDominatorTree *PDT =
1803 PDTWrapper ? &PDTWrapper->getPostDomTree() :
nullptr;
1804 SIWholeQuadMode Impl(MF, LIS, MDT, PDT);
1805 return Impl.run(MF);
1818 SIWholeQuadMode Impl(MF, LIS, MDT, PDT);
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static void analyzeFunction(Function &Fn, const DataLayout &Layout, FunctionVarLocsBuilder *FnVarLocs)
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
Promote Memory to Register
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI, MachineDominatorTree *MDT, MachineLoopInfo *MLI)
SI Optimize VGPR LiveRange
unsigned getWavefrontSize() const
const unsigned AndSaveExecTermOpc
const unsigned AndTermOpc
static const LaneMaskConstants & get(const GCNSubtarget &ST)
const unsigned OrSaveExecOpc
const unsigned AndSaveExecOpc
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
void applyUpdates(ArrayRef< UpdateType > Updates)
Inform the dominator tree about a sequence of CFG edge insertions and deletions and perform a batch u...
FunctionPass class - This class is used to implement most global optimizations.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
void removeAllRegUnitsForPhysReg(MCRegister Reg)
Remove associated live ranges for the register units associated with Reg.
MachineInstr * getInstructionFromIndex(SlotIndex index) const
Returns the instruction associated with the given index.
SlotIndex InsertMachineInstrInMaps(MachineInstr &MI)
LLVM_ABI void handleMove(MachineInstr &MI, bool UpdateFlags=false)
Call this method to notify LiveIntervals that instruction MI has been moved within a basic block.
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
void RemoveMachineInstrFromMaps(MachineInstr &MI)
SlotIndex getMBBEndIdx(const MachineBasicBlock *mbb) const
Return the last index in the given basic block.
LiveInterval & getInterval(Register Reg)
void removeInterval(Register Reg)
Interval removal.
LiveRange & getRegUnit(MCRegUnit Unit)
Return the live range for register unit Unit.
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
MachineBasicBlock * getMBBFromIndex(SlotIndex index) const
LiveInterval & createAndComputeVirtRegInterval(Register Reg)
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
VNInfo * valueIn() const
Return the value that is live-in to the instruction.
This class represents the liveness of a register, stack slot, etc.
const Segment * getSegmentContaining(SlotIndex Idx) const
Return the segment that contains the specified index, or null if there is none.
LiveQueryResult Query(SlotIndex Idx) const
Query Liveness at Idx.
VNInfo * getVNInfoBefore(SlotIndex Idx) const
getVNInfoBefore - Return the VNInfo that is live up to but not necessarily including Idx,...
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
An RAII based helper class to modify MachineFunctionProperties when running pass.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
succ_iterator succ_begin()
MachineInstr * remove(MachineInstr *I)
Remove the unbundled instruction from the instruction list without deleting it.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
unsigned succ_size() const
LLVM_ABI iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
pred_iterator pred_begin()
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
instr_iterator instr_end()
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
reverse_iterator rbegin()
iterator_range< pred_iterator > predecessors()
MachineInstrBundleIterator< MachineInstr > iterator
Analysis pass which computes a MachineDominatorTree.
Analysis pass which computes a MachineDominatorTree.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
void dump() const
dump - Print the current MachineFunction to cerr, useful for debugger use.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
LLVM_ABI MachineInstr * removeFromParent()
Unlink 'this' from the containing basic block, and return it without deleting it.
const MachineBasicBlock * getParent() const
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class implements a map that also provides access to all stored values in a deterministic order.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Wrapper class representing virtual and physical registers.
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
SlotIndex getBaseIndex() const
Returns the base index for associated with this index.
A SetVector that performs no allocations if smaller than a certain size.
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Wrapper class representing a virtual register or register unit.
constexpr bool isVirtualReg() const
constexpr Register asVirtualReg() const
self_iterator getIterator()
This class implements an extremely fast bulk output stream that can only output to a stream.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char WavefrontSize[]
Key for Kernel::CodeProps::Metadata::mWavefrontSize.
LLVM_READONLY int getVOPe32(uint16_t Opcode)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Flag
These should be considered private to the implementation of the MCInstrDesc class.
NodeAddr< PhiNode * > Phi
This is an optimization pass for GlobalISel generic memory operations.
IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It, then continue incrementing it while it points to a debug instruction.
FunctionAddr VTableAddr Value
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
IterT skipDebugInstructionsForward(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It until it points to a non-debug instruction or to End and return the resulting iterator.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
DominatorTreeBase< T, false > DomTreeBase
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionPass * createSIWholeQuadModeLegacyPass()
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
@ Disabled
Don't do any conversion of .debug_str_offsets tables.
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
WorkItem(const BasicBlock *BB, int St)
static constexpr LaneBitmask getAll()
constexpr bool any() const
static constexpr LaneBitmask getNone()