33#define DEBUG_TYPE "si-peephole-sdwa"
35STATISTIC(NumSDWAPatternsFound,
"Number of SDWA patterns found.");
37 "Number of instruction converted to SDWA.");
56 SDWAOperandsMap PotentialMatches;
67 bool convertToSDWA(
MachineInstr &
MI,
const SDWAOperandsVector &SDWAOperands);
78 SIPeepholeSDWALegacy() : MachineFunctionPass(ID) {}
80 StringRef getPassName()
const override {
return "SI Peephole SDWA"; }
82 bool runOnMachineFunction(MachineFunction &MF)
override;
84 void getAnalysisUsage(AnalysisUsage &AU)
const override {
94 MachineOperand *Target;
95 MachineOperand *Replaced;
99 virtual bool canCombineSelections(
const MachineInstr &
MI,
100 const SIInstrInfo *
TII) = 0;
103 SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp)
104 : Target(TargetOp), Replaced(ReplacedOp) {
106 assert(Replaced->isReg());
109 virtual ~SDWAOperand() =
default;
111 virtual MachineInstr *potentialToConvert(
const SIInstrInfo *
TII,
112 const GCNSubtarget &ST,
113 SDWAOperandsMap *PotentialMatches =
nullptr) = 0;
114 virtual bool convertToSDWA(MachineInstr &
MI,
const SIInstrInfo *
TII) = 0;
116 MachineOperand *getTargetOperand()
const {
return Target; }
117 MachineOperand *getReplacedOperand()
const {
return Replaced; }
118 MachineInstr *getParentInst()
const {
return Target->getParent(); }
120 MachineRegisterInfo *getMRI()
const {
121 return &getParentInst()->getMF()->getRegInfo();
124#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
125 virtual void print(raw_ostream& OS)
const = 0;
130class SDWASrcOperand :
public SDWAOperand {
138 SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
139 SdwaSel SrcSel_ =
DWORD,
bool Abs_ =
false,
bool Neg_ =
false,
141 : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_),
142 Neg(Neg_), Sext(Sext_) {}
144 MachineInstr *potentialToConvert(
const SIInstrInfo *
TII,
145 const GCNSubtarget &ST,
146 SDWAOperandsMap *PotentialMatches =
nullptr)
override;
147 bool convertToSDWA(MachineInstr &
MI,
const SIInstrInfo *
TII)
override;
148 bool canCombineSelections(
const MachineInstr &
MI,
149 const SIInstrInfo *
TII)
override;
151 SdwaSel getSrcSel()
const {
return SrcSel; }
152 bool getAbs()
const {
return Abs; }
153 bool getNeg()
const {
return Neg; }
154 bool getSext()
const {
return Sext; }
156 uint64_t getSrcMods(
const SIInstrInfo *
TII,
157 const MachineOperand *SrcOp)
const;
159#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
160 void print(raw_ostream& OS)
const override;
164class SDWADstOperand :
public SDWAOperand {
170 SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
172 : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
174 MachineInstr *potentialToConvert(
const SIInstrInfo *
TII,
175 const GCNSubtarget &ST,
176 SDWAOperandsMap *PotentialMatches =
nullptr)
override;
177 bool convertToSDWA(MachineInstr &
MI,
const SIInstrInfo *
TII)
override;
178 bool canCombineSelections(
const MachineInstr &
MI,
179 const SIInstrInfo *
TII)
override;
181 SdwaSel getDstSel()
const {
return DstSel; }
182 DstUnused getDstUnused()
const {
return DstUn; }
184#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
185 void print(raw_ostream& OS)
const override;
189class SDWADstPreserveOperand :
public SDWADstOperand {
191 MachineOperand *Preserve;
194 SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
197 Preserve(PreserveOp) {}
199 bool convertToSDWA(MachineInstr &
MI,
const SIInstrInfo *
TII)
override;
200 bool canCombineSelections(
const MachineInstr &
MI,
201 const SIInstrInfo *
TII)
override;
203 MachineOperand *getPreservedOperand()
const {
return Preserve; }
205#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
206 void print(raw_ostream& OS)
const override;
215char SIPeepholeSDWALegacy::
ID = 0;
220 return new SIPeepholeSDWALegacy();
223#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
226 case BYTE_0: OS <<
"BYTE_0";
break;
227 case BYTE_1: OS <<
"BYTE_1";
break;
228 case BYTE_2: OS <<
"BYTE_2";
break;
229 case BYTE_3: OS <<
"BYTE_3";
break;
230 case WORD_0: OS <<
"WORD_0";
break;
231 case WORD_1: OS <<
"WORD_1";
break;
232 case DWORD: OS <<
"DWORD";
break;
248 OS <<
"SDWA src: " << *getTargetOperand()
249 <<
" src_sel:" << getSrcSel()
250 <<
" abs:" << getAbs() <<
" neg:" << getNeg()
251 <<
" sext:" << getSext() <<
'\n';
255void SDWADstOperand::print(raw_ostream& OS)
const {
256 OS <<
"SDWA dst: " << *getTargetOperand()
257 <<
" dst_sel:" << getDstSel()
258 <<
" dst_unused:" << getDstUnused() <<
'\n';
262void SDWADstPreserveOperand::print(raw_ostream& OS)
const {
263 OS <<
"SDWA preserve dst: " << *getTargetOperand()
264 <<
" dst_sel:" << getDstSel()
265 <<
" preserve:" << *getPreservedOperand() <<
'\n';
283 return LHS.isReg() &&
285 LHS.getReg() ==
RHS.getReg() &&
286 LHS.getSubReg() ==
RHS.getSubReg();
291 if (!
Reg->isReg() || !
Reg->isDef())
312 if (Sel == SdwaSel::DWORD)
315 if (Sel == OperandSel || OperandSel == SdwaSel::DWORD)
318 if (Sel == SdwaSel::WORD_1 || Sel == SdwaSel::BYTE_2 ||
319 Sel == SdwaSel::BYTE_3)
322 if (OperandSel == SdwaSel::WORD_0)
325 if (OperandSel == SdwaSel::WORD_1) {
326 if (Sel == SdwaSel::BYTE_0)
327 return SdwaSel::BYTE_2;
328 if (Sel == SdwaSel::BYTE_1)
329 return SdwaSel::BYTE_3;
330 if (Sel == SdwaSel::WORD_0)
331 return SdwaSel::WORD_1;
337uint64_t SDWASrcOperand::getSrcMods(
const SIInstrInfo *
TII,
338 const MachineOperand *SrcOp)
const {
341 if (
TII->getNamedOperand(*
MI, AMDGPU::OpName::src0) == SrcOp) {
342 if (
auto *
Mod =
TII->getNamedOperand(*
MI, AMDGPU::OpName::src0_modifiers)) {
343 Mods =
Mod->getImm();
345 }
else if (
TII->getNamedOperand(*
MI, AMDGPU::OpName::src1) == SrcOp) {
346 if (
auto *
Mod =
TII->getNamedOperand(*
MI, AMDGPU::OpName::src1_modifiers)) {
347 Mods =
Mod->getImm();
352 "Float and integer src modifiers can't be set simultaneously");
362MachineInstr *SDWASrcOperand::potentialToConvert(
const SIInstrInfo *
TII,
363 const GCNSubtarget &ST,
364 SDWAOperandsMap *PotentialMatches) {
365 if (PotentialMatches !=
nullptr) {
367 MachineOperand *
Reg = getReplacedOperand();
368 if (!
Reg->isReg() || !
Reg->isDef())
371 for (MachineInstr &
UseMI : getMRI()->use_nodbg_instructions(
Reg->getReg()))
373 if (!isConvertibleToSDWA(
UseMI, ST,
TII) ||
379 for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(
Reg->getReg())) {
383 SDWAOperandsMap &potentialMatchesMap = *PotentialMatches;
384 MachineInstr *
UseMI = UseMO.getParent();
385 potentialMatchesMap[
UseMI].push_back(
this);
392 MachineOperand *PotentialMO =
findSingleRegUse(getReplacedOperand(), getMRI());
396 MachineInstr *Parent = PotentialMO->
getParent();
398 return canCombineSelections(*Parent,
TII) ? Parent :
nullptr;
401bool SDWASrcOperand::convertToSDWA(MachineInstr &
MI,
const SIInstrInfo *
TII) {
402 assert((!Sext || !
TII->getSubtarget().zeroesHigh16BitsOfDest(
404 "Cannot use sign-extension with instruction that zeroes high bits");
405 switch (
MI.getOpcode()) {
406 case AMDGPU::V_CVT_F32_FP8_sdwa:
407 case AMDGPU::V_CVT_F32_BF8_sdwa:
408 case AMDGPU::V_CVT_PK_F32_FP8_sdwa:
409 case AMDGPU::V_CVT_PK_F32_BF8_sdwa:
412 case AMDGPU::V_CNDMASK_B32_sdwa:
431 bool IsPreserveSrc =
false;
432 MachineOperand *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
433 MachineOperand *SrcSel =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0_sel);
434 MachineOperand *SrcMods =
435 TII->getNamedOperand(
MI, AMDGPU::OpName::src0_modifiers);
436 assert(Src && (Src->isReg() || Src->isImm()));
437 if (!
isSameReg(*Src, *getReplacedOperand())) {
439 Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
440 SrcSel =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1_sel);
441 SrcMods =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1_modifiers);
444 !
isSameReg(*Src, *getReplacedOperand())) {
451 MachineOperand *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
453 TII->getNamedOperand(
MI, AMDGPU::OpName::dst_unused);
456 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
462 TII->getNamedImmOperand(
MI, AMDGPU::OpName::dst_sel));
463 if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
464 getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) {
465 IsPreserveSrc =
true;
466 auto DstIdx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
467 AMDGPU::OpName::vdst);
468 auto TiedIdx =
MI.findTiedOperandIdx(DstIdx);
469 Src = &
MI.getOperand(TiedIdx);
478 assert(Src && Src->isReg());
480 if ((
MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
481 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
482 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
483 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
484 !
isSameReg(*Src, *getReplacedOperand())) {
491 (IsPreserveSrc || (SrcSel && SrcMods)));
494 if (!IsPreserveSrc) {
499 getTargetOperand()->setIsKill(
false);
506 AMDGPU::OpName SrcSelOpName,
SdwaSel OpSel) {
519 AMDGPU::OpName SrcOpName,
531bool SDWASrcOperand::canCombineSelections(
const MachineInstr &
MI,
532 const SIInstrInfo *
TII) {
533 if (!
TII->isSDWA(
MI.getOpcode()))
536 using namespace AMDGPU;
539 getReplacedOperand(), getSrcSel()) &&
541 getReplacedOperand(), getSrcSel());
544MachineInstr *SDWADstOperand::potentialToConvert(
const SIInstrInfo *
TII,
545 const GCNSubtarget &ST,
546 SDWAOperandsMap *PotentialMatches) {
549 MachineRegisterInfo *MRI = getMRI();
550 MachineInstr *ParentMI = getParentInst();
558 if (&UseInst != ParentMI)
562 MachineInstr *Parent = PotentialMO->
getParent();
563 return canCombineSelections(*Parent,
TII) ? Parent :
nullptr;
566bool SDWADstOperand::convertToSDWA(MachineInstr &
MI,
const SIInstrInfo *
TII) {
569 if ((
MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
570 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
571 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
572 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
578 MachineOperand *Operand =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
581 isSameReg(*Operand, *getReplacedOperand()));
583 MachineOperand *DstSel=
TII->getNamedOperand(
MI, AMDGPU::OpName::dst_sel);
589 MachineOperand *
DstUnused=
TII->getNamedOperand(
MI, AMDGPU::OpName::dst_unused);
595 getParentInst()->eraseFromParent();
599bool SDWADstOperand::canCombineSelections(
const MachineInstr &
MI,
600 const SIInstrInfo *
TII) {
601 if (!
TII->isSDWA(
MI.getOpcode()))
607bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &
MI,
608 const SIInstrInfo *
TII) {
612 for (MachineOperand &MO :
MI.uses()) {
615 getMRI()->clearKillFlags(MO.getReg());
619 MI.getParent()->remove(&
MI);
620 getParentInst()->getParent()->insert(getParentInst(), &
MI);
623 MachineInstrBuilder MIB(*
MI.getMF(),
MI);
624 MIB.addReg(getPreservedOperand()->
getReg(),
625 RegState::ImplicitKill,
626 getPreservedOperand()->getSubReg());
629 MI.tieOperands(AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::vdst),
630 MI.getNumOperands() - 1);
633 return SDWADstOperand::convertToSDWA(
MI,
TII);
636bool SDWADstPreserveOperand::canCombineSelections(
const MachineInstr &
MI,
637 const SIInstrInfo *
TII) {
638 return SDWADstOperand::canCombineSelections(
MI,
TII);
641std::optional<int64_t>
642SIPeepholeSDWA::foldToImm(
const MachineOperand &
Op)
const {
650 for (
const MachineOperand &Def : MRI->
def_operands(
Op.getReg())) {
654 const MachineInstr *DefInst =
Def.getParent();
655 if (!
TII->isFoldableCopy(*DefInst))
658 const MachineOperand &Copied = DefInst->
getOperand(1);
669std::unique_ptr<SDWAOperand>
670SIPeepholeSDWA::matchSDWAOperand(MachineInstr &
MI) {
671 unsigned Opcode =
MI.getOpcode();
673 case AMDGPU::V_LSHRREV_B32_e32:
674 case AMDGPU::V_ASHRREV_I32_e32:
675 case AMDGPU::V_LSHLREV_B32_e32:
676 case AMDGPU::V_LSHRREV_B32_e64:
677 case AMDGPU::V_ASHRREV_I32_e64:
678 case AMDGPU::V_LSHLREV_B32_e64: {
687 MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
688 auto Imm = foldToImm(*Src0);
692 if (*Imm != 16 && *Imm != 24)
695 MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
696 MachineOperand *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
698 Dst->getReg().isPhysical())
701 if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
702 Opcode == AMDGPU::V_LSHLREV_B32_e64) {
703 return std::make_unique<SDWADstOperand>(
706 return std::make_unique<SDWASrcOperand>(
708 Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
709 Opcode != AMDGPU::V_LSHRREV_B32_e64);
713 case AMDGPU::V_LSHRREV_B16_e32:
714 case AMDGPU::V_LSHLREV_B16_e32:
715 case AMDGPU::V_LSHRREV_B16_e64:
716 case AMDGPU::V_LSHRREV_B16_opsel_e64:
717 case AMDGPU::V_LSHLREV_B16_opsel_e64:
718 case AMDGPU::V_LSHLREV_B16_e64: {
727 MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
728 auto Imm = foldToImm(*Src0);
729 if (!Imm || *Imm != 8)
732 MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
733 MachineOperand *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
736 Dst->getReg().isPhysical())
739 if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
740 Opcode == AMDGPU::V_LSHLREV_B16_opsel_e64 ||
741 Opcode == AMDGPU::V_LSHLREV_B16_e64)
743 return std::make_unique<SDWASrcOperand>(Src1, Dst,
BYTE_1,
false,
false,
748 case AMDGPU::V_BFE_I32_e64:
749 case AMDGPU::V_BFE_U32_e64: {
764 MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
765 auto Offset = foldToImm(*Src1);
769 MachineOperand *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2);
770 auto Width = foldToImm(*Src2);
776 if (*
Offset == 0 && *Width == 8)
778 else if (*
Offset == 0 && *Width == 16)
780 else if (*
Offset == 0 && *Width == 32)
782 else if (*
Offset == 8 && *Width == 8)
784 else if (*
Offset == 16 && *Width == 8)
786 else if (*
Offset == 16 && *Width == 16)
788 else if (*
Offset == 24 && *Width == 8)
793 MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
794 MachineOperand *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
797 Dst->getReg().isPhysical())
800 return std::make_unique<SDWASrcOperand>(
801 Src0, Dst, SrcSel,
false,
false, Opcode != AMDGPU::V_BFE_U32_e64);
804 case AMDGPU::V_AND_B32_e32:
805 case AMDGPU::V_AND_B32_e64: {
810 MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
811 MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
813 auto Imm = foldToImm(*Src0);
816 Imm = foldToImm(*Src1);
820 if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
823 MachineOperand *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
825 if (!ValSrc->isReg() || ValSrc->getReg().isPhysical() ||
826 Dst->getReg().isPhysical())
829 return std::make_unique<SDWASrcOperand>(
833 case AMDGPU::V_OR_B32_e32:
834 case AMDGPU::V_OR_B32_e64: {
845 std::optional<std::pair<MachineOperand *, MachineOperand *>>;
846 auto CheckOROperandsForSDWA =
847 [&](
const MachineOperand *Op1,
const MachineOperand *Op2) -> CheckRetType {
848 if (!Op1 || !Op1->
isReg() || !Op2 || !Op2->isReg())
849 return CheckRetType(std::nullopt);
853 return CheckRetType(std::nullopt);
855 MachineInstr *Op1Inst = Op1Def->
getParent();
856 if (!
TII->isSDWA(*Op1Inst))
857 return CheckRetType(std::nullopt);
861 return CheckRetType(std::nullopt);
863 return CheckRetType(std::pair(Op1Def, Op2Def));
866 MachineOperand *OrSDWA =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
867 MachineOperand *OrOther =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
868 assert(OrSDWA && OrOther);
869 auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
871 OrSDWA =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
872 OrOther =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
873 assert(OrSDWA && OrOther);
874 Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
879 MachineOperand *OrSDWADef = Res->first;
880 MachineOperand *OrOtherDef = Res->second;
881 assert(OrSDWADef && OrOtherDef);
883 MachineInstr *SDWAInst = OrSDWADef->
getParent();
884 MachineInstr *OtherInst = OrOtherDef->
getParent();
906 if (!
TII->isSDWA(*OtherInst))
910 TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));
912 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel));
914 bool DstSelAgree =
false;
917 (OtherDstSel ==
BYTE_3) ||
921 (OtherDstSel ==
BYTE_1) ||
925 (OtherDstSel ==
BYTE_2) ||
926 (OtherDstSel ==
BYTE_3) ||
930 (OtherDstSel ==
BYTE_2) ||
931 (OtherDstSel ==
BYTE_3) ||
935 (OtherDstSel ==
BYTE_1) ||
936 (OtherDstSel ==
BYTE_3) ||
940 (OtherDstSel ==
BYTE_1) ||
941 (OtherDstSel ==
BYTE_2) ||
944 default: DstSelAgree =
false;
952 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused));
953 if (OtherDstUnused != DstUnused::UNUSED_PAD)
957 MachineOperand *OrDst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
960 return std::make_unique<SDWADstPreserveOperand>(
961 OrDst, OrSDWADef, OrOtherDef, DstSel);
966 return std::unique_ptr<SDWAOperand>(
nullptr);
976void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &
MBB) {
977 for (MachineInstr &
MI :
MBB) {
978 if (
auto Operand = matchSDWAOperand(
MI)) {
980 SDWAOperands[&
MI] = std::move(Operand);
981 ++NumSDWAPatternsFound;
1004void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &
MI,
1005 const GCNSubtarget &ST)
const {
1006 int Opc =
MI.getOpcode();
1007 assert((
Opc == AMDGPU::V_ADD_CO_U32_e64 ||
Opc == AMDGPU::V_SUB_CO_U32_e64) &&
1008 "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64");
1011 if (!
TII->canShrink(
MI, *MRI))
1015 const MachineOperand *Sdst =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst);
1021 MachineInstr &MISucc = *NextOp->
getParent();
1024 MachineOperand *CarryIn =
TII->getNamedOperand(MISucc, AMDGPU::OpName::src2);
1027 MachineOperand *CarryOut =
TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst);
1034 MachineBasicBlock &
MBB = *
MI.getParent();
1042 if (
I->modifiesRegister(AMDGPU::VCC,
TRI))
1048 .
add(*
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst))
1049 .
add(*
TII->getNamedOperand(
MI, AMDGPU::OpName::src0))
1050 .
add(*
TII->getNamedOperand(
MI, AMDGPU::OpName::src1))
1053 MI.eraseFromParent();
1065void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &
MI,
1066 const GCNSubtarget &ST)
const {
1067 assert(
MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64);
1070 if (!
TII->canShrink(
MI, *MRI)) {
1075 const MachineOperand &CarryIn =
1076 *
TII->getNamedOperand(
MI, AMDGPU::OpName::src2);
1078 MachineInstr *CarryDef = MRI->
getVRegDef(CarryReg);
1085 MCRegister
Vcc =
TRI->getVCC();
1086 MachineBasicBlock &
MBB = *
MI.getParent();
1090 LLVM_DEBUG(
dbgs() <<
"VCC not known to be dead before instruction\n");
1098 .
add(*
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst))
1099 .
add(*
TII->getNamedOperand(
MI, AMDGPU::OpName::src0))
1100 .
add(*
TII->getNamedOperand(
MI, AMDGPU::OpName::src1))
1102 TII->fixImplicitOperands(*Converted);
1105 MI.eraseFromParent();
1109bool isConvertibleToSDWA(MachineInstr &
MI,
1110 const GCNSubtarget &ST,
1111 const SIInstrInfo*
TII) {
1113 unsigned Opc =
MI.getOpcode();
1119 if (
Opc == AMDGPU::V_CNDMASK_B32_e64)
1129 if (!
ST.hasSDWAOmod() &&
TII->hasModifiersSet(
MI, AMDGPU::OpName::omod))
1133 if (!
ST.hasSDWASdst()) {
1134 const MachineOperand *SDst =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst);
1135 if (SDst && (SDst->
getReg() != AMDGPU::VCC &&
1136 SDst->
getReg() != AMDGPU::VCC_LO))
1140 if (!
ST.hasSDWAOutModsVOPC() &&
1141 (
TII->hasModifiersSet(
MI, AMDGPU::OpName::clamp) ||
1142 TII->hasModifiersSet(
MI, AMDGPU::OpName::omod)))
1145 }
else if (
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst) ||
1146 !
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)) {
1150 if (!
ST.hasSDWAMac() && (
Opc == AMDGPU::V_FMAC_F16_e32 ||
1151 Opc == AMDGPU::V_FMAC_F32_e32 ||
1152 Opc == AMDGPU::V_MAC_F16_e32 ||
1153 Opc == AMDGPU::V_MAC_F32_e32))
1157 if (
TII->pseudoToMCOpcode(
Opc) == -1)
1160 if (MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0)) {
1165 if (MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1)) {
1174MachineInstr *SIPeepholeSDWA::createSDWAVersion(MachineInstr &
MI) {
1175 unsigned Opcode =
MI.getOpcode();
1179 if (SDWAOpcode == -1)
1181 assert(SDWAOpcode != -1);
1183 const MCInstrDesc &SDWADesc =
TII->get(SDWAOpcode);
1186 MachineInstrBuilder SDWAInst =
1191 MachineOperand *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1195 }
else if ((Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst))) {
1200 SDWAInst.
addReg(
TRI->getVCC(), RegState::Define);
1205 MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
1208 if (
auto *
Mod =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0_modifiers))
1212 SDWAInst.
add(*Src0);
1215 MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
1219 if (
auto *
Mod =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1_modifiers))
1223 SDWAInst.
add(*Src1);
1226 if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa ||
1227 SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa ||
1228 SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
1229 SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
1231 MachineOperand *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2);
1233 SDWAInst.
add(*Src2);
1238 MachineOperand *Clamp =
TII->getNamedOperand(
MI, AMDGPU::OpName::clamp);
1240 SDWAInst.
add(*Clamp);
1247 MachineOperand *OMod =
TII->getNamedOperand(
MI, AMDGPU::OpName::omod);
1249 SDWAInst.
add(*OMod);
1257 SDWAInst.
addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1260 SDWAInst.
addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);
1263 SDWAInst.
addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1267 SDWAInst.
addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1271 MachineInstr *Ret = SDWAInst.
getInstr();
1272 TII->fixImplicitOperands(*Ret);
1276bool SIPeepholeSDWA::convertToSDWA(MachineInstr &
MI,
1277 const SDWAOperandsVector &SDWAOperands) {
1280 MachineInstr *SDWAInst;
1281 if (
TII->isSDWA(
MI.getOpcode())) {
1285 SDWAInst =
MI.getMF()->CloneMachineInstr(&
MI);
1286 MI.getParent()->insert(
MI.getIterator(), SDWAInst);
1288 SDWAInst = createSDWAVersion(
MI);
1292 bool Converted =
false;
1293 for (
auto &Operand : SDWAOperands) {
1305 if (PotentialMatches.count(Operand->getParentInst()) == 0)
1306 Converted |= Operand->convertToSDWA(*SDWAInst,
TII);
1314 ConvertedInstructions.
push_back(SDWAInst);
1315 for (MachineOperand &MO : SDWAInst->
uses()) {
1322 ++NumSDWAInstructionsPeepholed;
1324 MI.eraseFromParent();
1330void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &
MI,
1331 const GCNSubtarget &ST)
const {
1332 const MCInstrDesc &
Desc =
TII->get(
MI.getOpcode());
1333 unsigned ConstantBusCount = 0;
1334 for (MachineOperand &
Op :
MI.explicit_uses()) {
1336 if (
TRI->isVGPR(*MRI,
Op.getReg()))
1339 if (
ST.hasSDWAScalar() && ConstantBusCount == 0) {
1343 }
else if (!
Op.isImm())
1346 unsigned I =
Op.getOperandNo();
1347 const TargetRegisterClass *OpRC =
TII->getRegClass(
Desc,
I);
1348 if (!OpRC || !
TRI->isVSSuperClass(OpRC))
1353 TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
1355 Copy.addImm(
Op.getImm());
1356 else if (
Op.isReg())
1358 Op.ChangeToRegister(VGPR,
false);
1362bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) {
1366 return SIPeepholeSDWA().run(MF);
1369bool SIPeepholeSDWA::run(MachineFunction &MF) {
1376 TRI =
ST.getRegisterInfo();
1377 TII =
ST.getInstrInfo();
1381 for (MachineBasicBlock &
MBB : MF) {
1388 matchSDWAOperands(
MBB);
1389 for (
const auto &OperandPair : SDWAOperands) {
1390 const auto &Operand = OperandPair.second;
1391 MachineInstr *PotentialMI = Operand->potentialToConvert(
TII, ST);
1396 case AMDGPU::V_ADD_CO_U32_e64:
1397 case AMDGPU::V_SUB_CO_U32_e64:
1398 pseudoOpConvertToVOP2(*PotentialMI, ST);
1400 case AMDGPU::V_CNDMASK_B32_e64:
1401 convertVcndmaskToVOP2(*PotentialMI, ST);
1405 SDWAOperands.clear();
1408 matchSDWAOperands(
MBB);
1410 for (
const auto &OperandPair : SDWAOperands) {
1411 const auto &Operand = OperandPair.second;
1412 MachineInstr *PotentialMI =
1413 Operand->potentialToConvert(
TII, ST, &PotentialMatches);
1415 if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST,
TII))
1416 PotentialMatches[PotentialMI].push_back(Operand.get());
1419 for (
auto &PotentialPair : PotentialMatches) {
1420 MachineInstr &PotentialMI = *PotentialPair.first;
1421 convertToSDWA(PotentialMI, PotentialPair.second);
1424 PotentialMatches.clear();
1425 SDWAOperands.clear();
1431 while (!ConvertedInstructions.
empty())
1432 legalizeScalarOperands(*ConvertedInstructions.
pop_back_val(), ST);
MachineInstrBuilder & UseMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
static Value * getOpcode(Value &V, Type &Ty, InstrumentationConfig &IConf, InstrumentorIRBuilderTy &IIRB)
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
static MachineOperand * findSingleRegDef(const MachineOperand *Reg, const MachineRegisterInfo *MRI)
static void copyRegOperand(MachineOperand &To, const MachineOperand &From)
static MachineOperand * findSingleRegUse(const MachineOperand *Reg, const MachineRegisterInfo *MRI)
static std::optional< SdwaSel > combineSdwaSel(SdwaSel Sel, SdwaSel OperandSel)
Combine an SDWA instruction's existing SDWA selection Sel with the SDWA selection OperandSel of its o...
static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS)
static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII, AMDGPU::OpName SrcSelOpName, SdwaSel OpSel)
Verify that the SDWA selection operand SrcSelOpName of the SDWA instruction MI can be combined with t...
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Represents analyses that only rely on functions' control flow.
FunctionPass class - This class is used to implement most global optimizations.
bool hasOptNone() const
Do not optimize this function (-O0).
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LivenessQueryResult
Possible outcome of a register liveness query to computeRegisterLiveness()
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
LLVM_ABI void substituteRegister(Register FromReg, Register ToReg, unsigned SubIdx, const TargetRegisterInfo &RegInfo)
Replace all occurrences of FromReg with ToReg:SubIdx, properly composing subreg indices where necessa...
mop_range uses()
Returns all operands which may be register uses.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
bool use_nodbg_empty(Register RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register.
LLVM_ABI MachineOperand * getOneNonDBGUse(Register RegNo) const
If the register has a single non-Debug use, returns it; otherwise returns nullptr.
MachineOperand * getOneDef(Register Reg) const
Returns the defining operand if there is exactly one operand defining the specified register,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
iterator_range< use_instr_nodbg_iterator > use_nodbg_instructions(Register Reg) const
iterator_range< def_iterator > def_operands(Register Reg) const
This class implements a map that also provides access to all stored values in a deterministic order.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
self_iterator getIterator()
This class implements an extremely fast bulk output stream that can only output to a stream.
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY int32_t getVOPe32(uint32_t Opcode)
LLVM_READONLY int32_t getSDWAOp(uint32_t Opcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr RegState getKillRegState(bool B)
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
FunctionPass * createSIPeepholeSDWALegacyPass()
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
DWARFExpression::Operation Op
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
char & SIPeepholeSDWALegacyID