Go to the documentation of this file.
21 #define DEBUG_TYPE "si-pre-emit-peephole"
28 "Number of instructions before jumping over divergent control flow"),
61 "SI peephole optimizations",
false,
false)
63 char SIPreEmitPeephole::
ID = 0;
67 bool SIPreEmitPeephole::optimizeVccBranch(
MachineInstr &
MI)
const {
89 const bool IsWave32 =
ST.isWave32();
90 const unsigned CondReg =
TRI->getVCC();
91 const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
92 const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
93 const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
94 const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
98 bool ReadsCond =
false;
99 unsigned Threshold = 5;
100 for (++A;
A !=
E; ++
A) {
103 if (
A->modifiesRegister(ExecReg,
TRI))
105 if (
A->modifiesRegister(CondReg,
TRI)) {
106 if (!
A->definesRegister(CondReg,
TRI) ||
107 (
A->getOpcode() !=
And &&
A->getOpcode() != AndN2))
111 ReadsCond |=
A->readsRegister(CondReg,
TRI);
119 TII->commuteInstruction(*A);
122 if (Op1.
getReg() != ExecReg)
127 int64_t MaskValue = 0;
131 auto M = std::next(A);
132 bool ReadsSreg =
false;
133 bool ModifiesExec =
false;
134 for (;
M !=
E; ++
M) {
135 if (
M->definesRegister(SReg,
TRI))
137 if (
M->modifiesRegister(SReg,
TRI))
139 ReadsSreg |=
M->readsRegister(SReg,
TRI);
140 ModifiesExec |=
M->modifiesRegister(ExecReg,
TRI);
148 if (
A->getOpcode() == And && SReg == CondReg && !ModifiesExec &&
150 A->eraseFromParent();
153 if (!
M->isMoveImmediate() || !
M->getOperand(1).isImm() ||
154 (
M->getOperand(1).getImm() != -1 &&
M->getOperand(1).getImm() != 0))
156 MaskValue =
M->getOperand(1).getImm();
159 if (!ReadsSreg && Op2.
isKill()) {
160 A->getOperand(2).ChangeToImmediate(MaskValue);
161 M->eraseFromParent();
163 }
else if (Op2.
isImm()) {
170 assert(MaskValue == 0 || MaskValue == -1);
171 if (
A->getOpcode() == AndN2)
172 MaskValue = ~MaskValue;
175 if (!
MI.killsRegister(CondReg,
TRI)) {
177 if (MaskValue == 0) {
178 BuildMI(*
A->getParent(), *A,
A->getDebugLoc(),
TII->get(Mov), CondReg)
181 BuildMI(*
A->getParent(), *A,
A->getDebugLoc(),
TII->get(Mov), CondReg)
186 A->eraseFromParent();
189 bool IsVCCZ =
MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
190 if (SReg == ExecReg) {
193 MI.eraseFromParent();
196 MI.setDesc(
TII->get(AMDGPU::S_BRANCH));
197 }
else if (IsVCCZ && MaskValue == 0) {
208 Found =
Term.isIdenticalTo(
MI);
211 assert(Found &&
"conditional branch is not terminator");
214 assert(Dst.isMBB() &&
"destination is not basic block");
216 BranchMI->eraseFromParent();
224 MI.setDesc(
TII->get(AMDGPU::S_BRANCH));
225 }
else if (!IsVCCZ && MaskValue == 0) {
228 assert(Dst.isMBB() &&
"destination is not basic block");
229 MI.getParent()->removeSuccessor(Dst.getMBB());
230 MI.eraseFromParent();
232 }
else if (MaskValue == -1) {
235 TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
238 MI.removeOperand(
MI.findRegisterUseOperandIdx(CondReg,
false ,
TRI));
244 bool SIPreEmitPeephole::optimizeSetGPR(
MachineInstr &First,
254 if (!
MI.isIdenticalTo(First))
259 E =
MI.getIterator();
263 switch (
I->getOpcode()) {
264 case AMDGPU::S_SET_GPR_IDX_MODE:
266 case AMDGPU::S_SET_GPR_IDX_OFF:
273 if (IdxReg &&
I->modifiesRegister(IdxReg,
TRI))
278 TRI->isVectorRegister(MRI, MO.getReg());
282 if (!IdxOn || !(
I->getOpcode() == AMDGPU::V_MOV_B32_indirect_write ||
283 I->getOpcode() == AMDGPU::V_MOV_B32_indirect_read))
289 MI.eraseFromBundle();
291 RI->eraseFromBundle();
295 bool SIPreEmitPeephole::getBlockDestinations(
307 bool SIPreEmitPeephole::mustRetainExeczBranch(
309 unsigned NumInstr = 0;
320 if (
MI.isConditionalBranch())
323 if (
TII->hasUnwantedEffectsWhenEXECEmpty(
MI))
328 TII->isDS(
MI) ||
MI.getOpcode() == AMDGPU::S_WAITCNT)
347 if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB,
Cond))
352 mustRetainExeczBranch(*FalseMBB, *TrueMBB))
356 MI.eraseFromParent();
364 TII =
ST.getInstrInfo();
365 TRI = &
TII->getRegisterInfo();
366 bool Changed =
false;
375 switch (
MI.getOpcode()) {
376 case AMDGPU::S_CBRANCH_VCCZ:
377 case AMDGPU::S_CBRANCH_VCCNZ:
378 Changed |= optimizeVccBranch(
MI);
380 case AMDGPU::S_CBRANCH_EXECZ:
381 Changed |= removeExeczBranch(
MI,
MBB);
386 if (!
ST.hasVGPRIndexMode())
390 const unsigned Threshold = 20;
399 if (Count == Threshold)
404 if (
MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
413 if (optimizeSetGPR(*SetGPRMI,
MI))
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
This is an optimization pass for GlobalISel generic memory operations.
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LocationClass< Ty > location(Ty &L)
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const override
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
ReachingDefAnalysis InstSet & ToRemove
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
char & SIPreEmitPeepholeID
iterator_range< iterator > terminators()
static cl::opt< unsigned, true > SkipThresholdFlag("amdgpu-skip-threshold", cl::Hidden, cl::desc("Number of instructions before jumping over divergent control flow"), cl::location(SkipThreshold), cl::init(12))
unsigned const TargetRegisterInfo * TRI
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
@ And
Bitwise or logical AND of integers.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
const HexagonInstrInfo * TII
into llvm powi allowing the code generator to produce balanced multiplication trees First
MachineOperand class - Representation of each machine instruction operand.
unsigned M0(unsigned Val)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Representation of each machine instruction.
initializer< Ty > init(const Ty &Val)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Register getReg() const
getReg - Returns the register number.
instr_iterator instr_begin()
instr_iterator instr_end()
static unsigned SkipThreshold
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
SmallVector< MachineOperand, 4 > Cond
MachineBasicBlock MachineBasicBlock::iterator MBBI
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned const MachineRegisterInfo * MRI
Wrapper class representing virtual and physical registers.
void initializeSIPreEmitPeepholePass(PassRegistry &)
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Iterator for intrusive lists based on ilist_node.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
MachineBasicBlock * getFallThrough()
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
BlockVerifier::State From
MachineBasicBlock iterator that automatically skips over MIs that are inside bundles (i....
void RenumberBlocks(MachineBasicBlock *MBBFrom=nullptr)
RenumberBlocks - This discards all of the MachineBasicBlock numbers and recomputes them.