LLVM 17.0.0git
SIPreEmitPeephole.cpp
Go to the documentation of this file.
1//===-- SIPreEmitPeephole.cpp ------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass performs the peephole optimizations before code emission.
11///
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
15#include "GCNSubtarget.h"
18
19using namespace llvm;
20
21#define DEBUG_TYPE "si-pre-emit-peephole"
22
23static unsigned SkipThreshold;
24
26 "amdgpu-skip-threshold", cl::Hidden,
28 "Number of instructions before jumping over divergent control flow"),
30
31namespace {
32
33class SIPreEmitPeephole : public MachineFunctionPass {
34private:
35 const SIInstrInfo *TII = nullptr;
36 const SIRegisterInfo *TRI = nullptr;
37
38 bool optimizeVccBranch(MachineInstr &MI) const;
39 bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
40 bool getBlockDestinations(MachineBasicBlock &SrcMBB,
41 MachineBasicBlock *&TrueMBB,
42 MachineBasicBlock *&FalseMBB,
44 bool mustRetainExeczBranch(const MachineBasicBlock &From,
45 const MachineBasicBlock &To) const;
46 bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
47
48public:
49 static char ID;
50
51 SIPreEmitPeephole() : MachineFunctionPass(ID) {
53 }
54
55 bool runOnMachineFunction(MachineFunction &MF) override;
56};
57
58} // End anonymous namespace.
59
60INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE,
61 "SI peephole optimizations", false, false)
62
63char SIPreEmitPeephole::ID = 0;
64
65char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID;
66
67bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
68 // Match:
69 // sreg = -1 or 0
70 // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
71 // S_CBRANCH_VCC[N]Z
72 // =>
73 // S_CBRANCH_EXEC[N]Z
74 // We end up with this pattern sometimes after basic block placement.
75 // It happens while combining a block which assigns -1 or 0 to a saved mask
76 // and another block which consumes that saved mask and then a branch.
77 //
78 // While searching this also performs the following substitution:
79 // vcc = V_CMP
80 // vcc = S_AND exec, vcc
81 // S_CBRANCH_VCC[N]Z
82 // =>
83 // vcc = V_CMP
84 // S_CBRANCH_VCC[N]Z
85
86 bool Changed = false;
87 MachineBasicBlock &MBB = *MI.getParent();
89 const bool IsWave32 = ST.isWave32();
90 const unsigned CondReg = TRI->getVCC();
91 const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
92 const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
93 const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
94 const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
95
96 MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
97 E = MBB.rend();
98 bool ReadsCond = false;
99 unsigned Threshold = 5;
100 for (++A; A != E; ++A) {
101 if (!--Threshold)
102 return false;
103 if (A->modifiesRegister(ExecReg, TRI))
104 return false;
105 if (A->modifiesRegister(CondReg, TRI)) {
106 if (!A->definesRegister(CondReg, TRI) ||
107 (A->getOpcode() != And && A->getOpcode() != AndN2))
108 return false;
109 break;
110 }
111 ReadsCond |= A->readsRegister(CondReg, TRI);
112 }
113 if (A == E)
114 return false;
115
116 MachineOperand &Op1 = A->getOperand(1);
117 MachineOperand &Op2 = A->getOperand(2);
118 if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
119 TII->commuteInstruction(*A);
120 Changed = true;
121 }
122 if (Op1.getReg() != ExecReg)
123 return Changed;
124 if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
125 return Changed;
126
127 int64_t MaskValue = 0;
128 Register SReg;
129 if (Op2.isReg()) {
130 SReg = Op2.getReg();
131 auto M = std::next(A);
132 bool ReadsSreg = false;
133 bool ModifiesExec = false;
134 for (; M != E; ++M) {
135 if (M->definesRegister(SReg, TRI))
136 break;
137 if (M->modifiesRegister(SReg, TRI))
138 return Changed;
139 ReadsSreg |= M->readsRegister(SReg, TRI);
140 ModifiesExec |= M->modifiesRegister(ExecReg, TRI);
141 }
142 if (M == E)
143 return Changed;
144 // If SReg is VCC and SReg definition is a VALU comparison.
145 // This means S_AND with EXEC is not required.
146 // Erase the S_AND and return.
147 // Note: isVOPC is used instead of isCompare to catch V_CMP_CLASS
148 if (A->getOpcode() == And && SReg == CondReg && !ModifiesExec &&
149 TII->isVOPC(*M)) {
150 A->eraseFromParent();
151 return true;
152 }
153 if (!M->isMoveImmediate() || !M->getOperand(1).isImm() ||
154 (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
155 return Changed;
156 MaskValue = M->getOperand(1).getImm();
157 // First if sreg is only used in the AND instruction fold the immediate
158 // into the AND.
159 if (!ReadsSreg && Op2.isKill()) {
160 A->getOperand(2).ChangeToImmediate(MaskValue);
161 M->eraseFromParent();
162 }
163 } else if (Op2.isImm()) {
164 MaskValue = Op2.getImm();
165 } else {
166 llvm_unreachable("Op2 must be register or immediate");
167 }
168
169 // Invert mask for s_andn2
170 assert(MaskValue == 0 || MaskValue == -1);
171 if (A->getOpcode() == AndN2)
172 MaskValue = ~MaskValue;
173
174 if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC)) {
175 if (!MI.killsRegister(CondReg, TRI)) {
176 // Replace AND with MOV
177 if (MaskValue == 0) {
178 BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
179 .addImm(0);
180 } else {
181 BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
182 .addReg(ExecReg);
183 }
184 }
185 // Remove AND instruction
186 A->eraseFromParent();
187 }
188
189 bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
190 if (SReg == ExecReg) {
191 // EXEC is updated directly
192 if (IsVCCZ) {
193 MI.eraseFromParent();
194 return true;
195 }
196 MI.setDesc(TII->get(AMDGPU::S_BRANCH));
197 } else if (IsVCCZ && MaskValue == 0) {
198 // Will always branch
199 // Remove all successors shadowed by new unconditional branch
200 MachineBasicBlock *Parent = MI.getParent();
202 bool Found = false;
203 for (MachineInstr &Term : Parent->terminators()) {
204 if (Found) {
205 if (Term.isBranch())
206 ToRemove.push_back(&Term);
207 } else {
208 Found = Term.isIdenticalTo(MI);
209 }
210 }
211 assert(Found && "conditional branch is not terminator");
212 for (auto *BranchMI : ToRemove) {
213 MachineOperand &Dst = BranchMI->getOperand(0);
214 assert(Dst.isMBB() && "destination is not basic block");
215 Parent->removeSuccessor(Dst.getMBB());
216 BranchMI->eraseFromParent();
217 }
218
219 if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
220 Parent->removeSuccessor(Succ);
221 }
222
223 // Rewrite to unconditional branch
224 MI.setDesc(TII->get(AMDGPU::S_BRANCH));
225 } else if (!IsVCCZ && MaskValue == 0) {
226 // Will never branch
227 MachineOperand &Dst = MI.getOperand(0);
228 assert(Dst.isMBB() && "destination is not basic block");
229 MI.getParent()->removeSuccessor(Dst.getMBB());
230 MI.eraseFromParent();
231 return true;
232 } else if (MaskValue == -1) {
233 // Depends only on EXEC
234 MI.setDesc(
235 TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
236 }
237
238 MI.removeOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
239 MI.addImplicitDefUseOperands(*MBB.getParent());
240
241 return true;
242}
243
244bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
245 MachineInstr &MI) const {
246 MachineBasicBlock &MBB = *MI.getParent();
247 const MachineFunction &MF = *MBB.getParent();
248 const MachineRegisterInfo &MRI = MF.getRegInfo();
249 MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
250 Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
252 bool IdxOn = true;
253
254 if (!MI.isIdenticalTo(First))
255 return false;
256
257 // Scan back to find an identical S_SET_GPR_IDX_ON
258 for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()),
259 E = MI.getIterator();
260 I != E; ++I) {
261 if (I->isBundle())
262 continue;
263 switch (I->getOpcode()) {
264 case AMDGPU::S_SET_GPR_IDX_MODE:
265 return false;
266 case AMDGPU::S_SET_GPR_IDX_OFF:
267 IdxOn = false;
268 ToRemove.push_back(&*I);
269 break;
270 default:
271 if (I->modifiesRegister(AMDGPU::M0, TRI))
272 return false;
273 if (IdxReg && I->modifiesRegister(IdxReg, TRI))
274 return false;
275 if (llvm::any_of(I->operands(),
276 [&MRI, this](const MachineOperand &MO) {
277 return MO.isReg() &&
278 TRI->isVectorRegister(MRI, MO.getReg());
279 })) {
280 // The only exception allowed here is another indirect vector move
281 // with the same mode.
282 if (!IdxOn || !(I->getOpcode() == AMDGPU::V_MOV_B32_indirect_write ||
283 I->getOpcode() == AMDGPU::V_MOV_B32_indirect_read))
284 return false;
285 }
286 }
287 }
288
289 MI.eraseFromBundle();
290 for (MachineInstr *RI : ToRemove)
291 RI->eraseFromBundle();
292 return true;
293}
294
295bool SIPreEmitPeephole::getBlockDestinations(
296 MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
298 if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond))
299 return false;
300
301 if (!FalseMBB)
302 FalseMBB = SrcMBB.getNextNode();
303
304 return true;
305}
306
307bool SIPreEmitPeephole::mustRetainExeczBranch(
308 const MachineBasicBlock &From, const MachineBasicBlock &To) const {
309 unsigned NumInstr = 0;
310 const MachineFunction *MF = From.getParent();
311
312 for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
313 MBBI != End && MBBI != ToI; ++MBBI) {
314 const MachineBasicBlock &MBB = *MBBI;
315
316 for (const MachineInstr &MI : MBB) {
317 // When a uniform loop is inside non-uniform control flow, the branch
318 // leaving the loop might never be taken when EXEC = 0.
319 // Hence we should retain cbranch out of the loop lest it become infinite.
320 if (MI.isConditionalBranch())
321 return true;
322
323 if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
324 return true;
325
326 // These instructions are potentially expensive even if EXEC = 0.
327 if (TII->isSMRD(MI) || TII->isVMEM(MI) || TII->isFLAT(MI) ||
328 TII->isDS(MI) || MI.getOpcode() == AMDGPU::S_WAITCNT)
329 return true;
330
331 ++NumInstr;
332 if (NumInstr >= SkipThreshold)
333 return true;
334 }
335 }
336
337 return false;
338}
339
340// Returns true if the skip branch instruction is removed.
341bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
342 MachineBasicBlock &SrcMBB) {
343 MachineBasicBlock *TrueMBB = nullptr;
344 MachineBasicBlock *FalseMBB = nullptr;
346
347 if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
348 return false;
349
350 // Consider only the forward branches.
351 if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) ||
352 mustRetainExeczBranch(*FalseMBB, *TrueMBB))
353 return false;
354
355 LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
356 MI.eraseFromParent();
357 SrcMBB.removeSuccessor(TrueMBB);
358
359 return true;
360}
361
362bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
364 TII = ST.getInstrInfo();
365 TRI = &TII->getRegisterInfo();
366 bool Changed = false;
367
368 MF.RenumberBlocks();
369
370 for (MachineBasicBlock &MBB : MF) {
372 // Check first terminator for branches to optimize
373 if (TermI != MBB.end()) {
374 MachineInstr &MI = *TermI;
375 switch (MI.getOpcode()) {
376 case AMDGPU::S_CBRANCH_VCCZ:
377 case AMDGPU::S_CBRANCH_VCCNZ:
378 Changed |= optimizeVccBranch(MI);
379 break;
380 case AMDGPU::S_CBRANCH_EXECZ:
381 Changed |= removeExeczBranch(MI, MBB);
382 break;
383 }
384 }
385
386 if (!ST.hasVGPRIndexMode())
387 continue;
388
389 MachineInstr *SetGPRMI = nullptr;
390 const unsigned Threshold = 20;
391 unsigned Count = 0;
392 // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
393 // second is not needed. Do expensive checks in the optimizeSetGPR()
394 // and limit the distance to 20 instructions for compile time purposes.
395 // Note: this needs to work on bundles as S_SET_GPR_IDX* instructions
396 // may be bundled with the instructions they modify.
397 for (auto &MI :
399 if (Count == Threshold)
400 SetGPRMI = nullptr;
401 else
402 ++Count;
403
404 if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
405 continue;
406
407 Count = 0;
408 if (!SetGPRMI) {
409 SetGPRMI = &MI;
410 continue;
411 }
412
413 if (optimizeSetGPR(*SetGPRMI, MI))
414 Changed = true;
415 else
416 SetGPRMI = &MI;
417 }
418 }
419
420 return Changed;
421}
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator MBBI
Provides AMDGPU specific target descriptions.
ReachingDefAnalysis InstSet & ToRemove
SmallVector< MachineOperand, 4 > Cond
BlockVerifier::State From
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static cl::opt< unsigned, true > SkipThresholdFlag("amdgpu-skip-threshold", cl::Hidden, cl::desc("Number of instructions before jumping over divergent control flow"), cl::location(SkipThreshold), cl::init(12))
static unsigned SkipThreshold
#define DEBUG_TYPE
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const override
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
instr_iterator instr_begin()
reverse_iterator rend()
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
MachineBasicBlock * getFallThrough(bool JumpToFallThrough=false)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
instr_iterator instr_end()
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< iterator > terminators()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
void RenumberBlocks(MachineBasicBlock *MBBFrom=nullptr)
RenumberBlocks - This discards all of the MachineBasicBlock numbers and recomputes them.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
MachineBasicBlock iterator that automatically skips over MIs that are inside bundles (i....
Representation of each machine instruction.
Definition: MachineInstr.h:68
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
Iterator for intrusive lists based on ilist_node.
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:289
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:465
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:748
void initializeSIPreEmitPeepholePass(PassRegistry &)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1826
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
char & SIPreEmitPeepholeID
@ And
Bitwise or logical AND of integers.