LLVM 20.0.0git
SIPreEmitPeephole.cpp
Go to the documentation of this file.
1//===-- SIPreEmitPeephole.cpp ------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass performs the peephole optimizations before code emission.
11///
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
15#include "GCNSubtarget.h"
18
19using namespace llvm;
20
21#define DEBUG_TYPE "si-pre-emit-peephole"
22
23static unsigned SkipThreshold;
24
26 "amdgpu-skip-threshold", cl::Hidden,
28 "Number of instructions before jumping over divergent control flow"),
30
31namespace {
32
33class SIPreEmitPeephole : public MachineFunctionPass {
34private:
35 const SIInstrInfo *TII = nullptr;
36 const SIRegisterInfo *TRI = nullptr;
37
38 bool optimizeVccBranch(MachineInstr &MI) const;
39 bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
40 bool getBlockDestinations(MachineBasicBlock &SrcMBB,
41 MachineBasicBlock *&TrueMBB,
42 MachineBasicBlock *&FalseMBB,
44 bool mustRetainExeczBranch(const MachineBasicBlock &From,
45 const MachineBasicBlock &To) const;
46 bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
47
48public:
49 static char ID;
50
51 SIPreEmitPeephole() : MachineFunctionPass(ID) {
53 }
54
55 bool runOnMachineFunction(MachineFunction &MF) override;
56};
57
58} // End anonymous namespace.
59
60INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE,
61 "SI peephole optimizations", false, false)
62
63char SIPreEmitPeephole::ID = 0;
64
65char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID;
66
67bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
68 // Match:
69 // sreg = -1 or 0
70 // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
71 // S_CBRANCH_VCC[N]Z
72 // =>
73 // S_CBRANCH_EXEC[N]Z
74 // We end up with this pattern sometimes after basic block placement.
75 // It happens while combining a block which assigns -1 or 0 to a saved mask
76 // and another block which consumes that saved mask and then a branch.
77 //
78 // While searching this also performs the following substitution:
79 // vcc = V_CMP
80 // vcc = S_AND exec, vcc
81 // S_CBRANCH_VCC[N]Z
82 // =>
83 // vcc = V_CMP
84 // S_CBRANCH_VCC[N]Z
85
86 bool Changed = false;
87 MachineBasicBlock &MBB = *MI.getParent();
89 const bool IsWave32 = ST.isWave32();
90 const unsigned CondReg = TRI->getVCC();
91 const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
92 const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
93 const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
94 const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
95
96 MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
97 E = MBB.rend();
98 bool ReadsCond = false;
99 unsigned Threshold = 5;
100 for (++A; A != E; ++A) {
101 if (!--Threshold)
102 return false;
103 if (A->modifiesRegister(ExecReg, TRI))
104 return false;
105 if (A->modifiesRegister(CondReg, TRI)) {
106 if (!A->definesRegister(CondReg, TRI) ||
107 (A->getOpcode() != And && A->getOpcode() != AndN2))
108 return false;
109 break;
110 }
111 ReadsCond |= A->readsRegister(CondReg, TRI);
112 }
113 if (A == E)
114 return false;
115
116 MachineOperand &Op1 = A->getOperand(1);
117 MachineOperand &Op2 = A->getOperand(2);
118 if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
119 TII->commuteInstruction(*A);
120 Changed = true;
121 }
122 if (Op1.getReg() != ExecReg)
123 return Changed;
124 if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
125 return Changed;
126
127 int64_t MaskValue = 0;
128 Register SReg;
129 if (Op2.isReg()) {
130 SReg = Op2.getReg();
131 auto M = std::next(A);
132 bool ReadsSreg = false;
133 bool ModifiesExec = false;
134 for (; M != E; ++M) {
135 if (M->definesRegister(SReg, TRI))
136 break;
137 if (M->modifiesRegister(SReg, TRI))
138 return Changed;
139 ReadsSreg |= M->readsRegister(SReg, TRI);
140 ModifiesExec |= M->modifiesRegister(ExecReg, TRI);
141 }
142 if (M == E)
143 return Changed;
144 // If SReg is VCC and SReg definition is a VALU comparison.
145 // This means S_AND with EXEC is not required.
146 // Erase the S_AND and return.
147 // Note: isVOPC is used instead of isCompare to catch V_CMP_CLASS
148 if (A->getOpcode() == And && SReg == CondReg && !ModifiesExec &&
149 TII->isVOPC(*M)) {
150 A->eraseFromParent();
151 return true;
152 }
153 if (!M->isMoveImmediate() || !M->getOperand(1).isImm() ||
154 (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
155 return Changed;
156 MaskValue = M->getOperand(1).getImm();
157 // First if sreg is only used in the AND instruction fold the immediate
158 // into the AND.
159 if (!ReadsSreg && Op2.isKill()) {
160 A->getOperand(2).ChangeToImmediate(MaskValue);
161 M->eraseFromParent();
162 }
163 } else if (Op2.isImm()) {
164 MaskValue = Op2.getImm();
165 } else {
166 llvm_unreachable("Op2 must be register or immediate");
167 }
168
169 // Invert mask for s_andn2
170 assert(MaskValue == 0 || MaskValue == -1);
171 if (A->getOpcode() == AndN2)
172 MaskValue = ~MaskValue;
173
174 if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) {
175 if (!MI.killsRegister(CondReg, TRI)) {
176 // Replace AND with MOV
177 if (MaskValue == 0) {
178 BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
179 .addImm(0);
180 } else {
181 BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
182 .addReg(ExecReg);
183 }
184 }
185 // Remove AND instruction
186 A->eraseFromParent();
187 }
188
189 bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
190 if (SReg == ExecReg) {
191 // EXEC is updated directly
192 if (IsVCCZ) {
193 MI.eraseFromParent();
194 return true;
195 }
196 MI.setDesc(TII->get(AMDGPU::S_BRANCH));
197 } else if (IsVCCZ && MaskValue == 0) {
198 // Will always branch
199 // Remove all successors shadowed by new unconditional branch
200 MachineBasicBlock *Parent = MI.getParent();
202 bool Found = false;
203 for (MachineInstr &Term : Parent->terminators()) {
204 if (Found) {
205 if (Term.isBranch())
206 ToRemove.push_back(&Term);
207 } else {
208 Found = Term.isIdenticalTo(MI);
209 }
210 }
211 assert(Found && "conditional branch is not terminator");
212 for (auto *BranchMI : ToRemove) {
213 MachineOperand &Dst = BranchMI->getOperand(0);
214 assert(Dst.isMBB() && "destination is not basic block");
215 Parent->removeSuccessor(Dst.getMBB());
216 BranchMI->eraseFromParent();
217 }
218
219 if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
220 Parent->removeSuccessor(Succ);
221 }
222
223 // Rewrite to unconditional branch
224 MI.setDesc(TII->get(AMDGPU::S_BRANCH));
225 } else if (!IsVCCZ && MaskValue == 0) {
226 // Will never branch
227 MachineOperand &Dst = MI.getOperand(0);
228 assert(Dst.isMBB() && "destination is not basic block");
229 MI.getParent()->removeSuccessor(Dst.getMBB());
230 MI.eraseFromParent();
231 return true;
232 } else if (MaskValue == -1) {
233 // Depends only on EXEC
234 MI.setDesc(
235 TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
236 }
237
238 MI.removeOperand(MI.findRegisterUseOperandIdx(CondReg, TRI, false /*Kill*/));
239 MI.addImplicitDefUseOperands(*MBB.getParent());
240
241 return true;
242}
243
244bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
245 MachineInstr &MI) const {
246 MachineBasicBlock &MBB = *MI.getParent();
247 const MachineFunction &MF = *MBB.getParent();
248 const MachineRegisterInfo &MRI = MF.getRegInfo();
249 MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
250 Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
252 bool IdxOn = true;
253
254 if (!MI.isIdenticalTo(First))
255 return false;
256
257 // Scan back to find an identical S_SET_GPR_IDX_ON
258 for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()),
259 E = MI.getIterator();
260 I != E; ++I) {
261 if (I->isBundle())
262 continue;
263 switch (I->getOpcode()) {
264 case AMDGPU::S_SET_GPR_IDX_MODE:
265 return false;
266 case AMDGPU::S_SET_GPR_IDX_OFF:
267 IdxOn = false;
268 ToRemove.push_back(&*I);
269 break;
270 default:
271 if (I->modifiesRegister(AMDGPU::M0, TRI))
272 return false;
273 if (IdxReg && I->modifiesRegister(IdxReg, TRI))
274 return false;
275 if (llvm::any_of(I->operands(),
276 [&MRI, this](const MachineOperand &MO) {
277 return MO.isReg() &&
278 TRI->isVectorRegister(MRI, MO.getReg());
279 })) {
280 // The only exception allowed here is another indirect vector move
281 // with the same mode.
282 if (!IdxOn || !(I->getOpcode() == AMDGPU::V_MOV_B32_indirect_write ||
283 I->getOpcode() == AMDGPU::V_MOV_B32_indirect_read))
284 return false;
285 }
286 }
287 }
288
289 MI.eraseFromBundle();
290 for (MachineInstr *RI : ToRemove)
291 RI->eraseFromBundle();
292 return true;
293}
294
295bool SIPreEmitPeephole::getBlockDestinations(
296 MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
298 if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond))
299 return false;
300
301 if (!FalseMBB)
302 FalseMBB = SrcMBB.getNextNode();
303
304 return true;
305}
306
307bool SIPreEmitPeephole::mustRetainExeczBranch(
308 const MachineBasicBlock &From, const MachineBasicBlock &To) const {
309 unsigned NumInstr = 0;
310 const MachineFunction *MF = From.getParent();
311
312 for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
313 MBBI != End && MBBI != ToI; ++MBBI) {
314 const MachineBasicBlock &MBB = *MBBI;
315
316 for (const MachineInstr &MI : MBB) {
317 // When a uniform loop is inside non-uniform control flow, the branch
318 // leaving the loop might never be taken when EXEC = 0.
319 // Hence we should retain cbranch out of the loop lest it become infinite.
320 if (MI.isConditionalBranch())
321 return true;
322
323 if (MI.isMetaInstruction())
324 continue;
325
326 if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
327 return true;
328
329 // These instructions are potentially expensive even if EXEC = 0.
330 if (TII->isSMRD(MI) || TII->isVMEM(MI) || TII->isFLAT(MI) ||
331 TII->isDS(MI) || TII->isWaitcnt(MI.getOpcode()))
332 return true;
333
334 ++NumInstr;
335 if (NumInstr >= SkipThreshold)
336 return true;
337 }
338 }
339
340 return false;
341}
342
343// Returns true if the skip branch instruction is removed.
344bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
345 MachineBasicBlock &SrcMBB) {
346 MachineBasicBlock *TrueMBB = nullptr;
347 MachineBasicBlock *FalseMBB = nullptr;
349
350 if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
351 return false;
352
353 // Consider only the forward branches.
354 if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) ||
355 mustRetainExeczBranch(*FalseMBB, *TrueMBB))
356 return false;
357
358 LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
359 MI.eraseFromParent();
360 SrcMBB.removeSuccessor(TrueMBB);
361
362 return true;
363}
364
365bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
367 TII = ST.getInstrInfo();
368 TRI = &TII->getRegisterInfo();
369 bool Changed = false;
370
371 MF.RenumberBlocks();
372
373 for (MachineBasicBlock &MBB : MF) {
375 // Check first terminator for branches to optimize
376 if (TermI != MBB.end()) {
377 MachineInstr &MI = *TermI;
378 switch (MI.getOpcode()) {
379 case AMDGPU::S_CBRANCH_VCCZ:
380 case AMDGPU::S_CBRANCH_VCCNZ:
381 Changed |= optimizeVccBranch(MI);
382 break;
383 case AMDGPU::S_CBRANCH_EXECZ:
384 Changed |= removeExeczBranch(MI, MBB);
385 break;
386 }
387 }
388
389 if (!ST.hasVGPRIndexMode())
390 continue;
391
392 MachineInstr *SetGPRMI = nullptr;
393 const unsigned Threshold = 20;
394 unsigned Count = 0;
395 // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
396 // second is not needed. Do expensive checks in the optimizeSetGPR()
397 // and limit the distance to 20 instructions for compile time purposes.
398 // Note: this needs to work on bundles as S_SET_GPR_IDX* instructions
399 // may be bundled with the instructions they modify.
400 for (auto &MI : make_early_inc_range(MBB.instrs())) {
401 if (Count == Threshold)
402 SetGPRMI = nullptr;
403 else
404 ++Count;
405
406 if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
407 continue;
408
409 Count = 0;
410 if (!SetGPRMI) {
411 SetGPRMI = &MI;
412 continue;
413 }
414
415 if (optimizeSetGPR(*SetGPRMI, MI))
416 Changed = true;
417 else
418 SetGPRMI = &MI;
419 }
420 }
421
422 return Changed;
423}
unsigned const MachineRegisterInfo * MRI
Provides AMDGPU specific target descriptions.
ReachingDefAnalysis InstSet & ToRemove
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator MBBI
BlockVerifier::State From
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
bool End
Definition: ELF_riscv.cpp:480
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static cl::opt< unsigned, true > SkipThresholdFlag("amdgpu-skip-threshold", cl::Hidden, cl::desc("Number of instructions before jumping over divergent control flow"), cl::location(SkipThreshold), cl::init(12))
static unsigned SkipThreshold
#define DEBUG_TYPE
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const override
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
reverse_iterator rend()
MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< iterator > terminators()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
void RenumberBlocks(MachineBasicBlock *MBBFrom=nullptr)
RenumberBlocks - This discards all of the MachineBasicBlock numbers and recomputes them.
BasicBlockListType::const_iterator const_iterator
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:353
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:463
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
void initializeSIPreEmitPeepholePass(PassRegistry &)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
char & SIPreEmitPeepholeID
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
@ And
Bitwise or logical AND of integers.