LLVM 23.0.0git
AMDGPUGlobalISelDivergenceLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUGlobalISelDivergenceLowering.cpp ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// GlobalISel pass that selects divergent i1 phis as lane mask phis.
11/// Lane mask merging uses same algorithm as SDAG in SILowerI1Copies.
12/// Handles all cases of temporal divergence.
13/// For divergent non-phi i1 and uniform i1 uses outside of the cycle this pass
14/// currently depends on LCSSA to insert phis with one incoming.
15//
16//===----------------------------------------------------------------------===//
17
18#include "AMDGPU.h"
20#include "SILowerI1Copies.h"
25
26#define DEBUG_TYPE "amdgpu-global-isel-divergence-lowering"
27
28using namespace llvm;
29
30namespace {
31
32class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass {
33public:
34 static char ID;
35
36public:
37 AMDGPUGlobalISelDivergenceLowering() : MachineFunctionPass(ID) {}
38
39 bool runOnMachineFunction(MachineFunction &MF) override;
40
41 StringRef getPassName() const override {
42 return "AMDGPU GlobalISel divergence lowering";
43 }
44
45 void getAnalysisUsage(AnalysisUsage &AU) const override {
46 AU.setPreservesCFG();
51 }
52};
53
54class DivergenceLoweringHelper : public AMDGPU::PhiLoweringHelper {
55public:
56 DivergenceLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT,
59
60private:
61 MachineUniformityInfo *MUI = nullptr;
63 Register buildRegCopyToLaneMask(Register Reg);
64
65public:
66 void markAsLaneMask(Register DstReg) const override;
67 void getCandidatesForLowering(
68 SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override;
69 void collectIncomingValuesFromPhi(
70 const MachineInstr *MI,
71 SmallVectorImpl<AMDGPU::Incoming> &Incomings) const override;
72 void replaceDstReg(Register NewReg, Register OldReg,
73 MachineBasicBlock *MBB) override;
74 void buildMergeLaneMasks(MachineBasicBlock &MBB,
76 Register DstReg, Register PrevReg,
77 Register CurReg) override;
78 void constrainAsLaneMask(AMDGPU::Incoming &In) override;
79
80 bool lowerTemporalDivergence();
81 bool lowerTemporalDivergenceI1();
82};
83
84DivergenceLoweringHelper::DivergenceLoweringHelper(
87 : PhiLoweringHelper(MF, DT, PDT), MUI(MUI), B(*MF) {}
88
89// _(s1) -> SReg_32/64(s1)
90void DivergenceLoweringHelper::markAsLaneMask(Register DstReg) const {
91 assert(MRI->getType(DstReg) == LLT::scalar(1));
92
93 if (MRI->getRegClassOrNull(DstReg)) {
94 if (MRI->constrainRegClass(DstReg, ST->getBoolRC()))
95 return;
96 llvm_unreachable("Failed to constrain register class");
97 }
98
99 MRI->setRegClass(DstReg, ST->getBoolRC());
100}
101
102void DivergenceLoweringHelper::getCandidatesForLowering(
103 SmallVectorImpl<MachineInstr *> &Vreg1Phis) const {
104 LLT S1 = LLT::scalar(1);
105
106 // Add divergent i1 G_PHIs to the list. Only consider G_PHI instructions,
107 // not PHI instructions that may have been created by earlier lowering stages
108 // (e.g., lowerTemporalDivergenceI1).
109 for (MachineBasicBlock &MBB : *MF) {
110 for (MachineInstr &MI : MBB.phis()) {
111 if (MI.getOpcode() != TargetOpcode::G_PHI)
112 continue;
113 Register Dst = MI.getOperand(0).getReg();
114 if (MRI->getType(Dst) == S1 && MUI->isDivergent(Dst))
115 Vreg1Phis.push_back(&MI);
116 }
117 }
118}
119
120void DivergenceLoweringHelper::collectIncomingValuesFromPhi(
121 const MachineInstr *MI,
122 SmallVectorImpl<AMDGPU::Incoming> &Incomings) const {
123 for (unsigned i = 1; i < MI->getNumOperands(); i += 2) {
124 Incomings.emplace_back(MI->getOperand(i).getReg(),
125 MI->getOperand(i + 1).getMBB(), Register());
126 }
127}
128
129void DivergenceLoweringHelper::replaceDstReg(Register NewReg, Register OldReg,
131 BuildMI(*MBB, MBB->getFirstNonPHI(), {}, TII->get(AMDGPU::COPY), OldReg)
132 .addReg(NewReg);
133}
134
135// Copy Reg to new lane mask register, insert a copy after instruction that
136// defines Reg while skipping phis if needed.
137Register DivergenceLoweringHelper::buildRegCopyToLaneMask(Register Reg) {
138 Register LaneMask = AMDGPU::createLaneMaskReg(MRI, LaneMaskRegAttrs);
139 MachineInstr *Instr = MRI->getVRegDef(Reg);
140 MachineBasicBlock *MBB = Instr->getParent();
141 B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Instr->getIterator())));
142 B.buildCopy(LaneMask, Reg);
143 return LaneMask;
144}
145
146// bb.previous
147// %PrevReg = ...
148//
149// bb.current
150// %CurReg = ...
151//
152// %DstReg - not defined
153//
154// -> (wave32 example, new registers have sreg_32 reg class and S1 LLT)
155//
156// bb.previous
157// %PrevReg = ...
158// %PrevRegCopy:sreg_32(s1) = COPY %PrevReg
159//
160// bb.current
161// %CurReg = ...
162// %CurRegCopy:sreg_32(s1) = COPY %CurReg
163// ...
164// %PrevMaskedReg:sreg_32(s1) = ANDN2 %PrevRegCopy, ExecReg - active lanes 0
165// %CurMaskedReg:sreg_32(s1) = AND %ExecReg, CurRegCopy - inactive lanes to 0
166// %DstReg:sreg_32(s1) = OR %PrevMaskedReg, CurMaskedReg
167//
168// DstReg = for active lanes rewrite bit in PrevReg with bit from CurReg
169void DivergenceLoweringHelper::buildMergeLaneMasks(
171 Register DstReg, Register PrevReg, Register CurReg) {
172 // DstReg = (PrevReg & !EXEC) | (CurReg & EXEC)
173 // TODO: check if inputs are constants or results of a compare.
174
175 Register PrevRegCopy = buildRegCopyToLaneMask(PrevReg);
176 Register CurRegCopy = buildRegCopyToLaneMask(CurReg);
177 Register PrevMaskedReg = AMDGPU::createLaneMaskReg(MRI, LaneMaskRegAttrs);
178 Register CurMaskedReg = AMDGPU::createLaneMaskReg(MRI, LaneMaskRegAttrs);
179
180 B.setInsertPt(MBB, I);
181 B.buildInstr(LMC->AndN2Opc, {PrevMaskedReg}, {PrevRegCopy, LMC->ExecReg});
182 B.buildInstr(LMC->AndOpc, {CurMaskedReg}, {LMC->ExecReg, CurRegCopy});
183 B.buildInstr(LMC->OrOpc, {DstReg}, {PrevMaskedReg, CurMaskedReg});
184}
185
186// GlobalISel has to constrain S1 incoming taken as-is with lane mask register
187// class. Insert a copy of Incoming.Reg to new lane mask inside Incoming.Block,
188// Incoming.Reg becomes that new lane mask.
189void DivergenceLoweringHelper::constrainAsLaneMask(AMDGPU::Incoming &In) {
190 B.setInsertPt(*In.Block, In.Block->getFirstTerminator());
191
192 auto Copy = B.buildCopy(LLT::scalar(1), In.Reg);
193 MRI->setRegClass(Copy.getReg(0), ST->getBoolRC());
194 In.Reg = Copy.getReg(0);
195}
196
197void replaceUsesOfRegInInstWith(Register Reg, MachineInstr *Inst,
198 Register NewReg) {
199 for (MachineOperand &Op : Inst->operands()) {
200 if (Op.isReg() && Op.getReg() == Reg)
201 Op.setReg(NewReg);
202 }
203}
204
205bool DivergenceLoweringHelper::lowerTemporalDivergence() {
208
209 for (auto [Reg, UseInst, _] : MUI->getTemporalDivergenceList()) {
210 if (MRI->getType(Reg) == LLT::scalar(1) || MUI->isDivergent(Reg) ||
211 ILMA.isS32S64LaneMask(Reg))
212 continue;
213
214 Register CachedTDCopy = TDCache.lookup(Reg);
215 if (CachedTDCopy) {
216 replaceUsesOfRegInInstWith(Reg, UseInst, CachedTDCopy);
217 continue;
218 }
219
220 MachineInstr *Inst = MRI->getVRegDef(Reg);
222 B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Inst->getIterator())));
223
224 Register VgprReg = MRI->createGenericVirtualRegister(MRI->getType(Reg));
225 B.buildInstr(AMDGPU::COPY, {VgprReg}, {Reg})
226 .addUse(LMC->ExecReg, RegState::Implicit);
227
228 replaceUsesOfRegInInstWith(Reg, UseInst, VgprReg);
229 TDCache[Reg] = VgprReg;
230 }
231 return false;
232}
233
234bool DivergenceLoweringHelper::lowerTemporalDivergenceI1() {
235 MachineRegisterInfo::VRegAttrs BoolS1 = {ST->getBoolRC(), LLT::scalar(1)};
236 initializeLaneMaskRegisterAttributes(BoolS1);
238
239 // In case of use outside muliple nested cycles or muliple uses we only need
240 // to merge lane mask across largest relevant cycle.
242 for (auto [Reg, UseInst, LRC] : MUI->getTemporalDivergenceList()) {
243 if (MRI->getType(Reg) != LLT::scalar(1))
244 continue;
245
246 auto [LRCCacheIter, RegNotCached] = LRCCache.try_emplace(Reg);
247 auto &CycleMergedMask = LRCCacheIter->getSecond();
248 const MachineCycle *&CachedLRC = CycleMergedMask.first;
249 if (RegNotCached || LRC->contains(CachedLRC)) {
250 CachedLRC = LRC;
251 }
252 }
253
254 for (auto &LRCCacheEntry : LRCCache) {
255 Register Reg = LRCCacheEntry.first;
256 auto &CycleMergedMask = LRCCacheEntry.getSecond();
257 const MachineCycle *Cycle = CycleMergedMask.first;
258
259 Register MergedMask = MRI->createVirtualRegister(BoolS1);
260 SSAUpdater.Initialize(MergedMask);
261
262 MachineBasicBlock *MBB = MRI->getVRegDef(Reg)->getParent();
263 SSAUpdater.AddAvailableValue(MBB, MergedMask);
264
265 for (auto Entry : Cycle->getEntries()) {
266 for (MachineBasicBlock *Pred : Entry->predecessors()) {
267 if (!Cycle->contains(Pred)) {
268 B.setInsertPt(*Pred, Pred->getFirstTerminator());
269 auto ImplDef = B.buildInstr(AMDGPU::IMPLICIT_DEF, {BoolS1}, {});
270 SSAUpdater.AddAvailableValue(Pred, ImplDef.getReg(0));
271 }
272 }
273 }
274
275 buildMergeLaneMasks(*MBB, MBB->getFirstTerminator(), {}, MergedMask,
277
278 CycleMergedMask.second = MergedMask;
279 }
280
281 for (auto [Reg, UseInst, Cycle] : MUI->getTemporalDivergenceList()) {
282 if (MRI->getType(Reg) != LLT::scalar(1))
283 continue;
284
285 replaceUsesOfRegInInstWith(Reg, UseInst, LRCCache.lookup(Reg).second);
286 }
287
288 return false;
289}
290
291} // End anonymous namespace.
292
293INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
294 "AMDGPU GlobalISel divergence lowering", false, false)
298INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
299 "AMDGPU GlobalISel divergence lowering", false, false)
300
301char AMDGPUGlobalISelDivergenceLowering::ID = 0;
302
304 AMDGPUGlobalISelDivergenceLowering::ID;
305
307 return new AMDGPUGlobalISelDivergenceLowering();
308}
309
310bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
311 MachineFunction &MF) {
313 getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
315 getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
317 getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
318
319 DivergenceLoweringHelper Helper(&MF, &DT, &PDT, &MUI);
320
321 bool Changed = false;
322 // Temporal divergence lowering needs to inspect list of instructions used
323 // outside cycle with divergent exit provided by uniformity analysis. Uniform
324 // instructions from the list require lowering, no instruction is deleted.
325 // Thus it needs to be run before lowerPhis that deletes phis that require
326 // lowering and replaces them with new instructions.
327
328 // Non-i1 temporal divergence lowering.
329 Changed |= Helper.lowerTemporalDivergence();
330 // This covers both uniform and divergent i1s. Lane masks are in sgpr and need
331 // to be updated in each iteration.
332 Changed |= Helper.lowerTemporalDivergenceI1();
333 // Temporal divergence lowering of divergent i1 phi used outside of the cycle
334 // could also be handled by lowerPhis but we do it in lowerTempDivergenceI1
335 // since in some case lowerPhis does unnecessary lane mask merging.
336 Changed |= Helper.lowerPhis();
337 return Changed;
338}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
constexpr LLT S1
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define DEBUG_TYPE
const HexagonInstrInfo * TII
#define _
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition MD5.cpp:57
This file declares the MachineIRBuilder class.
Register Reg
Machine IR instance of the generic uniformity analysis.
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
Interface definition of the PhiLoweringHelper class that implements lane mask merging algorithm for d...
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
A debug info location.
Definition DebugLoc.h:123
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
iterator_range< TemporalDivergenceTuple * > getTemporalDivergenceList() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
iterator_range< pred_iterator > predecessors()
MachineInstrBundleIterator< MachineInstr > iterator
Analysis pass which computes a MachineDominatorTree.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Helper class to build MachineInstr.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
mop_range operands()
MachineOperand class - Representation of each machine instruction operand.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
MachineSSAUpdater - This class updates SSA form for a set of virtual registers defined in multiple bl...
Legacy analysis pass which computes a MachineUniformityInfo.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Helper class for SSA formation on a set of values defined in multiple blocks.
Definition SSAUpdater.h:39
void Initialize(Type *Ty, StringRef Name)
Reset this object to get ready for a new set of SSA updates with type 'Ty'.
Value * GetValueInMiddleOfBlock(BasicBlock *BB)
Construct SSA form, materializing a value that is live in the middle of the specified block.
void AddAvailableValue(BasicBlock *BB, Value *V)
Indicate that a rewritten value is available in the specified block with the specified value.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Register createLaneMaskReg(MachineRegisterInfo *MRI, MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
CycleInfo::CycleT Cycle
Definition CycleInfo.h:26
DWARFExpression::Operation Op
FunctionPass * createAMDGPUGlobalISelDivergenceLoweringPass()
MachineCycleInfo::CycleT MachineCycle
Incoming for lane mask phi as machine instruction, incoming register Reg and incoming block Block are...
All attributes(register class or bank and low-level type) a virtual register can have.