LLVM 20.0.0git
AMDGPURegBankLegalize.cpp
Go to the documentation of this file.
1//===-- AMDGPURegBankLegalize.cpp -----------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Lower G_ instructions that can't be inst-selected with register bank
10/// assignment from AMDGPURegBankSelect based on machine uniformity info.
11/// Given types on all operands, some register bank assignments require lowering
12/// while others do not.
13/// Note: cases where all register bank assignments would require lowering are
14/// lowered in legalizer.
15/// For example vgpr S64 G_AND requires lowering to S32 while sgpr S64 does not.
16/// Eliminate sgpr S1 by lowering to sgpr S32.
17//
18//===----------------------------------------------------------------------===//
19
20#include "AMDGPU.h"
23#include "GCNSubtarget.h"
30
31#define DEBUG_TYPE "amdgpu-regbanklegalize"
32
33using namespace llvm;
34using namespace AMDGPU;
35
36namespace {
37
38class AMDGPURegBankLegalize : public MachineFunctionPass {
39public:
40 static char ID;
41
42public:
43 AMDGPURegBankLegalize() : MachineFunctionPass(ID) {
45 }
46
47 bool runOnMachineFunction(MachineFunction &MF) override;
48
49 StringRef getPassName() const override {
50 return "AMDGPU Register Bank Legalize";
51 }
52
53 void getAnalysisUsage(AnalysisUsage &AU) const override {
58 }
59
60 // If there were no phis and we do waterfall expansion machine verifier would
61 // fail.
64 MachineFunctionProperties::Property::NoPHIs);
65 }
66};
67
68} // End anonymous namespace.
69
70INITIALIZE_PASS_BEGIN(AMDGPURegBankLegalize, DEBUG_TYPE,
71 "AMDGPU Register Bank Legalize", false, false)
75INITIALIZE_PASS_END(AMDGPURegBankLegalize, DEBUG_TYPE,
76 "AMDGPU Register Bank Legalize", false, false)
77
78char AMDGPURegBankLegalize::ID = 0;
79
80char &llvm::AMDGPURegBankLegalizeID = AMDGPURegBankLegalize::ID;
81
83 return new AMDGPURegBankLegalize();
84}
85
88 static std::mutex GlobalMutex;
90 CacheForRuleSet;
91 std::lock_guard<std::mutex> Lock(GlobalMutex);
92 if (!CacheForRuleSet.contains(ST.getGeneration())) {
93 auto Rules = std::make_unique<RegBankLegalizeRules>(ST, MRI);
94 CacheForRuleSet[ST.getGeneration()] = std::move(Rules);
95 } else {
96 CacheForRuleSet[ST.getGeneration()]->refreshRefs(ST, MRI);
97 }
98 return *CacheForRuleSet[ST.getGeneration()];
99}
100
104 const SIRegisterInfo &TRI;
105 const RegisterBank *SgprRB;
106 const RegisterBank *VgprRB;
107 const RegisterBank *VccRB;
108
109 static constexpr LLT S1 = LLT::scalar(1);
110 static constexpr LLT S16 = LLT::scalar(16);
111 static constexpr LLT S32 = LLT::scalar(32);
112 static constexpr LLT S64 = LLT::scalar(64);
113
114public:
116 const RegisterBankInfo &RBI)
117 : B(B), MRI(*B.getMRI()), TRI(TRI),
118 SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
119 VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
120 VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {};
121
123 const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
124 if (RB && RB->getID() == AMDGPU::VCCRegBankID)
125 return true;
126
127 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
128 return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1);
129 }
130
132 MI.eraseFromParent();
133 if (Optional0 && isTriviallyDead(*Optional0, MRI))
134 Optional0->eraseFromParent();
135 }
136
137 std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode) {
138 MachineInstr *MatchMI = MRI.getVRegDef(Src);
139 if (MatchMI->getOpcode() != Opcode)
140 return {nullptr, Register()};
141 return {MatchMI, MatchMI->getOperand(1).getReg()};
142 }
143
145 Register Dst = MI.getOperand(0).getReg();
146 Register Src = MI.getOperand(1).getReg();
147 // Skip copies of physical registers.
148 if (!Dst.isVirtual() || !Src.isVirtual())
149 return;
150
151 // This is a cross bank copy, sgpr S1 to lane mask.
152 //
153 // %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32)
154 // %Dst:lane-mask(s1) = COPY %Src:sgpr(s1)
155 // ->
156 // %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32)
157 if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) {
158 auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC);
159 assert(Trunc && MRI.getType(TruncS32Src) == S32 &&
160 "sgpr S1 must be result of G_TRUNC of sgpr S32");
161
162 B.setInstr(MI);
163 // Ensure that truncated bits in BoolSrc are 0.
164 auto One = B.buildConstant({SgprRB, S32}, 1);
165 auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One);
166 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc});
167 cleanUpAfterCombine(MI, Trunc);
168 return;
169 }
170
171 // Src = G_AMDGPU_READANYLANE RALSrc
172 // Dst = COPY Src
173 // ->
174 // Dst = RALSrc
175 if (MRI.getRegBankOrNull(Dst) == VgprRB &&
176 MRI.getRegBankOrNull(Src) == SgprRB) {
177 auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
178 if (!RAL)
179 return;
180
181 assert(MRI.getRegBank(RALSrc) == VgprRB);
182 MRI.replaceRegWith(Dst, RALSrc);
184 return;
185 }
186 }
187
189 // %Src:sgpr(S1) = G_TRUNC %TruncSrc
190 // %Dst = G_ANYEXT %Src:sgpr(S1)
191 // ->
192 // %Dst = G_... %TruncSrc
193 Register Dst = MI.getOperand(0).getReg();
194 Register Src = MI.getOperand(1).getReg();
195 if (MRI.getType(Src) != S1)
196 return;
197
198 auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC);
199 if (!Trunc)
200 return;
201
202 LLT DstTy = MRI.getType(Dst);
203 LLT TruncSrcTy = MRI.getType(TruncSrc);
204
205 if (DstTy == TruncSrcTy) {
206 MRI.replaceRegWith(Dst, TruncSrc);
207 cleanUpAfterCombine(MI, Trunc);
208 return;
209 }
210
211 B.setInstr(MI);
212
213 if (DstTy == S32 && TruncSrcTy == S64) {
214 auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc);
215 MRI.replaceRegWith(Dst, Unmerge.getReg(0));
216 cleanUpAfterCombine(MI, Trunc);
217 return;
218 }
219
220 if (DstTy == S32 && TruncSrcTy == S16) {
221 B.buildAnyExt(Dst, TruncSrc);
222 cleanUpAfterCombine(MI, Trunc);
223 return;
224 }
225
226 if (DstTy == S16 && TruncSrcTy == S32) {
227 B.buildTrunc(Dst, TruncSrc);
228 cleanUpAfterCombine(MI, Trunc);
229 return;
230 }
231
232 llvm_unreachable("missing anyext + trunc combine");
233 }
234};
235
236// Search through MRI for virtual registers with sgpr register bank and S1 LLT.
237[[maybe_unused]] static Register getAnySgprS1(const MachineRegisterInfo &MRI) {
238 const LLT S1 = LLT::scalar(1);
239 for (unsigned i = 0; i < MRI.getNumVirtRegs(); ++i) {
241 if (MRI.def_empty(Reg) || MRI.getType(Reg) != S1)
242 continue;
243
244 const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
245 if (RB && RB->getID() == AMDGPU::SGPRRegBankID) {
246 LLVM_DEBUG(dbgs() << "Warning: detected sgpr S1 register in: ";
247 MRI.getVRegDef(Reg)->dump(););
248 return Reg;
249 }
250 }
251
252 return {};
253}
254
255bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) {
256 if (MF.getProperties().hasProperty(
257 MachineFunctionProperties::Property::FailedISel))
258 return false;
259
260 // Setup the instruction builder with CSE.
261 const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
263 getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
264 GISelCSEInfo &CSEInfo = Wrapper.get(TPC.getCSEConfig());
265 GISelObserverWrapper Observer;
266 Observer.addObserver(&CSEInfo);
267
268 CSEMIRBuilder B(MF);
269 B.setCSEInfo(&CSEInfo);
270 B.setChangeObserver(Observer);
271
272 RAIIDelegateInstaller DelegateInstaller(MF, &Observer);
273 RAIIMFObserverInstaller MFObserverInstaller(MF, Observer);
274
277 const RegisterBankInfo &RBI = *ST.getRegBankInfo();
278 const MachineUniformityInfo &MUI =
279 getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
280
281 // RegBankLegalizeRules is initialized with assigning sets of IDs to opcodes.
282 const RegBankLegalizeRules &RBLRules = getRules(ST, MRI);
283
284 // Logic that does legalization based on IDs assigned to Opcode.
285 RegBankLegalizeHelper RBLHelper(B, MUI, RBI, RBLRules);
286
288
289 for (MachineBasicBlock &MBB : MF) {
290 for (MachineInstr &MI : MBB) {
291 AllInst.push_back(&MI);
292 }
293 }
294
295 for (MachineInstr *MI : AllInst) {
296 if (!MI->isPreISelOpcode())
297 continue;
298
299 unsigned Opc = MI->getOpcode();
300 // Insert point for use operands needs some calculation.
301 if (Opc == AMDGPU::G_PHI) {
302 RBLHelper.applyMappingPHI(*MI);
303 continue;
304 }
305
306 // Opcodes that support pretty much all combinations of reg banks and LLTs
307 // (except S1). There is no point in writing rules for them.
308 if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_UNMERGE_VALUES ||
309 Opc == AMDGPU::G_MERGE_VALUES) {
310 RBLHelper.applyMappingTrivial(*MI);
311 continue;
312 }
313
314 // Opcodes that also support S1.
315 if ((Opc == AMDGPU::G_CONSTANT || Opc == AMDGPU::G_FCONSTANT ||
316 Opc == AMDGPU::G_IMPLICIT_DEF)) {
317 Register Dst = MI->getOperand(0).getReg();
318 // Non S1 types are trivially accepted.
319 if (MRI.getType(Dst) != LLT::scalar(1)) {
320 assert(MRI.getRegBank(Dst)->getID() == AMDGPU::SGPRRegBankID);
321 continue;
322 }
323
324 // S1 rules are in RegBankLegalizeRules.
325 }
326
327 RBLHelper.findRuleAndApplyMapping(*MI);
328 }
329
330 // Sgpr S1 clean up combines:
331 // - Sgpr S1(S32) to sgpr S1(S32) Copy: anyext + trunc combine.
332 // In RegBankLegalize 'S1 Dst' are legalized into S32 as
333 // 'S1Dst = Trunc S32Dst' and 'S1 Src' into 'S32Src = Anyext S1Src'.
334 // S1 Truncs and Anyexts that come from legalizer, that can have non-S32
335 // types e.g. S16 = Anyext S1 or S1 = Trunc S64, will also be cleaned up.
336 // - Sgpr S1(S32) to vcc Copy: G_AMDGPU_COPY_VCC_SCC combine.
337 // Divergent instruction uses sgpr S1 as input that should be lane mask(vcc)
338 // Legalizing this use creates sgpr S1(S32) to vcc Copy.
339
340 // Note: Remaining S1 copies, S1s are either sgpr S1(S32) or vcc S1:
341 // - Vcc to vcc Copy: nothing to do here, just a regular copy.
342 // - Vcc to sgpr S1 Copy: Should not exist in a form of COPY instruction(*).
343 // Note: For 'uniform-in-vcc to sgpr-S1 copy' G_AMDGPU_COPY_SCC_VCC is used
344 // instead. When only available instruction creates vcc result, use of
345 // UniformInVcc results in creating G_AMDGPU_COPY_SCC_VCC.
346
347 // (*)Explanation for 'sgpr S1(uniform) = COPY vcc(divergent)':
348 // Copy from divergent to uniform register indicates an error in either:
349 // - Uniformity analysis: Uniform instruction has divergent input. If one of
350 // the inputs is divergent, instruction should be divergent!
351 // - RegBankLegalizer not executing in waterfall loop (missing implementation)
352
353 AMDGPURegBankLegalizeCombiner Combiner(B, *ST.getRegisterInfo(), RBI);
354
355 for (MachineBasicBlock &MBB : MF) {
357 if (MI.getOpcode() == AMDGPU::COPY) {
358 Combiner.tryCombineCopy(MI);
359 continue;
360 }
361 if (MI.getOpcode() == AMDGPU::G_ANYEXT) {
362 Combiner.tryCombineS1AnyExt(MI);
363 continue;
364 }
365 }
366 }
367
369 "Registers with sgpr reg bank and S1 LLT are not legal after "
370 "AMDGPURegBankLegalize. Should lower to sgpr S32");
371
372 return true;
373}
unsigned const MachineRegisterInfo * MRI
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static const LLT S1
AMDGPU Register Bank Legalize
static Register getAnySgprS1(const MachineRegisterInfo &MRI)
const RegBankLegalizeRules & getRules(const GCNSubtarget &ST, MachineRegisterInfo &MRI)
#define DEBUG_TYPE
Lower G_ instructions that can't be inst-selected with register bank assignment from AMDGPURegBankSel...
MachineBasicBlock & MBB
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Provides analysis for continuously CSEing during GISel passes.
This file implements a version of MachineIRBuilder which CSEs insts within a MachineBasicBlock.
#define LLVM_DEBUG(...)
Definition: Debug.h:106
AMD GCN specific subclass of TargetSubtarget.
IRTranslator LLVM IR MI
unsigned const TargetRegisterInfo * TRI
Machine IR instance of the generic uniformity analysis.
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Target-Independent Code Generator Pass Configuration Options pass.
AMDGPURegBankLegalizeCombiner(MachineIRBuilder &B, const SIRegisterInfo &TRI, const RegisterBankInfo &RBI)
void tryCombineS1AnyExt(MachineInstr &MI)
void cleanUpAfterCombine(MachineInstr &MI, MachineInstr *Optional0)
std::pair< MachineInstr *, Register > tryMatch(Register Src, unsigned Opcode)
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
Defines a builder that does CSE of MachineInstructions using GISelCSEInfo.
Definition: CSEMIRBuilder.h:38
Combiner implementation.
Definition: Combiner.h:34
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:147
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
The actual analysis pass wrapper.
Definition: CSEInfo.h:225
Simple wrapper that does the following.
Definition: CSEInfo.h:207
The CSE Analysis object.
Definition: CSEInfo.h:70
Simple wrapper observer that takes several observers, and calls each one for each event.
void addObserver(GISelChangeObserver *O)
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
virtual MachineFunctionProperties getClearedProperties() const
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
Properties which a MachineFunction may have at a given point in time.
MachineFunctionProperties & set(Property P)
bool hasProperty(Property P) const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const MachineFunctionProperties & getProperties() const
Get the function properties.
Helper class to build MachineInstr.
Representation of each machine instruction.
Definition: MachineInstr.h:71
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:577
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:587
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Legacy analysis pass which computes a MachineUniformityInfo.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
A simple RAII based Delegate installer.
A simple RAII based Observer installer.
Holds all the information related to register banks.
This class implements the register bank concept.
Definition: RegisterBank.h:28
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:45
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition: Register.h:84
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
Target-Independent Code Generator Pass Configuration Options.
virtual std::unique_ptr< CSEConfigBase > getCSEConfig() const
Returns the CSEConfig object to use for the current optimization level.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
FunctionPass * createAMDGPURegBankLegalizePass()
char & AMDGPURegBankLegalizeID
void initializeAMDGPURegBankLegalizePass(PassRegistry &)
bool isTriviallyDead(const MachineInstr &MI, const MachineRegisterInfo &MRI)
Check whether an instruction MI is dead: it only defines dead virtual registers, and doesn't have oth...
Definition: Utils.cpp:222