LLVM  12.0.0git
GCNNSAReassign.cpp
Go to the documentation of this file.
1 //===-- GCNNSAReassign.cpp - Reassign registers in NSA unstructions -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Try to reassign registers on GFX10+ from non-sequential to sequential
11 /// in NSA image instructions. Later SIShrinkInstructions pass will relace NSA
12 /// with sequential versions where possible.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUSubtarget.h"
18 #include "SIInstrInfo.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/Statistic.h"
26 #include "llvm/InitializePasses.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-nsa-reassign"
33 
34 STATISTIC(NumNSAInstructions,
35  "Number of NSA instructions with non-sequential address found");
36 STATISTIC(NumNSAConverted,
37  "Number of NSA instructions changed to sequential");
38 
39 namespace {
40 
41 class GCNNSAReassign : public MachineFunctionPass {
42 public:
43  static char ID;
44 
45  GCNNSAReassign() : MachineFunctionPass(ID) {
47  }
48 
49  bool runOnMachineFunction(MachineFunction &MF) override;
50 
51  StringRef getPassName() const override { return "GCN NSA Reassign"; }
52 
53  void getAnalysisUsage(AnalysisUsage &AU) const override {
55  AU.addRequired<VirtRegMap>();
57  AU.setPreservesAll();
59  }
60 
61 private:
62  typedef enum {
63  NOT_NSA, // Not an NSA instruction
64  FIXED, // NSA which we cannot modify
65  NON_CONTIGUOUS, // NSA with non-sequential address which we can try
66  // to optimize.
67  CONTIGUOUS // NSA with all sequential address registers
68  } NSA_Status;
69 
70  const GCNSubtarget *ST;
71 
72  const MachineRegisterInfo *MRI;
73 
74  const SIRegisterInfo *TRI;
75 
76  VirtRegMap *VRM;
77 
78  LiveRegMatrix *LRM;
79 
80  LiveIntervals *LIS;
81 
82  unsigned MaxNumVGPRs;
83 
84  const MCPhysReg *CSRegs;
85 
86  NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const;
87 
88  bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
89  unsigned StartReg) const;
90 
91  bool canAssign(unsigned StartReg, unsigned NumRegs) const;
92 
93  bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const;
94 };
95 
96 } // End anonymous namespace.
97 
98 INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
99  false, false)
103 INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
104  false, false)
105 
106 
107 char GCNNSAReassign::ID = 0;
108 
109 char &llvm::GCNNSAReassignID = GCNNSAReassign::ID;
110 
111 bool
112 GCNNSAReassign::tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
113  unsigned StartReg) const {
114  unsigned NumRegs = Intervals.size();
115 
116  for (unsigned N = 0; N < NumRegs; ++N)
117  if (VRM->hasPhys(Intervals[N]->reg))
118  LRM->unassign(*Intervals[N]);
119 
120  for (unsigned N = 0; N < NumRegs; ++N)
121  if (LRM->checkInterference(*Intervals[N], StartReg + N))
122  return false;
123 
124  for (unsigned N = 0; N < NumRegs; ++N)
125  LRM->assign(*Intervals[N], StartReg + N);
126 
127  return true;
128 }
129 
130 bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const {
131  for (unsigned N = 0; N < NumRegs; ++N) {
132  unsigned Reg = StartReg + N;
133  if (!MRI->isAllocatable(Reg))
134  return false;
135 
136  for (unsigned I = 0; CSRegs[I]; ++I)
137  if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
138  !LRM->isPhysRegUsed(CSRegs[I]))
139  return false;
140  }
141 
142  return true;
143 }
144 
145 bool
146 GCNNSAReassign::scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const {
147  unsigned NumRegs = Intervals.size();
148 
149  if (NumRegs > MaxNumVGPRs)
150  return false;
151  unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0;
152 
153  for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) {
154  if (!canAssign(Reg, NumRegs))
155  continue;
156 
157  if (tryAssignRegisters(Intervals, Reg))
158  return true;
159  }
160 
161  return false;
162 }
163 
164 GCNNSAReassign::NSA_Status
165 GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {
167  if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
168  return NSA_Status::NOT_NSA;
169 
170  int VAddr0Idx =
171  AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
172 
173  unsigned VgprBase = 0;
174  bool NSA = false;
175  for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
176  const MachineOperand &Op = MI.getOperand(VAddr0Idx + I);
177  Register Reg = Op.getReg();
178  if (Register::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
179  return NSA_Status::FIXED;
180 
181  Register PhysReg = VRM->getPhys(Reg);
182 
183  if (!Fast) {
184  if (!PhysReg)
185  return NSA_Status::FIXED;
186 
187  // Bail if address is not a VGPR32. That should be possible to extend the
188  // optimization to work with subregs of a wider register tuples, but the
189  // logic to find free registers will be much more complicated with much
190  // less chances for success. That seems reasonable to assume that in most
191  // cases a tuple is used because a vector variable contains different
192  // parts of an address and it is either already consequitive or cannot
193  // be reassigned if not. If needed it is better to rely on register
194  // coalescer to process such address tuples.
195  if (MRI->getRegClass(Reg) != &AMDGPU::VGPR_32RegClass || Op.getSubReg())
196  return NSA_Status::FIXED;
197 
198  const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
199 
200  if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
201  return NSA_Status::FIXED;
202 
203  for (auto U : MRI->use_nodbg_operands(Reg)) {
204  if (U.isImplicit())
205  return NSA_Status::FIXED;
206  const MachineInstr *UseInst = U.getParent();
207  if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
208  return NSA_Status::FIXED;
209  }
210 
211  if (!LIS->hasInterval(Reg))
212  return NSA_Status::FIXED;
213  }
214 
215  if (I == 0)
216  VgprBase = PhysReg;
217  else if (VgprBase + I != PhysReg)
218  NSA = true;
219  }
220 
221  return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS;
222 }
223 
224 bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
225  ST = &MF.getSubtarget<GCNSubtarget>();
226  if (ST->getGeneration() < GCNSubtarget::GFX10)
227  return false;
228 
229  MRI = &MF.getRegInfo();
230  TRI = ST->getRegisterInfo();
231  VRM = &getAnalysis<VirtRegMap>();
232  LRM = &getAnalysis<LiveRegMatrix>();
233  LIS = &getAnalysis<LiveIntervals>();
234 
236  MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
237  MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs);
238  CSRegs = MRI->getCalleeSavedRegs();
239 
240  using Candidate = std::pair<const MachineInstr*, bool>;
241  SmallVector<Candidate, 32> Candidates;
242  for (const MachineBasicBlock &MBB : MF) {
243  for (const MachineInstr &MI : MBB) {
244  switch (CheckNSA(MI)) {
245  default:
246  continue;
247  case NSA_Status::CONTIGUOUS:
248  Candidates.push_back(std::make_pair(&MI, true));
249  break;
250  case NSA_Status::NON_CONTIGUOUS:
251  Candidates.push_back(std::make_pair(&MI, false));
252  ++NumNSAInstructions;
253  break;
254  }
255  }
256  }
257 
258  bool Changed = false;
259  for (auto &C : Candidates) {
260  if (C.second)
261  continue;
262 
263  const MachineInstr *MI = C.first;
264  if (CheckNSA(*MI, true) == NSA_Status::CONTIGUOUS) {
265  // Already happen to be fixed.
266  C.second = true;
267  ++NumNSAConverted;
268  continue;
269  }
270 
272  int VAddr0Idx =
273  AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0);
274 
276  SmallVector<unsigned, 16> OrigRegs;
277  SlotIndex MinInd, MaxInd;
278  for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
279  const MachineOperand &Op = MI->getOperand(VAddr0Idx + I);
280  Register Reg = Op.getReg();
281  LiveInterval *LI = &LIS->getInterval(Reg);
282  if (llvm::find(Intervals, LI) != Intervals.end()) {
283  // Same register used, unable to make sequential
284  Intervals.clear();
285  break;
286  }
287  Intervals.push_back(LI);
288  OrigRegs.push_back(VRM->getPhys(Reg));
289  if (LI->empty()) {
290  // The address input is undef, so it doesn't contribute to the relevant
291  // range. Seed a reasonable index range if required.
292  if (I == 0)
293  MinInd = MaxInd = LIS->getInstructionIndex(*MI);
294  continue;
295  }
296  MinInd = I != 0 ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex();
297  MaxInd = I != 0 ? std::max(MaxInd, LI->endIndex()) : LI->endIndex();
298  }
299 
300  if (Intervals.empty())
301  continue;
302 
303  LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI
304  << "\tOriginal allocation:\t";
305  for(auto *LI : Intervals)
306  dbgs() << " " << llvm::printReg((VRM->getPhys(LI->reg)), TRI);
307  dbgs() << '\n');
308 
309  bool Success = scavengeRegs(Intervals);
310  if (!Success) {
311  LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n");
312  if (VRM->hasPhys(Intervals.back()->reg)) // Did not change allocation.
313  continue;
314  } else {
315  // Check we did not make it worse for other instructions.
316  auto I = std::lower_bound(Candidates.begin(), &C, MinInd,
317  [this](const Candidate &C, SlotIndex I) {
318  return LIS->getInstructionIndex(*C.first) < I;
319  });
320  for (auto E = Candidates.end(); Success && I != E &&
321  LIS->getInstructionIndex(*I->first) < MaxInd; ++I) {
322  if (I->second && CheckNSA(*I->first, true) < NSA_Status::CONTIGUOUS) {
323  Success = false;
324  LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first);
325  }
326  }
327  }
328 
329  if (!Success) {
330  for (unsigned I = 0; I < Info->VAddrDwords; ++I)
331  if (VRM->hasPhys(Intervals[I]->reg))
332  LRM->unassign(*Intervals[I]);
333 
334  for (unsigned I = 0; I < Info->VAddrDwords; ++I)
335  LRM->assign(*Intervals[I], OrigRegs[I]);
336 
337  continue;
338  }
339 
340  C.second = true;
341  ++NumNSAConverted;
342  LLVM_DEBUG(dbgs() << "\tNew allocation:\t\t ["
343  << llvm::printReg((VRM->getPhys(Intervals.front()->reg)), TRI)
344  << " : "
345  << llvm::printReg((VRM->getPhys(Intervals.back()->reg)), TRI)
346  << "]\n");
347  Changed = true;
348  }
349 
350  return Changed;
351 }
uint64_t CallInst * C
bool empty() const
Definition: LiveInterval.h:373
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
AMDGPU specific subclass of TargetSubtarget.
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
This class represents lattice values for constants.
Definition: AllocatorList.h:23
char & GCNNSAReassignID
static bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:65
LiveInterval - This class represents the liveness of a register, or stack slot.
Definition: LiveInterval.h:680
unsigned Reg
unsigned getSubReg() const
STATISTIC(NumFunctions, "Total number of functions")
unsigned const TargetRegisterInfo * TRI
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
MachineBasicBlock & MBB
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
AnalysisUsage & addRequired()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:43
void initializeGCNNSAReassignPass(PassRegistry &)
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:456
#define DEBUG_TYPE
Analysis containing CSE Info
Definition: CSEInfo.cpp:25
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition: MCRegister.h:19
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1594
SlotIndex endIndex() const
endNumber - return the maximum point of the range of the whole, exclusive.
Definition: LiveInterval.h:383
unsigned const MachineRegisterInfo * MRI
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Represent the analysis usage information of a pass.
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:305
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
bool isCopy() const
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:350
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:883
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1511
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign", false, false) INITIALIZE_PASS_END(GCNNSAReassign
void setPreservesAll()
Set by analyses that do not transform their input at all.
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:280
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
#define Success
Representation of each machine instruction.
Definition: MachineInstr.h:62
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Interface definition for SIInstrInfo.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
GCN NSA Reassign
#define I(x, y, z)
Definition: MD5.cpp:59
#define N
SlotIndex beginIndex() const
beginIndex - Return the lowest numbered slot covered.
Definition: LiveInterval.h:376
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:57
Register getReg() const
getReg - Returns the register number.
#define LLVM_DEBUG(X)
Definition: Debug.h:122
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:466
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:83
Fast - This calling convention attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:42
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)