LLVM  16.0.0git
AMDGPUResourceUsageAnalysis.cpp
Go to the documentation of this file.
1 //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Analyzes how many registers and other resources are used by
11 /// functions.
12 ///
13 /// The results of this analysis are used to fill the register usage, flat
14 /// usage, etc. into hardware registers.
15 ///
16 /// The analysis takes callees into account. E.g. if a function A that needs 10
17 /// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A
18 /// will return 20.
19 /// It is assumed that an indirect call can go into any function except
20 /// hardware-entrypoints. Therefore the register usage of functions with
21 /// indirect calls is estimated as the maximum of all non-entrypoint functions
22 /// in the module.
23 ///
24 //===----------------------------------------------------------------------===//
25 
27 #include "AMDGPU.h"
28 #include "GCNSubtarget.h"
29 #include "SIMachineFunctionInfo.h"
34 #include "llvm/IR/GlobalAlias.h"
35 #include "llvm/IR/GlobalValue.h"
37 
38 using namespace llvm;
39 using namespace llvm::AMDGPU;
40 
41 #define DEBUG_TYPE "amdgpu-resource-usage"
42 
45 
46 // In code object v4 and older, we need to tell the runtime some amount ahead of
47 // time if we don't know the true stack size. Assume a smaller number if this is
48 // only due to dynamic / non-entry block allocas.
50  "amdgpu-assume-external-call-stack-size",
51  cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
52  cl::init(16384));
53 
55  "amdgpu-assume-dynamic-stack-object-size",
56  cl::desc("Assumed extra stack use if there are any "
57  "variable sized objects (in bytes)"),
58  cl::Hidden, cl::init(4096));
59 
61  "Function register usage analysis", true, true)
62 
63 static const Function *getCalleeFunction(const MachineOperand &Op) {
64  if (Op.isImm()) {
65  assert(Op.getImm() == 0);
66  return nullptr;
67  }
68  if (auto *GA = dyn_cast<GlobalAlias>(Op.getGlobal()))
69  return cast<Function>(GA->getOperand(0));
70  return cast<Function>(Op.getGlobal());
71 }
72 
74  const SIInstrInfo &TII, unsigned Reg) {
75  for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
76  if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
77  return true;
78  }
79 
80  return false;
81 }
82 
84  const GCNSubtarget &ST) const {
85  return NumExplicitSGPR +
86  IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch,
87  ST.getTargetID().isXnackOnOrAny());
88 }
89 
91  const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const {
92  return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), ArgNumAGPR, ArgNumVGPR);
93 }
94 
96  const GCNSubtarget &ST) const {
97  return getTotalNumVGPRs(ST, NumAGPR, NumVGPR);
98 }
99 
101  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
102  if (!TPC)
103  return false;
104 
105  MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
106  const TargetMachine &TM = TPC->getTM<TargetMachine>();
107  bool HasIndirectCall = false;
108 
109  CallGraph CG = CallGraph(M);
110  auto End = po_end(&CG);
111 
112  // By default, for code object v5 and later, track only the minimum scratch
113  // size
115  if (!AssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
117  if (!AssumedStackSizeForExternalCall.getNumOccurrences())
119  }
120 
121  for (auto IT = po_begin(&CG); IT != End; ++IT) {
122  Function *F = IT->getFunction();
123  if (!F || F->isDeclaration())
124  continue;
125 
127  assert(MF && "function must have been generated already");
128 
129  auto CI = CallGraphResourceInfo.insert(
130  std::make_pair(F, SIFunctionResourceInfo()));
131  SIFunctionResourceInfo &Info = CI.first->second;
132  assert(CI.second && "should only be called once per function");
133  Info = analyzeResourceUsage(*MF, TM);
134  HasIndirectCall |= Info.HasIndirectCall;
135  }
136 
137  // It's possible we have unreachable functions in the module which weren't
138  // visited by the PO traversal. Make sure we have some resource counts to
139  // report.
140  for (const auto &IT : CG) {
141  const Function *F = IT.first;
142  if (!F || F->isDeclaration())
143  continue;
144 
145  auto CI = CallGraphResourceInfo.insert(
146  std::make_pair(F, SIFunctionResourceInfo()));
147  if (!CI.second) // Skip already visited functions
148  continue;
149 
150  SIFunctionResourceInfo &Info = CI.first->second;
152  assert(MF && "function must have been generated already");
153  Info = analyzeResourceUsage(*MF, TM);
154  HasIndirectCall |= Info.HasIndirectCall;
155  }
156 
157  if (HasIndirectCall)
158  propagateIndirectCallRegisterUsage();
159 
160  return false;
161 }
162 
164 AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
165  const MachineFunction &MF, const TargetMachine &TM) const {
166  SIFunctionResourceInfo Info;
167 
169  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
170  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
171  const MachineRegisterInfo &MRI = MF.getRegInfo();
172  const SIInstrInfo *TII = ST.getInstrInfo();
173  const SIRegisterInfo &TRI = TII->getRegisterInfo();
174 
175  Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
176  MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
179 
180  // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
181  // instructions aren't used to access the scratch buffer. Inline assembly may
182  // need it though.
183  //
184  // If we only have implicit uses of flat_scr on flat instructions, it is not
185  // really needed.
186  if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
187  (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
188  !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
189  !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
190  Info.UsesFlatScratch = false;
191  }
192 
193  Info.PrivateSegmentSize = FrameInfo.getStackSize();
194 
195  // Assume a big number if there are any unknown sized objects.
196  Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
197  if (Info.HasDynamicallySizedStack)
198  Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
199 
200  if (MFI->isStackRealigned())
201  Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
202 
203  Info.UsesVCC =
204  MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
205 
206  // If there are no calls, MachineRegisterInfo can tell us the used register
207  // count easily.
208  // A tail call isn't considered a call for MachineFrameInfo's purposes.
209  if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
210  MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
211  for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
212  if (MRI.isPhysRegUsed(Reg)) {
213  HighestVGPRReg = Reg;
214  break;
215  }
216  }
217 
218  if (ST.hasMAIInsts()) {
219  MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
220  for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
221  if (MRI.isPhysRegUsed(Reg)) {
222  HighestAGPRReg = Reg;
223  break;
224  }
225  }
226  Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
227  ? 0
228  : TRI.getHWRegIndex(HighestAGPRReg) + 1;
229  }
230 
231  MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
232  for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
233  if (MRI.isPhysRegUsed(Reg)) {
234  HighestSGPRReg = Reg;
235  break;
236  }
237  }
238 
239  // We found the maximum register index. They start at 0, so add one to get
240  // the number of registers.
241  Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
242  ? 0
243  : TRI.getHWRegIndex(HighestVGPRReg) + 1;
244  Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
245  ? 0
246  : TRI.getHWRegIndex(HighestSGPRReg) + 1;
247 
248  return Info;
249  }
250 
251  int32_t MaxVGPR = -1;
252  int32_t MaxAGPR = -1;
253  int32_t MaxSGPR = -1;
254  uint64_t CalleeFrameSize = 0;
255 
256  for (const MachineBasicBlock &MBB : MF) {
257  for (const MachineInstr &MI : MBB) {
258  // TODO: Check regmasks? Do they occur anywhere except calls?
259  for (const MachineOperand &MO : MI.operands()) {
260  unsigned Width = 0;
261  bool IsSGPR = false;
262  bool IsAGPR = false;
263 
264  if (!MO.isReg())
265  continue;
266 
267  Register Reg = MO.getReg();
268  switch (Reg) {
269  case AMDGPU::EXEC:
270  case AMDGPU::EXEC_LO:
271  case AMDGPU::EXEC_HI:
272  case AMDGPU::SCC:
273  case AMDGPU::M0:
274  case AMDGPU::M0_LO16:
275  case AMDGPU::M0_HI16:
276  case AMDGPU::SRC_SHARED_BASE_LO:
277  case AMDGPU::SRC_SHARED_BASE:
278  case AMDGPU::SRC_SHARED_LIMIT_LO:
279  case AMDGPU::SRC_SHARED_LIMIT:
280  case AMDGPU::SRC_PRIVATE_BASE_LO:
281  case AMDGPU::SRC_PRIVATE_BASE:
282  case AMDGPU::SRC_PRIVATE_LIMIT_LO:
283  case AMDGPU::SRC_PRIVATE_LIMIT:
284  case AMDGPU::SGPR_NULL:
285  case AMDGPU::SGPR_NULL64:
286  case AMDGPU::MODE:
287  continue;
288 
289  case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
290  llvm_unreachable("src_pops_exiting_wave_id should not be used");
291 
292  case AMDGPU::NoRegister:
293  assert(MI.isDebugInstr() &&
294  "Instruction uses invalid noreg register");
295  continue;
296 
297  case AMDGPU::VCC:
298  case AMDGPU::VCC_LO:
299  case AMDGPU::VCC_HI:
300  case AMDGPU::VCC_LO_LO16:
301  case AMDGPU::VCC_LO_HI16:
302  case AMDGPU::VCC_HI_LO16:
303  case AMDGPU::VCC_HI_HI16:
304  Info.UsesVCC = true;
305  continue;
306 
307  case AMDGPU::FLAT_SCR:
308  case AMDGPU::FLAT_SCR_LO:
309  case AMDGPU::FLAT_SCR_HI:
310  continue;
311 
312  case AMDGPU::XNACK_MASK:
313  case AMDGPU::XNACK_MASK_LO:
314  case AMDGPU::XNACK_MASK_HI:
315  llvm_unreachable("xnack_mask registers should not be used");
316 
317  case AMDGPU::LDS_DIRECT:
318  llvm_unreachable("lds_direct register should not be used");
319 
320  case AMDGPU::TBA:
321  case AMDGPU::TBA_LO:
322  case AMDGPU::TBA_HI:
323  case AMDGPU::TMA:
324  case AMDGPU::TMA_LO:
325  case AMDGPU::TMA_HI:
326  llvm_unreachable("trap handler registers should not be used");
327 
328  case AMDGPU::SRC_VCCZ:
329  llvm_unreachable("src_vccz register should not be used");
330 
331  case AMDGPU::SRC_EXECZ:
332  llvm_unreachable("src_execz register should not be used");
333 
334  case AMDGPU::SRC_SCC:
335  llvm_unreachable("src_scc register should not be used");
336 
337  default:
338  break;
339  }
340 
341  if (AMDGPU::SReg_32RegClass.contains(Reg) ||
342  AMDGPU::SReg_LO16RegClass.contains(Reg) ||
343  AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
344  assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
345  "trap handler registers should not be used");
346  IsSGPR = true;
347  Width = 1;
348  } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
349  AMDGPU::VGPR_LO16RegClass.contains(Reg) ||
350  AMDGPU::VGPR_HI16RegClass.contains(Reg)) {
351  IsSGPR = false;
352  Width = 1;
353  } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
354  AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
355  IsSGPR = false;
356  IsAGPR = true;
357  Width = 1;
358  } else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
359  assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
360  "trap handler registers should not be used");
361  IsSGPR = true;
362  Width = 2;
363  } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
364  IsSGPR = false;
365  Width = 2;
366  } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
367  IsSGPR = false;
368  IsAGPR = true;
369  Width = 2;
370  } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
371  IsSGPR = false;
372  Width = 3;
373  } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
374  IsSGPR = true;
375  Width = 3;
376  } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
377  IsSGPR = false;
378  IsAGPR = true;
379  Width = 3;
380  } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
381  assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
382  "trap handler registers should not be used");
383  IsSGPR = true;
384  Width = 4;
385  } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
386  IsSGPR = false;
387  Width = 4;
388  } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
389  IsSGPR = false;
390  IsAGPR = true;
391  Width = 4;
392  } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
393  IsSGPR = false;
394  Width = 5;
395  } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
396  IsSGPR = true;
397  Width = 5;
398  } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
399  IsSGPR = false;
400  IsAGPR = true;
401  Width = 5;
402  } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
403  IsSGPR = false;
404  Width = 6;
405  } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
406  IsSGPR = true;
407  Width = 6;
408  } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
409  IsSGPR = false;
410  IsAGPR = true;
411  Width = 6;
412  } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
413  IsSGPR = false;
414  Width = 7;
415  } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
416  IsSGPR = true;
417  Width = 7;
418  } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
419  IsSGPR = false;
420  IsAGPR = true;
421  Width = 7;
422  } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
423  assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
424  "trap handler registers should not be used");
425  IsSGPR = true;
426  Width = 8;
427  } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
428  IsSGPR = false;
429  Width = 8;
430  } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
431  IsSGPR = false;
432  IsAGPR = true;
433  Width = 8;
434  } else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
435  IsSGPR = false;
436  Width = 9;
437  } else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
438  IsSGPR = true;
439  Width = 9;
440  } else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
441  IsSGPR = false;
442  IsAGPR = true;
443  Width = 9;
444  } else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
445  IsSGPR = false;
446  Width = 10;
447  } else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
448  IsSGPR = true;
449  Width = 10;
450  } else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
451  IsSGPR = false;
452  IsAGPR = true;
453  Width = 10;
454  } else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
455  IsSGPR = false;
456  Width = 11;
457  } else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
458  IsSGPR = true;
459  Width = 11;
460  } else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
461  IsSGPR = false;
462  IsAGPR = true;
463  Width = 11;
464  } else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
465  IsSGPR = false;
466  Width = 12;
467  } else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
468  IsSGPR = true;
469  Width = 12;
470  } else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
471  IsSGPR = false;
472  IsAGPR = true;
473  Width = 12;
474  } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
475  assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
476  "trap handler registers should not be used");
477  IsSGPR = true;
478  Width = 16;
479  } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
480  IsSGPR = false;
481  Width = 16;
482  } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
483  IsSGPR = false;
484  IsAGPR = true;
485  Width = 16;
486  } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
487  IsSGPR = true;
488  Width = 32;
489  } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
490  IsSGPR = false;
491  Width = 32;
492  } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
493  IsSGPR = false;
494  IsAGPR = true;
495  Width = 32;
496  } else {
497  llvm_unreachable("Unknown register class");
498  }
499  unsigned HWReg = TRI.getHWRegIndex(Reg);
500  int MaxUsed = HWReg + Width - 1;
501  if (IsSGPR) {
502  MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
503  } else if (IsAGPR) {
504  MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
505  } else {
506  MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
507  }
508  }
509 
510  if (MI.isCall()) {
511  // Pseudo used just to encode the underlying global. Is there a better
512  // way to track this?
513 
514  const MachineOperand *CalleeOp =
515  TII->getNamedOperand(MI, AMDGPU::OpName::callee);
516 
517  const Function *Callee = getCalleeFunction(*CalleeOp);
519  CallGraphResourceInfo.end();
520 
521  // Avoid crashing on undefined behavior with an illegal call to a
522  // kernel. If a callsite's calling convention doesn't match the
523  // function's, it's undefined behavior. If the callsite calling
524  // convention does match, that would have errored earlier.
525  if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
526  report_fatal_error("invalid call to entry function");
527 
528  bool IsIndirect = !Callee || Callee->isDeclaration();
529  if (!IsIndirect)
530  I = CallGraphResourceInfo.find(Callee);
531 
532  // FIXME: Call site could have norecurse on it
533  if (!Callee || !Callee->doesNotRecurse()) {
534  Info.HasRecursion = true;
535 
536  // TODO: If we happen to know there is no stack usage in the
537  // callgraph, we don't need to assume an infinitely growing stack.
538  if (!MI.isReturn()) {
539  // We don't need to assume an unknown stack size for tail calls.
540 
541  // FIXME: This only benefits in the case where the kernel does not
542  // directly call the tail called function. If a kernel directly
543  // calls a tail recursive function, we'll assume maximum stack size
544  // based on the regular call instruction.
545  CalleeFrameSize =
546  std::max(CalleeFrameSize,
548  }
549  }
550 
551  if (IsIndirect || I == CallGraphResourceInfo.end()) {
552  CalleeFrameSize =
553  std::max(CalleeFrameSize,
555 
556  // Register usage of indirect calls gets handled later
557  Info.UsesVCC = true;
558  Info.UsesFlatScratch = ST.hasFlatAddressSpace();
559  Info.HasDynamicallySizedStack = true;
560  Info.HasIndirectCall = true;
561  } else {
562  // We force CodeGen to run in SCC order, so the callee's register
563  // usage etc. should be the cumulative usage of all callees.
564  MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
565  MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
566  MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
567  CalleeFrameSize =
568  std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
569  Info.UsesVCC |= I->second.UsesVCC;
570  Info.UsesFlatScratch |= I->second.UsesFlatScratch;
571  Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
572  Info.HasRecursion |= I->second.HasRecursion;
573  Info.HasIndirectCall |= I->second.HasIndirectCall;
574  }
575  }
576  }
577  }
578 
579  Info.NumExplicitSGPR = MaxSGPR + 1;
580  Info.NumVGPR = MaxVGPR + 1;
581  Info.NumAGPR = MaxAGPR + 1;
582  Info.PrivateSegmentSize += CalleeFrameSize;
583 
584  return Info;
585 }
586 
587 void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
588  // Collect the maximum number of registers from non-hardware-entrypoints.
589  // All these functions are potential targets for indirect calls.
590  int32_t NonKernelMaxSGPRs = 0;
591  int32_t NonKernelMaxVGPRs = 0;
592  int32_t NonKernelMaxAGPRs = 0;
593 
594  for (const auto &I : CallGraphResourceInfo) {
595  if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {
596  auto &Info = I.getSecond();
597  NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);
598  NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);
599  NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);
600  }
601  }
602 
603  // Add register usage for functions with indirect calls.
604  // For calls to unknown functions, we assume the maximum register usage of
605  // all non-hardware-entrypoints in the current module.
606  for (auto &I : CallGraphResourceInfo) {
607  auto &Info = I.getSecond();
608  if (Info.HasIndirectCall) {
609  Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);
610  Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);
611  Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);
612  }
613  }
614 }
llvm::AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT
@ FLAT_SCRATCH_INIT
Definition: AMDGPUArgumentUsageInfo.h:105
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:109
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
llvm::MachineRegisterInfo::isPhysRegUsed
bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
Definition: MachineRegisterInfo.cpp:587
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::SIMachineFunctionInfo::getPreloadedReg
MCRegister getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const
Definition: SIMachineFunctionInfo.h:730
SIMachineFunctionInfo.h
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:50
llvm::Function
Definition: Function.h:60
llvm::po_end
po_iterator< T > po_end(const T &G)
Definition: PostOrderIterator.h:187
contains
return AArch64::GPR64RegClass contains(Reg)
llvm::X86Disassembler::Reg
Reg
All possible values of the reg field in the ModR/M byte.
Definition: X86DisassemblerDecoder.h:462
llvm::CallGraph
The basic data container for the call graph of a Module of IR.
Definition: CallGraph.h:72
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:140
llvm::AMDGPU::getAmdhsaCodeObjectVersion
unsigned getAmdhsaCodeObjectVersion()
Definition: AMDGPUBaseInfo.cpp:149
IT
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
llvm::AMDGPU::IsaInfo::getNumExtraSGPRs
unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed)
Definition: AMDGPUBaseInfo.cpp:937
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::max
Expected< ExpressionValue > max(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:337
llvm::AMDGPUResourceUsageAnalysisID
char & AMDGPUResourceUsageAnalysisID
Definition: AMDGPUResourceUsageAnalysis.cpp:44
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1628
F
#define F(x, y, z)
Definition: MD5.cpp:55
DEBUG_TYPE
#define DEBUG_TYPE
Definition: AMDGPUResourceUsageAnalysis.cpp:41
llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition: MachineFunction.h:667
hasAnyNonFlatUseOfReg
static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, const SIInstrInfo &TII, unsigned Reg)
Definition: AMDGPUResourceUsageAnalysis.cpp:73
GlobalValue.h
TargetMachine.h
llvm::po_begin
po_iterator< T > po_begin(const T &G)
Definition: PostOrderIterator.h:185
GCNSubtarget.h
AssumedStackSizeForDynamicSizeObjects
static cl::opt< uint32_t > AssumedStackSizeForDynamicSizeObjects("amdgpu-assume-dynamic-stack-object-size", cl::desc("Assumed extra stack use if there are any " "variable sized objects (in bytes)"), cl::Hidden, cl::init(4096))
llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition: MachineFunction.h:755
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:125
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:48
llvm::AMDGPU
Definition: AMDGPUMetadataVerifier.h:34
llvm::M0
unsigned M0(unsigned Val)
Definition: VE.h:465
llvm::MachineModuleInfo
This class contains meta information specific to a module.
Definition: MachineModuleInfo.h:74
llvm::report_fatal_error
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:145
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:30
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:94
llvm::AMDGPU::isEntryFunctionCC
bool isEntryFunctionCC(CallingConv::ID CC)
Definition: AMDGPUBaseInfo.cpp:1831
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:657
llvm::cl::opt
Definition: CommandLine.h:1412
AMDGPUResourceUsageAnalysis.h
Analyzes how many registers and other resources are used by functions.
llvm::SIMachineFunctionInfo::isStackRealigned
bool isStackRealigned() const
Definition: SIMachineFunctionInfo.h:822
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:66
uint64_t
const
aarch64 promote const
Definition: AArch64PromoteConstant.cpp:232
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::DenseMap
Definition: DenseMap.h:714
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::MCPhysReg
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition: MCRegister.h:21
llvm::AMDGPUResourceUsageAnalysis::runOnModule
bool runOnModule(Module &M) override
runOnModule - Virtual method overriden by subclasses to process the module being operated on.
Definition: AMDGPUResourceUsageAnalysis.cpp:100
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:447
TargetPassConfig.h
llvm::MachineRegisterInfo::reg_operands
iterator_range< reg_iterator > reg_operands(Register Reg) const
Definition: MachineRegisterInfo.h:294
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
llvm::AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs
int32_t getTotalNumVGPRs(const GCNSubtarget &ST, int32_t NumAGPR, int32_t NumVGPR) const
Definition: AMDGPUResourceUsageAnalysis.cpp:90
llvm::MachineFunction::getFrameInfo
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
Definition: MachineFunction.h:673
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:66
llvm::MachineFunction
Definition: MachineFunction.h:257
llvm::AMDGPU::CPol::SCC
@ SCC
Definition: SIDefines.h:307
AMDGPU.h
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
MRI
unsigned const MachineRegisterInfo * MRI
Definition: AArch64AdvSIMDScalarPass.cpp:105
llvm::Register
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Callee
amdgpu Simplify well known AMD library false FunctionCallee Callee
Definition: AMDGPULibCalls.cpp:187
MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:74
llvm::MachineRegisterInfo::isLiveIn
bool isLiveIn(Register Reg) const
Definition: MachineRegisterInfo.cpp:438
llvm::MachineModuleInfo::getMachineFunction
MachineFunction * getMachineFunction(const Function &F) const
Returns the MachineFunction associated to IR function F if there is one, otherwise nullptr.
Definition: MachineModuleInfo.cpp:103
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:348
MachineFrameInfo.h
llvm::AMDGPU::getTotalNumVGPRs
int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR)
Definition: AMDGPUBaseInfo.cpp:1982
GlobalAlias.h
llvm::AMDGPU::Hwreg::Width
Width
Definition: SIDefines.h:433
llvm::AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs
int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const
Definition: AMDGPUResourceUsageAnalysis.cpp:83
llvm::SIInstrInfo
Definition: SIInstrInfo.h:44
CallGraph.h
llvm::MachineFrameInfo
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
Definition: MachineFrameInfo.h:106
PostOrderIterator.h
AssumedStackSizeForExternalCall
static cl::opt< uint32_t > AssumedStackSizeForExternalCall("amdgpu-assume-external-call-stack-size", cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden, cl::init(16384))
llvm::SIMachineFunctionInfo
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Definition: SIMachineFunctionInfo.h:325
llvm::AMDGPUResourceUsageAnalysis
Definition: AMDGPUResourceUsageAnalysis.h:27
llvm::reverse
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:485
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
callee
Here we don t need to write any variables to the top of the stack since they don t overwrite each other int callee(int32 arg1, int32 arg2)
llvm::cl::desc
Definition: CommandLine.h:413
llvm::AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
Definition: AMDGPUResourceUsageAnalysis.h:32
INITIALIZE_PASS
INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE, "Function register usage analysis", true, true) static const Function *getCalleeFunction(const MachineOperand &Op)
Definition: AMDGPUResourceUsageAnalysis.cpp:60
llvm::SIMachineFunctionInfo::hasFlatScratchInit
bool hasFlatScratchInit() const
Definition: SIMachineFunctionInfo.h:671
llvm::AMDGPUResourceUsageAnalysis::ID
static char ID
Definition: AMDGPUResourceUsageAnalysis.h:28