LLVM 23.0.0git
GCNSchedStrategy.cpp
Go to the documentation of this file.
1//===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This contains a MachineSchedStrategy implementation for maximizing wave
11/// occupancy on GCN hardware.
12///
13/// This pass will apply multiple scheduling stages to the same function.
14/// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
15/// entry point for the scheduling of those regions is
16/// GCNScheduleDAGMILive::runSchedStages.
17
18/// Generally, the reason for having multiple scheduling stages is to account
19/// for the kernel-wide effect of register usage on occupancy. Usually, only a
20/// few scheduling regions will have register pressure high enough to limit
21/// occupancy for the kernel, so constraints can be relaxed to improve ILP in
22/// other regions.
23///
24//===----------------------------------------------------------------------===//
25
26#include "GCNSchedStrategy.h"
27#include "AMDGPUIGroupLP.h"
28#include "GCNHazardRecognizer.h"
29#include "GCNRegPressure.h"
32#include "llvm/ADT/BitVector.h"
33#include "llvm/ADT/STLExtras.h"
41#include "llvm/MC/LaneBitmask.h"
42#include "llvm/MC/MCSchedule.h"
45
46#define DEBUG_TYPE "machine-scheduler"
47
48using namespace llvm;
49
51 "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden,
52 cl::desc("Disable unclustered high register pressure "
53 "reduction scheduling stage."),
54 cl::init(false));
55
57 "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden,
58 cl::desc("Disable clustered low occupancy "
59 "rescheduling for ILP scheduling stage."),
60 cl::init(false));
61
63 "amdgpu-schedule-metric-bias", cl::Hidden,
65 "Sets the bias which adds weight to occupancy vs latency. Set it to "
66 "100 to chase the occupancy only."),
67 cl::init(10));
68
69static cl::opt<bool>
70 RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden,
71 cl::desc("Relax occupancy targets for kernels which are memory "
72 "bound (amdgpu-membound-threshold), or "
73 "Wave Limited (amdgpu-limit-wave-threshold)."),
74 cl::init(false));
75
77 "amdgpu-use-amdgpu-trackers", cl::Hidden,
78 cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
79 cl::init(false));
80
82 "amdgpu-scheduler-pending-queue-limit", cl::Hidden,
84 "Max (Available+Pending) size to inspect pending queue (0 disables)"),
85 cl::init(256));
86
87#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
88#define DUMP_MAX_REG_PRESSURE
90 "amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden,
91 cl::desc("Print a list of live registers along with their def/uses at the "
92 "point of maximum register pressure before scheduling."),
93 cl::init(false));
94
96 "amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden,
97 cl::desc("Print a list of live registers along with their def/uses at the "
98 "point of maximum register pressure after scheduling."),
99 cl::init(false));
100#endif
101
103 "amdgpu-disable-rewrite-mfma-form-sched-stage", cl::Hidden,
104 cl::desc("Disable rewrite mfma rewrite scheduling stage"), cl::init(true));
105
106namespace {
107
108struct VGPRThresholdParser : public cl::parser<unsigned> {
109 VGPRThresholdParser(cl::Option &O) : cl::parser<unsigned>(O) {}
110
111 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
112 if (Arg.getAsInteger(0, Value))
113 return O.error("'" + Arg + "' value invalid for uint argument!");
114
115 if (Value > 100)
116 return O.error("'" + Arg + "' value must be in the range [0, 100]!");
117
118 return false;
119 }
120};
121
122} // end anonymous namespace
123
125 "amdgpu-vgpr-threshold-percent", cl::Hidden,
126 cl::desc("Percent of VGPR limits that we should use as RP threshold "
127 "during scheduling. We have two limits relevant to scheduling: "
128 "Critical (avoid decreasing occupancy), Excess (avoid spilling). "
129 "This flag scales both limits back by an equal percent: (0 = use "
130 " default calculation, 1-100 = use percentage), default: 0"),
131 cl::init(0));
132
133const unsigned ScheduleMetrics::ScaleFactor = 100;
134
141
144
145 MF = &DAG->MF;
146
147 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
148
150 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
152 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
153
155 // Set the initial TargetOccupnacy to the maximum occupancy that we can
156 // achieve for this function. This effectively sets a lower bound on the
157 // 'Critical' register limits in the scheduler.
158 // Allow for lower occupancy targets if kernel is wave limited or memory
159 // bound, and using the relaxed occupancy feature.
163 std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
164
165 if (!KnownExcessRP) {
166 VGPRCriticalLimit = std::min(
167 ST.getMaxNumVGPRs(TargetOccupancy, MFI.getDynamicVGPRBlockSize()),
169 } else {
170 // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
171 // returns a reasonably small number for targets with lots of VGPRs, such
172 // as GFX10 and GFX11.
173 LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
174 "VGPRCriticalLimit calculation method.\n");
175 unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
176 unsigned Granule =
177 AMDGPU::IsaInfo::getVGPRAllocGranule(ST, DynamicVGPRBlockSize);
178 unsigned Addressable =
179 AMDGPU::IsaInfo::getAddressableNumVGPRs(ST, DynamicVGPRBlockSize);
180 unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule);
181 VGPRBudget = std::max(VGPRBudget, Granule);
182 VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit);
183 }
184 // Apply VGPR excess threshold percentage if specified.
185 if (VGPRThresholdPercentOpt > 0) {
186 [[maybe_unused]] unsigned OriginalVGPRExcessLimit = VGPRExcessLimit;
187 [[maybe_unused]] unsigned OriginalVGPRCriticalLimit = VGPRCriticalLimit;
191 LLVM_DEBUG(dbgs() << "Applied VGPR excess threshold "
192 << VGPRThresholdPercentOpt << "%, VGPRExcessLimit: "
193 << OriginalVGPRExcessLimit << " -> " << VGPRExcessLimit
194 << ". VGPRCriticalLimit: " << OriginalVGPRCriticalLimit
195 << " -> " << VGPRCriticalLimit << '\n');
196 } else {
200 }
201
202 // Subtract error margin and bias from register limits and avoid overflow.
205 LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
206 << ", VGPRExcessLimit = " << VGPRExcessLimit
207 << ", SGPRCriticalLimit = " << SGPRCriticalLimit
208 << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
209}
210
211/// Checks whether \p SU can use the cached DAG pressure diffs to compute the
212/// current register pressure.
213///
214/// This works for the common case, but it has a few exceptions that have been
215/// observed through trial and error:
216/// - Explicit physical register operands
217/// - Subregister definitions
218///
219/// In both of those cases, PressureDiff doesn't represent the actual pressure,
220/// and querying LiveIntervals through the RegPressureTracker is needed to get
221/// an accurate value.
222///
223/// We should eventually only use PressureDiff for maximum performance, but this
224/// already allows 80% of SUs to take the fast path without changing scheduling
225/// at all. Further changes would either change scheduling, or require a lot
226/// more logic to recover an accurate pressure estimate from the PressureDiffs.
227static bool canUsePressureDiffs(const SUnit &SU) {
228 if (!SU.isInstr())
229 return false;
230
231 // Cannot use pressure diffs for subregister defs or with physregs, it's
232 // imprecise in both cases.
233 for (const auto &Op : SU.getInstr()->operands()) {
234 if (!Op.isReg() || Op.isImplicit())
235 continue;
236 if (Op.getReg().isPhysical() ||
237 (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister))
238 return false;
239 }
240 return true;
241}
242
244 bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
245 std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
247 ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) {
248 // getDownwardPressure() and getUpwardPressure() make temporary changes to
249 // the tracker, so we need to pass those function a non-const copy.
250 RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
251 if (!useGCNTrackers()) {
252 AtTop
253 ? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure)
254 : TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
255
256 return;
257 }
258
259 // GCNTrackers
260 Pressure.resize(4, 0);
261 MachineInstr *MI = SU->getInstr();
262 GCNRegPressure NewPressure;
263 if (AtTop) {
264 GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
265 NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, SRI);
266 } else {
267 GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
268 TempUpwardTracker.recede(*MI);
269 NewPressure = TempUpwardTracker.getPressure();
270 }
271 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
272 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
273 NewPressure.getArchVGPRNum();
274 Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
275}
276
278 SUnit *SU) const {
279 // Only implemented for top-down scheduling currently.
280 if (!Zone.isTop() || !SU)
281 return 0;
282
283 MachineInstr *MI = SU->getInstr();
284 unsigned CurrCycle = Zone.getCurrCycle();
285 unsigned Stall = 0;
286
287 // Query SchedModel for resource stalls (unbuffered resources).
288 if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) {
289 const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
290 for (const MCWriteProcResEntry &PE :
291 make_range(SchedModel->getWriteProcResBegin(SC),
292 SchedModel->getWriteProcResEnd(SC))) {
293 unsigned NextAvail =
294 Zone.getNextResourceCycle(SC, PE.ProcResourceIdx, PE.ReleaseAtCycle,
295 PE.AcquireAtCycle)
296 .first;
297 if (NextAvail > CurrCycle)
298 Stall = std::max(Stall, NextAvail - CurrCycle);
299 }
300 }
301
302 // Query HazardRecognizer for sequence-dependent hazard penalties.
303 // AMDGPU currently installs GCNHazardRecognizer for MI scheduling only in
304 // the post-RA configuration without vreg liveness.
305 if (!DAG->hasVRegLiveness() && Zone.HazardRec &&
306 Zone.HazardRec->isEnabled()) {
307 auto *HR = static_cast<GCNHazardRecognizer *>(Zone.HazardRec);
308 Stall = std::max(Stall, HR->getHazardWaitStates(MI));
309 }
310
311 return Stall;
312}
313
315 bool AtTop,
316 const RegPressureTracker &RPTracker,
317 const SIRegisterInfo *SRI,
318 unsigned SGPRPressure,
319 unsigned VGPRPressure, bool IsBottomUp) {
320 Cand.SU = SU;
321 Cand.AtTop = AtTop;
322
323 if (!DAG->isTrackingPressure())
324 return;
325
326 Pressure.clear();
327 MaxPressure.clear();
328
329 // We try to use the cached PressureDiffs in the ScheduleDAG whenever
330 // possible over querying the RegPressureTracker.
331 //
332 // RegPressureTracker will make a lot of LIS queries which are very
333 // expensive, it is considered a slow function in this context.
334 //
335 // PressureDiffs are precomputed and cached, and getPressureDiff is just a
336 // trivial lookup into an array. It is pretty much free.
337 //
338 // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
339 // PressureDiffs.
340 if (AtTop || !canUsePressureDiffs(*SU) || useGCNTrackers()) {
341 getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
343 } else {
344 // Reserve 4 slots.
345 Pressure.resize(4, 0);
346 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
347 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
348
349 for (const auto &Diff : DAG->getPressureDiff(SU)) {
350 if (!Diff.isValid())
351 continue;
352 // PressureDiffs is always bottom-up so if we're working top-down we need
353 // to invert its sign.
354 Pressure[Diff.getPSet()] +=
355 (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc());
356 }
357
358#ifdef EXPENSIVE_CHECKS
359 std::vector<unsigned> CheckPressure, CheckMaxPressure;
360 getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
362 if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
363 CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
364 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
365 CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
366 errs() << "Register Pressure is inaccurate when calculated through "
367 "PressureDiff\n"
368 << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
369 << ", expected "
370 << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
371 << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
372 << ", expected "
373 << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
374 report_fatal_error("inaccurate register pressure calculation");
375 }
376#endif
377 }
378
379 unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
380 unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
381
382 // If two instructions increase the pressure of different register sets
383 // by the same amount, the generic scheduler will prefer to schedule the
384 // instruction that increases the set with the least amount of registers,
385 // which in our case would be SGPRs. This is rarely what we want, so
386 // when we report excess/critical register pressure, we do it either
387 // only for VGPRs or only for SGPRs.
388
389 // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
390 const unsigned MaxVGPRPressureInc = 16;
391 bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
392 bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
393
394 // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
395 // to increase the likelihood we don't go over the limits. We should improve
396 // the analysis to look through dependencies to find the path with the least
397 // register pressure.
398
399 // We only need to update the RPDelta for instructions that increase register
400 // pressure. Instructions that decrease or keep reg pressure the same will be
401 // marked as RegExcess in tryCandidate() when they are compared with
402 // instructions that increase the register pressure.
403 if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
404 HasHighPressure = true;
405 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
406 Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
407 }
408
409 if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
410 HasHighPressure = true;
411 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
412 Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
413 }
414
415 // Register pressure is considered 'CRITICAL' if it is approaching a value
416 // that would reduce the wave occupancy for the execution unit. When
417 // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
418 // has the same cost, so we don't need to prefer one over the other.
419
420 int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
421 int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
422
423 if (SGPRDelta >= 0 || VGPRDelta >= 0) {
424 HasHighPressure = true;
425 if (SGPRDelta > VGPRDelta) {
426 Cand.RPDelta.CriticalMax =
427 PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
428 Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
429 } else {
430 Cand.RPDelta.CriticalMax =
431 PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
432 Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
433 }
434 }
435}
436
438 const TargetSchedModel *SchedModel) {
439 bool HasBufferedModel =
440 SchedModel->hasInstrSchedModel() && SchedModel->getMicroOpBufferSize();
441 unsigned Combined = Zone.Available.size() + Zone.Pending.size();
442 return Combined <= PendingQueueLimit && HasBufferedModel;
443}
444
446 const TargetSchedModel *SchedModel) {
447 // pickOnlyChoice() releases pending instructions and checks for new hazards.
448 SUnit *OnlyChoice = Zone.pickOnlyChoice();
449 if (!shouldCheckPending(Zone, SchedModel) || Zone.Pending.empty())
450 return OnlyChoice;
451
452 return nullptr;
453}
454
456 const SchedCandidate &Preferred) {
457 LLVM_DEBUG({
458 dbgs() << "Prefer:\t\t";
459 DAG->dumpNode(*Preferred.SU);
460
461 if (Current.SU) {
462 dbgs() << "Not:\t";
463 DAG->dumpNode(*Current.SU);
464 }
465
466 dbgs() << "Reason:\t\t";
467 traceCandidate(Preferred);
468 });
469}
470
471// This function is mostly cut and pasted from
472// GenericScheduler::pickNodeFromQueue()
474 const CandPolicy &ZonePolicy,
475 const RegPressureTracker &RPTracker,
476 SchedCandidate &Cand, bool &IsPending,
477 bool IsBottomUp) {
478 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
480 unsigned SGPRPressure = 0;
481 unsigned VGPRPressure = 0;
482 IsPending = false;
483 if (DAG->isTrackingPressure()) {
484 if (!useGCNTrackers()) {
485 SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
486 VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
487 } else {
488 GCNRPTracker *T = IsBottomUp
489 ? static_cast<GCNRPTracker *>(&UpwardTracker)
490 : static_cast<GCNRPTracker *>(&DownwardTracker);
491 SGPRPressure = T->getPressure().getSGPRNum();
492 VGPRPressure = T->getPressure().getArchVGPRNum();
493 }
494 }
495 LLVM_DEBUG(dbgs() << "Available Q:\n");
496 ReadyQueue &AQ = Zone.Available;
497 for (SUnit *SU : AQ) {
498
499 SchedCandidate TryCand(ZonePolicy);
500 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
501 VGPRPressure, IsBottomUp);
502 // Pass SchedBoundary only when comparing nodes from the same boundary.
503 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
504 tryCandidate(Cand, TryCand, ZoneArg);
505 if (TryCand.Reason != NoCand) {
506 // Initialize resource delta if needed in case future heuristics query it.
507 if (TryCand.ResDelta == SchedResourceDelta())
508 TryCand.initResourceDelta(Zone.DAG, SchedModel);
509 LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
510 Cand.setBest(TryCand);
511 } else {
512 printCandidateDecision(TryCand, Cand);
513 }
514 }
515
516 if (!shouldCheckPending(Zone, SchedModel))
517 return;
518
519 LLVM_DEBUG(dbgs() << "Pending Q:\n");
520 ReadyQueue &PQ = Zone.Pending;
521 for (SUnit *SU : PQ) {
522
523 SchedCandidate TryCand(ZonePolicy);
524 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
525 VGPRPressure, IsBottomUp);
526 // Pass SchedBoundary only when comparing nodes from the same boundary.
527 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
528 tryPendingCandidate(Cand, TryCand, ZoneArg);
529 if (TryCand.Reason != NoCand) {
530 // Initialize resource delta if needed in case future heuristics query it.
531 if (TryCand.ResDelta == SchedResourceDelta())
532 TryCand.initResourceDelta(Zone.DAG, SchedModel);
533 LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
534 IsPending = true;
535 Cand.setBest(TryCand);
536 } else {
537 printCandidateDecision(TryCand, Cand);
538 }
539 }
540}
541
542// This function is mostly cut and pasted from
543// GenericScheduler::pickNodeBidirectional()
545 bool &PickedPending) {
546 // Schedule as far as possible in the direction of no choice. This is most
547 // efficient, but also provides the best heuristics for CriticalPSets.
548 if (SUnit *SU = pickOnlyChoice(Bot, SchedModel)) {
549 IsTopNode = false;
550 return SU;
551 }
552 if (SUnit *SU = pickOnlyChoice(Top, SchedModel)) {
553 IsTopNode = true;
554 return SU;
555 }
556 // Set the bottom-up policy based on the state of the current bottom zone
557 // and the instructions outside the zone, including the top zone.
558 CandPolicy BotPolicy;
559 setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
560 // Set the top-down policy based on the state of the current top zone and
561 // the instructions outside the zone, including the bottom zone.
562 CandPolicy TopPolicy;
563 setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
564
565 bool BotPending = false;
566 // See if BotCand is still valid (because we previously scheduled from Top).
567 LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
568 if (!BotCand.isValid() || BotCand.SU->isScheduled ||
569 BotCand.Policy != BotPolicy) {
570 BotCand.reset(CandPolicy());
571 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand,
572 BotPending,
573 /*IsBottomUp=*/true);
574 assert(BotCand.Reason != NoCand && "failed to find the first candidate");
575 } else {
577#ifndef NDEBUG
578 if (VerifyScheduling) {
579 SchedCandidate TCand;
580 TCand.reset(CandPolicy());
581 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
582 BotPending,
583 /*IsBottomUp=*/true);
584 assert(TCand.SU == BotCand.SU &&
585 "Last pick result should correspond to re-picking right now");
586 }
587#endif
588 }
589
590 bool TopPending = false;
591 // Check if the top Q has a better candidate.
592 LLVM_DEBUG(dbgs() << "Picking from Top:\n");
593 if (!TopCand.isValid() || TopCand.SU->isScheduled ||
594 TopCand.Policy != TopPolicy) {
595 TopCand.reset(CandPolicy());
596 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand,
597 TopPending,
598 /*IsBottomUp=*/false);
599 assert(TopCand.Reason != NoCand && "failed to find the first candidate");
600 } else {
602#ifndef NDEBUG
603 if (VerifyScheduling) {
604 SchedCandidate TCand;
605 TCand.reset(CandPolicy());
606 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
607 TopPending,
608 /*IsBottomUp=*/false);
609 assert(TCand.SU == TopCand.SU &&
610 "Last pick result should correspond to re-picking right now");
611 }
612#endif
613 }
614
615 // Pick best from BotCand and TopCand.
616 LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
617 dbgs() << "Bot Cand: "; traceCandidate(BotCand););
618 SchedCandidate Cand = BotPending ? TopCand : BotCand;
619 SchedCandidate TryCand = BotPending ? BotCand : TopCand;
620 PickedPending = BotPending && TopPending;
621
622 TryCand.Reason = NoCand;
623 if (BotPending || TopPending) {
624 PickedPending |= tryPendingCandidate(Cand, TopCand, nullptr);
625 } else {
626 tryCandidate(Cand, TryCand, nullptr);
627 }
628
629 if (TryCand.Reason != NoCand) {
630 Cand.setBest(TryCand);
631 }
632
633 LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
634
635 IsTopNode = Cand.AtTop;
636 return Cand.SU;
637}
638
639// This function is mostly cut and pasted from
640// GenericScheduler::pickNode()
642 if (DAG->top() == DAG->bottom()) {
643 assert(Top.Available.empty() && Top.Pending.empty() &&
644 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
645 return nullptr;
646 }
647 bool PickedPending;
648 SUnit *SU;
649 do {
650 PickedPending = false;
651 if (RegionPolicy.OnlyTopDown) {
653 if (!SU) {
654 CandPolicy NoPolicy;
655 TopCand.reset(NoPolicy);
656 pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
657 PickedPending,
658 /*IsBottomUp=*/false);
659 assert(TopCand.Reason != NoCand && "failed to find a candidate");
660 SU = TopCand.SU;
661 }
662 IsTopNode = true;
663 } else if (RegionPolicy.OnlyBottomUp) {
665 if (!SU) {
666 CandPolicy NoPolicy;
667 BotCand.reset(NoPolicy);
668 pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand,
669 PickedPending,
670 /*IsBottomUp=*/true);
671 assert(BotCand.Reason != NoCand && "failed to find a candidate");
672 SU = BotCand.SU;
673 }
674 IsTopNode = false;
675 } else {
676 SU = pickNodeBidirectional(IsTopNode, PickedPending);
677 }
678 } while (SU->isScheduled);
679
680 if (PickedPending) {
681 unsigned ReadyCycle = IsTopNode ? SU->TopReadyCycle : SU->BotReadyCycle;
682 SchedBoundary &Zone = IsTopNode ? Top : Bot;
683 unsigned CurrentCycle = Zone.getCurrCycle();
684 if (ReadyCycle > CurrentCycle)
685 Zone.bumpCycle(ReadyCycle);
686
687 // FIXME: checkHazard() doesn't give information about which cycle the
688 // hazard will resolve so just keep bumping the cycle by 1. This could be
689 // made more efficient if checkHazard() returned more details.
690 while (Zone.checkHazard(SU))
691 Zone.bumpCycle(Zone.getCurrCycle() + 1);
692
693 Zone.releasePending();
694 }
695
696 if (SU->isTopReady())
697 Top.removeReady(SU);
698 if (SU->isBottomReady())
699 Bot.removeReady(SU);
700
701 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
702 << *SU->getInstr());
703 return SU;
704}
705
706void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
707 if (useGCNTrackers()) {
708 MachineInstr *MI = SU->getInstr();
709 IsTopNode ? (void)DownwardTracker.advance(MI, false)
710 : UpwardTracker.recede(*MI);
711 }
712
713 return GenericScheduler::schedNode(SU, IsTopNode);
714}
715
720
723 if (!CurrentStage)
724 CurrentStage = SchedStages.begin();
725 else
726 CurrentStage++;
727
728 return CurrentStage != SchedStages.end();
729}
730
733 return std::next(CurrentStage) != SchedStages.end();
734}
735
737 assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
738 return *std::next(CurrentStage);
739}
740
742 SchedCandidate &TryCand,
743 SchedBoundary *Zone) const {
744 // Initialize the candidate if needed.
745 if (!Cand.isValid()) {
746 TryCand.Reason = NodeOrder;
747 return true;
748 }
749
750 // Bias PhysReg Defs and copies to their uses and defined respectively.
751 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
752 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
753 return TryCand.Reason != NoCand;
754
755 // Avoid exceeding the target's limit.
756 if (DAG->isTrackingPressure() &&
757 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
758 RegExcess, TRI, DAG->MF))
759 return TryCand.Reason != NoCand;
760
761 // Avoid increasing the max critical pressure in the scheduled region.
762 if (DAG->isTrackingPressure() &&
764 TryCand, Cand, RegCritical, TRI, DAG->MF))
765 return TryCand.Reason != NoCand;
766
767 bool SameBoundary = Zone != nullptr;
768 if (SameBoundary) {
771 TryCand, Cand, ResourceReduce))
772 return TryCand.Reason != NoCand;
774 Cand.ResDelta.DemandedResources, TryCand, Cand,
776 return TryCand.Reason != NoCand;
777 }
778
779 return false;
780}
781
794
799
801 SchedCandidate &TryCand,
802 SchedBoundary *Zone) const {
803 // Initialize the candidate if needed.
804 if (!Cand.isValid()) {
805 TryCand.Reason = NodeOrder;
806 return true;
807 }
808
809 // Avoid spilling by exceeding the register limit.
810 if (DAG->isTrackingPressure() &&
811 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
812 RegExcess, TRI, DAG->MF))
813 return TryCand.Reason != NoCand;
814
815 // Bias PhysReg Defs and copies to their uses and defined respectively.
816 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
817 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
818 return TryCand.Reason != NoCand;
819
820 bool SameBoundary = Zone != nullptr;
821 if (SameBoundary) {
822 // Prioritize instructions that read unbuffered resources by stall cycles.
823 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
824 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
825 return TryCand.Reason != NoCand;
826
827 // Avoid critical resource consumption and balance the schedule.
830 TryCand, Cand, ResourceReduce))
831 return TryCand.Reason != NoCand;
833 Cand.ResDelta.DemandedResources, TryCand, Cand,
835 return TryCand.Reason != NoCand;
836
837 // Unconditionally try to reduce latency.
838 if (tryLatency(TryCand, Cand, *Zone))
839 return TryCand.Reason != NoCand;
840
841 // Weak edges are for clustering and other constraints.
842 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
843 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
844 return TryCand.Reason != NoCand;
845 }
846
847 // Keep clustered nodes together to encourage downstream peephole
848 // optimizations which may reduce resource requirements.
849 //
850 // This is a best effort to set things up for a post-RA pass. Optimizations
851 // like generating loads of multiple registers should ideally be done within
852 // the scheduler pass by combining the loads during DAG postprocessing.
853 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
854 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
855 bool CandIsClusterSucc =
856 isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
857 bool TryCandIsClusterSucc =
858 isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
859 if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
860 Cluster))
861 return TryCand.Reason != NoCand;
862
863 // Avoid increasing the max critical pressure in the scheduled region.
864 if (DAG->isTrackingPressure() &&
866 TryCand, Cand, RegCritical, TRI, DAG->MF))
867 return TryCand.Reason != NoCand;
868
869 // Avoid increasing the max pressure of the entire region.
870 if (DAG->isTrackingPressure() &&
871 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
872 Cand, RegMax, TRI, DAG->MF))
873 return TryCand.Reason != NoCand;
874
875 if (SameBoundary) {
876 // Fall through to original instruction order.
877 if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
878 (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
879 TryCand.Reason = NodeOrder;
880 return true;
881 }
882 }
883 return false;
884}
885
891
892/// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as
893/// much as possible. This is achieved by:
894// 1. Prioritize clustered operations before stall latency heuristic.
895// 2. Prioritize long-latency-load before stall latency heuristic.
896///
897/// \param Cand provides the policy and current best candidate.
898/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
899/// \param Zone describes the scheduled zone that we are extending, or nullptr
900/// if Cand is from a different zone than TryCand.
901/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
903 SchedCandidate &TryCand,
904 SchedBoundary *Zone) const {
905 // Initialize the candidate if needed.
906 if (!Cand.isValid()) {
907 TryCand.Reason = NodeOrder;
908 return true;
909 }
910
911 // Bias PhysReg Defs and copies to their uses and defined respectively.
912 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
913 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
914 return TryCand.Reason != NoCand;
915
916 if (DAG->isTrackingPressure()) {
917 // Avoid exceeding the target's limit.
918 if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
919 RegExcess, TRI, DAG->MF))
920 return TryCand.Reason != NoCand;
921
922 // Avoid increasing the max critical pressure in the scheduled region.
924 TryCand, Cand, RegCritical, TRI, DAG->MF))
925 return TryCand.Reason != NoCand;
926 }
927
928 // MaxMemoryClause-specific: We prioritize clustered instructions as we would
929 // get more benefit from clausing these memory instructions.
930 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
931 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
932 bool CandIsClusterSucc =
933 isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
934 bool TryCandIsClusterSucc =
935 isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
936 if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
937 Cluster))
938 return TryCand.Reason != NoCand;
939
940 // We only compare a subset of features when comparing nodes between
941 // Top and Bottom boundary. Some properties are simply incomparable, in many
942 // other instances we should only override the other boundary if something
943 // is a clear good pick on one boundary. Skip heuristics that are more
944 // "tie-breaking" in nature.
945 bool SameBoundary = Zone != nullptr;
946 if (SameBoundary) {
947 // For loops that are acyclic path limited, aggressively schedule for
948 // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
949 // heuristics to take precedence.
950 if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
951 tryLatency(TryCand, Cand, *Zone))
952 return TryCand.Reason != NoCand;
953
954 // MaxMemoryClause-specific: Prioritize long latency memory load
955 // instructions in top-bottom order to hide more latency. The mayLoad check
956 // is used to exclude store-like instructions, which we do not want to
957 // scheduler them too early.
958 bool TryMayLoad =
959 TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad();
960 bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad();
961
962 if (TryMayLoad || CandMayLoad) {
963 bool TryLongLatency =
964 TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad;
965 bool CandLongLatency =
966 10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad;
967
968 if (tryGreater(Zone->isTop() ? TryLongLatency : CandLongLatency,
969 Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand,
970 Cand, Stall))
971 return TryCand.Reason != NoCand;
972 }
973 // Prioritize instructions that read unbuffered resources by stall cycles.
974 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
975 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
976 return TryCand.Reason != NoCand;
977 }
978
979 if (SameBoundary) {
980 // Weak edges are for clustering and other constraints.
981 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
982 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
983 return TryCand.Reason != NoCand;
984 }
985
986 // Avoid increasing the max pressure of the entire region.
987 if (DAG->isTrackingPressure() &&
988 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
989 Cand, RegMax, TRI, DAG->MF))
990 return TryCand.Reason != NoCand;
991
992 if (SameBoundary) {
993 // Avoid critical resource consumption and balance the schedule.
996 TryCand, Cand, ResourceReduce))
997 return TryCand.Reason != NoCand;
999 Cand.ResDelta.DemandedResources, TryCand, Cand,
1001 return TryCand.Reason != NoCand;
1002
1003 // Avoid serializing long latency dependence chains.
1004 // For acyclic path limited loops, latency was already checked above.
1005 if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
1006 !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
1007 return TryCand.Reason != NoCand;
1008
1009 // Fall through to original instruction order.
1010 if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) {
1011 assert(TryCand.SU->NodeNum != Cand.SU->NodeNum);
1012 TryCand.Reason = NodeOrder;
1013 return true;
1014 }
1015 }
1016
1017 return false;
1018}
1019
1021 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
1022 : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
1023 MFI(*MF.getInfo<SIMachineFunctionInfo>()),
1024 StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy),
1025 RegionLiveOuts(this, /*IsLiveOut=*/true) {
1026
1027 // We want regions with a single MI to be scheduled so that we can reason
1028 // about them correctly during scheduling stages that move MIs between regions
1029 // (e.g., rematerialization).
1031 LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
1032 if (RelaxedOcc) {
1033 MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy);
1034 if (MinOccupancy != StartingOccupancy)
1035 LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
1036 << ".\n");
1037 }
1038}
1039
1040std::unique_ptr<GCNSchedStage>
1041GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
1042 switch (SchedStageID) {
1044 return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
1046 return std::make_unique<RewriteMFMAFormStage>(SchedStageID, *this);
1048 return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
1050 return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this);
1052 return std::make_unique<PreRARematStage>(SchedStageID, *this);
1054 return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
1056 return std::make_unique<MemoryClauseInitialScheduleStage>(SchedStageID,
1057 *this);
1058 }
1059
1060 llvm_unreachable("Unknown SchedStageID.");
1061}
1062
1064 // Collect all scheduling regions. The actual scheduling is performed in
1065 // GCNScheduleDAGMILive::finalizeSchedule.
1066 Regions.push_back(std::pair(RegionBegin, RegionEnd));
1067}
1068
1070GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
1071 if (Regions[RegionIdx].first == Regions[RegionIdx].second)
1072 return llvm::getRegPressure(MRI, LiveIns[RegionIdx]);
1074 RPTracker.advance(Regions[RegionIdx].first, Regions[RegionIdx].second,
1075 &LiveIns[RegionIdx]);
1076 return RPTracker.moveMaxPressure();
1077}
1078
1080 MachineBasicBlock::iterator RegionEnd) {
1081 assert(RegionBegin != RegionEnd && "Region must not be empty");
1082 return &*skipDebugInstructionsBackward(std::prev(RegionEnd), RegionBegin);
1083}
1084
1085void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
1086 const MachineBasicBlock *MBB) {
1087 GCNDownwardRPTracker RPTracker(*LIS);
1088
1089 // If the block has the only successor then live-ins of that successor are
1090 // live-outs of the current block. We can reuse calculated live set if the
1091 // successor will be sent to scheduling past current block.
1092
1093 // However, due to the bug in LiveInterval analysis it may happen that two
1094 // predecessors of the same successor block have different lane bitmasks for
1095 // a live-out register. Workaround that by sticking to one-to-one relationship
1096 // i.e. one predecessor with one successor block.
1097 const MachineBasicBlock *OnlySucc = nullptr;
1098 if (MBB->succ_size() == 1) {
1099 auto *Candidate = *MBB->succ_begin();
1100 if (!Candidate->empty() && Candidate->pred_size() == 1) {
1101 SlotIndexes *Ind = LIS->getSlotIndexes();
1102 if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate))
1103 OnlySucc = Candidate;
1104 }
1105 }
1106
1107 // Scheduler sends regions from the end of the block upwards.
1108 size_t CurRegion = RegionIdx;
1109 for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
1110 if (Regions[CurRegion].first->getParent() != MBB)
1111 break;
1112 --CurRegion;
1113
1114 auto I = MBB->begin();
1115 auto LiveInIt = MBBLiveIns.find(MBB);
1116 auto &Rgn = Regions[CurRegion];
1117 auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
1118 if (LiveInIt != MBBLiveIns.end()) {
1119 auto LiveIn = std::move(LiveInIt->second);
1120 RPTracker.reset(*MBB->begin(), MBB->end(), &LiveIn);
1121 MBBLiveIns.erase(LiveInIt);
1122 } else {
1123 I = Rgn.first;
1124 auto LRS = BBLiveInMap.lookup(NonDbgMI);
1125#ifdef EXPENSIVE_CHECKS
1126 assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
1127#endif
1128 RPTracker.reset(*I, I->getParent()->end(), &LRS);
1129 }
1130
1131 for (;;) {
1132 I = RPTracker.getNext();
1133
1134 if (Regions[CurRegion].first == I || NonDbgMI == I) {
1135 LiveIns[CurRegion] = RPTracker.getLiveRegs();
1136 RPTracker.clearMaxPressure();
1137 }
1138
1139 if (Regions[CurRegion].second == I) {
1140 Pressure[CurRegion] = RPTracker.moveMaxPressure();
1141 if (CurRegion-- == RegionIdx)
1142 break;
1143 auto &Rgn = Regions[CurRegion];
1144 NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
1145 }
1146 RPTracker.advanceBeforeNext();
1147 RPTracker.advanceToNext();
1148 }
1149
1150 if (OnlySucc) {
1151 if (I != MBB->end()) {
1152 RPTracker.advanceBeforeNext();
1153 RPTracker.advanceToNext();
1154 RPTracker.advance(MBB->end());
1155 }
1156 MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
1157 }
1158}
1159
1161GCNScheduleDAGMILive::getRegionLiveInMap() const {
1162 assert(!Regions.empty());
1163 std::vector<MachineInstr *> RegionFirstMIs;
1164 RegionFirstMIs.reserve(Regions.size());
1165 for (auto &[RegionBegin, RegionEnd] : reverse(Regions))
1166 RegionFirstMIs.push_back(
1168
1169 return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
1170}
1171
1173GCNScheduleDAGMILive::getRegionLiveOutMap() const {
1174 assert(!Regions.empty());
1175 std::vector<MachineInstr *> RegionLastMIs;
1176 RegionLastMIs.reserve(Regions.size());
1177 for (auto &[RegionBegin, RegionEnd] : reverse(Regions)) {
1178 // Skip empty regions.
1179 if (RegionBegin == RegionEnd)
1180 continue;
1181 RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd));
1182 }
1183 return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS);
1184}
1185
1187 IdxToInstruction.clear();
1188
1189 RegionLiveRegMap =
1190 IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
1191 for (unsigned I = 0; I < DAG->Regions.size(); I++) {
1192 auto &[RegionBegin, RegionEnd] = DAG->Regions[I];
1193 // Skip empty regions.
1194 if (RegionBegin == RegionEnd)
1195 continue;
1196 MachineInstr *RegionKey =
1197 IsLiveOut ? getLastMIForRegion(RegionBegin, RegionEnd) : &*RegionBegin;
1198 IdxToInstruction[I] = RegionKey;
1199 }
1200}
1201
1203 // Start actual scheduling here. This function is called by the base
1204 // MachineScheduler after all regions have been recorded by
1205 // GCNScheduleDAGMILive::schedule().
1206 LiveIns.resize(Regions.size());
1207 Pressure.resize(Regions.size());
1208 RegionsWithHighRP.resize(Regions.size());
1209 RegionsWithExcessRP.resize(Regions.size());
1210 RegionsWithIGLPInstrs.resize(Regions.size());
1211 RegionsWithHighRP.reset();
1212 RegionsWithExcessRP.reset();
1213 RegionsWithIGLPInstrs.reset();
1214
1215 runSchedStages();
1216}
1217
1218void GCNScheduleDAGMILive::runSchedStages() {
1219 LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
1220
1221 GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
1222 if (!Regions.empty()) {
1223 BBLiveInMap = getRegionLiveInMap();
1224 if (S.useGCNTrackers())
1225 RegionLiveOuts.buildLiveRegMap();
1226 }
1227
1228#ifdef DUMP_MAX_REG_PRESSURE
1232 LIS->dump();
1233 }
1234#endif
1235
1236 while (S.advanceStage()) {
1237 auto Stage = createSchedStage(S.getCurrentStage());
1238 if (!Stage->initGCNSchedStage())
1239 continue;
1240
1241 for (auto Region : Regions) {
1242 RegionBegin = Region.first;
1243 RegionEnd = Region.second;
1244 // Setup for scheduling the region and check whether it should be skipped.
1245 if (!Stage->initGCNRegion()) {
1246 Stage->advanceRegion();
1247 exitRegion();
1248 continue;
1249 }
1250
1251 if (S.useGCNTrackers()) {
1252 const unsigned RegionIdx = Stage->getRegionIdx();
1253 S.getDownwardTracker()->reset(MRI, LiveIns[RegionIdx]);
1255 MRI, RegionLiveOuts.getLiveRegsForRegionIdx(RegionIdx));
1256 }
1257
1259 Stage->finalizeGCNRegion();
1260 Stage->advanceRegion();
1261 exitRegion();
1262 }
1263
1264 Stage->finalizeGCNSchedStage();
1265 }
1266
1267#ifdef DUMP_MAX_REG_PRESSURE
1271 LIS->dump();
1272 }
1273#endif
1274}
1275
1276#ifndef NDEBUG
1278 switch (StageID) {
1280 OS << "Max Occupancy Initial Schedule";
1281 break;
1283 OS << "Instruction Rewriting Reschedule";
1284 break;
1286 OS << "Unclustered High Register Pressure Reschedule";
1287 break;
1289 OS << "Clustered Low Occupancy Reschedule";
1290 break;
1292 OS << "Pre-RA Rematerialize";
1293 break;
1295 OS << "Max ILP Initial Schedule";
1296 break;
1298 OS << "Max memory clause Initial Schedule";
1299 break;
1300 }
1301
1302 return OS;
1303}
1304#endif
1305
1309
1311 if (!DAG.LIS)
1312 return false;
1313
1314 LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
1315 return true;
1316}
1317
1318void RewriteMFMAFormStage::findReachingDefs(
1319 MachineOperand &UseMO, LiveIntervals *LIS,
1320 SmallVectorImpl<SlotIndex> &DefIdxs) {
1321 MachineInstr *UseMI = UseMO.getParent();
1322 LiveInterval &UseLI = LIS->getInterval(UseMO.getReg());
1323 VNInfo *VNI = UseLI.getVNInfoAt(LIS->getInstructionIndex(*UseMI));
1324
1325 // If the def is not a PHI, then it must be the only reaching def.
1326 if (!VNI->isPHIDef()) {
1327 DefIdxs.push_back(VNI->def);
1328 return;
1329 }
1330
1331 SmallPtrSet<MachineBasicBlock *, 8> Visited = {UseMI->getParent()};
1333
1334 // Mark the predecessor blocks for traversal
1335 for (MachineBasicBlock *PredMBB : UseMI->getParent()->predecessors()) {
1336 Worklist.push_back(PredMBB);
1337 Visited.insert(PredMBB);
1338 }
1339
1340 while (!Worklist.empty()) {
1341 MachineBasicBlock *CurrMBB = Worklist.pop_back_val();
1342
1343 SlotIndex CurrMBBEnd = LIS->getMBBEndIdx(CurrMBB);
1344 VNInfo *VNI = UseLI.getVNInfoAt(CurrMBBEnd.getPrevSlot());
1345
1346 MachineBasicBlock *DefMBB = LIS->getMBBFromIndex(VNI->def);
1347
1348 // If there is a def in this block, then add it to the list. This is the
1349 // reaching def of this path.
1350 if (!VNI->isPHIDef()) {
1351 DefIdxs.push_back(VNI->def);
1352 continue;
1353 }
1354
1355 for (MachineBasicBlock *PredMBB : DefMBB->predecessors()) {
1356 if (Visited.insert(PredMBB).second)
1357 Worklist.push_back(PredMBB);
1358 }
1359 }
1360}
1361
1362void RewriteMFMAFormStage::findReachingUses(
1363 const MachineInstr *DefMI, LiveIntervals *LIS,
1364 SmallVectorImpl<MachineOperand *> &ReachingUses) {
1365 SlotIndex DefIdx = LIS->getInstructionIndex(*DefMI);
1366 for (MachineOperand &UseMO :
1367 DAG.MRI.use_nodbg_operands(DefMI->getOperand(0).getReg())) {
1368 SmallVector<SlotIndex, 8> ReachingDefIndexes;
1369 findReachingDefs(UseMO, LIS, ReachingDefIndexes);
1370
1371 // If we find a use that contains this DefMI in its reachingDefs, then it is
1372 // a reaching use.
1373 if (any_of(ReachingDefIndexes, [DefIdx](SlotIndex RDIdx) {
1374 return SlotIndex::isSameInstr(RDIdx, DefIdx);
1375 }))
1376 ReachingUses.push_back(&UseMO);
1377 }
1378}
1379
1381 // We only need to run this pass if the architecture supports AGPRs.
1382 // Additionally, we don't use AGPRs at occupancy levels above 1 so there
1383 // is no need for this pass in that case, either.
1384 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1385 if (!ST.hasGFX90AInsts() || MFI.getMinWavesPerEU() > 1)
1386 return false;
1387
1388 RegionsWithExcessArchVGPR.resize(DAG.Regions.size());
1389 RegionsWithExcessArchVGPR.reset();
1390 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
1392 if (PressureBefore.getArchVGPRNum() > ST.getAddressableNumArchVGPRs())
1393 RegionsWithExcessArchVGPR[Region] = true;
1394 }
1395
1396 if (RegionsWithExcessArchVGPR.none())
1397 return false;
1398
1399 TII = ST.getInstrInfo();
1400 SRI = ST.getRegisterInfo();
1401
1402 std::vector<std::pair<MachineInstr *, unsigned>> RewriteCands;
1405
1406 if (!initHeuristics(RewriteCands, CopyForUse, CopyForDef))
1407 return false;
1408
1409 int64_t Cost = getRewriteCost(RewriteCands, CopyForUse, CopyForDef);
1410
1411 // If we haven't found the beneficial conditions, prefer the VGPR form which
1412 // may result in less cross RC copies.
1413 if (Cost > 0)
1414 return false;
1415
1416 return rewrite(RewriteCands);
1417}
1418
1421 return false;
1422
1424 return false;
1425
1426 if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
1427 return false;
1428
1429 SavedMutations.swap(DAG.Mutations);
1430 DAG.addMutation(
1432
1433 InitialOccupancy = DAG.MinOccupancy;
1434 // Aggressively try to reduce register pressure in the unclustered high RP
1435 // stage. Temporarily increase occupancy target in the region.
1436 TempTargetOccupancy = MFI.getMaxWavesPerEU() > DAG.MinOccupancy
1437 ? InitialOccupancy + 1
1438 : InitialOccupancy;
1439 IsAnyRegionScheduled = false;
1440 S.SGPRLimitBias = S.HighRPSGPRBias;
1441 S.VGPRLimitBias = S.HighRPVGPRBias;
1442
1443 LLVM_DEBUG(
1444 dbgs()
1445 << "Retrying function scheduling without clustering. "
1446 "Aggressively try to reduce register pressure to achieve occupancy "
1447 << TempTargetOccupancy << ".\n");
1448
1449 return true;
1450}
1451
1454 return false;
1455
1457 return false;
1458
1459 // Don't bother trying to improve ILP in lower RP regions if occupancy has not
1460 // been dropped. All regions will have already been scheduled with the ideal
1461 // occupancy targets.
1462 if (DAG.StartingOccupancy <= DAG.MinOccupancy)
1463 return false;
1464
1465 LLVM_DEBUG(
1466 dbgs() << "Retrying function scheduling with lowest recorded occupancy "
1467 << DAG.MinOccupancy << ".\n");
1468 return true;
1469}
1470
1471/// Allows to easily filter for this stage's debug output.
1472#define REMAT_PREFIX "[PreRARemat] "
1473#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
1474
1475#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1476Printable PreRARematStage::ScoredRemat::print() const {
1477 return Printable([&](raw_ostream &OS) {
1478 OS << '(' << MaxFreq << ", " << FreqDiff << ", " << RegionImpact << ')';
1479 });
1480}
1481#endif
1482
1484 // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
1485 // regions inbetween the defs and region we sinked the def to. Will need to be
1486 // fixed if there is another pass after this pass.
1487 assert(!S.hasNextStage());
1488
1489 if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() <= 1)
1490 return false;
1491
1492#ifndef NDEBUG
1493 auto PrintTargetRegions = [&]() -> void {
1494 if (TargetRegions.none()) {
1495 dbgs() << REMAT_PREFIX << "No target regions\n";
1496 return;
1497 }
1498 dbgs() << REMAT_PREFIX << "Target regions:\n";
1499 for (unsigned I : TargetRegions.set_bits())
1500 dbgs() << REMAT_PREFIX << " [" << I << "] " << RPTargets[I] << '\n';
1501 };
1502#endif
1503
1504 // Set an objective for the stage based on current RP in each region.
1505 REMAT_DEBUG({
1506 dbgs() << "Analyzing ";
1507 MF.getFunction().printAsOperand(dbgs(), false);
1508 dbgs() << ": ";
1509 });
1510 if (!setObjective()) {
1511 LLVM_DEBUG(dbgs() << "no objective to achieve, occupancy is maximal at "
1512 << MFI.getMaxWavesPerEU() << '\n');
1513 return false;
1514 }
1515 LLVM_DEBUG({
1516 if (TargetOcc) {
1517 dbgs() << "increase occupancy from " << *TargetOcc - 1 << '\n';
1518 } else {
1519 dbgs() << "reduce spilling (minimum target occupancy is "
1520 << MFI.getMinWavesPerEU() << ")\n";
1521 }
1522 PrintTargetRegions();
1523 });
1524
1525 // We need up-to-date live-out info. to query live-out register masks in
1526 // regions containing rematerializable instructions.
1527 DAG.RegionLiveOuts.buildLiveRegMap();
1528
1529 if (!Remater.analyze()) {
1530 REMAT_DEBUG(dbgs() << "No rematerializable registers\n");
1531 return false;
1532 }
1533 const ScoredRemat::FreqInfo FreqInfo(MF, DAG);
1534
1535 // Set of registers already marked for potential remterialization; used to
1536 // avoid rematerialization chains.
1537 SmallSet<Register, 4> MarkedRegs;
1538
1539 // Collect candidates. We have more restrictions on what we can track here
1540 // compared to the rematerializer.
1541 SmallVector<ScoredRemat, 8> Candidates;
1542 SmallVector<unsigned> CandidateOrder;
1543 for (unsigned RegIdx = 0, E = Remater.getNumRegs(); RegIdx < E; ++RegIdx) {
1544 const Rematerializer::Reg &CandReg = Remater.getReg(RegIdx);
1545
1546 // Single user only.
1547 unsigned NumUsers = 0;
1548 for (const auto &[_, RegionUses] : CandReg.Uses)
1549 NumUsers += RegionUses.size();
1550 if (NumUsers != 1)
1551 continue;
1552
1553 // We further filter the registers that we can rematerialize based on our
1554 // current tracking capabilities in the stage. The user cannot itself be
1555 // marked rematerializable, and no register operand of the defining MI can
1556 // be marked rematerializable.
1557 MachineInstr *UseMI = *CandReg.Uses.begin()->getSecond().begin();
1558 const MachineOperand &UseMO = UseMI->getOperand(0);
1559 if (UseMO.isReg() && MarkedRegs.contains(UseMO.getReg()))
1560 continue;
1561 if (llvm::any_of(CandReg.DefMI->all_uses(),
1562 [&MarkedRegs](const MachineOperand &MO) {
1563 return MarkedRegs.contains(MO.getReg());
1564 }))
1565 continue;
1566
1567 // Do not rematerialize an instruction if it uses registers that aren't
1568 // available at its use. This ensures that we are not extending any live
1569 // range while rematerializing.
1570 SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true);
1571 if (!VirtRegAuxInfo::allUsesAvailableAt(CandReg.DefMI, UseIdx, *DAG.LIS,
1572 DAG.MRI, *DAG.TII))
1573 continue;
1574
1575 MarkedRegs.insert(CandReg.getDefReg());
1576 ScoredRemat &Cand = Candidates.emplace_back();
1577 Cand.init(RegIdx, FreqInfo, Remater, DAG);
1578 Cand.update(TargetRegions, RPTargets, FreqInfo, !TargetOcc);
1579 if (!Cand.hasNullScore())
1580 CandidateOrder.push_back(Candidates.size() - 1);
1581 }
1582
1583 if (TargetOcc) {
1584 // Every rematerialization we do here is likely to move the instruction
1585 // into a higher frequency region, increasing the total sum latency of the
1586 // instruction itself. This is acceptable if we are eliminating a spill in
1587 // the process, but when the goal is increasing occupancy we get nothing
1588 // out of rematerialization if occupancy is not increased in the end; in
1589 // such cases we want to roll back the rematerialization.
1590 Rollback = std::make_unique<RollbackSupport>(Remater);
1591 }
1592
1593 // Rematerialize registers in successive rounds until all RP targets are
1594 // satisifed or until we run out of rematerialization candidates.
1595 BitVector RecomputeRP(DAG.Regions.size());
1596 for (;;) {
1597 RecomputeRP.reset();
1598
1599 // Sort candidates in increasing score order.
1600 sort(CandidateOrder, [&](unsigned LHSIndex, unsigned RHSIndex) {
1601 return Candidates[LHSIndex] < Candidates[RHSIndex];
1602 });
1603
1604 REMAT_DEBUG({
1605 dbgs() << "==== NEW REMAT ROUND ====\n"
1606 << REMAT_PREFIX
1607 << "Candidates with non-null score, in rematerialization order:\n";
1608 for (const ScoredRemat &Cand : reverse(Candidates)) {
1609 dbgs() << REMAT_PREFIX << " " << Cand.print() << " | "
1610 << Remater.printRematReg(Cand.RegIdx) << '\n';
1611 }
1612 PrintTargetRegions();
1613 });
1614
1615 // Rematerialize registers in decreasing score order until we estimate
1616 // that all RP targets are satisfied or until rematerialization candidates
1617 // are no longer useful to decrease RP.
1618 while (!CandidateOrder.empty()) {
1619 const ScoredRemat &Cand = Candidates[CandidateOrder.back()];
1620 const Rematerializer::Reg &Reg = Remater.getReg(Cand.RegIdx);
1621
1622 // When previous rematerializations in this round have already satisfied
1623 // RP targets in all regions this rematerialization can impact, we have a
1624 // good indication that our scores have diverged significantly from
1625 // reality, in which case we interrupt this round and re-score. This also
1626 // ensures that every rematerialization we perform is possibly impactful
1627 // in at least one target region.
1628 if (!Cand.maybeBeneficial(TargetRegions, RPTargets)) {
1629 REMAT_DEBUG(dbgs() << "Interrupt round on stale score for "
1630 << Cand.print() << " | "
1631 << Remater.printRematReg(Cand.RegIdx));
1632 break;
1633 }
1634 CandidateOrder.pop_back();
1635
1636#ifdef EXPENSIVE_CHECKS
1637 // All uses are known to be available / live at the remat point. Thus,
1638 // the uses should already be live in to the using region.
1639 for (MachineOperand &MO : Reg.DefMI->operands()) {
1640 if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
1641 continue;
1642
1643 Register UseReg = MO.getReg();
1644 if (!UseReg.isVirtual())
1645 continue;
1646
1647 LiveInterval &LI = DAG.LIS->getInterval(UseReg);
1648 LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
1649 if (LI.hasSubRanges() && MO.getSubReg())
1650 LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
1651
1652 const unsigned UseRegion = Reg.Uses.begin()->first;
1653 LaneBitmask LiveInMask = DAG.LiveIns[UseRegion].at(UseReg);
1654 LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
1655 // If this register has lanes not covered by the LiveIns, be sure they
1656 // do not map to any subrange. ref:
1657 // machine-scheduler-sink-trivial-remats.mir::omitted_subrange
1658 if (UncoveredLanes.any()) {
1659 assert(LI.hasSubRanges());
1660 for (LiveInterval::SubRange &SR : LI.subranges())
1661 assert((SR.LaneMask & UncoveredLanes).none());
1662 }
1663 }
1664#endif
1665
1666 // Remove the register from all regions where it is a live-in or live-out,
1667 // then rematerialize the register.
1668 REMAT_DEBUG(dbgs() << "** REMAT " << Remater.printRematReg(Cand.RegIdx)
1669 << '\n');
1670 removeFromLiveMaps(Reg.getDefReg(), Cand.LiveIn, Cand.LiveOut);
1671 if (Rollback) {
1672 Rollback->LiveMapUpdates.emplace_back(Cand.RegIdx, Cand.LiveIn,
1673 Cand.LiveOut);
1674 }
1675 Cand.rematerialize(Remater);
1676
1677 // Adjust RP targets. The save is guaranteed in regions in which the
1678 // register is live-through and unused but optimistic in all other regions
1679 // where the register is live.
1680 updateRPTargets(Cand.Live, Cand.RPSave);
1681 RecomputeRP |= Cand.UnpredictableRPSave;
1682 RescheduleRegions |= Cand.Live;
1683 if (!TargetRegions.any()) {
1684 REMAT_DEBUG(dbgs() << "All targets cleared, verifying...\n");
1685 break;
1686 }
1687 }
1688
1689 if (!updateAndVerifyRPTargets(RecomputeRP) && !TargetRegions.any()) {
1690 REMAT_DEBUG(dbgs() << "Objectives achieved!\n");
1691 break;
1692 }
1693
1694 // Update the score of remaining candidates and filter out those that have
1695 // become useless from the vector. Candidates never become useful after
1696 // having been useless for a round, so we can freely drop them without
1697 // losing any future rematerialization opportunity.
1698 unsigned NumUsefulCandidates = 0;
1699 for (unsigned CandIdx : CandidateOrder) {
1700 ScoredRemat &Candidate = Candidates[CandIdx];
1701 Candidate.update(TargetRegions, RPTargets, FreqInfo, !TargetOcc);
1702 if (!Candidate.hasNullScore())
1703 CandidateOrder[NumUsefulCandidates++] = CandIdx;
1704 }
1705 if (NumUsefulCandidates == 0) {
1706 REMAT_DEBUG(dbgs() << "Stop on exhausted rematerialization candidates\n");
1707 break;
1708 }
1709 CandidateOrder.truncate(NumUsefulCandidates);
1710 }
1711
1712 if (RescheduleRegions.none())
1713 return false;
1714
1715 // Commit all pressure changes to the DAG and compute minimum achieved
1716 // occupancy in impacted regions.
1717 REMAT_DEBUG(dbgs() << "==== REMAT RESULTS ====\n");
1718 unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
1719 for (unsigned I : RescheduleRegions.set_bits()) {
1720 DAG.Pressure[I] = RPTargets[I].getCurrentRP();
1721 REMAT_DEBUG(dbgs() << '[' << I << "] Achieved occupancy "
1722 << DAG.Pressure[I].getOccupancy(ST, DynamicVGPRBlockSize)
1723 << " (" << RPTargets[I] << ")\n");
1724 }
1725 AchievedOcc = MFI.getMaxWavesPerEU();
1726 for (const GCNRegPressure &RP : DAG.Pressure) {
1727 AchievedOcc =
1728 std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize));
1729 }
1730
1731 REMAT_DEBUG({
1732 dbgs() << "Retrying function scheduling with new min. occupancy of "
1733 << AchievedOcc << " from rematerializing (original was "
1734 << DAG.MinOccupancy;
1735 if (TargetOcc)
1736 dbgs() << ", target was " << *TargetOcc;
1737 dbgs() << ")\n";
1738 });
1739
1740 DAG.setTargetOccupancy(getStageTargetOccupancy());
1741 return true;
1742}
1743
1745 DAG.finishBlock();
1746 LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
1747}
1748
1750 SavedMutations.swap(DAG.Mutations);
1751 S.SGPRLimitBias = S.VGPRLimitBias = 0;
1752 if (DAG.MinOccupancy > InitialOccupancy) {
1753 assert(IsAnyRegionScheduled);
1755 << " stage successfully increased occupancy to "
1756 << DAG.MinOccupancy << '\n');
1757 } else if (!IsAnyRegionScheduled) {
1758 assert(DAG.MinOccupancy == InitialOccupancy);
1760 << ": No regions scheduled, min occupancy stays at "
1761 << DAG.MinOccupancy << ", MFI occupancy stays at "
1762 << MFI.getOccupancy() << ".\n");
1763 }
1764
1766}
1767
1769 // Skip empty scheduling region.
1770 if (DAG.begin() == DAG.end())
1771 return false;
1772
1773 // Check whether this new region is also a new block.
1774 if (DAG.RegionBegin->getParent() != CurrentMBB)
1775 setupNewBlock();
1776
1777 unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end());
1778 DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs);
1779
1780 // Skip regions with 1 schedulable instruction.
1781 if (DAG.begin() == std::prev(DAG.end()))
1782 return false;
1783
1784 LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
1785 LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
1786 << " " << CurrentMBB->getName()
1787 << "\n From: " << *DAG.begin() << " To: ";
1788 if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
1789 else dbgs() << "End";
1790 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
1791
1792 // Save original instruction order before scheduling for possible revert.
1793 Unsched.clear();
1794 Unsched.reserve(DAG.NumRegionInstrs);
1797 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG.TII);
1798 for (auto &I : DAG) {
1799 Unsched.push_back(&I);
1800 if (SII->isIGLPMutationOnly(I.getOpcode()))
1801 DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
1802 }
1803 } else {
1804 for (auto &I : DAG)
1805 Unsched.push_back(&I);
1806 }
1807
1808 PressureBefore = DAG.Pressure[RegionIdx];
1809
1810 LLVM_DEBUG(
1811 dbgs() << "Pressure before scheduling:\nRegion live-ins:"
1812 << print(DAG.LiveIns[RegionIdx], DAG.MRI)
1813 << "Region live-in pressure: "
1814 << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
1815 << "Region register pressure: " << print(PressureBefore));
1816
1817 S.HasHighPressure = false;
1818 S.KnownExcessRP = isRegionWithExcessRP();
1819
1820 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1822 SavedMutations.clear();
1823 SavedMutations.swap(DAG.Mutations);
1824 bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
1826 DAG.addMutation(createIGroupLPDAGMutation(
1827 IsInitialStage ? AMDGPU::SchedulingPhase::Initial
1829 }
1830
1831 return true;
1832}
1833
1835 // Only reschedule regions that have excess register pressure (i.e. spilling)
1836 // or had minimum occupancy at the beginning of the stage (as long as
1837 // rescheduling of previous regions did not make occupancy drop back down to
1838 // the initial minimum).
1839 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1840 // If no region has been scheduled yet, the DAG has not yet been updated with
1841 // the occupancy target. So retrieve it from the temporary.
1842 unsigned CurrentTargetOccupancy =
1843 IsAnyRegionScheduled ? DAG.MinOccupancy : TempTargetOccupancy;
1844 if (!DAG.RegionsWithExcessRP[RegionIdx] &&
1845 (CurrentTargetOccupancy <= InitialOccupancy ||
1846 DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) !=
1847 InitialOccupancy))
1848 return false;
1849
1850 bool IsSchedulingThisRegion = GCNSchedStage::initGCNRegion();
1851 // If this is the first region scheduled during this stage, make the target
1852 // occupancy changes in the DAG and MFI.
1853 if (!IsAnyRegionScheduled && IsSchedulingThisRegion) {
1854 IsAnyRegionScheduled = true;
1855 if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
1856 DAG.setTargetOccupancy(TempTargetOccupancy);
1857 }
1858 return IsSchedulingThisRegion;
1859}
1860
1862 // We may need to reschedule this region if it wasn't rescheduled in the last
1863 // stage, or if we found it was testing critical register pressure limits in
1864 // the unclustered reschedule stage. The later is because we may not have been
1865 // able to raise the min occupancy in the previous stage so the region may be
1866 // overly constrained even if it was already rescheduled.
1867 if (!DAG.RegionsWithHighRP[RegionIdx])
1868 return false;
1869
1871}
1872
1874 return !RevertAllRegions && RescheduleRegions[RegionIdx] &&
1876}
1877
1879 if (CurrentMBB)
1880 DAG.finishBlock();
1881
1882 CurrentMBB = DAG.RegionBegin->getParent();
1883 DAG.startBlock(CurrentMBB);
1884 // Get real RP for the region if it hasn't be calculated before. After the
1885 // initial schedule stage real RP will be collected after scheduling.
1889 DAG.computeBlockPressure(RegionIdx, CurrentMBB);
1890}
1891
1893 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1894 if (S.HasHighPressure)
1895 DAG.RegionsWithHighRP[RegionIdx] = true;
1896
1897 // Revert scheduling if we have dropped occupancy or there is some other
1898 // reason that the original schedule is better.
1900
1901 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1903 SavedMutations.swap(DAG.Mutations);
1904}
1905
1908 // When the goal is to increase occupancy, all regions must reach the target
1909 // occupancy for rematerializations to be possibly useful, otherwise we will
1910 // just hurt latency for no benefit. If minimum occupancy drops below the
1911 // target there is no point in trying to re-schedule further regions.
1912 if (!TargetOcc)
1913 return;
1914 RegionReverts.emplace_back(RegionIdx, Unsched, PressureBefore);
1915 if (DAG.MinOccupancy < *TargetOcc) {
1916 REMAT_DEBUG(dbgs() << "Region " << RegionIdx
1917 << " cannot meet occupancy target, interrupting "
1918 "re-scheduling in all regions\n");
1919 RevertAllRegions = true;
1920 }
1921}
1922
1924 // Check the results of scheduling.
1925 PressureAfter = DAG.getRealRegPressure(RegionIdx);
1926
1927 LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
1928 LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
1929
1930 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1931
1932 if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
1933 PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
1934 DAG.Pressure[RegionIdx] = PressureAfter;
1935
1936 // Early out if we have achieved the occupancy target.
1937 LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
1938 return;
1939 }
1940
1941 unsigned TargetOccupancy = std::min(
1942 S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second);
1943 unsigned WavesAfter = std::min(
1944 TargetOccupancy, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
1945 unsigned WavesBefore = std::min(
1946 TargetOccupancy, PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
1947 LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
1948 << ", after " << WavesAfter << ".\n");
1949
1950 // We may not be able to keep the current target occupancy because of the just
1951 // scheduled region. We might still be able to revert scheduling if the
1952 // occupancy before was higher, or if the current schedule has register
1953 // pressure higher than the excess limits which could lead to more spilling.
1954 unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
1955
1956 // Allow memory bound functions to drop to 4 waves if not limited by an
1957 // attribute.
1958 if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
1959 WavesAfter >= MFI.getMinAllowedOccupancy()) {
1960 LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
1961 << MFI.getMinAllowedOccupancy() << " waves\n");
1962 NewOccupancy = WavesAfter;
1963 }
1964
1965 if (NewOccupancy < DAG.MinOccupancy) {
1966 DAG.MinOccupancy = NewOccupancy;
1967 MFI.limitOccupancy(DAG.MinOccupancy);
1968 LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
1969 << DAG.MinOccupancy << ".\n");
1970 }
1971 // The maximum number of arch VGPR on non-unified register file, or the
1972 // maximum VGPR + AGPR in the unified register file case.
1973 unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
1974 // The maximum number of arch VGPR for both unified and non-unified register
1975 // file.
1976 unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
1977 unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
1978
1979 if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs ||
1980 PressureAfter.getArchVGPRNum() > MaxArchVGPRs ||
1981 PressureAfter.getAGPRNum() > MaxArchVGPRs ||
1982 PressureAfter.getSGPRNum() > MaxSGPRs) {
1983 DAG.RegionsWithHighRP[RegionIdx] = true;
1984 DAG.RegionsWithExcessRP[RegionIdx] = true;
1985 }
1986
1987 // Revert if this region's schedule would cause a drop in occupancy or
1988 // spilling.
1989 if (shouldRevertScheduling(WavesAfter)) {
1991 std::tie(DAG.RegionBegin, DAG.RegionEnd) = DAG.Regions[RegionIdx];
1992 } else {
1993 DAG.Pressure[RegionIdx] = PressureAfter;
1994 }
1995}
1996
1997unsigned
1998GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
1999 DenseMap<unsigned, unsigned> &ReadyCycles,
2000 const TargetSchedModel &SM) {
2001 unsigned ReadyCycle = CurrCycle;
2002 for (auto &D : SU.Preds) {
2003 if (D.isAssignedRegDep()) {
2004 MachineInstr *DefMI = D.getSUnit()->getInstr();
2005 unsigned Latency = SM.computeInstrLatency(DefMI);
2006 unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum];
2007 ReadyCycle = std::max(ReadyCycle, DefReady + Latency);
2008 }
2009 }
2010 ReadyCycles[SU.NodeNum] = ReadyCycle;
2011 return ReadyCycle;
2012}
2013
2014#ifndef NDEBUG
2016 bool operator()(std::pair<MachineInstr *, unsigned> A,
2017 std::pair<MachineInstr *, unsigned> B) const {
2018 return A.second < B.second;
2019 }
2020};
2021
2022static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>,
2023 EarlierIssuingCycle> &ReadyCycles) {
2024 if (ReadyCycles.empty())
2025 return;
2026 unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
2027 dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
2028 << " ##################\n# Cycle #\t\t\tInstruction "
2029 " "
2030 " \n";
2031 unsigned IPrev = 1;
2032 for (auto &I : ReadyCycles) {
2033 if (I.second > IPrev + 1)
2034 dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
2035 << " CYCLES DETECTED ******************************\n\n";
2036 dbgs() << "[ " << I.second << " ] : " << *I.first << "\n";
2037 IPrev = I.second;
2038 }
2039}
2040#endif
2041
2042ScheduleMetrics
2043GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
2044#ifndef NDEBUG
2045 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
2046 ReadyCyclesSorted;
2047#endif
2048 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
2049 unsigned SumBubbles = 0;
2050 DenseMap<unsigned, unsigned> ReadyCycles;
2051 unsigned CurrCycle = 0;
2052 for (auto &SU : InputSchedule) {
2053 unsigned ReadyCycle =
2054 computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
2055 SumBubbles += ReadyCycle - CurrCycle;
2056#ifndef NDEBUG
2057 ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
2058#endif
2059 CurrCycle = ++ReadyCycle;
2060 }
2061#ifndef NDEBUG
2062 LLVM_DEBUG(
2063 printScheduleModel(ReadyCyclesSorted);
2064 dbgs() << "\n\t"
2065 << "Metric: "
2066 << (SumBubbles
2067 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
2068 : 1)
2069 << "\n\n");
2070#endif
2071
2072 return ScheduleMetrics(CurrCycle, SumBubbles);
2073}
2074
2077#ifndef NDEBUG
2078 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
2079 ReadyCyclesSorted;
2080#endif
2081 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
2082 unsigned SumBubbles = 0;
2083 DenseMap<unsigned, unsigned> ReadyCycles;
2084 unsigned CurrCycle = 0;
2085 for (auto &MI : DAG) {
2086 SUnit *SU = DAG.getSUnit(&MI);
2087 if (!SU)
2088 continue;
2089 unsigned ReadyCycle =
2090 computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM);
2091 SumBubbles += ReadyCycle - CurrCycle;
2092#ifndef NDEBUG
2093 ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
2094#endif
2095 CurrCycle = ++ReadyCycle;
2096 }
2097#ifndef NDEBUG
2098 LLVM_DEBUG(
2099 printScheduleModel(ReadyCyclesSorted);
2100 dbgs() << "\n\t"
2101 << "Metric: "
2102 << (SumBubbles
2103 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
2104 : 1)
2105 << "\n\n");
2106#endif
2107
2108 return ScheduleMetrics(CurrCycle, SumBubbles);
2109}
2110
2111bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
2112 if (WavesAfter < DAG.MinOccupancy)
2113 return true;
2114
2115 // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
2116 if (DAG.MFI.isDynamicVGPREnabled()) {
2117 unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
2118 ST, DAG.MFI.getDynamicVGPRBlockSize(),
2119 PressureBefore.getVGPRNum(false));
2120 unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
2121 ST, DAG.MFI.getDynamicVGPRBlockSize(), PressureAfter.getVGPRNum(false));
2122 if (BlocksAfter > BlocksBefore)
2123 return true;
2124 }
2125
2126 return false;
2127}
2128
2131 return false;
2132
2134 return true;
2135
2136 if (mayCauseSpilling(WavesAfter))
2137 return true;
2138
2139 return false;
2140}
2141
2143 // If RP is not reduced in the unclustered reschedule stage, revert to the
2144 // old schedule.
2145 if ((WavesAfter <=
2146 PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) &&
2147 mayCauseSpilling(WavesAfter)) ||
2149 LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
2150 return true;
2151 }
2152
2153 // Do not attempt to relax schedule even more if we are already spilling.
2155 return false;
2156
2157 LLVM_DEBUG(
2158 dbgs()
2159 << "\n\t *** In shouldRevertScheduling ***\n"
2160 << " *********** BEFORE UnclusteredHighRPStage ***********\n");
2161 ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits);
2162 LLVM_DEBUG(
2163 dbgs()
2164 << "\n *********** AFTER UnclusteredHighRPStage ***********\n");
2166 unsigned OldMetric = MBefore.getMetric();
2167 unsigned NewMetric = MAfter.getMetric();
2168 unsigned WavesBefore = std::min(
2169 S.getTargetOccupancy(),
2170 PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()));
2171 unsigned Profit =
2172 ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
2174 NewMetric) /
2176 LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
2177 << MAfter << "Profit: " << Profit << "\n");
2178 return Profit < ScheduleMetrics::ScaleFactor;
2179}
2180
2183 return false;
2184
2186 return true;
2187
2188 if (mayCauseSpilling(WavesAfter))
2189 return true;
2190
2191 return false;
2192}
2193
2195 // When trying to increase occupancy (TargetOcc == true) the stage manages
2196 // region reverts globally (all or none), so we always return false here.
2197 return !TargetOcc && mayCauseSpilling(WavesAfter);
2198}
2199
2201 if (mayCauseSpilling(WavesAfter))
2202 return true;
2203
2204 return false;
2205}
2206
2208 unsigned WavesAfter) {
2209 return mayCauseSpilling(WavesAfter);
2210}
2211
2212bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
2213 if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
2215 LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
2216 return true;
2217 }
2218
2219 return false;
2220}
2221
2223 ArrayRef<MachineInstr *> MIOrder) {
2224 assert(static_cast<size_t>(std::distance(DAG.Regions[RegionIdx].first,
2225 DAG.Regions[RegionIdx].second)) ==
2226 MIOrder.size() &&
2227 "instruction number mismatch");
2228 if (MIOrder.empty())
2229 return;
2230
2231 LLVM_DEBUG(dbgs() << "Reverting scheduling for region " << RegionIdx << '\n');
2232
2233 // Reconstruct MI sequence by moving instructions in desired order before
2234 // the current region's start.
2235 MachineBasicBlock::iterator RegionEnd = DAG.Regions[RegionIdx].first;
2236 MachineBasicBlock *MBB = MIOrder.front()->getParent();
2237 for (MachineInstr *MI : MIOrder) {
2238 // Either move the next MI in order before the end of the region or move the
2239 // region end past the MI if it is at the correct position.
2240 MachineBasicBlock::iterator MII = MI->getIterator();
2241 if (MII != RegionEnd) {
2242 // Will subsequent splice move MI up past a non-debug instruction?
2243 bool NonDebugReordered =
2244 !MI->isDebugInstr() &&
2245 skipDebugInstructionsForward(RegionEnd, MII) != MII;
2246 MBB->splice(RegionEnd, MBB, MI);
2247 // Only update LiveIntervals information if non-debug instructions are
2248 // reordered. Otherwise debug instructions could cause code generation to
2249 // change.
2250 if (NonDebugReordered)
2251 DAG.LIS->handleMove(*MI, true);
2252 } else {
2253 // MI is already at the expected position. However, earlier splices in
2254 // this loop may have changed neighboring slot indices, so this MI's
2255 // slot index can become non-monotonic w.r.t. the physical MBB order.
2256 // Only re-seat when monotonicity is actually violated to avoid
2257 // unnecessary LiveInterval changes that could perturb scheduling.
2258 if (!MI->isDebugInstr()) {
2259 SlotIndex MIIdx = DAG.LIS->getInstructionIndex(*MI);
2260 SlotIndex PrevIdx = DAG.LIS->getSlotIndexes()->getIndexBefore(*MI);
2261 if (PrevIdx >= MIIdx)
2262 DAG.LIS->handleMove(*MI, true);
2263 }
2264 ++RegionEnd;
2265 }
2266 if (MI->isDebugInstr()) {
2267 LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
2268 continue;
2269 }
2270
2271 // Reset read-undef flags and update them later.
2272 for (MachineOperand &Op : MI->all_defs())
2273 Op.setIsUndef(false);
2274 RegisterOperands RegOpers;
2275 RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false);
2276 if (DAG.ShouldTrackLaneMasks) {
2277 // Adjust liveness and add missing dead+read-undef flags.
2278 SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot();
2279 RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
2280 } else {
2281 // Adjust for missing dead-def flags.
2282 RegOpers.detectDeadDefs(*MI, *DAG.LIS);
2283 }
2284 LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
2285 }
2286
2287 // The region end doesn't change throughout scheduling since it itself is
2288 // outside the region (whether that is a MBB end or a terminator MI).
2289 assert(RegionEnd == DAG.Regions[RegionIdx].second && "region end mismatch");
2290 DAG.Regions[RegionIdx].first = MIOrder.front();
2291}
2292
2293/// Returns true if reaching def \p RD will be in AGPR form after the rewrite
2294/// and so needs no bridge copy: a candidate MFMA in \p RewriteSet, an
2295/// AV_MOV_*_IMM_PSEUDO, or a copy from a candidate src2 reg in \p CandSrc2Regs.
2296/// A non-candidate MFMA stays in VGPR form and still needs a bridge.
2298 MachineInstr *RD, const SmallPtrSetImpl<MachineInstr *> &RewriteSet,
2299 const DenseSet<Register> &CandSrc2Regs, const SIInstrInfo &TII) {
2300 if (TII.isMAI(*RD))
2301 return RewriteSet.contains(RD);
2302 if (RD->getOpcode() == AMDGPU::AV_MOV_B32_IMM_PSEUDO ||
2303 RD->getOpcode() == AMDGPU::AV_MOV_B64_IMM_PSEUDO)
2304 return true;
2305 if (RD->isCopy() && CandSrc2Regs.contains(RD->getOperand(1).getReg()))
2306 return true;
2307 return false;
2308}
2309
2310bool RewriteMFMAFormStage::hasUseRequiringVGPR(
2311 ArrayRef<SlotIndex> Src2ReachingDefs,
2312 const SmallPtrSetImpl<MachineInstr *> &RewriteSet) {
2313 for (SlotIndex RDIdx : Src2ReachingDefs) {
2314 const MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIdx);
2316 findReachingUses(RD, DAG.LIS, ReachingUses);
2317 for (const MachineOperand *UseMO : ReachingUses) {
2318 const MachineInstr *UseMI = UseMO->getParent();
2319 if (UseMI->isCopy())
2320 continue;
2321 if (TII->isMAI(*UseMI) && RewriteSet.contains(UseMI))
2322 continue;
2323 return true;
2324 }
2325 }
2326 return false;
2327}
2328
2329void RewriteMFMAFormStage::resetRewriteCandsToVGPR(
2330 ArrayRef<std::pair<MachineInstr *, unsigned>> RewriteCands) {
2331 for (auto [MI, OriginalOpcode] : RewriteCands) {
2332 assert(TII->isMAI(*MI));
2333 const TargetRegisterClass *ADefRC =
2334 DAG.MRI.getRegClass(MI->getOperand(0).getReg());
2335 const TargetRegisterClass *VDefRC = SRI->getEquivalentVGPRClass(ADefRC);
2336 DAG.MRI.setRegClass(MI->getOperand(0).getReg(), VDefRC);
2337 MI->setDesc(TII->get(OriginalOpcode));
2338
2339 MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
2340 if (!Src2->isReg())
2341 continue;
2342
2343 // Have to get src types separately since subregs may cause C and D
2344 // registers to be different types even though the actual operand is
2345 // the same size.
2346 const TargetRegisterClass *AUseRC = DAG.MRI.getRegClass(Src2->getReg());
2347 const TargetRegisterClass *VUseRC = SRI->getEquivalentVGPRClass(AUseRC);
2348 DAG.MRI.setRegClass(Src2->getReg(), VUseRC);
2349 }
2350}
2351
2352bool RewriteMFMAFormStage::isRewriteCandidate(MachineInstr *MI) const {
2353 if (!static_cast<const SIInstrInfo *>(DAG.TII)->isMAI(*MI))
2354 return false;
2355 if (AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode()) == -1)
2356 return false;
2357 // Reject candidates whose users force an unavoidable bridge copy.
2358 Register DstReg = MI->getOperand(0).getReg();
2359 for (const MachineOperand &Use : DAG.MRI.use_nodbg_operands(DstReg)) {
2360 if (!TII->isMAI(*Use.getParent()) && !Use.getParent()->isCopy())
2361 return false;
2362 }
2363 return true;
2364}
2365
2366bool RewriteMFMAFormStage::initHeuristics(
2367 std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
2368 DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
2369 SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
2370 bool Changed = false;
2371
2372 // Collect the candidate group, its members share AGPR-form operands
2373 // post-rewrite, so reaching defs feeding any member don't need bridge copy.
2374 SmallPtrSet<MachineInstr *, 16> RewriteSet;
2375 DenseSet<Register> CandSrc2Regs;
2376 for (MachineBasicBlock &MBB : MF) {
2377 for (MachineInstr &MI : MBB) {
2378 if (!isRewriteCandidate(&MI))
2379 continue;
2380 RewriteSet.insert(&MI);
2381 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
2382 if (Src2 && Src2->isReg())
2383 CandSrc2Regs.insert(Src2->getReg());
2384 }
2385 }
2386
2387 // Prepare for the heuristics
2388 for (MachineBasicBlock &MBB : MF) {
2389 for (MachineInstr &MI : MBB) {
2390 if (!isRewriteCandidate(&MI))
2391 continue;
2392
2393 int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
2394 assert(ReplacementOp != -1);
2395
2396 RewriteCands.push_back({&MI, MI.getOpcode()});
2397 MI.setDesc(TII->get(ReplacementOp));
2398
2399 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
2400 if (Src2->isReg()) {
2401 SmallVector<SlotIndex, 8> Src2ReachingDefs;
2402 findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
2403
2404 // If src2 has a use that must remain VGPR, it cannot be reclassified to
2405 // AGPR.
2406 bool Src2NeedsVGPR = hasUseRequiringVGPR(Src2ReachingDefs, RewriteSet);
2407 Src2NeedsVGPRCache[&MI] = Src2NeedsVGPR;
2408
2409 for (SlotIndex RDIdx : Src2ReachingDefs) {
2410 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIdx);
2411 if (!Src2NeedsVGPR &&
2412 isReachingDefAGPRForm(RD, RewriteSet, CandSrc2Regs, *TII))
2413 continue;
2414 CopyForDef.insert(RD);
2415 }
2416 }
2417
2418 MachineOperand &Dst = MI.getOperand(0);
2419 SmallVector<MachineOperand *, 8> DstReachingUses;
2420
2421 findReachingUses(&MI, DAG.LIS, DstReachingUses);
2422
2423 for (MachineOperand *RUOp : DstReachingUses) {
2424 MachineInstr *UserMI = RUOp->getParent();
2425 // Group members read the AGPR result directly.
2426 if (TII->isMAI(*UserMI) && RewriteSet.contains(UserMI))
2427 continue;
2428
2429 // For any user of the result of the MFMA which is not an MFMA, we
2430 // insert a copy. For a given register, we will only insert one copy
2431 // per user block.
2432 CopyForUse[UserMI->getParent()].insert(RUOp->getReg());
2433
2434 if (TII->isMAI(*UserMI))
2435 continue;
2436
2437 SmallVector<SlotIndex, 8> DstUsesReachingDefs;
2438 findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
2439
2440 for (SlotIndex RDIndex : DstUsesReachingDefs) {
2441 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
2442 if (TII->isMAI(*RD))
2443 continue;
2444
2445 // For any definition of the user of the MFMA which is not an MFMA,
2446 // we insert a copy. We do this to transform all the reaching defs
2447 // of this use to AGPR. By doing this, we can insert a copy from
2448 // AGPR to VGPR at the user rather than after the MFMA.
2449 CopyForDef.insert(RD);
2450 }
2451 }
2452
2453 // Do the rewrite to allow for updated RP calculation.
2454 const TargetRegisterClass *VDefRC = DAG.MRI.getRegClass(Dst.getReg());
2455 const TargetRegisterClass *ADefRC = SRI->getEquivalentAGPRClass(VDefRC);
2456 DAG.MRI.setRegClass(Dst.getReg(), ADefRC);
2457 if (Src2->isReg()) {
2458 // Have to get src types separately since subregs may cause C and D
2459 // registers to be different types even though the actual operand is
2460 // the same size.
2461 const TargetRegisterClass *VUseRC = DAG.MRI.getRegClass(Src2->getReg());
2462 const TargetRegisterClass *AUseRC = SRI->getEquivalentAGPRClass(VUseRC);
2463 DAG.MRI.setRegClass(Src2->getReg(), AUseRC);
2464 }
2465 Changed = true;
2466 }
2467 }
2468
2469 return Changed;
2470}
2471
2472int64_t RewriteMFMAFormStage::getRewriteCost(
2473 ArrayRef<std::pair<MachineInstr *, unsigned>> RewriteCands,
2474 const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
2475 const SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
2476 MachineBlockFrequencyInfo *MBFI = DAG.MBFI;
2477
2478 int64_t BestSpillCost = 0;
2479 int64_t Cost = 0;
2480 uint64_t EntryFreq = MBFI->getEntryFreq().getFrequency();
2481
2482 std::pair<unsigned, unsigned> MaxVectorRegs =
2483 ST.getMaxNumVectorRegs(MF.getFunction());
2484 unsigned ArchVGPRThreshold = MaxVectorRegs.first;
2485 unsigned AGPRThreshold = MaxVectorRegs.second;
2486 unsigned CombinedThreshold = ST.getMaxNumVGPRs(MF);
2487
2488 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
2489 if (!RegionsWithExcessArchVGPR[Region])
2490 continue;
2491
2492 GCNRegPressure &PressureBefore = DAG.Pressure[Region];
2493 unsigned SpillCostBefore = PressureBefore.getVGPRSpills(
2494 MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
2495
2496 // For the cases we care about (i.e. ArchVGPR usage is greater than the
2497 // addressable limit), rewriting alone should bring pressure to manageable
2498 // level. If we find any such region, then the rewrite is potentially
2499 // beneficial.
2500 GCNRegPressure PressureAfter = DAG.getRealRegPressure(Region);
2501 unsigned SpillCostAfter = PressureAfter.getVGPRSpills(
2502 MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
2503
2504 uint64_t BlockFreq =
2505 MBFI->getBlockFreq(DAG.Regions[Region].first->getParent())
2506 .getFrequency();
2507
2508 bool RelativeFreqIsDenom = EntryFreq > BlockFreq;
2509 uint64_t RelativeFreq = EntryFreq && BlockFreq
2510 ? (RelativeFreqIsDenom ? EntryFreq / BlockFreq
2511 : BlockFreq / EntryFreq)
2512 : 1;
2513
2514 // This assumes perfect spilling / splitting -- using one spill / copy
2515 // instruction and one restoreFrom / copy for each excess register,
2516 int64_t SpillCost = ((int)SpillCostAfter - (int)SpillCostBefore) * 2;
2517
2518 // Also account for the block frequency.
2519 if (RelativeFreqIsDenom)
2520 SpillCost /= (int64_t)RelativeFreq;
2521 else
2522 SpillCost *= (int64_t)RelativeFreq;
2523
2524 // If we have increased spilling in any block, just bail.
2525 if (SpillCost > 0) {
2526 resetRewriteCandsToVGPR(RewriteCands);
2527 return SpillCost;
2528 }
2529
2530 if (SpillCost < BestSpillCost)
2531 BestSpillCost = SpillCost;
2532 }
2533
2534 // Set the cost to the largest decrease in spill cost in order to not double
2535 // count spill reductions.
2536 Cost = BestSpillCost;
2537 assert(Cost <= 0);
2538
2539 unsigned CopyCost = 0;
2540
2541 // For each CopyForDef, increase the cost by the register size while
2542 // accounting for block frequency.
2543 for (MachineInstr *DefMI : CopyForDef) {
2544 Register DefReg = DefMI->getOperand(0).getReg();
2545 uint64_t DefFreq =
2546 EntryFreq
2547 ? MBFI->getBlockFreq(DefMI->getParent()).getFrequency() / EntryFreq
2548 : 1;
2549
2550 const TargetRegisterClass *RC = DAG.MRI.getRegClass(DefReg);
2551 CopyCost += RC->getCopyCost() * DefFreq;
2552 }
2553
2554 // Account for CopyForUse copies in each block that the register is used.
2555 for (auto &[UseBlock, UseRegs] : CopyForUse) {
2556 uint64_t UseFreq =
2557 EntryFreq ? MBFI->getBlockFreq(UseBlock).getFrequency() / EntryFreq : 1;
2558
2559 for (Register UseReg : UseRegs) {
2560 const TargetRegisterClass *RC = DAG.MRI.getRegClass(UseReg);
2561 CopyCost += RC->getCopyCost() * UseFreq;
2562 }
2563 }
2564
2565 // Reset the classes that were changed to AGPR for better register bank
2566 // analysis. We must do rewriting after copy-insertion, as some defs of the
2567 // register may require VGPR. Additionally, if we bail out and don't perform
2568 // the rewrite then these need to be restored anyway.
2569 resetRewriteCandsToVGPR(RewriteCands);
2570
2571 return Cost + CopyCost;
2572}
2573
2574bool RewriteMFMAFormStage::rewrite(
2575 ArrayRef<std::pair<MachineInstr *, unsigned>> RewriteCands) {
2576 DenseMap<MachineInstr *, unsigned> FirstMIToRegion;
2577 DenseMap<MachineInstr *, unsigned> LastMIToRegion;
2578
2579 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
2580 RegionBoundaries Entry = DAG.Regions[Region];
2581 if (Entry.first == Entry.second)
2582 continue;
2583
2584 FirstMIToRegion[&*Entry.first] = Region;
2585 if (Entry.second != Entry.first->getParent()->end())
2586 LastMIToRegion[&*Entry.second] = Region;
2587 }
2588
2589 // Rewrite the MFMAs to AGPR, and insert any copies as needed.
2590 // The general assumption of the algorithm (and the previous cost calculation)
2591 // is that it is better to insert the copies in the MBB of the def of the src2
2592 // operands, and in the MBB of the user of the dest operands. This is based on
2593 // the assumption that the MFMAs are likely to appear in loop bodies, while
2594 // the src2 and dest operands are live-in / live-out of the loop. Due to this
2595 // design, the algorithm for finding copy insertion points is more
2596 // complicated.
2597 //
2598 // There are three main cases to handle: 1. the reaching defs of the src2
2599 // operands, 2. the reaching uses of the dst operands, and 3. the reaching
2600 // defs of the reaching uses of the dst operand.
2601 //
2602 // In the first case, we simply insert copies after each of the reaching
2603 // definitions. In the second case, we collect all the uses of a given dest
2604 // and organize them by MBB. Then, we insert 1 copy for each MBB before the
2605 // earliest use. Since the use may have multiple reaching defs, and since we
2606 // want to replace the register it is using with the result of the copy, we
2607 // must handle case 3. In the third case, we simply insert a copy after each
2608 // of the reaching defs to connect to the copy of the reaching uses of the dst
2609 // reg. This allows us to avoid inserting copies next to the MFMAs.
2610 //
2611 // While inserting the copies, we maintain a map of operands which will use
2612 // different regs (i.e. the result of the copies). For example, a case 1 src2
2613 // operand will use the register result of the copies after the reaching defs,
2614 // as opposed to the original register. Now that we have completed our copy
2615 // analysis and placement, we can bulk update the registers. We do this
2616 // separately as to avoid complicating the reachingDef and reachingUse
2617 // queries.
2618 //
2619 // While inserting the copies, we also maintain a list or registers which we
2620 // will want to reclassify as AGPR. After doing the copy insertion and the
2621 // register replacement, we can finally do the reclassification. This uses the
2622 // redef map, as the registers we are interested in reclassifying may be
2623 // replaced by the result of a copy. We must do this after the copy analysis
2624 // and placement as we must have an accurate redef map -- otherwise we may end
2625 // up creating illegal instructions.
2626
2627 // The original registers of the MFMA that need to be reclassified as AGPR.
2628 DenseSet<Register> RewriteRegs;
2629 // The map of an original register in the MFMA to a new register (result of a
2630 // copy) that it should be replaced with.
2631 DenseMap<Register, Register> RedefMap;
2632 // The map of the original MFMA registers to the relevant MFMA operands.
2633 DenseMap<Register, DenseSet<MachineOperand *>> ReplaceMap;
2634 // The map of reaching defs for a given register -- to avoid duplicate copies.
2635 DenseMap<Register, SmallPtrSet<MachineInstr *, 8>> ReachingDefCopyMap;
2636 // The map of reaching uses for a given register by basic block -- to avoid
2637 // duplicate copies and to calculate per MBB insert pts.
2638 DenseMap<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>
2639 ReachingUseTracker;
2640
2641 // Collect the candidate group; its members share AGPR-form operands
2642 // post-rewrite, so reaching defs feeding any member need no bridge copy.
2643 SmallPtrSet<MachineInstr *, 16> RewriteCandsSet;
2644 DenseSet<Register> RewriteSrc2Regs;
2645 for (auto &[MI, OriginalOpcode] : RewriteCands) {
2646 RewriteCandsSet.insert(MI);
2647 MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
2648 if (Src2 && Src2->isReg())
2649 RewriteSrc2Regs.insert(Src2->getReg());
2650 }
2651
2652 for (auto &[MI, OriginalOpcode] : RewriteCands) {
2653 int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode());
2654 if (ReplacementOp == -1)
2655 continue;
2656 MI->setDesc(TII->get(ReplacementOp));
2657
2658 // Case 1: insert copies for the reaching defs of the Src2Reg.
2659 MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
2660 if (Src2->isReg()) {
2661 Register Src2Reg = Src2->getReg();
2662 if (!Src2Reg.isVirtual())
2663 return false;
2664
2665 Register MappedReg = Src2->getReg();
2666 SmallVector<SlotIndex, 8> Src2ReachingDefs;
2667 findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
2668 SmallSetVector<MachineInstr *, 8> Src2DefsReplace;
2669
2670 // If src2 has a use that must remain VGPR, it cannot be reclassified to
2671 // AGPR.
2672 bool Src2NeedsVGPR = Src2NeedsVGPRCache.lookup(MI);
2673
2674 for (SlotIndex RDIndex : Src2ReachingDefs) {
2675 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
2676 if (!Src2NeedsVGPR &&
2677 isReachingDefAGPRForm(RD, RewriteCandsSet, RewriteSrc2Regs, *TII))
2678 continue;
2679
2680 Src2DefsReplace.insert(RD);
2681 }
2682
2683 if (!Src2DefsReplace.empty()) {
2684 auto RI = RedefMap.find(Src2Reg);
2685 if (RI != RedefMap.end()) {
2686 MappedReg = RI->second;
2687 } else {
2688 assert(!ReachingDefCopyMap.contains(Src2Reg));
2689 const TargetRegisterClass *Src2RC = DAG.MRI.getRegClass(Src2Reg);
2690 const TargetRegisterClass *VGPRRC =
2691 SRI->getEquivalentVGPRClass(Src2RC);
2692
2693 // Track the mapping of the original register to the new register.
2694 MappedReg = DAG.MRI.createVirtualRegister(VGPRRC);
2695 RedefMap[Src2Reg] = MappedReg;
2696 }
2697
2698 // If none exists, create a copy from this reaching def.
2699 // We may have inserted a copy already in an earlier iteration.
2700 for (MachineInstr *RD : Src2DefsReplace) {
2701 // Do not create redundant copies.
2702 if (ReachingDefCopyMap[Src2Reg].insert(RD).second) {
2703 MachineInstrBuilder VGPRCopy =
2704 BuildMI(*RD->getParent(), std::next(RD->getIterator()),
2705 RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
2706 .addDef(MappedReg, {}, 0)
2707 .addUse(Src2Reg, {}, 0);
2708 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2709
2710 // If this reaching def was the last MI in the region, update the
2711 // region boundaries.
2712 if (LastMIToRegion.contains(RD)) {
2713 unsigned UpdateRegion = LastMIToRegion[RD];
2714 DAG.Regions[UpdateRegion].second = VGPRCopy;
2715 LastMIToRegion.erase(RD);
2716 }
2717 }
2718 }
2719 }
2720
2721 // Track the register for reclassification
2722 RewriteRegs.insert(Src2Reg);
2723
2724 // Always insert the operand for replacement. If this corresponds with a
2725 // chain of tied-def we may not see the VGPR requirement until later.
2726 ReplaceMap[Src2Reg].insert(Src2);
2727 }
2728
2729 // Case 2 and Case 3: insert copies before the reaching uses of the dsts,
2730 // and after the reaching defs of the reaching uses of the dsts.
2731
2732 MachineOperand *Dst = &MI->getOperand(0);
2733 Register DstReg = Dst->getReg();
2734 if (!DstReg.isVirtual())
2735 return false;
2736
2737 Register MappedReg = DstReg;
2738 SmallVector<MachineOperand *, 8> DstReachingUses;
2739
2740 SmallVector<MachineOperand *, 8> DstReachingUseCopies;
2741 SmallVector<MachineInstr *, 8> DstUseDefsReplace;
2742
2743 findReachingUses(MI, DAG.LIS, DstReachingUses);
2744
2745 for (MachineOperand *RUOp : DstReachingUses) {
2746 MachineInstr *UserMI = RUOp->getParent();
2747 // Group members read the AGPR result directly.
2748 if (TII->isMAI(*UserMI) && RewriteCandsSet.contains(UserMI))
2749 continue;
2750
2751 // If there is a non mai reaching use, then we need a copy.
2752 if (find(DstReachingUseCopies, RUOp) == DstReachingUseCopies.end())
2753 DstReachingUseCopies.push_back(RUOp);
2754
2755 // Non-rewritten MAI: its defs aren't being reclassified.
2756 if (TII->isMAI(*UserMI))
2757 continue;
2758
2759 SmallVector<SlotIndex, 8> DstUsesReachingDefs;
2760 findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
2761
2762 for (SlotIndex RDIndex : DstUsesReachingDefs) {
2763 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
2764 if (TII->isMAI(*RD))
2765 continue;
2766
2767 // If there is a non mai reaching def of this reaching use, then we will
2768 // need a copy.
2769 if (find(DstUseDefsReplace, RD) == DstUseDefsReplace.end())
2770 DstUseDefsReplace.push_back(RD);
2771 }
2772 }
2773
2774 if (!DstUseDefsReplace.empty()) {
2775 auto RI = RedefMap.find(DstReg);
2776 if (RI != RedefMap.end()) {
2777 MappedReg = RI->second;
2778 } else {
2779 assert(!ReachingDefCopyMap.contains(DstReg));
2780 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg);
2781 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
2782
2783 // Track the mapping of the original register to the new register.
2784 MappedReg = DAG.MRI.createVirtualRegister(VGPRRC);
2785 RedefMap[DstReg] = MappedReg;
2786 }
2787
2788 // If none exists, create a copy from this reaching def.
2789 // We may have inserted a copy already in an earlier iteration.
2790 for (MachineInstr *RD : DstUseDefsReplace) {
2791 // Do not create reundant copies.
2792 if (ReachingDefCopyMap[DstReg].insert(RD).second) {
2793 MachineInstrBuilder VGPRCopy =
2794 BuildMI(*RD->getParent(), std::next(RD->getIterator()),
2795 RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
2796 .addDef(MappedReg, {}, 0)
2797 .addUse(DstReg, {}, 0);
2798 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2799
2800 // If this reaching def was the last MI in the region, update the
2801 // region boundaries.
2802 auto LMI = LastMIToRegion.find(RD);
2803 if (LMI != LastMIToRegion.end()) {
2804 unsigned UpdateRegion = LMI->second;
2805 DAG.Regions[UpdateRegion].second = VGPRCopy;
2806 LastMIToRegion.erase(RD);
2807 }
2808 }
2809 }
2810 }
2811
2812 DenseSet<MachineOperand *> &DstRegSet = ReplaceMap[DstReg];
2813 for (MachineOperand *RU : DstReachingUseCopies) {
2814 MachineBasicBlock *RUBlock = RU->getParent()->getParent();
2815 // Just keep track of the reaching use of this register by block. After we
2816 // have scanned all the MFMAs we can find optimal insert pts.
2817 if (RUBlock != MI->getParent()) {
2818 ReachingUseTracker[RUBlock->getNumber()][DstReg].insert(RU);
2819 continue;
2820 }
2821
2822 // Special case, the use is in the same block as the MFMA. Insert the copy
2823 // just before the use.
2824 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg);
2825 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
2826 Register NewUseReg = DAG.MRI.createVirtualRegister(VGPRRC);
2827 MachineInstr *UseInst = RU->getParent();
2828 MachineInstrBuilder VGPRCopy =
2829 BuildMI(*UseInst->getParent(), UseInst->getIterator(),
2830 UseInst->getDebugLoc(), TII->get(TargetOpcode::COPY))
2831 .addDef(NewUseReg, {}, 0)
2832 .addUse(DstReg, {}, 0);
2833 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2834 // Since we know this use has only one reaching def, we can replace the
2835 // use reg.
2836 RU->setReg(NewUseReg);
2837 // Track the copy source operand for r eplacement.
2838 DstRegSet.insert(&VGPRCopy->getOperand(1));
2839 }
2840
2841 // Track the register for reclassification
2842 RewriteRegs.insert(DstReg);
2843
2844 // Insert the dst operand for replacement. If this dst is in a chain of
2845 // tied-def MFMAs, and the first src2 needs to be replaced with a new reg,
2846 // all the correspond operands need to be replaced.
2847 DstRegSet.insert(Dst);
2848 }
2849
2850 // Handle the copies for dst uses.
2851 using RUBType =
2852 std::pair<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>;
2853 for (RUBType RUBlockEntry : ReachingUseTracker) {
2854 using RUDType = std::pair<Register, SmallPtrSet<MachineOperand *, 8>>;
2855 for (RUDType RUDst : RUBlockEntry.second) {
2856 MachineOperand *OpBegin = *RUDst.second.begin();
2857 SlotIndex InstPt = DAG.LIS->getInstructionIndex(*OpBegin->getParent());
2858
2859 // Find the earliest use in this block.
2860 for (MachineOperand *User : RUDst.second) {
2861 SlotIndex NewInstPt = DAG.LIS->getInstructionIndex(*User->getParent());
2862 if (SlotIndex::isEarlierInstr(NewInstPt, InstPt))
2863 InstPt = NewInstPt;
2864 }
2865
2866 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(RUDst.first);
2867 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
2868 Register NewUseReg = DAG.MRI.createVirtualRegister(VGPRRC);
2869 MachineInstr *UseInst = DAG.LIS->getInstructionFromIndex(InstPt);
2870
2871 MachineInstrBuilder VGPRCopy =
2872 BuildMI(*UseInst->getParent(), UseInst->getIterator(),
2873 UseInst->getDebugLoc(), TII->get(TargetOpcode::COPY))
2874 .addDef(NewUseReg, {}, 0)
2875 .addUse(RUDst.first, {}, 0);
2876 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2877
2878 // If this UseInst was the first MI in the region, update the region
2879 // boundaries.
2880 auto FI = FirstMIToRegion.find(UseInst);
2881 if (FI != FirstMIToRegion.end()) {
2882 unsigned UpdateRegion = FI->second;
2883 DAG.Regions[UpdateRegion].first = VGPRCopy;
2884 FirstMIToRegion.erase(UseInst);
2885 }
2886
2887 // Replace the operand for all users.
2888 for (MachineOperand *User : RUDst.second) {
2889 User->setReg(NewUseReg);
2890 }
2891
2892 // Track the copy source operand for replacement.
2893 ReplaceMap[RUDst.first].insert(&VGPRCopy->getOperand(1));
2894 }
2895 }
2896
2897 // We may have needed to insert copies after the reaching defs of the MFMAs.
2898 // Replace the original register with the result of the copy for all relevant
2899 // operands.
2900 for (std::pair<Register, Register> NewDef : RedefMap) {
2901 Register OldReg = NewDef.first;
2902 Register NewReg = NewDef.second;
2903
2904 // Replace the register for any associated operand in the MFMA chain.
2905 for (MachineOperand *ReplaceOp : ReplaceMap[OldReg])
2906 ReplaceOp->setReg(NewReg);
2907 }
2908
2909 // Finally, do the reclassification of the MFMA registers.
2910 for (Register RewriteReg : RewriteRegs) {
2911 Register RegToRewrite = RewriteReg;
2912
2913 // Be sure to update the replacement register and not the original.
2914 auto RI = RedefMap.find(RewriteReg);
2915 if (RI != RedefMap.end())
2916 RegToRewrite = RI->second;
2917
2918 const TargetRegisterClass *CurrRC = DAG.MRI.getRegClass(RegToRewrite);
2919 const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(CurrRC);
2920
2921 DAG.MRI.setRegClass(RegToRewrite, AGPRRC);
2922 }
2923
2924 // Bulk update the LIS.
2925 DAG.LIS->reanalyze(DAG.MF);
2926 // Liveins may have been modified for cross RC copies
2927 RegionPressureMap LiveInUpdater(&DAG, false);
2928 LiveInUpdater.buildLiveRegMap();
2929
2930 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++)
2931 DAG.LiveIns[Region] = LiveInUpdater.getLiveRegsForRegionIdx(Region);
2932
2933 DAG.Pressure[RegionIdx] = DAG.getRealRegPressure(RegionIdx);
2934
2935 return true;
2936}
2937
2938unsigned PreRARematStage::getStageTargetOccupancy() const {
2939 return TargetOcc ? *TargetOcc : MFI.getMinWavesPerEU();
2940}
2941
2942bool PreRARematStage::setObjective() {
2943 const Function &F = MF.getFunction();
2944
2945 // Set up "spilling targets" for all regions.
2946 unsigned MaxSGPRs = ST.getMaxNumSGPRs(F);
2947 unsigned MaxVGPRs = ST.getMaxNumVGPRs(F);
2948 bool HasVectorRegisterExcess = false;
2949 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
2950 const GCNRegPressure &RP = DAG.Pressure[I];
2951 GCNRPTarget &Target = RPTargets.emplace_back(MaxSGPRs, MaxVGPRs, MF, RP);
2952 if (!Target.satisfied())
2953 TargetRegions.set(I);
2954 HasVectorRegisterExcess |= Target.hasVectorRegisterExcess();
2955 }
2956
2957 if (HasVectorRegisterExcess || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) {
2958 // In addition to register usage being above addressable limits, occupancy
2959 // below the minimum is considered like "spilling" as well.
2960 TargetOcc = std::nullopt;
2961 } else {
2962 // There is no spilling and room to improve occupancy; set up "increased
2963 // occupancy targets" for all regions.
2964 TargetOcc = DAG.MinOccupancy + 1;
2965 const unsigned VGPRBlockSize = MFI.getDynamicVGPRBlockSize();
2966 MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false);
2967 MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize);
2968 for (auto [I, Target] : enumerate(RPTargets)) {
2969 Target.setTarget(MaxSGPRs, MaxVGPRs);
2970 if (!Target.satisfied())
2971 TargetRegions.set(I);
2972 }
2973 }
2974
2975 return TargetRegions.any();
2976}
2977
2978bool PreRARematStage::ScoredRemat::maybeBeneficial(
2979 const BitVector &TargetRegions, ArrayRef<GCNRPTarget> RPTargets) const {
2980 for (unsigned I : TargetRegions.set_bits()) {
2981 if (Live[I] && RPTargets[I].isSaveBeneficial(RPSave))
2982 return true;
2983 }
2984 return false;
2985}
2986
2989 assert(DAG.MLI && "MLI not defined in DAG");
2991 MachineBlockFrequencyInfo MBFI(MF, MBPI, *DAG.MLI);
2992
2993 const unsigned NumRegions = DAG.Regions.size();
2995 MaxFreq = 0;
2996 Regions.reserve(NumRegions);
2997 for (unsigned I = 0; I < NumRegions; ++I) {
2998 MachineBasicBlock *MBB = DAG.Regions[I].first->getParent();
2999 uint64_t BlockFreq = MBFI.getBlockFreq(MBB).getFrequency();
3000 Regions.push_back(BlockFreq);
3001 if (BlockFreq && BlockFreq < MinFreq)
3002 MinFreq = BlockFreq;
3003 else if (BlockFreq > MaxFreq)
3004 MaxFreq = BlockFreq;
3005 }
3006 if (!MinFreq)
3007 return;
3008
3009 // Scale everything down if frequencies are high.
3010 if (MinFreq >= ScaleFactor * ScaleFactor) {
3011 for (uint64_t &Freq : Regions)
3012 Freq /= ScaleFactor;
3013 MinFreq /= ScaleFactor;
3014 MaxFreq /= ScaleFactor;
3015 }
3016}
3017
3018void PreRARematStage::ScoredRemat::init(RegisterIdx RegIdx,
3019 const FreqInfo &Freq,
3020 const Rematerializer &Remater,
3022 this->RegIdx = RegIdx;
3023 const unsigned NumRegions = DAG.Regions.size();
3024 LiveIn.resize(NumRegions);
3025 LiveOut.resize(NumRegions);
3026 Live.resize(NumRegions);
3027 UnpredictableRPSave.resize(NumRegions);
3028
3029 const Rematerializer::Reg &Reg = Remater.getReg(RegIdx);
3030 Register DefReg = Reg.getDefReg();
3031 assert(Reg.Uses.size() == 1 && "expected users in single region");
3032 const unsigned UseRegion = Reg.Uses.begin()->first;
3033
3034 // Mark regions in which the rematerializable register is live.
3035 for (unsigned I = 0, E = NumRegions; I != E; ++I) {
3036 if (DAG.LiveIns[I].contains(DefReg))
3037 LiveIn.set(I);
3038 if (DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).contains(DefReg))
3039 LiveOut.set(I);
3040
3041 // If the register is both unused and live-through in the region, the
3042 // latter's RP is guaranteed to decrease.
3043 if (!LiveIn[I] || !LiveOut[I] || I == UseRegion)
3044 UnpredictableRPSave.set(I);
3045 }
3046 Live |= LiveIn;
3047 Live |= LiveOut;
3048 RPSave.inc(DefReg, LaneBitmask::getNone(), Reg.Mask, DAG.MRI);
3049
3050 // Get frequencies of defining and using regions. A rematerialization from the
3051 // least frequent region to the most frequent region will yield the greatest
3052 // in order to penalize rematerializations from or into regions whose
3053 int64_t DefOrMin = std::max(Freq.Regions[Reg.DefRegion], Freq.MinFreq);
3054 int64_t UseOrMax = Freq.Regions[UseRegion];
3055 if (!UseOrMax)
3056 UseOrMax = Freq.MaxFreq;
3057 FreqDiff = DefOrMin - UseOrMax;
3058}
3059
3060void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
3061 ArrayRef<GCNRPTarget> RPTargets,
3062 const FreqInfo &FreqInfo,
3063 bool ReduceSpill) {
3064 MaxFreq = 0;
3065 RegionImpact = 0;
3066 for (unsigned I : TargetRegions.set_bits()) {
3067 if (!Live[I])
3068 continue;
3069
3070 // The rematerialization must contribute positively in at least one
3071 // register class with usage above the RP target for this region to
3072 // contribute to the score.
3073 const GCNRPTarget &RegionTarget = RPTargets[I];
3074 const unsigned NumRegsBenefit = RegionTarget.getNumRegsBenefit(RPSave);
3075 if (!NumRegsBenefit)
3076 continue;
3077
3078 // Regions in which RP is guaranteed to decrease have more weight.
3079 RegionImpact += (UnpredictableRPSave[I] ? 1 : 2) * NumRegsBenefit;
3080
3081 if (ReduceSpill) {
3082 uint64_t Freq = FreqInfo.Regions[I];
3083 if (UnpredictableRPSave[I]) {
3084 // Apply a frequency penalty in regions in which we are not sure that RP
3085 // will decrease.
3086 Freq /= 2;
3087 }
3088 MaxFreq = std::max(MaxFreq, Freq);
3089 }
3090 }
3091}
3092
3093void PreRARematStage::ScoredRemat::rematerialize(
3094 Rematerializer &Remater) const {
3095 const Rematerializer::Reg &Reg = Remater.getReg(RegIdx);
3096 Rematerializer::DependencyReuseInfo DRI;
3097 for (const Rematerializer::Reg::Dependency &Dep : Reg.Dependencies)
3098 DRI.reuse(Dep.RegIdx);
3099 unsigned UseRegion = Reg.Uses.begin()->first;
3100 Remater.rematerializeToRegion(RegIdx, UseRegion, DRI);
3101}
3102
3103void PreRARematStage::updateRPTargets(const BitVector &Regions,
3104 const GCNRegPressure &RPSave) {
3105 for (unsigned I : Regions.set_bits()) {
3106 RPTargets[I].saveRP(RPSave);
3107 if (TargetRegions[I] && RPTargets[I].satisfied()) {
3108 REMAT_DEBUG(dbgs() << " [" << I << "] Target reached!\n");
3109 TargetRegions.reset(I);
3110 }
3111 }
3112}
3113
3114bool PreRARematStage::updateAndVerifyRPTargets(const BitVector &Regions) {
3115 bool TooOptimistic = false;
3116 for (unsigned I : Regions.set_bits()) {
3117 GCNRPTarget &Target = RPTargets[I];
3118 Target.setRP(DAG.getRealRegPressure(I));
3119
3120 // Since we were optimistic in assessing RP decreases in these regions, we
3121 // may need to remark the target as a target region if RP didn't decrease
3122 // as expected.
3123 if (!TargetRegions[I] && !Target.satisfied()) {
3124 REMAT_DEBUG(dbgs() << " [" << I << "] Incorrect RP estimation\n");
3125 TooOptimistic = true;
3126 TargetRegions.set(I);
3127 }
3128 }
3129 return TooOptimistic;
3130}
3131
3132void PreRARematStage::removeFromLiveMaps(Register Reg, const BitVector &LiveIn,
3133 const BitVector &LiveOut) {
3134 assert(LiveIn.size() == DAG.Regions.size() &&
3135 LiveOut.size() == DAG.Regions.size() && "region num mismatch");
3136 for (unsigned I : LiveIn.set_bits())
3137 DAG.LiveIns[I].erase(Reg);
3138 for (unsigned I : LiveOut.set_bits())
3139 DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).erase(Reg);
3140}
3141
3142void PreRARematStage::addToLiveMaps(Register Reg, LaneBitmask Mask,
3143 const BitVector &LiveIn,
3144 const BitVector &LiveOut) {
3145 assert(LiveIn.size() == DAG.Regions.size() &&
3146 LiveOut.size() == DAG.Regions.size() && "region num mismatch");
3147 std::pair<Register, LaneBitmask> LiveReg(Reg, Mask);
3148 for (unsigned I : LiveIn.set_bits())
3149 DAG.LiveIns[I].insert(LiveReg);
3150 for (unsigned I : LiveOut.set_bits())
3151 DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).insert(LiveReg);
3152}
3153
3155 // We consider that reducing spilling is always beneficial so we never
3156 // rollback rematerializations or revert scheduling in such cases.
3157 if (!TargetOcc)
3158 return;
3159
3160 // When increasing occupancy, it is possible that re-scheduling is not able to
3161 // achieve the target occupancy in all regions, in which case re-scheduling in
3162 // all regions should be reverted.
3163 if (DAG.MinOccupancy >= *TargetOcc)
3164 return;
3165
3166 // Revert re-scheduling in all affected regions.
3167 for (const auto &[RegionIdx, OrigMIOrder, MaxPressure] : RegionReverts) {
3168 REMAT_DEBUG(dbgs() << "Reverting re-scheduling in region " << RegionIdx
3169 << '\n');
3170 DAG.Pressure[RegionIdx] = MaxPressure;
3171 modifyRegionSchedule(RegionIdx, OrigMIOrder);
3172 }
3173
3174 // It is possible that re-scheduling lowers occupancy over the one achieved
3175 // just through rematerializations, in which case we revert re-scheduling in
3176 // all regions but do not roll back rematerializations.
3177 if (AchievedOcc >= *TargetOcc) {
3178 DAG.setTargetOccupancy(AchievedOcc);
3179 return;
3180 }
3181
3182 // Reset the target occupancy to what it was pre-rematerialization.
3183 DAG.setTargetOccupancy(*TargetOcc - 1);
3184
3185 // Roll back changes made by the stage, then recompute pressure in all
3186 // affected regions.
3187 REMAT_DEBUG(dbgs() << "==== ROLLBACK ====\n");
3188 assert(Rollback && "rollbacker should be defined");
3189 Rollback->Listener.rollback(Remater);
3190 for (const auto &[RegIdx, LiveIn, LiveOut] : Rollback->LiveMapUpdates) {
3191 const Rematerializer::Reg &Reg = Remater.getReg(RegIdx);
3192 addToLiveMaps(Reg.getDefReg(), Reg.Mask, LiveIn, LiveOut);
3193 }
3194
3195#ifdef EXPENSIVE_CHECKS
3196 // In particular, we want to check for coherent MI/slot order in regions in
3197 // which reverts and/or rollbacks may have happened.
3198 MF.verify();
3199#endif
3200 for (unsigned I : RescheduleRegions.set_bits())
3201 DAG.Pressure[I] = DAG.getRealRegPressure(I);
3202
3204}
3205
3206void GCNScheduleDAGMILive::setTargetOccupancy(unsigned TargetOccupancy) {
3207 MinOccupancy = TargetOccupancy;
3208 if (MFI.getOccupancy() < TargetOccupancy)
3209 MFI.increaseOccupancy(MF, MinOccupancy);
3210 else
3211 MFI.limitOccupancy(MinOccupancy);
3212}
3213
3215 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII);
3216 return any_of(*DAG, [SII](MachineBasicBlock::iterator MI) {
3217 return SII->isIGLPMutationOnly(MI->getOpcode());
3218 });
3219}
3220
3225
3227 HasIGLPInstrs = hasIGLPInstrs(this);
3228 if (HasIGLPInstrs) {
3229 SavedMutations.clear();
3230 SavedMutations.swap(Mutations);
3232 }
3233
3235}
3236
3238 if (HasIGLPInstrs)
3239 SavedMutations.swap(Mutations);
3240
3242}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SUnit * pickOnlyChoice(SchedBoundary &Zone)
MachineBasicBlock & MBB
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file defines the GCNRegPressure class, which tracks registry pressure by bookkeeping number of S...
static cl::opt< bool > GCNTrackers("amdgpu-use-amdgpu-trackers", cl::Hidden, cl::desc("Use the AMDGPU specific RPTrackers during scheduling"), cl::init(false))
static cl::opt< bool > DisableClusteredLowOccupancy("amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden, cl::desc("Disable clustered low occupancy " "rescheduling for ILP scheduling stage."), cl::init(false))
#define REMAT_PREFIX
Allows to easily filter for this stage's debug output.
static cl::opt< unsigned, false, VGPRThresholdParser > VGPRThresholdPercentOpt("amdgpu-vgpr-threshold-percent", cl::Hidden, cl::desc("Percent of VGPR limits that we should use as RP threshold " "during scheduling. We have two limits relevant to scheduling: " "Critical (avoid decreasing occupancy), Excess (avoid spilling). " "This flag scales both limits back by an equal percent: (0 = use " " default calculation, 1-100 = use percentage), default: 0"), cl::init(0))
static MachineInstr * getLastMIForRegion(MachineBasicBlock::iterator RegionBegin, MachineBasicBlock::iterator RegionEnd)
static bool shouldCheckPending(SchedBoundary &Zone, const TargetSchedModel *SchedModel)
static cl::opt< bool > RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden, cl::desc("Relax occupancy targets for kernels which are memory " "bound (amdgpu-membound-threshold), or " "Wave Limited (amdgpu-limit-wave-threshold)."), cl::init(false))
#define REMAT_DEBUG(X)
static cl::opt< bool > DisableUnclusterHighRP("amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden, cl::desc("Disable unclustered high register pressure " "reduction scheduling stage."), cl::init(false))
static void printScheduleModel(std::set< std::pair< MachineInstr *, unsigned >, EarlierIssuingCycle > &ReadyCycles)
static bool isReachingDefAGPRForm(MachineInstr *RD, const SmallPtrSetImpl< MachineInstr * > &RewriteSet, const DenseSet< Register > &CandSrc2Regs, const SIInstrInfo &TII)
Returns true if reaching def RD will be in AGPR form after the rewrite and so needs no bridge copy: a...
static cl::opt< bool > PrintMaxRPRegUsageAfterScheduler("amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden, cl::desc("Print a list of live registers along with their def/uses at the " "point of maximum register pressure after scheduling."), cl::init(false))
static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG)
static cl::opt< bool > DisableRewriteMFMAFormSchedStage("amdgpu-disable-rewrite-mfma-form-sched-stage", cl::Hidden, cl::desc("Disable rewrite mfma rewrite scheduling stage"), cl::init(true))
static bool canUsePressureDiffs(const SUnit &SU)
Checks whether SU can use the cached DAG pressure diffs to compute the current register pressure.
static cl::opt< unsigned > PendingQueueLimit("amdgpu-scheduler-pending-queue-limit", cl::Hidden, cl::desc("Max (Available+Pending) size to inspect pending queue (0 disables)"), cl::init(256))
static cl::opt< bool > PrintMaxRPRegUsageBeforeScheduler("amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden, cl::desc("Print a list of live registers along with their def/uses at the " "point of maximum register pressure before scheduling."), cl::init(false))
static cl::opt< unsigned > ScheduleMetricBias("amdgpu-schedule-metric-bias", cl::Hidden, cl::desc("Sets the bias which adds weight to occupancy vs latency. Set it to " "100 to chase the occupancy only."), cl::init(10))
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
#define _
static constexpr std::pair< StringLiteral, StringLiteral > ReplaceMap[]
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
static llvm::Error parse(GsymDataExtractor &Data, uint64_t BaseAddr, LineEntryCallback const &Callback)
Definition LineTable.cpp:54
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
static constexpr unsigned SM(unsigned Version)
if(PassOpts->AAPipeline)
MIR-level target-independent rematerialization helpers.
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:119
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
Get the first element.
Definition ArrayRef.h:144
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
BitVector & reset()
Reset all bits in the bitvector.
Definition BitVector.h:409
iterator_range< const_set_bits_iterator > set_bits() const
Definition BitVector.h:159
size_type size() const
Returns the number of bits in this bitvector.
Definition BitVector.h:178
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
bool shouldRevertScheduling(unsigned WavesAfter) override
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:225
bool erase(const KeyT &Val)
Definition DenseMap.h:379
iterator end()
Definition DenseMap.h:143
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:216
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:286
Implements a dense probed hash-table based set.
Definition DenseSet.h:289
bool reset(const MachineInstr &MI, MachineBasicBlock::const_iterator End, const LiveRegSet *LiveRegs=nullptr)
Reset tracker to the point before the MI filling LiveRegs upon this point using LIS.
GCNRegPressure bumpDownwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI) const
Mostly copy/paste from CodeGen/RegisterPressure.cpp Calculate the impact MI will have on CurPressure ...
GCNMaxILPSchedStrategy(const MachineSchedContext *C)
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const override
Apply a set of heuristics to a new candidate.
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const override
GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as much as possible.
GCNMaxMemoryClauseSchedStrategy(const MachineSchedContext *C)
GCNMaxOccupancySchedStrategy(const MachineSchedContext *C, bool IsLegacyScheduler=false)
void finalizeSchedule() override
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
void schedule() override
Orders nodes according to selected style.
GCNPostScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S, bool RemoveKillFlags)
Models a register pressure target, allowing to evaluate and track register savings against that targe...
unsigned getNumRegsBenefit(const GCNRegPressure &SaveRP) const
Returns the benefit towards achieving the RP target that saving SaveRP represents,...
GCNRegPressure getPressure() const
GCNSchedStrategy & S
GCNRegPressure PressureBefore
bool isRegionWithExcessRP() const
void modifyRegionSchedule(unsigned RegionIdx, ArrayRef< MachineInstr * > MIOrder)
Sets the schedule of region RegionIdx to MIOrder.
bool mayCauseSpilling(unsigned WavesAfter)
ScheduleMetrics getScheduleMetrics(const std::vector< SUnit > &InputSchedule)
GCNScheduleDAGMILive & DAG
const GCNSchedStageID StageID
std::vector< MachineInstr * > Unsched
GCNRegPressure PressureAfter
MachineFunction & MF
virtual void finalizeGCNRegion()
SIMachineFunctionInfo & MFI
unsigned computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle, DenseMap< unsigned, unsigned > &ReadyCycles, const TargetSchedModel &SM)
virtual void finalizeGCNSchedStage()
virtual bool initGCNSchedStage()
virtual bool shouldRevertScheduling(unsigned WavesAfter)
std::vector< std::unique_ptr< ScheduleDAGMutation > > SavedMutations
GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
MachineBasicBlock * CurrentMBB
const GCNSubtarget & ST
This is a minimal scheduler strategy.
GCNDownwardRPTracker DownwardTracker
void getRegisterPressures(bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU, std::vector< unsigned > &Pressure, std::vector< unsigned > &MaxPressure, GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker, ScheduleDAGMI *DAG, const SIRegisterInfo *SRI)
GCNSchedStrategy(const MachineSchedContext *C)
SmallVector< GCNSchedStageID, 4 > SchedStages
std::vector< unsigned > MaxPressure
SUnit * pickNodeBidirectional(bool &IsTopNode, bool &PickedPending)
GCNSchedStageID getCurrentStage()
bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const
Evaluates instructions in the pending queue using a subset of scheduling heuristics.
SmallVectorImpl< GCNSchedStageID >::iterator CurrentStage
void schedNode(SUnit *SU, bool IsTopNode) override
Notify MachineSchedStrategy that ScheduleDAGMI has scheduled an instruction and updated scheduled/rem...
std::optional< bool > GCNTrackersOverride
GCNDownwardRPTracker * getDownwardTracker()
std::vector< unsigned > Pressure
void initialize(ScheduleDAGMI *DAG) override
Initialize the strategy after building the DAG for a new region.
GCNUpwardRPTracker UpwardTracker
void printCandidateDecision(const SchedCandidate &Current, const SchedCandidate &Preferred)
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, SchedCandidate &Cand, bool &IsPending, bool IsBottomUp)
unsigned getStructuralStallCycles(SchedBoundary &Zone, SUnit *SU) const
Estimate how many cycles SU must wait due to structural hazards at the current boundary cycle.
void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, const SIRegisterInfo *SRI, unsigned SGPRPressure, unsigned VGPRPressure, bool IsBottomUp)
SUnit * pickNode(bool &IsTopNode) override
Pick the next node to schedule, or return NULL.
GCNUpwardRPTracker * getUpwardTracker()
GCNSchedStageID getNextStage() const
void finalizeSchedule() override
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
void schedule() override
Orders nodes according to selected style.
GCNScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S)
void recede(const MachineInstr &MI)
Move to the state of RP just before the MI .
void reset(const MachineInstr &MI)
Resets tracker to the point just after MI (in program order), which can be a debug instruction.
void traceCandidate(const SchedCandidate &Cand)
LLVM_ABI void setPolicy(CandPolicy &Policy, bool IsPostRA, SchedBoundary &CurrZone, SchedBoundary *OtherZone)
Set the CandPolicy given a scheduling zone given the current resources and latencies inside and outsi...
MachineSchedPolicy RegionPolicy
const TargetSchedModel * SchedModel
const MachineSchedContext * Context
const TargetRegisterInfo * TRI
SchedCandidate BotCand
Candidate last picked from Bot boundary.
SchedCandidate TopCand
Candidate last picked from Top boundary.
virtual bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const
Apply a set of heuristics to a new candidate.
ScheduleDAGMILive * DAG
void initialize(ScheduleDAGMI *dag) override
Initialize the strategy after building the DAG for a new region.
void schedNode(SUnit *SU, bool IsTopNode) override
Update the scheduler's state after scheduling a node.
GenericScheduler(const MachineSchedContext *C)
bool shouldRevertScheduling(unsigned WavesAfter) override
A live range for subregisters.
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasSubRanges() const
Returns true if subregister liveness information is available.
iterator_range< subrange_iterator > subranges()
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
SlotIndex getMBBEndIdx(const MachineBasicBlock *mbb) const
Return the last index in the given basic block.
LiveInterval & getInterval(Register Reg)
LLVM_ABI void dump() const
MachineBasicBlock * getMBBFromIndex(SlotIndex index) const
VNInfo * getVNInfoAt(SlotIndex Idx) const
getVNInfoAt - Return the VNInfo that is live at Idx, or NULL.
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
iterator_range< pred_iterator > predecessors()
MachineInstrBundleIterator< MachineInstr > iterator
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
LLVM_ABI BlockFrequency getBlockFreq(const MachineBasicBlock *MBB) const
getblockFreq - Return block frequency.
LLVM_ABI BlockFrequency getEntryFreq() const
Divide a block's BlockFrequency::getFrequency() value by this value to obtain the entry block - relat...
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool isCopy() const
const MachineBasicBlock * getParent() const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
mop_range operands()
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
Register getReg() const
getReg - Returns the register number.
bool shouldRevertScheduling(unsigned WavesAfter) override
bool shouldRevertScheduling(unsigned WavesAfter) override
bool shouldRevertScheduling(unsigned WavesAfter) override
void finalizeGCNRegion() override
bool initGCNSchedStage() override
Capture a change in pressure for a single pressure set.
Simple wrapper around std::function<void(raw_ostream&)>.
Definition Printable.h:38
Helpers for implementing custom MachineSchedStrategy classes.
unsigned size() const
Track the current register pressure at some position in the instruction stream, and remember the high...
LLVM_ABI void advance()
Advance across the current instruction.
LLVM_ABI void getDownwardPressure(const MachineInstr *MI, std::vector< unsigned > &PressureResult, std::vector< unsigned > &MaxPressureResult)
Get the pressure of each PSet after traversing this instruction top-down.
const std::vector< unsigned > & getRegSetPressureAtPos() const
Get the register set pressure at the current position, which may be less than the pressure across the...
LLVM_ABI void getUpwardPressure(const MachineInstr *MI, std::vector< unsigned > &PressureResult, std::vector< unsigned > &MaxPressureResult)
Get the pressure of each PSet after traversing this instruction bottom-up.
List of registers defined and used by a machine instruction.
LLVM_ABI void collect(const MachineInstr &MI, const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, bool TrackLaneMasks, bool IgnoreDead)
Analyze the given instruction MI and fill in the Uses, Defs and DeadDefs list based on the MachineOpe...
LLVM_ABI void adjustLaneLiveness(const LiveIntervals &LIS, const MachineRegisterInfo &MRI, SlotIndex Pos, MachineInstr *AddFlagsMI=nullptr)
Use liveness information to find out which uses/defs are partially undefined/dead and adjust the VReg...
LLVM_ABI void detectDeadDefs(const MachineInstr &MI, const LiveIntervals &LIS)
Use liveness information to find dead defs not marked with a dead flag and move them to the DeadDefs ...
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
MIR-level target-independent rematerializer.
bool isIGLPMutationOnly(unsigned Opcode) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
unsigned TopReadyCycle
Cycle relative to start when node is ready.
unsigned NodeNum
Entry # of node in the node vector.
unsigned short Latency
Node latency.
bool isScheduled
True once scheduled.
unsigned ParentClusterIdx
The parent cluster id.
unsigned BotReadyCycle
Cycle relative to end when node is ready.
bool hasReservedResource
Uses a reserved resource.
bool isBottomReady() const
bool isTopReady() const
SmallVector< SDep, 4 > Preds
All sunit predecessors.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Each Scheduling boundary is associated with ready queues.
LLVM_ABI void releasePending()
Release pending ready nodes in to the available queue.
LLVM_ABI unsigned getLatencyStallCycles(SUnit *SU)
Get the difference between the given SUnit's ready time and the current cycle.
LLVM_ABI SUnit * pickOnlyChoice()
Call this before applying any other heuristics to the Available queue.
ScheduleHazardRecognizer * HazardRec
LLVM_ABI void bumpCycle(unsigned NextCycle)
Move the boundary of scheduled code by one cycle.
unsigned getCurrMOps() const
Micro-ops issued in the current cycle.
unsigned getCurrCycle() const
Number of cycles to issue the instructions scheduled in this zone.
LLVM_ABI bool checkHazard(SUnit *SU)
Does this SU have a hazard within the current instruction group.
LLVM_ABI std::pair< unsigned, unsigned > getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx, unsigned ReleaseAtCycle, unsigned AcquireAtCycle)
Compute the next cycle at which the given processor resource can be scheduled.
A ScheduleDAG for scheduling lists of MachineInstr.
bool ScheduleSingleMIRegions
True if regions with a single MI should be scheduled.
MachineBasicBlock::iterator RegionEnd
The end of the range to be scheduled.
virtual void finalizeSchedule()
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
virtual void exitRegion()
Called when the scheduler has finished scheduling the current region.
const MachineLoopInfo * MLI
bool RemoveKillFlags
True if the DAG builder should remove kill flags (in preparation for rescheduling).
MachineBasicBlock::iterator RegionBegin
The beginning of the range to be scheduled.
void schedule() override
Implement ScheduleDAGInstrs interface for scheduling a sequence of reorderable instructions.
ScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S)
RegPressureTracker RPTracker
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
void addMutation(std::unique_ptr< ScheduleDAGMutation > Mutation)
Add a postprocessing step to the DAG builder.
void schedule() override
Implement ScheduleDAGInstrs interface for scheduling a sequence of reorderable instructions.
ScheduleDAGMI(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S, bool RemoveKillFlags)
std::vector< std::unique_ptr< ScheduleDAGMutation > > Mutations
Ordered list of DAG postprocessing steps.
MachineRegisterInfo & MRI
Virtual/real register map.
const TargetInstrInfo * TII
Target instruction information.
MachineFunction & MF
Machine function.
static const unsigned ScaleFactor
unsigned getMetric() const
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
static bool isSameInstr(SlotIndex A, SlotIndex B)
isSameInstr - Return true if A and B refer to the same instruction.
static bool isEarlierInstr(SlotIndex A, SlotIndex B)
isEarlierInstr - Return true if A refers to an instruction earlier than B.
SlotIndex getPrevSlot() const
Returns the previous slot in the index list.
SlotIndex getMBBStartIdx(unsigned Num) const
Returns the first index in the given basic block number.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition SmallSet.h:229
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition StringRef.h:490
uint8_t getCopyCost() const
Return the cost of copying a value between two registers in this class.
Provide an instruction scheduling machine model to CodeGen passes.
LLVM_ABI bool hasInstrSchedModel() const
Return true if this machine model includes an instruction-level scheduling model.
unsigned getMicroOpBufferSize() const
Number of micro-ops that may be buffered for OOO execution.
bool shouldRevertScheduling(unsigned WavesAfter) override
VNInfo - Value Number Information.
SlotIndex def
The index of the defining instruction.
bool isPHIDef() const
Returns true if this value is defined by a PHI instruction (or was, PHI instructions may have been el...
static LLVM_ABI bool allUsesAvailableAt(const MachineInstr *MI, SlotIndex UseIdx, const LiveIntervals &LIS, const MachineRegisterInfo &MRI, const TargetInstrInfo &TII)
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:212
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:185
self_iterator getIterator()
Definition ilist_node.h:123
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned getAddressableNumVGPRs(const MCSubtargetInfo &STI, unsigned DynamicVGPRBlockSize)
unsigned getAllocatedNumVGPRBlocks(const MCSubtargetInfo &STI, unsigned NumVGPRs, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
unsigned getVGPRAllocGranule(const MCSubtargetInfo &STI, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
LLVM_READONLY int32_t getMFMASrcCVDstAGPROp(uint32_t Opcode)
@ Entry
Definition COFF.h:862
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
This namespace contains all of the command line option processing machinery.
Definition MCSchedule.h:35
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI int biasPhysReg(const SUnit *SU, bool isTop, bool BiasPRegsExtra=false)
Minimize physical register live ranges.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1764
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
LLVM_ABI unsigned getWeakLeft(const SUnit *SU, bool isTop)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI, Range &&LiveRegs)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase)
Phase specifes whether or not this is a reentry into the IGroupLPDAGMutation.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
std::pair< MachineBasicBlock::iterator, MachineBasicBlock::iterator > RegionBoundaries
A region's boundaries i.e.
IterT skipDebugInstructionsForward(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It until it points to a non-debug instruction or to End and return the resulting iterator.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
LLVM_ABI bool tryPressure(const PressureChange &TryP, const PressureChange &CandP, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason, const TargetRegisterInfo *TRI, const MachineFunction &MF)
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
LLVM_ABI cl::opt< bool > VerifyScheduling
LLVM_ABI bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary &Zone)
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
IterT skipDebugInstructionsBackward(IterT It, IterT Begin, bool SkipPseudoOp=true)
Decrement It until it points to a non-debug instruction or to Begin and return the resulting iterator...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
bool isTheSameCluster(unsigned A, unsigned B)
Return whether the input cluster ID's are the same and valid.
DWARFExpression::Operation Op
LLVM_ABI bool tryGreater(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
ArrayRef(const T &OneElt) -> ArrayRef< T >
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1916
DenseMap< MachineInstr *, GCNRPTracker::LiveRegSet > getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS)
creates a map MachineInstr -> LiveRegSet R - range of iterators on instructions After - upon entry or...
GCNRPTracker::LiveRegSet getLiveRegsBefore(const MachineInstr &MI, const LiveIntervals &LIS)
LLVM_ABI bool tryLess(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
Return true if this heuristic determines order.
LLVM_ABI void dumpMaxRegPressure(MachineFunction &MF, GCNRegPressure::RegKind Kind, LiveIntervals &LIS, const MachineLoopInfo *MLI)
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:860
bool operator()(std::pair< MachineInstr *, unsigned > A, std::pair< MachineInstr *, unsigned > B) const
unsigned getArchVGPRNum() const
unsigned getAGPRNum() const
unsigned getSGPRNum() const
Policy for scheduling the next instruction in the candidate's zone.
Store the state used by GenericScheduler heuristics, required for the lifetime of one invocation of p...
void reset(const CandPolicy &NewPolicy)
LLVM_ABI void initResourceDelta(const ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel)
Status of an instruction's critical resource consumption.
constexpr bool any() const
Definition LaneBitmask.h:53
static constexpr LaneBitmask getNone()
Definition LaneBitmask.h:81
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:129
Identify one of the processor resource kinds consumed by a particular scheduling class for the specif...
Definition MCSchedule.h:74
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
Execution frequency information required by scoring heuristics.
SmallVector< uint64_t > Regions
Per-region execution frequencies. 0 when unknown.
uint64_t MinFreq
Minimum and maximum observed frequencies.
FreqInfo(MachineFunction &MF, const GCNScheduleDAGMILive &DAG)
DependencyReuseInfo & reuse(RegisterIdx DepIdx)
RegisterIdx RegIdx
The corresponding register's index in the rematerializer.
A rematerializable register defined by a single machine instruction.
MachineInstr * DefMI
Single MI defining the rematerializable register.
SmallDenseMap< unsigned, RegionUsers, 2 > Uses
Uses of the register, mapped by region.
Register getDefReg() const
Returns the rematerializable register from its defining instruction.