LLVM 22.0.0git
GCNSchedStrategy.cpp
Go to the documentation of this file.
1//===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This contains a MachineSchedStrategy implementation for maximizing wave
11/// occupancy on GCN hardware.
12///
13/// This pass will apply multiple scheduling stages to the same function.
14/// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
15/// entry point for the scheduling of those regions is
16/// GCNScheduleDAGMILive::runSchedStages.
17
18/// Generally, the reason for having multiple scheduling stages is to account
19/// for the kernel-wide effect of register usage on occupancy. Usually, only a
20/// few scheduling regions will have register pressure high enough to limit
21/// occupancy for the kernel, so constraints can be relaxed to improve ILP in
22/// other regions.
23///
24//===----------------------------------------------------------------------===//
25
26#include "GCNSchedStrategy.h"
27#include "AMDGPUIGroupLP.h"
28#include "GCNRegPressure.h"
31#include "llvm/ADT/STLExtras.h"
34#include "llvm/MC/LaneBitmask.h"
36
37#define DEBUG_TYPE "machine-scheduler"
38
39using namespace llvm;
40
42 "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden,
43 cl::desc("Disable unclustered high register pressure "
44 "reduction scheduling stage."),
45 cl::init(false));
46
48 "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden,
49 cl::desc("Disable clustered low occupancy "
50 "rescheduling for ILP scheduling stage."),
51 cl::init(false));
52
54 "amdgpu-schedule-metric-bias", cl::Hidden,
56 "Sets the bias which adds weight to occupancy vs latency. Set it to "
57 "100 to chase the occupancy only."),
58 cl::init(10));
59
60static cl::opt<bool>
61 RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden,
62 cl::desc("Relax occupancy targets for kernels which are memory "
63 "bound (amdgpu-membound-threshold), or "
64 "Wave Limited (amdgpu-limit-wave-threshold)."),
65 cl::init(false));
66
68 "amdgpu-use-amdgpu-trackers", cl::Hidden,
69 cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
70 cl::init(false));
71
73 "amdgpu-scheduler-pending-queue-limit", cl::Hidden,
75 "Max (Available+Pending) size to inspect pending queue (0 disables)"),
76 cl::init(256));
77
78#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
79#define DUMP_MAX_REG_PRESSURE
81 "amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden,
82 cl::desc("Print a list of live registers along with their def/uses at the "
83 "point of maximum register pressure before scheduling."),
84 cl::init(false));
85
87 "amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden,
88 cl::desc("Print a list of live registers along with their def/uses at the "
89 "point of maximum register pressure after scheduling."),
90 cl::init(false));
91#endif
92
93const unsigned ScheduleMetrics::ScaleFactor = 100;
94
99
102
103 MF = &DAG->MF;
104
105 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
106
108 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
110 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
111
113 // Set the initial TargetOccupnacy to the maximum occupancy that we can
114 // achieve for this function. This effectively sets a lower bound on the
115 // 'Critical' register limits in the scheduler.
116 // Allow for lower occupancy targets if kernel is wave limited or memory
117 // bound, and using the relaxed occupancy feature.
121 std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
122
123 if (!KnownExcessRP) {
124 VGPRCriticalLimit = std::min(
125 ST.getMaxNumVGPRs(TargetOccupancy, MFI.getDynamicVGPRBlockSize()),
127 } else {
128 // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
129 // returns a reasonably small number for targets with lots of VGPRs, such
130 // as GFX10 and GFX11.
131 LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
132 "VGPRCriticalLimit calculation method.\n");
133 unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
134 unsigned Granule =
135 AMDGPU::IsaInfo::getVGPRAllocGranule(&ST, DynamicVGPRBlockSize);
136 unsigned Addressable =
137 AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST, DynamicVGPRBlockSize);
138 unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule);
139 VGPRBudget = std::max(VGPRBudget, Granule);
140 VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit);
141 }
142
143 // Subtract error margin and bias from register limits and avoid overflow.
148
149 LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
150 << ", VGPRExcessLimit = " << VGPRExcessLimit
151 << ", SGPRCriticalLimit = " << SGPRCriticalLimit
152 << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
153}
154
155/// Checks whether \p SU can use the cached DAG pressure diffs to compute the
156/// current register pressure.
157///
158/// This works for the common case, but it has a few exceptions that have been
159/// observed through trial and error:
160/// - Explicit physical register operands
161/// - Subregister definitions
162///
163/// In both of those cases, PressureDiff doesn't represent the actual pressure,
164/// and querying LiveIntervals through the RegPressureTracker is needed to get
165/// an accurate value.
166///
167/// We should eventually only use PressureDiff for maximum performance, but this
168/// already allows 80% of SUs to take the fast path without changing scheduling
169/// at all. Further changes would either change scheduling, or require a lot
170/// more logic to recover an accurate pressure estimate from the PressureDiffs.
171static bool canUsePressureDiffs(const SUnit &SU) {
172 if (!SU.isInstr())
173 return false;
174
175 // Cannot use pressure diffs for subregister defs or with physregs, it's
176 // imprecise in both cases.
177 for (const auto &Op : SU.getInstr()->operands()) {
178 if (!Op.isReg() || Op.isImplicit())
179 continue;
180 if (Op.getReg().isPhysical() ||
181 (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister))
182 return false;
183 }
184 return true;
185}
186
188 bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
189 std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
190 GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker,
191 ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) {
192 // getDownwardPressure() and getUpwardPressure() make temporary changes to
193 // the tracker, so we need to pass those function a non-const copy.
194 RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
195 if (!GCNTrackers) {
196 AtTop
197 ? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure)
198 : TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
199
200 return;
201 }
202
203 // GCNTrackers
204 Pressure.resize(4, 0);
205 MachineInstr *MI = SU->getInstr();
206 GCNRegPressure NewPressure;
207 if (AtTop) {
208 GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
209 NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, SRI);
210 } else {
211 GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
212 TempUpwardTracker.recede(*MI);
213 NewPressure = TempUpwardTracker.getPressure();
214 }
215 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
216 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
217 NewPressure.getArchVGPRNum();
218 Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
219}
220
222 bool AtTop,
223 const RegPressureTracker &RPTracker,
224 const SIRegisterInfo *SRI,
225 unsigned SGPRPressure,
226 unsigned VGPRPressure, bool IsBottomUp) {
227 Cand.SU = SU;
228 Cand.AtTop = AtTop;
229
230 if (!DAG->isTrackingPressure())
231 return;
232
233 Pressure.clear();
234 MaxPressure.clear();
235
236 // We try to use the cached PressureDiffs in the ScheduleDAG whenever
237 // possible over querying the RegPressureTracker.
238 //
239 // RegPressureTracker will make a lot of LIS queries which are very
240 // expensive, it is considered a slow function in this context.
241 //
242 // PressureDiffs are precomputed and cached, and getPressureDiff is just a
243 // trivial lookup into an array. It is pretty much free.
244 //
245 // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
246 // PressureDiffs.
247 if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) {
248 getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
250 } else {
251 // Reserve 4 slots.
252 Pressure.resize(4, 0);
253 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
254 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
255
256 for (const auto &Diff : DAG->getPressureDiff(SU)) {
257 if (!Diff.isValid())
258 continue;
259 // PressureDiffs is always bottom-up so if we're working top-down we need
260 // to invert its sign.
261 Pressure[Diff.getPSet()] +=
262 (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc());
263 }
264
265#ifdef EXPENSIVE_CHECKS
266 std::vector<unsigned> CheckPressure, CheckMaxPressure;
267 getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
269 if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
270 CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
271 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
272 CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
273 errs() << "Register Pressure is inaccurate when calculated through "
274 "PressureDiff\n"
275 << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
276 << ", expected "
277 << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
278 << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
279 << ", expected "
280 << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
281 report_fatal_error("inaccurate register pressure calculation");
282 }
283#endif
284 }
285
286 unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
287 unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
288
289 // If two instructions increase the pressure of different register sets
290 // by the same amount, the generic scheduler will prefer to schedule the
291 // instruction that increases the set with the least amount of registers,
292 // which in our case would be SGPRs. This is rarely what we want, so
293 // when we report excess/critical register pressure, we do it either
294 // only for VGPRs or only for SGPRs.
295
296 // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
297 const unsigned MaxVGPRPressureInc = 16;
298 bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
299 bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
300
301 // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
302 // to increase the likelihood we don't go over the limits. We should improve
303 // the analysis to look through dependencies to find the path with the least
304 // register pressure.
305
306 // We only need to update the RPDelta for instructions that increase register
307 // pressure. Instructions that decrease or keep reg pressure the same will be
308 // marked as RegExcess in tryCandidate() when they are compared with
309 // instructions that increase the register pressure.
310 if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
311 HasHighPressure = true;
312 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
313 Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
314 }
315
316 if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
317 HasHighPressure = true;
318 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
319 Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
320 }
321
322 // Register pressure is considered 'CRITICAL' if it is approaching a value
323 // that would reduce the wave occupancy for the execution unit. When
324 // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
325 // has the same cost, so we don't need to prefer one over the other.
326
327 int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
328 int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
329
330 if (SGPRDelta >= 0 || VGPRDelta >= 0) {
331 HasHighPressure = true;
332 if (SGPRDelta > VGPRDelta) {
333 Cand.RPDelta.CriticalMax =
334 PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
335 Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
336 } else {
337 Cand.RPDelta.CriticalMax =
338 PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
339 Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
340 }
341 }
342}
343
345 const TargetSchedModel *SchedModel) {
346 bool HasBufferedModel =
347 SchedModel->hasInstrSchedModel() && SchedModel->getMicroOpBufferSize();
348 unsigned Combined = Zone.Available.size() + Zone.Pending.size();
349 return Combined <= PendingQueueLimit && HasBufferedModel;
350}
351
353 const TargetSchedModel *SchedModel) {
354 // pickOnlyChoice() releases pending instructions and checks for new hazards.
355 SUnit *OnlyChoice = Zone.pickOnlyChoice();
356 if (!shouldCheckPending(Zone, SchedModel) || Zone.Pending.empty())
357 return OnlyChoice;
358
359 return nullptr;
360}
361
363 const SchedCandidate &Preferred) {
364 LLVM_DEBUG({
365 dbgs() << "Prefer:\t\t";
366 DAG->dumpNode(*Preferred.SU);
367
368 if (Current.SU) {
369 dbgs() << "Not:\t";
370 DAG->dumpNode(*Current.SU);
371 }
372
373 dbgs() << "Reason:\t\t";
374 traceCandidate(Preferred);
375 });
376}
377
378// This function is mostly cut and pasted from
379// GenericScheduler::pickNodeFromQueue()
381 const CandPolicy &ZonePolicy,
382 const RegPressureTracker &RPTracker,
383 SchedCandidate &Cand, bool &IsPending,
384 bool IsBottomUp) {
385 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
387 unsigned SGPRPressure = 0;
388 unsigned VGPRPressure = 0;
389 IsPending = false;
390 if (DAG->isTrackingPressure()) {
391 if (!GCNTrackers) {
392 SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
393 VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
394 } else {
395 GCNRPTracker *T = IsBottomUp
396 ? static_cast<GCNRPTracker *>(&UpwardTracker)
397 : static_cast<GCNRPTracker *>(&DownwardTracker);
398 SGPRPressure = T->getPressure().getSGPRNum();
399 VGPRPressure = T->getPressure().getArchVGPRNum();
400 }
401 }
402 LLVM_DEBUG(dbgs() << "Available Q:\n");
403 ReadyQueue &AQ = Zone.Available;
404 for (SUnit *SU : AQ) {
405
406 SchedCandidate TryCand(ZonePolicy);
407 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
408 VGPRPressure, IsBottomUp);
409 // Pass SchedBoundary only when comparing nodes from the same boundary.
410 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
411 tryCandidate(Cand, TryCand, ZoneArg);
412 if (TryCand.Reason != NoCand) {
413 // Initialize resource delta if needed in case future heuristics query it.
414 if (TryCand.ResDelta == SchedResourceDelta())
415 TryCand.initResourceDelta(Zone.DAG, SchedModel);
416 LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
417 Cand.setBest(TryCand);
418 } else {
419 printCandidateDecision(TryCand, Cand);
420 }
421 }
422
423 if (!shouldCheckPending(Zone, SchedModel))
424 return;
425
426 LLVM_DEBUG(dbgs() << "Pending Q:\n");
427 ReadyQueue &PQ = Zone.Pending;
428 for (SUnit *SU : PQ) {
429
430 SchedCandidate TryCand(ZonePolicy);
431 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
432 VGPRPressure, IsBottomUp);
433 // Pass SchedBoundary only when comparing nodes from the same boundary.
434 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
435 tryPendingCandidate(Cand, TryCand, ZoneArg);
436 if (TryCand.Reason != NoCand) {
437 // Initialize resource delta if needed in case future heuristics query it.
438 if (TryCand.ResDelta == SchedResourceDelta())
439 TryCand.initResourceDelta(Zone.DAG, SchedModel);
440 LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
441 IsPending = true;
442 Cand.setBest(TryCand);
443 } else {
444 printCandidateDecision(TryCand, Cand);
445 }
446 }
447}
448
449// This function is mostly cut and pasted from
450// GenericScheduler::pickNodeBidirectional()
452 bool &PickedPending) {
453 // Schedule as far as possible in the direction of no choice. This is most
454 // efficient, but also provides the best heuristics for CriticalPSets.
455 if (SUnit *SU = pickOnlyChoice(Bot, SchedModel)) {
456 IsTopNode = false;
457 return SU;
458 }
459 if (SUnit *SU = pickOnlyChoice(Top, SchedModel)) {
460 IsTopNode = true;
461 return SU;
462 }
463 // Set the bottom-up policy based on the state of the current bottom zone
464 // and the instructions outside the zone, including the top zone.
465 CandPolicy BotPolicy;
466 setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
467 // Set the top-down policy based on the state of the current top zone and
468 // the instructions outside the zone, including the bottom zone.
469 CandPolicy TopPolicy;
470 setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
471
472 bool BotPending = false;
473 // See if BotCand is still valid (because we previously scheduled from Top).
474 LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
475 if (!BotCand.isValid() || BotCand.SU->isScheduled ||
476 BotCand.Policy != BotPolicy) {
477 BotCand.reset(CandPolicy());
478 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand,
479 BotPending,
480 /*IsBottomUp=*/true);
481 assert(BotCand.Reason != NoCand && "failed to find the first candidate");
482 } else {
484#ifndef NDEBUG
485 if (VerifyScheduling) {
486 SchedCandidate TCand;
487 TCand.reset(CandPolicy());
488 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
489 BotPending,
490 /*IsBottomUp=*/true);
491 assert(TCand.SU == BotCand.SU &&
492 "Last pick result should correspond to re-picking right now");
493 }
494#endif
495 }
496
497 bool TopPending = false;
498 // Check if the top Q has a better candidate.
499 LLVM_DEBUG(dbgs() << "Picking from Top:\n");
500 if (!TopCand.isValid() || TopCand.SU->isScheduled ||
501 TopCand.Policy != TopPolicy) {
502 TopCand.reset(CandPolicy());
503 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand,
504 TopPending,
505 /*IsBottomUp=*/false);
506 assert(TopCand.Reason != NoCand && "failed to find the first candidate");
507 } else {
509#ifndef NDEBUG
510 if (VerifyScheduling) {
511 SchedCandidate TCand;
512 TCand.reset(CandPolicy());
513 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
514 TopPending,
515 /*IsBottomUp=*/false);
516 assert(TCand.SU == TopCand.SU &&
517 "Last pick result should correspond to re-picking right now");
518 }
519#endif
520 }
521
522 // Pick best from BotCand and TopCand.
523 LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
524 dbgs() << "Bot Cand: "; traceCandidate(BotCand););
525 SchedCandidate Cand = BotPending ? TopCand : BotCand;
526 SchedCandidate TryCand = BotPending ? BotCand : TopCand;
527 PickedPending = BotPending && TopPending;
528
529 TryCand.Reason = NoCand;
530 if (BotPending || TopPending) {
531 PickedPending |= tryPendingCandidate(Cand, TopCand, nullptr);
532 } else {
533 tryCandidate(Cand, TryCand, nullptr);
534 }
535
536 if (TryCand.Reason != NoCand) {
537 Cand.setBest(TryCand);
538 }
539
540 LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
541
542 IsTopNode = Cand.AtTop;
543 return Cand.SU;
544}
545
546// This function is mostly cut and pasted from
547// GenericScheduler::pickNode()
549 if (DAG->top() == DAG->bottom()) {
550 assert(Top.Available.empty() && Top.Pending.empty() &&
551 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
552 return nullptr;
553 }
554 bool PickedPending;
555 SUnit *SU;
556 do {
557 PickedPending = false;
558 if (RegionPolicy.OnlyTopDown) {
560 if (!SU) {
561 CandPolicy NoPolicy;
562 TopCand.reset(NoPolicy);
563 pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
564 PickedPending,
565 /*IsBottomUp=*/false);
566 assert(TopCand.Reason != NoCand && "failed to find a candidate");
567 SU = TopCand.SU;
568 }
569 IsTopNode = true;
570 } else if (RegionPolicy.OnlyBottomUp) {
572 if (!SU) {
573 CandPolicy NoPolicy;
574 BotCand.reset(NoPolicy);
575 pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand,
576 PickedPending,
577 /*IsBottomUp=*/true);
578 assert(BotCand.Reason != NoCand && "failed to find a candidate");
579 SU = BotCand.SU;
580 }
581 IsTopNode = false;
582 } else {
583 SU = pickNodeBidirectional(IsTopNode, PickedPending);
584 }
585 } while (SU->isScheduled);
586
587 if (PickedPending) {
588 unsigned ReadyCycle = IsTopNode ? SU->TopReadyCycle : SU->BotReadyCycle;
589 SchedBoundary &Zone = IsTopNode ? Top : Bot;
590 unsigned CurrentCycle = Zone.getCurrCycle();
591 if (ReadyCycle > CurrentCycle)
592 Zone.bumpCycle(ReadyCycle);
593
594 // FIXME: checkHazard() doesn't give information about which cycle the
595 // hazard will resolve so just keep bumping the cycle by 1. This could be
596 // made more efficient if checkHazard() returned more details.
597 while (Zone.checkHazard(SU))
598 Zone.bumpCycle(Zone.getCurrCycle() + 1);
599
600 Zone.releasePending();
601 }
602
603 if (SU->isTopReady())
604 Top.removeReady(SU);
605 if (SU->isBottomReady())
606 Bot.removeReady(SU);
607
608 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
609 << *SU->getInstr());
610 return SU;
611}
612
613void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
614 if (GCNTrackers) {
615 MachineInstr *MI = SU->getInstr();
616 IsTopNode ? (void)DownwardTracker.advance(MI, false)
617 : UpwardTracker.recede(*MI);
618 }
619
620 return GenericScheduler::schedNode(SU, IsTopNode);
621}
622
627
630 if (!CurrentStage)
631 CurrentStage = SchedStages.begin();
632 else
633 CurrentStage++;
634
635 return CurrentStage != SchedStages.end();
636}
637
640 return std::next(CurrentStage) != SchedStages.end();
641}
642
644 assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
645 return *std::next(CurrentStage);
646}
647
649 SchedCandidate &TryCand,
650 SchedBoundary *Zone) const {
651 // Initialize the candidate if needed.
652 if (!Cand.isValid()) {
653 TryCand.Reason = NodeOrder;
654 return true;
655 }
656
657 // Bias PhysReg Defs and copies to their uses and defined respectively.
658 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
659 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
660 return TryCand.Reason != NoCand;
661
662 // Avoid exceeding the target's limit.
663 if (DAG->isTrackingPressure() &&
664 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
665 RegExcess, TRI, DAG->MF))
666 return TryCand.Reason != NoCand;
667
668 // Avoid increasing the max critical pressure in the scheduled region.
669 if (DAG->isTrackingPressure() &&
671 TryCand, Cand, RegCritical, TRI, DAG->MF))
672 return TryCand.Reason != NoCand;
673
674 bool SameBoundary = Zone != nullptr;
675 if (SameBoundary) {
678 TryCand, Cand, ResourceReduce))
679 return TryCand.Reason != NoCand;
681 Cand.ResDelta.DemandedResources, TryCand, Cand,
683 return TryCand.Reason != NoCand;
684 }
685
686 return false;
687}
688
698
703
705 SchedCandidate &TryCand,
706 SchedBoundary *Zone) const {
707 // Initialize the candidate if needed.
708 if (!Cand.isValid()) {
709 TryCand.Reason = NodeOrder;
710 return true;
711 }
712
713 // Avoid spilling by exceeding the register limit.
714 if (DAG->isTrackingPressure() &&
715 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
716 RegExcess, TRI, DAG->MF))
717 return TryCand.Reason != NoCand;
718
719 // Bias PhysReg Defs and copies to their uses and defined respectively.
720 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
721 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
722 return TryCand.Reason != NoCand;
723
724 bool SameBoundary = Zone != nullptr;
725 if (SameBoundary) {
726 // Prioritize instructions that read unbuffered resources by stall cycles.
727 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
728 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
729 return TryCand.Reason != NoCand;
730
731 // Avoid critical resource consumption and balance the schedule.
734 TryCand, Cand, ResourceReduce))
735 return TryCand.Reason != NoCand;
737 Cand.ResDelta.DemandedResources, TryCand, Cand,
739 return TryCand.Reason != NoCand;
740
741 // Unconditionally try to reduce latency.
742 if (tryLatency(TryCand, Cand, *Zone))
743 return TryCand.Reason != NoCand;
744
745 // Weak edges are for clustering and other constraints.
746 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
747 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
748 return TryCand.Reason != NoCand;
749 }
750
751 // Keep clustered nodes together to encourage downstream peephole
752 // optimizations which may reduce resource requirements.
753 //
754 // This is a best effort to set things up for a post-RA pass. Optimizations
755 // like generating loads of multiple registers should ideally be done within
756 // the scheduler pass by combining the loads during DAG postprocessing.
757 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
758 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
759 bool CandIsClusterSucc =
760 isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
761 bool TryCandIsClusterSucc =
762 isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
763 if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
764 Cluster))
765 return TryCand.Reason != NoCand;
766
767 // Avoid increasing the max critical pressure in the scheduled region.
768 if (DAG->isTrackingPressure() &&
770 TryCand, Cand, RegCritical, TRI, DAG->MF))
771 return TryCand.Reason != NoCand;
772
773 // Avoid increasing the max pressure of the entire region.
774 if (DAG->isTrackingPressure() &&
775 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
776 Cand, RegMax, TRI, DAG->MF))
777 return TryCand.Reason != NoCand;
778
779 if (SameBoundary) {
780 // Fall through to original instruction order.
781 if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
782 (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
783 TryCand.Reason = NodeOrder;
784 return true;
785 }
786 }
787 return false;
788}
789
795
796/// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as
797/// much as possible. This is achieved by:
798// 1. Prioritize clustered operations before stall latency heuristic.
799// 2. Prioritize long-latency-load before stall latency heuristic.
800///
801/// \param Cand provides the policy and current best candidate.
802/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
803/// \param Zone describes the scheduled zone that we are extending, or nullptr
804/// if Cand is from a different zone than TryCand.
805/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
807 SchedCandidate &TryCand,
808 SchedBoundary *Zone) const {
809 // Initialize the candidate if needed.
810 if (!Cand.isValid()) {
811 TryCand.Reason = NodeOrder;
812 return true;
813 }
814
815 // Bias PhysReg Defs and copies to their uses and defined respectively.
816 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
817 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
818 return TryCand.Reason != NoCand;
819
820 if (DAG->isTrackingPressure()) {
821 // Avoid exceeding the target's limit.
822 if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
823 RegExcess, TRI, DAG->MF))
824 return TryCand.Reason != NoCand;
825
826 // Avoid increasing the max critical pressure in the scheduled region.
828 TryCand, Cand, RegCritical, TRI, DAG->MF))
829 return TryCand.Reason != NoCand;
830 }
831
832 // MaxMemoryClause-specific: We prioritize clustered instructions as we would
833 // get more benefit from clausing these memory instructions.
834 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
835 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
836 bool CandIsClusterSucc =
837 isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
838 bool TryCandIsClusterSucc =
839 isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
840 if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
841 Cluster))
842 return TryCand.Reason != NoCand;
843
844 // We only compare a subset of features when comparing nodes between
845 // Top and Bottom boundary. Some properties are simply incomparable, in many
846 // other instances we should only override the other boundary if something
847 // is a clear good pick on one boundary. Skip heuristics that are more
848 // "tie-breaking" in nature.
849 bool SameBoundary = Zone != nullptr;
850 if (SameBoundary) {
851 // For loops that are acyclic path limited, aggressively schedule for
852 // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
853 // heuristics to take precedence.
854 if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
855 tryLatency(TryCand, Cand, *Zone))
856 return TryCand.Reason != NoCand;
857
858 // MaxMemoryClause-specific: Prioritize long latency memory load
859 // instructions in top-bottom order to hide more latency. The mayLoad check
860 // is used to exclude store-like instructions, which we do not want to
861 // scheduler them too early.
862 bool TryMayLoad =
863 TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad();
864 bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad();
865
866 if (TryMayLoad || CandMayLoad) {
867 bool TryLongLatency =
868 TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad;
869 bool CandLongLatency =
870 10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad;
871
872 if (tryGreater(Zone->isTop() ? TryLongLatency : CandLongLatency,
873 Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand,
874 Cand, Stall))
875 return TryCand.Reason != NoCand;
876 }
877 // Prioritize instructions that read unbuffered resources by stall cycles.
878 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
879 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
880 return TryCand.Reason != NoCand;
881 }
882
883 if (SameBoundary) {
884 // Weak edges are for clustering and other constraints.
885 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
886 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
887 return TryCand.Reason != NoCand;
888 }
889
890 // Avoid increasing the max pressure of the entire region.
891 if (DAG->isTrackingPressure() &&
892 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
893 Cand, RegMax, TRI, DAG->MF))
894 return TryCand.Reason != NoCand;
895
896 if (SameBoundary) {
897 // Avoid critical resource consumption and balance the schedule.
900 TryCand, Cand, ResourceReduce))
901 return TryCand.Reason != NoCand;
903 Cand.ResDelta.DemandedResources, TryCand, Cand,
905 return TryCand.Reason != NoCand;
906
907 // Avoid serializing long latency dependence chains.
908 // For acyclic path limited loops, latency was already checked above.
909 if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
910 !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
911 return TryCand.Reason != NoCand;
912
913 // Fall through to original instruction order.
914 if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) {
915 assert(TryCand.SU->NodeNum != Cand.SU->NodeNum);
916 TryCand.Reason = NodeOrder;
917 return true;
918 }
919 }
920
921 return false;
922}
923
925 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
926 : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
927 MFI(*MF.getInfo<SIMachineFunctionInfo>()),
928 StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy),
929 RegionLiveOuts(this, /*IsLiveOut=*/true) {
930
931 // We want regions with a single MI to be scheduled so that we can reason
932 // about them correctly during scheduling stages that move MIs between regions
933 // (e.g., rematerialization).
935 LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
936 if (RelaxedOcc) {
937 MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy);
938 if (MinOccupancy != StartingOccupancy)
939 LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
940 << ".\n");
941 }
942}
943
944std::unique_ptr<GCNSchedStage>
945GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
946 switch (SchedStageID) {
948 return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
950 return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
952 return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this);
954 return std::make_unique<PreRARematStage>(SchedStageID, *this);
956 return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
958 return std::make_unique<MemoryClauseInitialScheduleStage>(SchedStageID,
959 *this);
960 }
961
962 llvm_unreachable("Unknown SchedStageID.");
963}
964
966 // Collect all scheduling regions. The actual scheduling is performed in
967 // GCNScheduleDAGMILive::finalizeSchedule.
968 Regions.push_back(std::pair(RegionBegin, RegionEnd));
969}
970
972GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
974 RPTracker.advance(Regions[RegionIdx].first, Regions[RegionIdx].second,
975 &LiveIns[RegionIdx]);
976 return RPTracker.moveMaxPressure();
977}
978
980 MachineBasicBlock::iterator RegionEnd) {
981 assert(RegionBegin != RegionEnd && "Region must not be empty");
982 return &*skipDebugInstructionsBackward(std::prev(RegionEnd), RegionBegin);
983}
984
985void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
986 const MachineBasicBlock *MBB) {
987 GCNDownwardRPTracker RPTracker(*LIS);
988
989 // If the block has the only successor then live-ins of that successor are
990 // live-outs of the current block. We can reuse calculated live set if the
991 // successor will be sent to scheduling past current block.
992
993 // However, due to the bug in LiveInterval analysis it may happen that two
994 // predecessors of the same successor block have different lane bitmasks for
995 // a live-out register. Workaround that by sticking to one-to-one relationship
996 // i.e. one predecessor with one successor block.
997 const MachineBasicBlock *OnlySucc = nullptr;
998 if (MBB->succ_size() == 1) {
999 auto *Candidate = *MBB->succ_begin();
1000 if (!Candidate->empty() && Candidate->pred_size() == 1) {
1001 SlotIndexes *Ind = LIS->getSlotIndexes();
1002 if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate))
1003 OnlySucc = Candidate;
1004 }
1005 }
1006
1007 // Scheduler sends regions from the end of the block upwards.
1008 size_t CurRegion = RegionIdx;
1009 for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
1010 if (Regions[CurRegion].first->getParent() != MBB)
1011 break;
1012 --CurRegion;
1013
1014 auto I = MBB->begin();
1015 auto LiveInIt = MBBLiveIns.find(MBB);
1016 auto &Rgn = Regions[CurRegion];
1017 auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
1018 if (LiveInIt != MBBLiveIns.end()) {
1019 auto LiveIn = std::move(LiveInIt->second);
1020 RPTracker.reset(*MBB->begin(), &LiveIn);
1021 MBBLiveIns.erase(LiveInIt);
1022 } else {
1023 I = Rgn.first;
1024 auto LRS = BBLiveInMap.lookup(NonDbgMI);
1025#ifdef EXPENSIVE_CHECKS
1026 assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
1027#endif
1028 RPTracker.reset(*I, &LRS);
1029 }
1030
1031 for (;;) {
1032 I = RPTracker.getNext();
1033
1034 if (Regions[CurRegion].first == I || NonDbgMI == I) {
1035 LiveIns[CurRegion] = RPTracker.getLiveRegs();
1036 RPTracker.clearMaxPressure();
1037 }
1038
1039 if (Regions[CurRegion].second == I) {
1040 Pressure[CurRegion] = RPTracker.moveMaxPressure();
1041 if (CurRegion-- == RegionIdx)
1042 break;
1043 auto &Rgn = Regions[CurRegion];
1044 NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
1045 }
1046 RPTracker.advanceToNext();
1047 RPTracker.advanceBeforeNext();
1048 }
1049
1050 if (OnlySucc) {
1051 if (I != MBB->end()) {
1052 RPTracker.advanceToNext();
1053 RPTracker.advance(MBB->end());
1054 }
1055 RPTracker.advanceBeforeNext();
1056 MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
1057 }
1058}
1059
1061GCNScheduleDAGMILive::getRegionLiveInMap() const {
1062 assert(!Regions.empty());
1063 std::vector<MachineInstr *> RegionFirstMIs;
1064 RegionFirstMIs.reserve(Regions.size());
1065 for (auto &[RegionBegin, RegionEnd] : reverse(Regions))
1066 RegionFirstMIs.push_back(
1068
1069 return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
1070}
1071
1073GCNScheduleDAGMILive::getRegionLiveOutMap() const {
1074 assert(!Regions.empty());
1075 std::vector<MachineInstr *> RegionLastMIs;
1076 RegionLastMIs.reserve(Regions.size());
1077 for (auto &[RegionBegin, RegionEnd] : reverse(Regions)) {
1078 // Skip empty regions.
1079 if (RegionBegin == RegionEnd)
1080 continue;
1081 RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd));
1082 }
1083 return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS);
1084}
1085
1087 IdxToInstruction.clear();
1088
1089 RegionLiveRegMap =
1090 IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
1091 for (unsigned I = 0; I < DAG->Regions.size(); I++) {
1092 auto &[RegionBegin, RegionEnd] = DAG->Regions[I];
1093 // Skip empty regions.
1094 if (RegionBegin == RegionEnd)
1095 continue;
1096 MachineInstr *RegionKey =
1097 IsLiveOut ? getLastMIForRegion(RegionBegin, RegionEnd) : &*RegionBegin;
1098 IdxToInstruction[I] = RegionKey;
1099 }
1100}
1101
1103 // Start actual scheduling here. This function is called by the base
1104 // MachineScheduler after all regions have been recorded by
1105 // GCNScheduleDAGMILive::schedule().
1106 LiveIns.resize(Regions.size());
1107 Pressure.resize(Regions.size());
1108 RegionsWithHighRP.resize(Regions.size());
1109 RegionsWithExcessRP.resize(Regions.size());
1110 RegionsWithIGLPInstrs.resize(Regions.size());
1111 RegionsWithHighRP.reset();
1112 RegionsWithExcessRP.reset();
1113 RegionsWithIGLPInstrs.reset();
1114
1115 runSchedStages();
1116}
1117
1118void GCNScheduleDAGMILive::runSchedStages() {
1119 LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
1120
1121 if (!Regions.empty()) {
1122 BBLiveInMap = getRegionLiveInMap();
1123 if (GCNTrackers)
1124 RegionLiveOuts.buildLiveRegMap();
1125 }
1126
1127#ifdef DUMP_MAX_REG_PRESSURE
1131 LIS->dump();
1132 }
1133#endif
1134
1135 GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
1136 while (S.advanceStage()) {
1137 auto Stage = createSchedStage(S.getCurrentStage());
1138 if (!Stage->initGCNSchedStage())
1139 continue;
1140
1141 for (auto Region : Regions) {
1142 RegionBegin = Region.first;
1143 RegionEnd = Region.second;
1144 // Setup for scheduling the region and check whether it should be skipped.
1145 if (!Stage->initGCNRegion()) {
1146 Stage->advanceRegion();
1147 exitRegion();
1148 continue;
1149 }
1150
1151 if (GCNTrackers) {
1152 GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker();
1153 GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
1154 GCNRPTracker::LiveRegSet *RegionLiveIns =
1155 &LiveIns[Stage->getRegionIdx()];
1156
1157 reinterpret_cast<GCNRPTracker *>(DownwardTracker)
1158 ->reset(MRI, *RegionLiveIns);
1159 reinterpret_cast<GCNRPTracker *>(UpwardTracker)
1160 ->reset(MRI, RegionLiveOuts.getLiveRegsForRegionIdx(
1161 Stage->getRegionIdx()));
1162 }
1163
1165 Stage->finalizeGCNRegion();
1166 }
1167
1168 Stage->finalizeGCNSchedStage();
1169 }
1170
1171#ifdef DUMP_MAX_REG_PRESSURE
1175 LIS->dump();
1176 }
1177#endif
1178}
1179
1180#ifndef NDEBUG
1182 switch (StageID) {
1184 OS << "Max Occupancy Initial Schedule";
1185 break;
1187 OS << "Unclustered High Register Pressure Reschedule";
1188 break;
1190 OS << "Clustered Low Occupancy Reschedule";
1191 break;
1193 OS << "Pre-RA Rematerialize";
1194 break;
1196 OS << "Max ILP Initial Schedule";
1197 break;
1199 OS << "Max memory clause Initial Schedule";
1200 break;
1201 }
1202
1203 return OS;
1204}
1205#endif
1206
1210
1212 if (!DAG.LIS)
1213 return false;
1214
1215 LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
1216 return true;
1217}
1218
1221 return false;
1222
1224 return false;
1225
1226 if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
1227 return false;
1228
1229 SavedMutations.swap(DAG.Mutations);
1230 DAG.addMutation(
1232
1233 InitialOccupancy = DAG.MinOccupancy;
1234 // Aggressively try to reduce register pressure in the unclustered high RP
1235 // stage. Temporarily increase occupancy target in the region.
1236 TempTargetOccupancy = MFI.getMaxWavesPerEU() > DAG.MinOccupancy
1237 ? InitialOccupancy + 1
1238 : InitialOccupancy;
1239 IsAnyRegionScheduled = false;
1240 S.SGPRLimitBias = S.HighRPSGPRBias;
1241 S.VGPRLimitBias = S.HighRPVGPRBias;
1242
1243 LLVM_DEBUG(
1244 dbgs()
1245 << "Retrying function scheduling without clustering. "
1246 "Aggressively try to reduce register pressure to achieve occupancy "
1247 << TempTargetOccupancy << ".\n");
1248
1249 return true;
1250}
1251
1254 return false;
1255
1257 return false;
1258
1259 // Don't bother trying to improve ILP in lower RP regions if occupancy has not
1260 // been dropped. All regions will have already been scheduled with the ideal
1261 // occupancy targets.
1262 if (DAG.StartingOccupancy <= DAG.MinOccupancy)
1263 return false;
1264
1265 LLVM_DEBUG(
1266 dbgs() << "Retrying function scheduling with lowest recorded occupancy "
1267 << DAG.MinOccupancy << ".\n");
1268 return true;
1269}
1270
1271/// Allows to easily filter for this stage's debug output.
1272#define REMAT_PREFIX "[PreRARemat] "
1273#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
1274
1276 // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
1277 // regions inbetween the defs and region we sinked the def to. Will need to be
1278 // fixed if there is another pass after this pass.
1279 assert(!S.hasNextStage());
1280
1281 if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() == 1)
1282 return false;
1283
1284 // Before performing any IR modification record the parent region of each MI
1285 // and the parent MBB of each region.
1286 const unsigned NumRegions = DAG.Regions.size();
1287 RegionBB.reserve(NumRegions);
1288 for (unsigned I = 0; I < NumRegions; ++I) {
1289 RegionBoundaries Region = DAG.Regions[I];
1290 for (auto MI = Region.first; MI != Region.second; ++MI)
1291 MIRegion.insert({&*MI, I});
1292 RegionBB.push_back(Region.first->getParent());
1293 }
1294
1295 if (!canIncreaseOccupancyOrReduceSpill())
1296 return false;
1297
1298 // Rematerialize identified instructions and update scheduler's state.
1299 rematerialize();
1300 if (GCNTrackers)
1301 DAG.RegionLiveOuts.buildLiveRegMap();
1302 REMAT_DEBUG({
1303 dbgs() << "Retrying function scheduling with new min. occupancy of "
1304 << AchievedOcc << " from rematerializing (original was "
1305 << DAG.MinOccupancy;
1306 if (TargetOcc)
1307 dbgs() << ", target was " << *TargetOcc;
1308 dbgs() << ")\n";
1309 });
1310
1311 if (AchievedOcc > DAG.MinOccupancy) {
1312 DAG.MinOccupancy = AchievedOcc;
1314 MFI.increaseOccupancy(MF, DAG.MinOccupancy);
1315 }
1316 return true;
1317}
1318
1320 DAG.finishBlock();
1321 LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
1322}
1323
1325 SavedMutations.swap(DAG.Mutations);
1326 S.SGPRLimitBias = S.VGPRLimitBias = 0;
1327 if (DAG.MinOccupancy > InitialOccupancy) {
1328 assert(IsAnyRegionScheduled);
1330 << " stage successfully increased occupancy to "
1331 << DAG.MinOccupancy << '\n');
1332 } else if (!IsAnyRegionScheduled) {
1333 assert(DAG.MinOccupancy == InitialOccupancy);
1335 << ": No regions scheduled, min occupancy stays at "
1336 << DAG.MinOccupancy << ", MFI occupancy stays at "
1337 << MFI.getOccupancy() << ".\n");
1338 }
1339
1341}
1342
1344 // Check whether this new region is also a new block.
1345 if (DAG.RegionBegin->getParent() != CurrentMBB)
1346 setupNewBlock();
1347
1348 unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end());
1349 DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs);
1350
1351 // Skip empty scheduling regions (0 or 1 schedulable instructions).
1352 if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end()))
1353 return false;
1354
1355 LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
1356 LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
1357 << " " << CurrentMBB->getName()
1358 << "\n From: " << *DAG.begin() << " To: ";
1359 if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
1360 else dbgs() << "End";
1361 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
1362
1363 // Save original instruction order before scheduling for possible revert.
1364 Unsched.clear();
1365 Unsched.reserve(DAG.NumRegionInstrs);
1368 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG.TII);
1369 for (auto &I : DAG) {
1370 Unsched.push_back(&I);
1371 if (SII->isIGLPMutationOnly(I.getOpcode()))
1372 DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
1373 }
1374 } else {
1375 for (auto &I : DAG)
1376 Unsched.push_back(&I);
1377 }
1378
1379 PressureBefore = DAG.Pressure[RegionIdx];
1380
1381 LLVM_DEBUG(
1382 dbgs() << "Pressure before scheduling:\nRegion live-ins:"
1383 << print(DAG.LiveIns[RegionIdx], DAG.MRI)
1384 << "Region live-in pressure: "
1385 << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
1386 << "Region register pressure: " << print(PressureBefore));
1387
1388 S.HasHighPressure = false;
1389 S.KnownExcessRP = isRegionWithExcessRP();
1390
1391 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1393 SavedMutations.clear();
1394 SavedMutations.swap(DAG.Mutations);
1395 bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
1397 DAG.addMutation(createIGroupLPDAGMutation(
1398 IsInitialStage ? AMDGPU::SchedulingPhase::Initial
1400 }
1401
1402 return true;
1403}
1404
1406 // Only reschedule regions that have excess register pressure (i.e. spilling)
1407 // or had minimum occupancy at the beginning of the stage (as long as
1408 // rescheduling of previous regions did not make occupancy drop back down to
1409 // the initial minimum).
1410 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1411 // If no region has been scheduled yet, the DAG has not yet been updated with
1412 // the occupancy target. So retrieve it from the temporary.
1413 unsigned CurrentTargetOccupancy =
1414 IsAnyRegionScheduled ? DAG.MinOccupancy : TempTargetOccupancy;
1415 if (!DAG.RegionsWithExcessRP[RegionIdx] &&
1416 (CurrentTargetOccupancy <= InitialOccupancy ||
1417 DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) !=
1418 InitialOccupancy))
1419 return false;
1420
1421 bool IsSchedulingThisRegion = GCNSchedStage::initGCNRegion();
1422 // If this is the first region scheduled during this stage, make the target
1423 // occupancy changes in the DAG and MFI.
1424 if (!IsAnyRegionScheduled && IsSchedulingThisRegion) {
1425 IsAnyRegionScheduled = true;
1426 if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) {
1427 DAG.MinOccupancy = TempTargetOccupancy;
1428 MFI.increaseOccupancy(MF, TempTargetOccupancy);
1429 }
1430 }
1431 return IsSchedulingThisRegion;
1432}
1433
1435 // We may need to reschedule this region if it wasn't rescheduled in the last
1436 // stage, or if we found it was testing critical register pressure limits in
1437 // the unclustered reschedule stage. The later is because we may not have been
1438 // able to raise the min occupancy in the previous stage so the region may be
1439 // overly constrained even if it was already rescheduled.
1440 if (!DAG.RegionsWithHighRP[RegionIdx])
1441 return false;
1442
1444}
1445
1447 return RescheduleRegions[RegionIdx] && GCNSchedStage::initGCNRegion();
1448}
1449
1451 if (CurrentMBB)
1452 DAG.finishBlock();
1453
1454 CurrentMBB = DAG.RegionBegin->getParent();
1455 DAG.startBlock(CurrentMBB);
1456 // Get real RP for the region if it hasn't be calculated before. After the
1457 // initial schedule stage real RP will be collected after scheduling.
1461 DAG.computeBlockPressure(RegionIdx, CurrentMBB);
1462}
1463
1465 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1466 if (S.HasHighPressure)
1467 DAG.RegionsWithHighRP[RegionIdx] = true;
1468
1469 // Revert scheduling if we have dropped occupancy or there is some other
1470 // reason that the original schedule is better.
1472
1473 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1475 SavedMutations.swap(DAG.Mutations);
1476
1477 DAG.exitRegion();
1478 advanceRegion();
1479}
1480
1482 // Check the results of scheduling.
1483 PressureAfter = DAG.getRealRegPressure(RegionIdx);
1484
1485 LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
1486 LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
1487
1488 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1489
1490 if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
1491 PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
1492 DAG.Pressure[RegionIdx] = PressureAfter;
1493
1494 // Early out if we have achieved the occupancy target.
1495 LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
1496 return;
1497 }
1498
1499 unsigned TargetOccupancy = std::min(
1500 S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second);
1501 unsigned WavesAfter = std::min(
1502 TargetOccupancy, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
1503 unsigned WavesBefore = std::min(
1504 TargetOccupancy, PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
1505 LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
1506 << ", after " << WavesAfter << ".\n");
1507
1508 // We may not be able to keep the current target occupancy because of the just
1509 // scheduled region. We might still be able to revert scheduling if the
1510 // occupancy before was higher, or if the current schedule has register
1511 // pressure higher than the excess limits which could lead to more spilling.
1512 unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
1513
1514 // Allow memory bound functions to drop to 4 waves if not limited by an
1515 // attribute.
1516 if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
1517 WavesAfter >= MFI.getMinAllowedOccupancy()) {
1518 LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
1519 << MFI.getMinAllowedOccupancy() << " waves\n");
1520 NewOccupancy = WavesAfter;
1521 }
1522
1523 if (NewOccupancy < DAG.MinOccupancy) {
1524 DAG.MinOccupancy = NewOccupancy;
1525 MFI.limitOccupancy(DAG.MinOccupancy);
1526 LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
1527 << DAG.MinOccupancy << ".\n");
1528 }
1529 // The maximum number of arch VGPR on non-unified register file, or the
1530 // maximum VGPR + AGPR in the unified register file case.
1531 unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
1532 // The maximum number of arch VGPR for both unified and non-unified register
1533 // file.
1534 unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
1535 unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
1536
1537 if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs ||
1538 PressureAfter.getArchVGPRNum() > MaxArchVGPRs ||
1539 PressureAfter.getAGPRNum() > MaxArchVGPRs ||
1540 PressureAfter.getSGPRNum() > MaxSGPRs) {
1541 DAG.RegionsWithHighRP[RegionIdx] = true;
1542 DAG.RegionsWithExcessRP[RegionIdx] = true;
1543 }
1544
1545 // Revert if this region's schedule would cause a drop in occupancy or
1546 // spilling.
1547 if (shouldRevertScheduling(WavesAfter))
1549 else
1550 DAG.Pressure[RegionIdx] = PressureAfter;
1551}
1552
1553unsigned
1554GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
1555 DenseMap<unsigned, unsigned> &ReadyCycles,
1556 const TargetSchedModel &SM) {
1557 unsigned ReadyCycle = CurrCycle;
1558 for (auto &D : SU.Preds) {
1559 if (D.isAssignedRegDep()) {
1560 MachineInstr *DefMI = D.getSUnit()->getInstr();
1561 unsigned Latency = SM.computeInstrLatency(DefMI);
1562 unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum];
1563 ReadyCycle = std::max(ReadyCycle, DefReady + Latency);
1564 }
1565 }
1566 ReadyCycles[SU.NodeNum] = ReadyCycle;
1567 return ReadyCycle;
1568}
1569
1570#ifndef NDEBUG
1572 bool operator()(std::pair<MachineInstr *, unsigned> A,
1573 std::pair<MachineInstr *, unsigned> B) const {
1574 return A.second < B.second;
1575 }
1576};
1577
1578static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>,
1579 EarlierIssuingCycle> &ReadyCycles) {
1580 if (ReadyCycles.empty())
1581 return;
1582 unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
1583 dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
1584 << " ##################\n# Cycle #\t\t\tInstruction "
1585 " "
1586 " \n";
1587 unsigned IPrev = 1;
1588 for (auto &I : ReadyCycles) {
1589 if (I.second > IPrev + 1)
1590 dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
1591 << " CYCLES DETECTED ******************************\n\n";
1592 dbgs() << "[ " << I.second << " ] : " << *I.first << "\n";
1593 IPrev = I.second;
1594 }
1595}
1596#endif
1597
1599GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
1600#ifndef NDEBUG
1601 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1602 ReadyCyclesSorted;
1603#endif
1604 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1605 unsigned SumBubbles = 0;
1606 DenseMap<unsigned, unsigned> ReadyCycles;
1607 unsigned CurrCycle = 0;
1608 for (auto &SU : InputSchedule) {
1609 unsigned ReadyCycle =
1610 computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
1611 SumBubbles += ReadyCycle - CurrCycle;
1612#ifndef NDEBUG
1613 ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
1614#endif
1615 CurrCycle = ++ReadyCycle;
1616 }
1617#ifndef NDEBUG
1618 LLVM_DEBUG(
1619 printScheduleModel(ReadyCyclesSorted);
1620 dbgs() << "\n\t"
1621 << "Metric: "
1622 << (SumBubbles
1623 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1624 : 1)
1625 << "\n\n");
1626#endif
1627
1628 return ScheduleMetrics(CurrCycle, SumBubbles);
1629}
1630
1633#ifndef NDEBUG
1634 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1635 ReadyCyclesSorted;
1636#endif
1637 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1638 unsigned SumBubbles = 0;
1639 DenseMap<unsigned, unsigned> ReadyCycles;
1640 unsigned CurrCycle = 0;
1641 for (auto &MI : DAG) {
1642 SUnit *SU = DAG.getSUnit(&MI);
1643 if (!SU)
1644 continue;
1645 unsigned ReadyCycle =
1646 computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM);
1647 SumBubbles += ReadyCycle - CurrCycle;
1648#ifndef NDEBUG
1649 ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
1650#endif
1651 CurrCycle = ++ReadyCycle;
1652 }
1653#ifndef NDEBUG
1654 LLVM_DEBUG(
1655 printScheduleModel(ReadyCyclesSorted);
1656 dbgs() << "\n\t"
1657 << "Metric: "
1658 << (SumBubbles
1659 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1660 : 1)
1661 << "\n\n");
1662#endif
1663
1664 return ScheduleMetrics(CurrCycle, SumBubbles);
1665}
1666
1667bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
1668 if (WavesAfter < DAG.MinOccupancy)
1669 return true;
1670
1671 // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
1672 if (DAG.MFI.isDynamicVGPREnabled()) {
1673 unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
1674 &ST, DAG.MFI.getDynamicVGPRBlockSize(),
1675 PressureBefore.getVGPRNum(false));
1676 unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
1677 &ST, DAG.MFI.getDynamicVGPRBlockSize(),
1678 PressureAfter.getVGPRNum(false));
1679 if (BlocksAfter > BlocksBefore)
1680 return true;
1681 }
1682
1683 return false;
1684}
1685
1688 return false;
1689
1691 return true;
1692
1693 if (mayCauseSpilling(WavesAfter))
1694 return true;
1695
1696 return false;
1697}
1698
1700 // If RP is not reduced in the unclustered reschedule stage, revert to the
1701 // old schedule.
1702 if ((WavesAfter <=
1703 PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) &&
1704 mayCauseSpilling(WavesAfter)) ||
1706 LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
1707 return true;
1708 }
1709
1710 // Do not attempt to relax schedule even more if we are already spilling.
1712 return false;
1713
1714 LLVM_DEBUG(
1715 dbgs()
1716 << "\n\t *** In shouldRevertScheduling ***\n"
1717 << " *********** BEFORE UnclusteredHighRPStage ***********\n");
1718 ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits);
1719 LLVM_DEBUG(
1720 dbgs()
1721 << "\n *********** AFTER UnclusteredHighRPStage ***********\n");
1723 unsigned OldMetric = MBefore.getMetric();
1724 unsigned NewMetric = MAfter.getMetric();
1725 unsigned WavesBefore = std::min(
1726 S.getTargetOccupancy(),
1727 PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()));
1728 unsigned Profit =
1729 ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
1731 NewMetric) /
1733 LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
1734 << MAfter << "Profit: " << Profit << "\n");
1735 return Profit < ScheduleMetrics::ScaleFactor;
1736}
1737
1740 return false;
1741
1743 return true;
1744
1745 if (mayCauseSpilling(WavesAfter))
1746 return true;
1747
1748 return false;
1749}
1750
1752 return GCNSchedStage::shouldRevertScheduling(WavesAfter) ||
1753 mayCauseSpilling(WavesAfter) || (TargetOcc && WavesAfter < TargetOcc);
1754}
1755
1757 if (mayCauseSpilling(WavesAfter))
1758 return true;
1759
1760 return false;
1761}
1762
1764 unsigned WavesAfter) {
1765 return mayCauseSpilling(WavesAfter);
1766}
1767
1768bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
1769 if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
1771 LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
1772 return true;
1773 }
1774
1775 return false;
1776}
1777
1779 LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
1780 DAG.RegionEnd = DAG.RegionBegin;
1781 int SkippedDebugInstr = 0;
1782 for (MachineInstr *MI : Unsched) {
1783 if (MI->isDebugInstr()) {
1784 ++SkippedDebugInstr;
1785 continue;
1786 }
1787
1788 if (MI->getIterator() != DAG.RegionEnd) {
1789 DAG.BB->splice(DAG.RegionEnd, DAG.BB, MI);
1790 if (!MI->isDebugInstr())
1791 DAG.LIS->handleMove(*MI, true);
1792 }
1793
1794 // Reset read-undef flags and update them later.
1795 for (auto &Op : MI->all_defs())
1796 Op.setIsUndef(false);
1797 RegisterOperands RegOpers;
1798 RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false);
1799 if (!MI->isDebugInstr()) {
1800 if (DAG.ShouldTrackLaneMasks) {
1801 // Adjust liveness and add missing dead+read-undef flags.
1802 SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot();
1803 RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
1804 } else {
1805 // Adjust for missing dead-def flags.
1806 RegOpers.detectDeadDefs(*MI, *DAG.LIS);
1807 }
1808 }
1809 DAG.RegionEnd = MI->getIterator();
1810 ++DAG.RegionEnd;
1811 LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
1812 }
1813
1814 // After reverting schedule, debug instrs will now be at the end of the block
1815 // and RegionEnd will point to the first debug instr. Increment RegionEnd
1816 // pass debug instrs to the actual end of the scheduling region.
1817 while (SkippedDebugInstr-- > 0)
1818 ++DAG.RegionEnd;
1819
1820 // If Unsched.front() instruction is a debug instruction, this will actually
1821 // shrink the region since we moved all debug instructions to the end of the
1822 // block. Find the first instruction that is not a debug instruction.
1823 DAG.RegionBegin = Unsched.front()->getIterator();
1824 if (DAG.RegionBegin->isDebugInstr()) {
1825 for (MachineInstr *MI : Unsched) {
1826 if (MI->isDebugInstr())
1827 continue;
1828 DAG.RegionBegin = MI->getIterator();
1829 break;
1830 }
1831 }
1832
1833 // Then move the debug instructions back into their correct place and set
1834 // RegionBegin and RegionEnd if needed.
1835 DAG.placeDebugValues();
1836
1837 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1838}
1839
1840bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
1841 const Function &F = MF.getFunction();
1842
1843 // Maps optimizable regions (i.e., regions at minimum and register-limited
1844 // occupancy, or regions with spilling) to the target RP we would like to
1845 // reach.
1847 unsigned MaxSGPRs = ST.getMaxNumSGPRs(F);
1848 unsigned MaxVGPRs = ST.getMaxNumVGPRs(F);
1849 auto ResetTargetRegions = [&]() {
1850 OptRegions.clear();
1851 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
1852 const GCNRegPressure &RP = DAG.Pressure[I];
1853 GCNRPTarget Target(MaxSGPRs, MaxVGPRs, MF, RP);
1854 if (!Target.satisfied())
1855 OptRegions.insert({I, Target});
1856 }
1857 };
1858
1859 ResetTargetRegions();
1860 if (!OptRegions.empty() || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) {
1861 // In addition to register usage being above addressable limits, occupancy
1862 // below the minimum is considered like "spilling" as well.
1863 TargetOcc = std::nullopt;
1864 } else {
1865 // There is no spilling and room to improve occupancy; set up "increased
1866 // occupancy targets" for all regions.
1867 TargetOcc = DAG.MinOccupancy + 1;
1868 unsigned VGPRBlockSize =
1869 MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
1870 MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false);
1871 MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize);
1872 ResetTargetRegions();
1873 }
1874 REMAT_DEBUG({
1875 dbgs() << "Analyzing ";
1876 MF.getFunction().printAsOperand(dbgs(), false);
1877 dbgs() << ": ";
1878 if (OptRegions.empty()) {
1879 dbgs() << "no objective to achieve, occupancy is maximal at "
1880 << MFI.getMaxWavesPerEU();
1881 } else if (!TargetOcc) {
1882 dbgs() << "reduce spilling (minimum target occupancy is "
1883 << MFI.getMinWavesPerEU() << ')';
1884 } else {
1885 dbgs() << "increase occupancy from " << DAG.MinOccupancy << " to "
1886 << TargetOcc;
1887 }
1888 dbgs() << '\n';
1889 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
1890 if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) {
1891 dbgs() << REMAT_PREFIX << " [" << I << "] " << OptIt->getSecond()
1892 << '\n';
1893 }
1894 }
1895 });
1896 if (OptRegions.empty())
1897 return false;
1898
1899 // Accounts for a reduction in RP in an optimizable region. Returns whether we
1900 // estimate that we have identified enough rematerialization opportunities to
1901 // achieve our goal, and sets Progress to true when this particular reduction
1902 // in pressure was helpful toward that goal.
1903 auto ReduceRPInRegion = [&](auto OptIt, Register Reg, LaneBitmask Mask,
1904 bool &Progress) -> bool {
1905 GCNRPTarget &Target = OptIt->getSecond();
1906 if (!Target.isSaveBeneficial(Reg))
1907 return false;
1908 Progress = true;
1909 Target.saveReg(Reg, Mask, DAG.MRI);
1910 if (Target.satisfied())
1911 OptRegions.erase(OptIt->getFirst());
1912 return OptRegions.empty();
1913 };
1914
1915 // We need up-to-date live-out info. to query live-out register masks in
1916 // regions containing rematerializable instructions.
1917 DAG.RegionLiveOuts.buildLiveRegMap();
1918
1919 // Cache set of registers that are going to be rematerialized.
1920 DenseSet<unsigned> RematRegs;
1921
1922 // Identify rematerializable instructions in the function.
1923 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
1924 auto Region = DAG.Regions[I];
1925 for (auto MI = Region.first; MI != Region.second; ++MI) {
1926 // The instruction must be rematerializable.
1927 MachineInstr &DefMI = *MI;
1928 if (!isReMaterializable(DefMI))
1929 continue;
1930
1931 // We only support rematerializing virtual registers with one definition.
1932 Register Reg = DefMI.getOperand(0).getReg();
1933 if (!Reg.isVirtual() || !DAG.MRI.hasOneDef(Reg))
1934 continue;
1935
1936 // We only care to rematerialize the instruction if it has a single
1937 // non-debug user in a different region. The using MI may not belong to a
1938 // region if it is a lone region terminator.
1939 MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg);
1940 if (!UseMI)
1941 continue;
1942 auto UseRegion = MIRegion.find(UseMI);
1943 if (UseRegion != MIRegion.end() && UseRegion->second == I)
1944 continue;
1945
1946 // Do not rematerialize an instruction if it uses or is used by an
1947 // instruction that we have designated for rematerialization.
1948 // FIXME: Allow for rematerialization chains: this requires 1. updating
1949 // remat points to account for uses that are rematerialized, and 2. either
1950 // rematerializing the candidates in careful ordering, or deferring the
1951 // MBB RP walk until the entire chain has been rematerialized.
1952 if (Rematerializations.contains(UseMI) ||
1953 llvm::any_of(DefMI.operands(), [&RematRegs](MachineOperand &MO) {
1954 return MO.isReg() && RematRegs.contains(MO.getReg());
1955 }))
1956 continue;
1957
1958 // Do not rematerialize an instruction it it uses registers that aren't
1959 // available at its use. This ensures that we are not extending any live
1960 // range while rematerializing.
1961 SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true);
1962 if (!VirtRegAuxInfo::allUsesAvailableAt(&DefMI, UseIdx, *DAG.LIS, DAG.MRI,
1963 *DAG.TII))
1964 continue;
1965
1966 REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI);
1967 RematInstruction &Remat =
1968 Rematerializations.try_emplace(&DefMI, UseMI).first->second;
1969
1970 bool RematUseful = false;
1971 if (auto It = OptRegions.find(I); It != OptRegions.end()) {
1972 // Optimistically consider that moving the instruction out of its
1973 // defining region will reduce RP in the latter; this assumes that
1974 // maximum RP in the region is reached somewhere between the defining
1975 // instruction and the end of the region.
1976 REMAT_DEBUG(dbgs() << " Defining region is optimizable\n");
1977 LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
1978 if (ReduceRPInRegion(It, Reg, Mask, RematUseful))
1979 return true;
1980 }
1981
1982 for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) {
1983 // We are only collecting regions in which the register is a live-in
1984 // (and may be live-through).
1985 auto It = DAG.LiveIns[LIRegion].find(Reg);
1986 if (It == DAG.LiveIns[LIRegion].end() || It->second.none())
1987 continue;
1988 Remat.LiveInRegions.insert(LIRegion);
1989
1990 // Account for the reduction in RP due to the rematerialization in an
1991 // optimizable region in which the defined register is a live-in. This
1992 // is exact for live-through region but optimistic in the using region,
1993 // where RP is actually reduced only if maximum RP is reached somewhere
1994 // between the beginning of the region and the rematerializable
1995 // instruction's use.
1996 if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) {
1997 REMAT_DEBUG(dbgs() << " Live-in in region " << LIRegion << '\n');
1998 if (ReduceRPInRegion(It, Reg, DAG.LiveIns[LIRegion][Reg],
1999 RematUseful))
2000 return true;
2001 }
2002 }
2003
2004 // If the instruction is not a live-in or live-out in any optimizable
2005 // region then there is no point in rematerializing it.
2006 if (!RematUseful) {
2007 Rematerializations.pop_back();
2008 REMAT_DEBUG(dbgs() << " No impact, not rematerializing instruction\n");
2009 } else {
2010 RematRegs.insert(Reg);
2011 }
2012 }
2013 }
2014
2015 if (TargetOcc) {
2016 // We were trying to increase occupancy but failed, abort the stage.
2017 REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
2018 Rematerializations.clear();
2019 return false;
2020 }
2021 REMAT_DEBUG(dbgs() << "Can reduce but not eliminate spilling\n");
2022 return !Rematerializations.empty();
2023}
2024
2025void PreRARematStage::rematerialize() {
2026 const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
2027
2028 // Collect regions whose RP changes in unpredictable way; we will have to
2029 // fully recompute their RP after all rematerailizations.
2030 DenseSet<unsigned> RecomputeRP;
2031
2032 // Rematerialize all instructions.
2033 for (auto &[DefMI, Remat] : Rematerializations) {
2034 MachineBasicBlock::iterator InsertPos(Remat.UseMI);
2036 unsigned DefRegion = MIRegion.at(DefMI);
2037
2038 // Rematerialize DefMI to its use block.
2039 TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
2040 AMDGPU::NoSubRegister, *DefMI);
2041 Remat.RematMI = &*std::prev(InsertPos);
2042 DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI);
2043
2044 // Update region boundaries in regions we sinked from (remove defining MI)
2045 // and to (insert MI rematerialized in use block). Only then we can erase
2046 // the original MI.
2047 DAG.updateRegionBoundaries(DAG.Regions[DefRegion], DefMI, nullptr);
2048 auto UseRegion = MIRegion.find(Remat.UseMI);
2049 if (UseRegion != MIRegion.end()) {
2050 DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], InsertPos,
2051 Remat.RematMI);
2052 }
2053 DAG.LIS->RemoveMachineInstrFromMaps(*DefMI);
2055
2056 // Collect all regions impacted by the rematerialization and update their
2057 // live-in/RP information.
2058 for (unsigned I : Remat.LiveInRegions) {
2059 ImpactedRegions.insert({I, DAG.Pressure[I]});
2060 GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I];
2061
2062#ifdef EXPENSIVE_CHECKS
2063 // All uses are known to be available / live at the remat point. Thus, the
2064 // uses should already be live in to the region.
2065 for (MachineOperand &MO : DefMI->operands()) {
2066 if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
2067 continue;
2068
2069 Register UseReg = MO.getReg();
2070 if (!UseReg.isVirtual())
2071 continue;
2072
2073 LiveInterval &LI = DAG.LIS->getInterval(UseReg);
2074 LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
2075 if (LI.hasSubRanges() && MO.getSubReg())
2076 LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
2077
2078 LaneBitmask LiveInMask = RegionLiveIns.at(UseReg);
2079 LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
2080 // If this register has lanes not covered by the LiveIns, be sure they
2081 // do not map to any subrange. ref:
2082 // machine-scheduler-sink-trivial-remats.mir::omitted_subrange
2083 if (UncoveredLanes.any()) {
2084 assert(LI.hasSubRanges());
2085 for (LiveInterval::SubRange &SR : LI.subranges())
2086 assert((SR.LaneMask & UncoveredLanes).none());
2087 }
2088 }
2089#endif
2090
2091 // The register is no longer a live-in in all regions but the one that
2092 // contains the single use. In live-through regions, maximum register
2093 // pressure decreases predictably so we can directly update it. In the
2094 // using region, maximum RP may or may not decrease, so we will mark it
2095 // for re-computation after all materializations have taken place.
2096 LaneBitmask PrevMask = RegionLiveIns[Reg];
2097 RegionLiveIns.erase(Reg);
2098 RegMasks.insert({{I, Remat.RematMI->getOperand(0).getReg()}, PrevMask});
2099 if (Remat.UseMI->getParent() != DAG.Regions[I].first->getParent())
2100 DAG.Pressure[I].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
2101 else
2102 RecomputeRP.insert(I);
2103 }
2104 // RP in the region from which the instruction was rematerialized may or may
2105 // not decrease.
2106 ImpactedRegions.insert({DefRegion, DAG.Pressure[DefRegion]});
2107 RecomputeRP.insert(DefRegion);
2108
2109 // Recompute live interval to reflect the register's rematerialization.
2110 Register RematReg = Remat.RematMI->getOperand(0).getReg();
2111 DAG.LIS->removeInterval(RematReg);
2112 DAG.LIS->createAndComputeVirtRegInterval(RematReg);
2113 }
2114
2115 // All regions impacted by at least one rematerialization must be rescheduled.
2116 // Maximum pressure must also be recomputed for all regions where it changed
2117 // non-predictably and checked against the target occupancy.
2118 unsigned DynamicVGPRBlockSize =
2119 MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
2120 AchievedOcc = MFI.getMaxWavesPerEU();
2121 for (auto &[I, OriginalRP] : ImpactedRegions) {
2122 bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second;
2123 RescheduleRegions[I] = !IsEmptyRegion;
2124 if (!RecomputeRP.contains(I))
2125 continue;
2126
2127 GCNRegPressure RP;
2128 if (IsEmptyRegion) {
2129 RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
2130 } else {
2131 GCNDownwardRPTracker RPT(*DAG.LIS);
2132 auto *NonDbgMI = &*skipDebugInstructionsForward(DAG.Regions[I].first,
2133 DAG.Regions[I].second);
2134 if (NonDbgMI == DAG.Regions[I].second) {
2135 // Region is non-empty but contains only debug instructions.
2136 RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
2137 } else {
2138 RPT.reset(*NonDbgMI, &DAG.LiveIns[I]);
2139 RPT.advance(DAG.Regions[I].second);
2140 RP = RPT.moveMaxPressure();
2141 }
2142 }
2143 DAG.Pressure[I] = RP;
2144 AchievedOcc =
2145 std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize));
2146 }
2147 REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
2148}
2149
2150// Copied from MachineLICM
2151bool PreRARematStage::isReMaterializable(const MachineInstr &MI) {
2152 if (!DAG.TII->isReMaterializable(MI))
2153 return false;
2154
2155 for (const MachineOperand &MO : MI.all_uses()) {
2156 // We can't remat physreg uses, unless it is a constant or an ignorable
2157 // use (e.g. implicit exec use on VALU instructions)
2158 if (MO.getReg().isPhysical()) {
2159 if (DAG.MRI.isConstantPhysReg(MO.getReg()) || DAG.TII->isIgnorableUse(MO))
2160 continue;
2161 return false;
2162 }
2163 }
2164
2165 return true;
2166}
2167
2169 // We consider that reducing spilling is always beneficial so we never
2170 // rollback rematerializations in such cases. It's also possible that
2171 // rescheduling lowers occupancy over the one achieved just through remats, in
2172 // which case we do not want to rollback either (the rescheduling was already
2173 // reverted in PreRARematStage::shouldRevertScheduling in such cases).
2174 unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy);
2175 if (!TargetOcc || MaxOcc >= *TargetOcc)
2176 return;
2177
2178 REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n");
2179 const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
2180
2181 // Rollback the rematerializations.
2182 for (const auto &[DefMI, Remat] : Rematerializations) {
2183 MachineInstr &RematMI = *Remat.RematMI;
2184 unsigned DefRegion = MIRegion.at(DefMI);
2185 MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second);
2186 MachineBasicBlock *MBB = RegionBB[DefRegion];
2187 Register Reg = RematMI.getOperand(0).getReg();
2188
2189 // Re-rematerialize MI at the end of its original region. Note that it may
2190 // not be rematerialized exactly in the same position as originally within
2191 // the region, but it should not matter much.
2192 TII->reMaterialize(*MBB, InsertPos, Reg, AMDGPU::NoSubRegister, RematMI);
2193 MachineInstr *NewMI = &*std::prev(InsertPos);
2194 DAG.LIS->InsertMachineInstrInMaps(*NewMI);
2195
2196 auto UseRegion = MIRegion.find(Remat.UseMI);
2197 if (UseRegion != MIRegion.end()) {
2198 DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], RematMI,
2199 nullptr);
2200 }
2201 DAG.updateRegionBoundaries(DAG.Regions[DefRegion], InsertPos, NewMI);
2202
2203 // Erase rematerialized MI.
2204 DAG.LIS->RemoveMachineInstrFromMaps(RematMI);
2205 RematMI.eraseFromParent();
2206
2207 // Recompute live interval for the re-rematerialized register
2208 DAG.LIS->removeInterval(Reg);
2209 DAG.LIS->createAndComputeVirtRegInterval(Reg);
2210
2211 // Re-add the register as a live-in in all regions it used to be one in.
2212 for (unsigned LIRegion : Remat.LiveInRegions)
2213 DAG.LiveIns[LIRegion].insert({Reg, RegMasks.at({LIRegion, Reg})});
2214 }
2215
2216 // Reset RP in all impacted regions.
2217 for (auto &[I, OriginalRP] : ImpactedRegions)
2218 DAG.Pressure[I] = OriginalRP;
2219
2221}
2222
2223void GCNScheduleDAGMILive::updateRegionBoundaries(
2225 MachineInstr *NewMI) {
2226 assert((!NewMI || NewMI != RegionBounds.second) &&
2227 "cannot remove at region end");
2228
2229 if (RegionBounds.first == RegionBounds.second) {
2230 assert(NewMI && "cannot remove from an empty region");
2231 RegionBounds.first = NewMI;
2232 return;
2233 }
2234
2235 // We only care for modifications at the beginning of a non-empty region since
2236 // the upper region boundary is exclusive.
2237 if (MI != RegionBounds.first)
2238 return;
2239 if (!NewMI)
2240 RegionBounds.first = std::next(MI); // Removal
2241 else
2242 RegionBounds.first = NewMI; // Insertion
2243}
2244
2246 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII);
2247 return any_of(*DAG, [SII](MachineBasicBlock::iterator MI) {
2248 return SII->isIGLPMutationOnly(MI->getOpcode());
2249 });
2250}
2251
2253 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
2254 bool RemoveKillFlags)
2256
2258 HasIGLPInstrs = hasIGLPInstrs(this);
2259 if (HasIGLPInstrs) {
2260 SavedMutations.clear();
2261 SavedMutations.swap(Mutations);
2263 }
2264
2266}
2267
2269 if (HasIGLPInstrs)
2270 SavedMutations.swap(Mutations);
2271
2273}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file defines the GCNRegPressure class, which tracks registry pressure by bookkeeping number of S...
static cl::opt< bool > GCNTrackers("amdgpu-use-amdgpu-trackers", cl::Hidden, cl::desc("Use the AMDGPU specific RPTrackers during scheduling"), cl::init(false))
static cl::opt< bool > DisableClusteredLowOccupancy("amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden, cl::desc("Disable clustered low occupancy " "rescheduling for ILP scheduling stage."), cl::init(false))
#define REMAT_PREFIX
Allows to easily filter for this stage's debug output.
static MachineInstr * getLastMIForRegion(MachineBasicBlock::iterator RegionBegin, MachineBasicBlock::iterator RegionEnd)
static bool shouldCheckPending(SchedBoundary &Zone, const TargetSchedModel *SchedModel)
static cl::opt< bool > RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden, cl::desc("Relax occupancy targets for kernels which are memory " "bound (amdgpu-membound-threshold), or " "Wave Limited (amdgpu-limit-wave-threshold)."), cl::init(false))
#define REMAT_DEBUG(X)
static cl::opt< bool > DisableUnclusterHighRP("amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden, cl::desc("Disable unclustered high register pressure " "reduction scheduling stage."), cl::init(false))
static void printScheduleModel(std::set< std::pair< MachineInstr *, unsigned >, EarlierIssuingCycle > &ReadyCycles)
static cl::opt< bool > PrintMaxRPRegUsageAfterScheduler("amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden, cl::desc("Print a list of live registers along with their def/uses at the " "point of maximum register pressure after scheduling."), cl::init(false))
static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG)
static bool canUsePressureDiffs(const SUnit &SU)
Checks whether SU can use the cached DAG pressure diffs to compute the current register pressure.
static cl::opt< unsigned > PendingQueueLimit("amdgpu-scheduler-pending-queue-limit", cl::Hidden, cl::desc("Max (Available+Pending) size to inspect pending queue (0 disables)"), cl::init(256))
static void getRegisterPressures(bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU, std::vector< unsigned > &Pressure, std::vector< unsigned > &MaxPressure, GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker, ScheduleDAGMI *DAG, const SIRegisterInfo *SRI)
static cl::opt< bool > PrintMaxRPRegUsageBeforeScheduler("amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden, cl::desc("Print a list of live registers along with their def/uses at the " "point of maximum register pressure before scheduling."), cl::init(false))
static SUnit * pickOnlyChoice(SchedBoundary &Zone, const TargetSchedModel *SchedModel)
static cl::opt< unsigned > ScheduleMetricBias("amdgpu-schedule-metric-bias", cl::Hidden, cl::desc("Sets the bias which adds weight to occupancy vs latency. Set it to " "100 to chase the occupancy only."), cl::init(10))
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
if(PassOpts->AAPipeline)
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
bool shouldRevertScheduling(unsigned WavesAfter) override
ValueT & at(const_arg_type_t< KeyT > Val)
at - Return the entry for the specified key, or abort if no such entry exists.
Definition DenseMap.h:224
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
bool erase(const KeyT &Val)
Definition DenseMap.h:330
bool empty() const
Definition DenseMap.h:109
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
GCNRegPressure bumpDownwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI) const
Mostly copy/paste from CodeGen/RegisterPressure.cpp Calculate the impact MI will have on CurPressure ...
GCNMaxILPSchedStrategy(const MachineSchedContext *C)
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const override
Apply a set of heuristics to a new candidate.
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const override
GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as much as possible.
GCNMaxMemoryClauseSchedStrategy(const MachineSchedContext *C)
GCNMaxOccupancySchedStrategy(const MachineSchedContext *C, bool IsLegacyScheduler=false)
void finalizeSchedule() override
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
void schedule() override
Orders nodes according to selected style.
GCNPostScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S, bool RemoveKillFlags)
Models a register pressure target, allowing to evaluate and track register savings against that targe...
GCNRegPressure getPressure() const
DenseMap< unsigned, LaneBitmask > LiveRegSet
GCNSchedStrategy & S
GCNRegPressure PressureBefore
bool isRegionWithExcessRP() const
bool mayCauseSpilling(unsigned WavesAfter)
ScheduleMetrics getScheduleMetrics(const std::vector< SUnit > &InputSchedule)
GCNScheduleDAGMILive & DAG
const GCNSchedStageID StageID
std::vector< MachineInstr * > Unsched
GCNRegPressure PressureAfter
MachineFunction & MF
SIMachineFunctionInfo & MFI
unsigned computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle, DenseMap< unsigned, unsigned > &ReadyCycles, const TargetSchedModel &SM)
virtual void finalizeGCNSchedStage()
virtual bool initGCNSchedStage()
virtual bool shouldRevertScheduling(unsigned WavesAfter)
std::vector< std::unique_ptr< ScheduleDAGMutation > > SavedMutations
GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
MachineBasicBlock * CurrentMBB
const GCNSubtarget & ST
This is a minimal scheduler strategy.
GCNDownwardRPTracker DownwardTracker
GCNSchedStrategy(const MachineSchedContext *C)
SmallVector< GCNSchedStageID, 4 > SchedStages
std::vector< unsigned > MaxPressure
SUnit * pickNodeBidirectional(bool &IsTopNode, bool &PickedPending)
GCNSchedStageID getCurrentStage()
bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const
Evaluates instructions in the pending queue using a subset of scheduling heuristics.
SmallVectorImpl< GCNSchedStageID >::iterator CurrentStage
void schedNode(SUnit *SU, bool IsTopNode) override
Notify MachineSchedStrategy that ScheduleDAGMI has scheduled an instruction and updated scheduled/rem...
GCNDownwardRPTracker * getDownwardTracker()
std::vector< unsigned > Pressure
void initialize(ScheduleDAGMI *DAG) override
Initialize the strategy after building the DAG for a new region.
GCNUpwardRPTracker UpwardTracker
void printCandidateDecision(const SchedCandidate &Current, const SchedCandidate &Preferred)
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, SchedCandidate &Cand, bool &IsPending, bool IsBottomUp)
void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, const SIRegisterInfo *SRI, unsigned SGPRPressure, unsigned VGPRPressure, bool IsBottomUp)
SUnit * pickNode(bool &IsTopNode) override
Pick the next node to schedule, or return NULL.
GCNUpwardRPTracker * getUpwardTracker()
GCNSchedStageID getNextStage() const
void finalizeSchedule() override
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
void schedule() override
Orders nodes according to selected style.
GCNScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S)
unsigned getMaxNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
void recede(const MachineInstr &MI)
Move to the state of RP just before the MI .
void traceCandidate(const SchedCandidate &Cand)
LLVM_ABI void setPolicy(CandPolicy &Policy, bool IsPostRA, SchedBoundary &CurrZone, SchedBoundary *OtherZone)
Set the CandPolicy given a scheduling zone given the current resources and latencies inside and outsi...
MachineSchedPolicy RegionPolicy
const TargetSchedModel * SchedModel
const MachineSchedContext * Context
const TargetRegisterInfo * TRI
SchedCandidate BotCand
Candidate last picked from Bot boundary.
SchedCandidate TopCand
Candidate last picked from Top boundary.
virtual bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const
Apply a set of heuristics to a new candidate.
ScheduleDAGMILive * DAG
void initialize(ScheduleDAGMI *dag) override
Initialize the strategy after building the DAG for a new region.
void schedNode(SUnit *SU, bool IsTopNode) override
Update the scheduler's state after scheduling a node.
GenericScheduler(const MachineSchedContext *C)
bool shouldRevertScheduling(unsigned WavesAfter) override
bool hasSubRanges() const
Returns true if subregister liveness information is available.
iterator_range< subrange_iterator > subranges()
LLVM_ABI void dump() const
MachineInstrBundleIterator< MachineInstr > iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
Register getReg(unsigned Idx) const
Get the register for the operand index.
Representation of each machine instruction.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
mop_range operands()
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
unsigned getSubReg() const
bool readsReg() const
readsReg - Returns true if this operand reads the previous value of its register.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
bool shouldRevertScheduling(unsigned WavesAfter) override
bool shouldRevertScheduling(unsigned WavesAfter) override
bool shouldRevertScheduling(unsigned WavesAfter) override
bool initGCNSchedStage() override
Capture a change in pressure for a single pressure set.
Helpers for implementing custom MachineSchedStrategy classes.
unsigned size() const
Track the current register pressure at some position in the instruction stream, and remember the high...
LLVM_ABI void advance()
Advance across the current instruction.
LLVM_ABI void getDownwardPressure(const MachineInstr *MI, std::vector< unsigned > &PressureResult, std::vector< unsigned > &MaxPressureResult)
Get the pressure of each PSet after traversing this instruction top-down.
const std::vector< unsigned > & getRegSetPressureAtPos() const
Get the register set pressure at the current position, which may be less than the pressure across the...
LLVM_ABI void getUpwardPressure(const MachineInstr *MI, std::vector< unsigned > &PressureResult, std::vector< unsigned > &MaxPressureResult)
Get the pressure of each PSet after traversing this instruction bottom-up.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
List of registers defined and used by a machine instruction.
LLVM_ABI void collect(const MachineInstr &MI, const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, bool TrackLaneMasks, bool IgnoreDead)
Analyze the given instruction MI and fill in the Uses, Defs and DeadDefs list based on the MachineOpe...
LLVM_ABI void adjustLaneLiveness(const LiveIntervals &LIS, const MachineRegisterInfo &MRI, SlotIndex Pos, MachineInstr *AddFlagsMI=nullptr)
Use liveness information to find out which uses/defs are partially undefined/dead and adjust the VReg...
LLVM_ABI void detectDeadDefs(const MachineInstr &MI, const LiveIntervals &LIS)
Use liveness information to find dead defs not marked with a dead flag and move them to the DeadDefs ...
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
bool isIGLPMutationOnly(unsigned Opcode) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
unsigned TopReadyCycle
Cycle relative to start when node is ready.
unsigned NodeNum
Entry # of node in the node vector.
unsigned short Latency
Node latency.
bool isScheduled
True once scheduled.
unsigned ParentClusterIdx
The parent cluster id.
unsigned BotReadyCycle
Cycle relative to end when node is ready.
bool isBottomReady() const
bool isTopReady() const
SmallVector< SDep, 4 > Preds
All sunit predecessors.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Each Scheduling boundary is associated with ready queues.
LLVM_ABI void releasePending()
Release pending ready nodes in to the available queue.
LLVM_ABI unsigned getLatencyStallCycles(SUnit *SU)
Get the difference between the given SUnit's ready time and the current cycle.
LLVM_ABI SUnit * pickOnlyChoice()
Call this before applying any other heuristics to the Available queue.
LLVM_ABI void bumpCycle(unsigned NextCycle)
Move the boundary of scheduled code by one cycle.
unsigned getCurrMOps() const
Micro-ops issued in the current cycle.
unsigned getCurrCycle() const
Number of cycles to issue the instructions scheduled in this zone.
LLVM_ABI bool checkHazard(SUnit *SU)
Does this SU have a hazard within the current instruction group.
A ScheduleDAG for scheduling lists of MachineInstr.
bool ScheduleSingleMIRegions
True if regions with a single MI should be scheduled.
MachineBasicBlock::iterator RegionEnd
The end of the range to be scheduled.
virtual void finalizeSchedule()
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
virtual void exitRegion()
Called when the scheduler has finished scheduling the current region.
const MachineLoopInfo * MLI
bool RemoveKillFlags
True if the DAG builder should remove kill flags (in preparation for rescheduling).
MachineBasicBlock::iterator RegionBegin
The beginning of the range to be scheduled.
void schedule() override
Implement ScheduleDAGInstrs interface for scheduling a sequence of reorderable instructions.
ScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S)
RegPressureTracker RPTracker
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
void addMutation(std::unique_ptr< ScheduleDAGMutation > Mutation)
Add a postprocessing step to the DAG builder.
void schedule() override
Implement ScheduleDAGInstrs interface for scheduling a sequence of reorderable instructions.
ScheduleDAGMI(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S, bool RemoveKillFlags)
std::vector< std::unique_ptr< ScheduleDAGMutation > > Mutations
Ordered list of DAG postprocessing steps.
MachineRegisterInfo & MRI
Virtual/real register map.
const TargetInstrInfo * TII
Target instruction information.
MachineFunction & MF
Machine function.
static const unsigned ScaleFactor
unsigned getMetric() const
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getMBBStartIdx(unsigned Num) const
Returns the first index in the given basic block number.
Provide an instruction scheduling machine model to CodeGen passes.
LLVM_ABI bool hasInstrSchedModel() const
Return true if this machine model includes an instruction-level scheduling model.
unsigned getMicroOpBufferSize() const
Number of micro-ops that may be buffered for OOO execution.
Target - Wrapper for Target specific information.
bool shouldRevertScheduling(unsigned WavesAfter) override
static bool allUsesAvailableAt(const MachineInstr *MI, SlotIndex UseIdx, const LiveIntervals &LIS, const MachineRegisterInfo &MRI, const TargetInstrInfo &TII)
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
unsigned getAllocatedNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize)
unsigned getDynamicVGPRBlockSize(const Function &F)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
LLVM_ABI unsigned getWeakLeft(const SUnit *SU, bool isTop)
GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI, Range &&LiveRegs)
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase)
Phase specifes whether or not this is a reentry into the IGroupLPDAGMutation.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
std::pair< MachineBasicBlock::iterator, MachineBasicBlock::iterator > RegionBoundaries
A region's boundaries i.e.
IterT skipDebugInstructionsForward(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It until it points to a non-debug instruction or to End and return the resulting iterator.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
LLVM_ABI bool tryPressure(const PressureChange &TryP, const PressureChange &CandP, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason, const TargetRegisterInfo *TRI, const MachineFunction &MF)
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
LLVM_ABI cl::opt< bool > VerifyScheduling
LLVM_ABI bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary &Zone)
IterT skipDebugInstructionsBackward(IterT It, IterT Begin, bool SkipPseudoOp=true)
Decrement It until it points to a non-debug instruction or to Begin and return the resulting iterator...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
bool isTheSameCluster(unsigned A, unsigned B)
Return whether the input cluster ID's are the same and valid.
DWARFExpression::Operation Op
LLVM_ABI bool tryGreater(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1867
DenseMap< MachineInstr *, GCNRPTracker::LiveRegSet > getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS)
creates a map MachineInstr -> LiveRegSet R - range of iterators on instructions After - upon entry or...
GCNRPTracker::LiveRegSet getLiveRegsBefore(const MachineInstr &MI, const LiveIntervals &LIS)
LLVM_ABI bool tryLess(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
Return true if this heuristic determines order.
LLVM_ABI void dumpMaxRegPressure(MachineFunction &MF, GCNRegPressure::RegKind Kind, LiveIntervals &LIS, const MachineLoopInfo *MLI)
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
LLVM_ABI int biasPhysReg(const SUnit *SU, bool isTop)
Minimize physical register live ranges.
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:867
bool operator()(std::pair< MachineInstr *, unsigned > A, std::pair< MachineInstr *, unsigned > B) const
unsigned getArchVGPRNum() const
unsigned getAGPRNum() const
unsigned getSGPRNum() const
Policy for scheduling the next instruction in the candidate's zone.
Store the state used by GenericScheduler heuristics, required for the lifetime of one invocation of p...
void reset(const CandPolicy &NewPolicy)
LLVM_ABI void initResourceDelta(const ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel)
Status of an instruction's critical resource consumption.
constexpr bool any() const
Definition LaneBitmask.h:53
static constexpr LaneBitmask getNone()
Definition LaneBitmask.h:81
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...