LLVM 23.0.0git
GCNSchedStrategy.cpp
Go to the documentation of this file.
1//===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This contains a MachineSchedStrategy implementation for maximizing wave
11/// occupancy on GCN hardware.
12///
13/// This pass will apply multiple scheduling stages to the same function.
14/// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
15/// entry point for the scheduling of those regions is
16/// GCNScheduleDAGMILive::runSchedStages.
17
18/// Generally, the reason for having multiple scheduling stages is to account
19/// for the kernel-wide effect of register usage on occupancy. Usually, only a
20/// few scheduling regions will have register pressure high enough to limit
21/// occupancy for the kernel, so constraints can be relaxed to improve ILP in
22/// other regions.
23///
24//===----------------------------------------------------------------------===//
25
26#include "GCNSchedStrategy.h"
27#include "AMDGPUIGroupLP.h"
28#include "GCNRegPressure.h"
31#include "llvm/ADT/STLExtras.h"
35#include "llvm/MC/LaneBitmask.h"
37
38#define DEBUG_TYPE "machine-scheduler"
39
40using namespace llvm;
41
43 "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden,
44 cl::desc("Disable unclustered high register pressure "
45 "reduction scheduling stage."),
46 cl::init(false));
47
49 "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden,
50 cl::desc("Disable clustered low occupancy "
51 "rescheduling for ILP scheduling stage."),
52 cl::init(false));
53
55 "amdgpu-schedule-metric-bias", cl::Hidden,
57 "Sets the bias which adds weight to occupancy vs latency. Set it to "
58 "100 to chase the occupancy only."),
59 cl::init(10));
60
61static cl::opt<bool>
62 RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden,
63 cl::desc("Relax occupancy targets for kernels which are memory "
64 "bound (amdgpu-membound-threshold), or "
65 "Wave Limited (amdgpu-limit-wave-threshold)."),
66 cl::init(false));
67
69 "amdgpu-use-amdgpu-trackers", cl::Hidden,
70 cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
71 cl::init(false));
72
74 "amdgpu-scheduler-pending-queue-limit", cl::Hidden,
76 "Max (Available+Pending) size to inspect pending queue (0 disables)"),
77 cl::init(256));
78
79#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
80#define DUMP_MAX_REG_PRESSURE
82 "amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden,
83 cl::desc("Print a list of live registers along with their def/uses at the "
84 "point of maximum register pressure before scheduling."),
85 cl::init(false));
86
88 "amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden,
89 cl::desc("Print a list of live registers along with their def/uses at the "
90 "point of maximum register pressure after scheduling."),
91 cl::init(false));
92#endif
93
95 "amdgpu-disable-rewrite-mfma-form-sched-stage", cl::Hidden,
96 cl::desc("Disable rewrie mfma rewrite scheduling stage"), cl::init(true));
97
98const unsigned ScheduleMetrics::ScaleFactor = 100;
99
104
107
108 MF = &DAG->MF;
109
110 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
111
113 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
115 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
116
118 // Set the initial TargetOccupnacy to the maximum occupancy that we can
119 // achieve for this function. This effectively sets a lower bound on the
120 // 'Critical' register limits in the scheduler.
121 // Allow for lower occupancy targets if kernel is wave limited or memory
122 // bound, and using the relaxed occupancy feature.
126 std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
127
128 if (!KnownExcessRP) {
129 VGPRCriticalLimit = std::min(
130 ST.getMaxNumVGPRs(TargetOccupancy, MFI.getDynamicVGPRBlockSize()),
132 } else {
133 // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
134 // returns a reasonably small number for targets with lots of VGPRs, such
135 // as GFX10 and GFX11.
136 LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
137 "VGPRCriticalLimit calculation method.\n");
138 unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
139 unsigned Granule =
140 AMDGPU::IsaInfo::getVGPRAllocGranule(&ST, DynamicVGPRBlockSize);
141 unsigned Addressable =
142 AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST, DynamicVGPRBlockSize);
143 unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule);
144 VGPRBudget = std::max(VGPRBudget, Granule);
145 VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit);
146 }
147
148 // Subtract error margin and bias from register limits and avoid overflow.
153
154 LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
155 << ", VGPRExcessLimit = " << VGPRExcessLimit
156 << ", SGPRCriticalLimit = " << SGPRCriticalLimit
157 << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
158}
159
160/// Checks whether \p SU can use the cached DAG pressure diffs to compute the
161/// current register pressure.
162///
163/// This works for the common case, but it has a few exceptions that have been
164/// observed through trial and error:
165/// - Explicit physical register operands
166/// - Subregister definitions
167///
168/// In both of those cases, PressureDiff doesn't represent the actual pressure,
169/// and querying LiveIntervals through the RegPressureTracker is needed to get
170/// an accurate value.
171///
172/// We should eventually only use PressureDiff for maximum performance, but this
173/// already allows 80% of SUs to take the fast path without changing scheduling
174/// at all. Further changes would either change scheduling, or require a lot
175/// more logic to recover an accurate pressure estimate from the PressureDiffs.
176static bool canUsePressureDiffs(const SUnit &SU) {
177 if (!SU.isInstr())
178 return false;
179
180 // Cannot use pressure diffs for subregister defs or with physregs, it's
181 // imprecise in both cases.
182 for (const auto &Op : SU.getInstr()->operands()) {
183 if (!Op.isReg() || Op.isImplicit())
184 continue;
185 if (Op.getReg().isPhysical() ||
186 (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister))
187 return false;
188 }
189 return true;
190}
191
193 bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
194 std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
195 GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker,
196 ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) {
197 // getDownwardPressure() and getUpwardPressure() make temporary changes to
198 // the tracker, so we need to pass those function a non-const copy.
199 RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
200 if (!GCNTrackers) {
201 AtTop
202 ? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure)
203 : TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
204
205 return;
206 }
207
208 // GCNTrackers
209 Pressure.resize(4, 0);
210 MachineInstr *MI = SU->getInstr();
211 GCNRegPressure NewPressure;
212 if (AtTop) {
213 GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
214 NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, SRI);
215 } else {
216 GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
217 TempUpwardTracker.recede(*MI);
218 NewPressure = TempUpwardTracker.getPressure();
219 }
220 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
221 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
222 NewPressure.getArchVGPRNum();
223 Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
224}
225
227 bool AtTop,
228 const RegPressureTracker &RPTracker,
229 const SIRegisterInfo *SRI,
230 unsigned SGPRPressure,
231 unsigned VGPRPressure, bool IsBottomUp) {
232 Cand.SU = SU;
233 Cand.AtTop = AtTop;
234
235 if (!DAG->isTrackingPressure())
236 return;
237
238 Pressure.clear();
239 MaxPressure.clear();
240
241 // We try to use the cached PressureDiffs in the ScheduleDAG whenever
242 // possible over querying the RegPressureTracker.
243 //
244 // RegPressureTracker will make a lot of LIS queries which are very
245 // expensive, it is considered a slow function in this context.
246 //
247 // PressureDiffs are precomputed and cached, and getPressureDiff is just a
248 // trivial lookup into an array. It is pretty much free.
249 //
250 // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
251 // PressureDiffs.
252 if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) {
253 getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
255 } else {
256 // Reserve 4 slots.
257 Pressure.resize(4, 0);
258 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
259 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
260
261 for (const auto &Diff : DAG->getPressureDiff(SU)) {
262 if (!Diff.isValid())
263 continue;
264 // PressureDiffs is always bottom-up so if we're working top-down we need
265 // to invert its sign.
266 Pressure[Diff.getPSet()] +=
267 (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc());
268 }
269
270#ifdef EXPENSIVE_CHECKS
271 std::vector<unsigned> CheckPressure, CheckMaxPressure;
272 getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
274 if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
275 CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
276 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
277 CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
278 errs() << "Register Pressure is inaccurate when calculated through "
279 "PressureDiff\n"
280 << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
281 << ", expected "
282 << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
283 << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
284 << ", expected "
285 << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
286 report_fatal_error("inaccurate register pressure calculation");
287 }
288#endif
289 }
290
291 unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
292 unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
293
294 // If two instructions increase the pressure of different register sets
295 // by the same amount, the generic scheduler will prefer to schedule the
296 // instruction that increases the set with the least amount of registers,
297 // which in our case would be SGPRs. This is rarely what we want, so
298 // when we report excess/critical register pressure, we do it either
299 // only for VGPRs or only for SGPRs.
300
301 // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
302 const unsigned MaxVGPRPressureInc = 16;
303 bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
304 bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
305
306 // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
307 // to increase the likelihood we don't go over the limits. We should improve
308 // the analysis to look through dependencies to find the path with the least
309 // register pressure.
310
311 // We only need to update the RPDelta for instructions that increase register
312 // pressure. Instructions that decrease or keep reg pressure the same will be
313 // marked as RegExcess in tryCandidate() when they are compared with
314 // instructions that increase the register pressure.
315 if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
316 HasHighPressure = true;
317 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
318 Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
319 }
320
321 if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
322 HasHighPressure = true;
323 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
324 Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
325 }
326
327 // Register pressure is considered 'CRITICAL' if it is approaching a value
328 // that would reduce the wave occupancy for the execution unit. When
329 // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
330 // has the same cost, so we don't need to prefer one over the other.
331
332 int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
333 int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
334
335 if (SGPRDelta >= 0 || VGPRDelta >= 0) {
336 HasHighPressure = true;
337 if (SGPRDelta > VGPRDelta) {
338 Cand.RPDelta.CriticalMax =
339 PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
340 Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
341 } else {
342 Cand.RPDelta.CriticalMax =
343 PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
344 Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
345 }
346 }
347}
348
350 const TargetSchedModel *SchedModel) {
351 bool HasBufferedModel =
352 SchedModel->hasInstrSchedModel() && SchedModel->getMicroOpBufferSize();
353 unsigned Combined = Zone.Available.size() + Zone.Pending.size();
354 return Combined <= PendingQueueLimit && HasBufferedModel;
355}
356
358 const TargetSchedModel *SchedModel) {
359 // pickOnlyChoice() releases pending instructions and checks for new hazards.
360 SUnit *OnlyChoice = Zone.pickOnlyChoice();
361 if (!shouldCheckPending(Zone, SchedModel) || Zone.Pending.empty())
362 return OnlyChoice;
363
364 return nullptr;
365}
366
368 const SchedCandidate &Preferred) {
369 LLVM_DEBUG({
370 dbgs() << "Prefer:\t\t";
371 DAG->dumpNode(*Preferred.SU);
372
373 if (Current.SU) {
374 dbgs() << "Not:\t";
375 DAG->dumpNode(*Current.SU);
376 }
377
378 dbgs() << "Reason:\t\t";
379 traceCandidate(Preferred);
380 });
381}
382
383// This function is mostly cut and pasted from
384// GenericScheduler::pickNodeFromQueue()
386 const CandPolicy &ZonePolicy,
387 const RegPressureTracker &RPTracker,
388 SchedCandidate &Cand, bool &IsPending,
389 bool IsBottomUp) {
390 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
392 unsigned SGPRPressure = 0;
393 unsigned VGPRPressure = 0;
394 IsPending = false;
395 if (DAG->isTrackingPressure()) {
396 if (!GCNTrackers) {
397 SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
398 VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
399 } else {
400 GCNRPTracker *T = IsBottomUp
401 ? static_cast<GCNRPTracker *>(&UpwardTracker)
402 : static_cast<GCNRPTracker *>(&DownwardTracker);
403 SGPRPressure = T->getPressure().getSGPRNum();
404 VGPRPressure = T->getPressure().getArchVGPRNum();
405 }
406 }
407 LLVM_DEBUG(dbgs() << "Available Q:\n");
408 ReadyQueue &AQ = Zone.Available;
409 for (SUnit *SU : AQ) {
410
411 SchedCandidate TryCand(ZonePolicy);
412 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
413 VGPRPressure, IsBottomUp);
414 // Pass SchedBoundary only when comparing nodes from the same boundary.
415 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
416 tryCandidate(Cand, TryCand, ZoneArg);
417 if (TryCand.Reason != NoCand) {
418 // Initialize resource delta if needed in case future heuristics query it.
419 if (TryCand.ResDelta == SchedResourceDelta())
420 TryCand.initResourceDelta(Zone.DAG, SchedModel);
421 LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
422 Cand.setBest(TryCand);
423 } else {
424 printCandidateDecision(TryCand, Cand);
425 }
426 }
427
428 if (!shouldCheckPending(Zone, SchedModel))
429 return;
430
431 LLVM_DEBUG(dbgs() << "Pending Q:\n");
432 ReadyQueue &PQ = Zone.Pending;
433 for (SUnit *SU : PQ) {
434
435 SchedCandidate TryCand(ZonePolicy);
436 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
437 VGPRPressure, IsBottomUp);
438 // Pass SchedBoundary only when comparing nodes from the same boundary.
439 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
440 tryPendingCandidate(Cand, TryCand, ZoneArg);
441 if (TryCand.Reason != NoCand) {
442 // Initialize resource delta if needed in case future heuristics query it.
443 if (TryCand.ResDelta == SchedResourceDelta())
444 TryCand.initResourceDelta(Zone.DAG, SchedModel);
445 LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
446 IsPending = true;
447 Cand.setBest(TryCand);
448 } else {
449 printCandidateDecision(TryCand, Cand);
450 }
451 }
452}
453
454// This function is mostly cut and pasted from
455// GenericScheduler::pickNodeBidirectional()
457 bool &PickedPending) {
458 // Schedule as far as possible in the direction of no choice. This is most
459 // efficient, but also provides the best heuristics for CriticalPSets.
460 if (SUnit *SU = pickOnlyChoice(Bot, SchedModel)) {
461 IsTopNode = false;
462 return SU;
463 }
464 if (SUnit *SU = pickOnlyChoice(Top, SchedModel)) {
465 IsTopNode = true;
466 return SU;
467 }
468 // Set the bottom-up policy based on the state of the current bottom zone
469 // and the instructions outside the zone, including the top zone.
470 CandPolicy BotPolicy;
471 setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
472 // Set the top-down policy based on the state of the current top zone and
473 // the instructions outside the zone, including the bottom zone.
474 CandPolicy TopPolicy;
475 setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
476
477 bool BotPending = false;
478 // See if BotCand is still valid (because we previously scheduled from Top).
479 LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
480 if (!BotCand.isValid() || BotCand.SU->isScheduled ||
481 BotCand.Policy != BotPolicy) {
482 BotCand.reset(CandPolicy());
483 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand,
484 BotPending,
485 /*IsBottomUp=*/true);
486 assert(BotCand.Reason != NoCand && "failed to find the first candidate");
487 } else {
489#ifndef NDEBUG
490 if (VerifyScheduling) {
491 SchedCandidate TCand;
492 TCand.reset(CandPolicy());
493 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
494 BotPending,
495 /*IsBottomUp=*/true);
496 assert(TCand.SU == BotCand.SU &&
497 "Last pick result should correspond to re-picking right now");
498 }
499#endif
500 }
501
502 bool TopPending = false;
503 // Check if the top Q has a better candidate.
504 LLVM_DEBUG(dbgs() << "Picking from Top:\n");
505 if (!TopCand.isValid() || TopCand.SU->isScheduled ||
506 TopCand.Policy != TopPolicy) {
507 TopCand.reset(CandPolicy());
508 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand,
509 TopPending,
510 /*IsBottomUp=*/false);
511 assert(TopCand.Reason != NoCand && "failed to find the first candidate");
512 } else {
514#ifndef NDEBUG
515 if (VerifyScheduling) {
516 SchedCandidate TCand;
517 TCand.reset(CandPolicy());
518 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
519 TopPending,
520 /*IsBottomUp=*/false);
521 assert(TCand.SU == TopCand.SU &&
522 "Last pick result should correspond to re-picking right now");
523 }
524#endif
525 }
526
527 // Pick best from BotCand and TopCand.
528 LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
529 dbgs() << "Bot Cand: "; traceCandidate(BotCand););
530 SchedCandidate Cand = BotPending ? TopCand : BotCand;
531 SchedCandidate TryCand = BotPending ? BotCand : TopCand;
532 PickedPending = BotPending && TopPending;
533
534 TryCand.Reason = NoCand;
535 if (BotPending || TopPending) {
536 PickedPending |= tryPendingCandidate(Cand, TopCand, nullptr);
537 } else {
538 tryCandidate(Cand, TryCand, nullptr);
539 }
540
541 if (TryCand.Reason != NoCand) {
542 Cand.setBest(TryCand);
543 }
544
545 LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
546
547 IsTopNode = Cand.AtTop;
548 return Cand.SU;
549}
550
551// This function is mostly cut and pasted from
552// GenericScheduler::pickNode()
554 if (DAG->top() == DAG->bottom()) {
555 assert(Top.Available.empty() && Top.Pending.empty() &&
556 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
557 return nullptr;
558 }
559 bool PickedPending;
560 SUnit *SU;
561 do {
562 PickedPending = false;
563 if (RegionPolicy.OnlyTopDown) {
565 if (!SU) {
566 CandPolicy NoPolicy;
567 TopCand.reset(NoPolicy);
568 pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
569 PickedPending,
570 /*IsBottomUp=*/false);
571 assert(TopCand.Reason != NoCand && "failed to find a candidate");
572 SU = TopCand.SU;
573 }
574 IsTopNode = true;
575 } else if (RegionPolicy.OnlyBottomUp) {
577 if (!SU) {
578 CandPolicy NoPolicy;
579 BotCand.reset(NoPolicy);
580 pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand,
581 PickedPending,
582 /*IsBottomUp=*/true);
583 assert(BotCand.Reason != NoCand && "failed to find a candidate");
584 SU = BotCand.SU;
585 }
586 IsTopNode = false;
587 } else {
588 SU = pickNodeBidirectional(IsTopNode, PickedPending);
589 }
590 } while (SU->isScheduled);
591
592 if (PickedPending) {
593 unsigned ReadyCycle = IsTopNode ? SU->TopReadyCycle : SU->BotReadyCycle;
594 SchedBoundary &Zone = IsTopNode ? Top : Bot;
595 unsigned CurrentCycle = Zone.getCurrCycle();
596 if (ReadyCycle > CurrentCycle)
597 Zone.bumpCycle(ReadyCycle);
598
599 // FIXME: checkHazard() doesn't give information about which cycle the
600 // hazard will resolve so just keep bumping the cycle by 1. This could be
601 // made more efficient if checkHazard() returned more details.
602 while (Zone.checkHazard(SU))
603 Zone.bumpCycle(Zone.getCurrCycle() + 1);
604
605 Zone.releasePending();
606 }
607
608 if (SU->isTopReady())
609 Top.removeReady(SU);
610 if (SU->isBottomReady())
611 Bot.removeReady(SU);
612
613 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
614 << *SU->getInstr());
615 return SU;
616}
617
618void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
619 if (GCNTrackers) {
620 MachineInstr *MI = SU->getInstr();
621 IsTopNode ? (void)DownwardTracker.advance(MI, false)
622 : UpwardTracker.recede(*MI);
623 }
624
625 return GenericScheduler::schedNode(SU, IsTopNode);
626}
627
632
635 if (!CurrentStage)
636 CurrentStage = SchedStages.begin();
637 else
638 CurrentStage++;
639
640 return CurrentStage != SchedStages.end();
641}
642
645 return std::next(CurrentStage) != SchedStages.end();
646}
647
649 assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
650 return *std::next(CurrentStage);
651}
652
654 SchedCandidate &TryCand,
655 SchedBoundary *Zone) const {
656 // Initialize the candidate if needed.
657 if (!Cand.isValid()) {
658 TryCand.Reason = NodeOrder;
659 return true;
660 }
661
662 // Bias PhysReg Defs and copies to their uses and defined respectively.
663 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
664 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
665 return TryCand.Reason != NoCand;
666
667 // Avoid exceeding the target's limit.
668 if (DAG->isTrackingPressure() &&
669 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
670 RegExcess, TRI, DAG->MF))
671 return TryCand.Reason != NoCand;
672
673 // Avoid increasing the max critical pressure in the scheduled region.
674 if (DAG->isTrackingPressure() &&
676 TryCand, Cand, RegCritical, TRI, DAG->MF))
677 return TryCand.Reason != NoCand;
678
679 bool SameBoundary = Zone != nullptr;
680 if (SameBoundary) {
683 TryCand, Cand, ResourceReduce))
684 return TryCand.Reason != NoCand;
686 Cand.ResDelta.DemandedResources, TryCand, Cand,
688 return TryCand.Reason != NoCand;
689 }
690
691 return false;
692}
693
705
710
712 SchedCandidate &TryCand,
713 SchedBoundary *Zone) const {
714 // Initialize the candidate if needed.
715 if (!Cand.isValid()) {
716 TryCand.Reason = NodeOrder;
717 return true;
718 }
719
720 // Avoid spilling by exceeding the register limit.
721 if (DAG->isTrackingPressure() &&
722 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
723 RegExcess, TRI, DAG->MF))
724 return TryCand.Reason != NoCand;
725
726 // Bias PhysReg Defs and copies to their uses and defined respectively.
727 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
728 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
729 return TryCand.Reason != NoCand;
730
731 bool SameBoundary = Zone != nullptr;
732 if (SameBoundary) {
733 // Prioritize instructions that read unbuffered resources by stall cycles.
734 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
735 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
736 return TryCand.Reason != NoCand;
737
738 // Avoid critical resource consumption and balance the schedule.
741 TryCand, Cand, ResourceReduce))
742 return TryCand.Reason != NoCand;
744 Cand.ResDelta.DemandedResources, TryCand, Cand,
746 return TryCand.Reason != NoCand;
747
748 // Unconditionally try to reduce latency.
749 if (tryLatency(TryCand, Cand, *Zone))
750 return TryCand.Reason != NoCand;
751
752 // Weak edges are for clustering and other constraints.
753 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
754 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
755 return TryCand.Reason != NoCand;
756 }
757
758 // Keep clustered nodes together to encourage downstream peephole
759 // optimizations which may reduce resource requirements.
760 //
761 // This is a best effort to set things up for a post-RA pass. Optimizations
762 // like generating loads of multiple registers should ideally be done within
763 // the scheduler pass by combining the loads during DAG postprocessing.
764 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
765 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
766 bool CandIsClusterSucc =
767 isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
768 bool TryCandIsClusterSucc =
769 isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
770 if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
771 Cluster))
772 return TryCand.Reason != NoCand;
773
774 // Avoid increasing the max critical pressure in the scheduled region.
775 if (DAG->isTrackingPressure() &&
777 TryCand, Cand, RegCritical, TRI, DAG->MF))
778 return TryCand.Reason != NoCand;
779
780 // Avoid increasing the max pressure of the entire region.
781 if (DAG->isTrackingPressure() &&
782 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
783 Cand, RegMax, TRI, DAG->MF))
784 return TryCand.Reason != NoCand;
785
786 if (SameBoundary) {
787 // Fall through to original instruction order.
788 if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
789 (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
790 TryCand.Reason = NodeOrder;
791 return true;
792 }
793 }
794 return false;
795}
796
802
803/// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as
804/// much as possible. This is achieved by:
805// 1. Prioritize clustered operations before stall latency heuristic.
806// 2. Prioritize long-latency-load before stall latency heuristic.
807///
808/// \param Cand provides the policy and current best candidate.
809/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
810/// \param Zone describes the scheduled zone that we are extending, or nullptr
811/// if Cand is from a different zone than TryCand.
812/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
814 SchedCandidate &TryCand,
815 SchedBoundary *Zone) const {
816 // Initialize the candidate if needed.
817 if (!Cand.isValid()) {
818 TryCand.Reason = NodeOrder;
819 return true;
820 }
821
822 // Bias PhysReg Defs and copies to their uses and defined respectively.
823 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
824 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
825 return TryCand.Reason != NoCand;
826
827 if (DAG->isTrackingPressure()) {
828 // Avoid exceeding the target's limit.
829 if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
830 RegExcess, TRI, DAG->MF))
831 return TryCand.Reason != NoCand;
832
833 // Avoid increasing the max critical pressure in the scheduled region.
835 TryCand, Cand, RegCritical, TRI, DAG->MF))
836 return TryCand.Reason != NoCand;
837 }
838
839 // MaxMemoryClause-specific: We prioritize clustered instructions as we would
840 // get more benefit from clausing these memory instructions.
841 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
842 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
843 bool CandIsClusterSucc =
844 isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
845 bool TryCandIsClusterSucc =
846 isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
847 if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
848 Cluster))
849 return TryCand.Reason != NoCand;
850
851 // We only compare a subset of features when comparing nodes between
852 // Top and Bottom boundary. Some properties are simply incomparable, in many
853 // other instances we should only override the other boundary if something
854 // is a clear good pick on one boundary. Skip heuristics that are more
855 // "tie-breaking" in nature.
856 bool SameBoundary = Zone != nullptr;
857 if (SameBoundary) {
858 // For loops that are acyclic path limited, aggressively schedule for
859 // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
860 // heuristics to take precedence.
861 if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
862 tryLatency(TryCand, Cand, *Zone))
863 return TryCand.Reason != NoCand;
864
865 // MaxMemoryClause-specific: Prioritize long latency memory load
866 // instructions in top-bottom order to hide more latency. The mayLoad check
867 // is used to exclude store-like instructions, which we do not want to
868 // scheduler them too early.
869 bool TryMayLoad =
870 TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad();
871 bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad();
872
873 if (TryMayLoad || CandMayLoad) {
874 bool TryLongLatency =
875 TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad;
876 bool CandLongLatency =
877 10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad;
878
879 if (tryGreater(Zone->isTop() ? TryLongLatency : CandLongLatency,
880 Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand,
881 Cand, Stall))
882 return TryCand.Reason != NoCand;
883 }
884 // Prioritize instructions that read unbuffered resources by stall cycles.
885 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
886 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
887 return TryCand.Reason != NoCand;
888 }
889
890 if (SameBoundary) {
891 // Weak edges are for clustering and other constraints.
892 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
893 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
894 return TryCand.Reason != NoCand;
895 }
896
897 // Avoid increasing the max pressure of the entire region.
898 if (DAG->isTrackingPressure() &&
899 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
900 Cand, RegMax, TRI, DAG->MF))
901 return TryCand.Reason != NoCand;
902
903 if (SameBoundary) {
904 // Avoid critical resource consumption and balance the schedule.
907 TryCand, Cand, ResourceReduce))
908 return TryCand.Reason != NoCand;
910 Cand.ResDelta.DemandedResources, TryCand, Cand,
912 return TryCand.Reason != NoCand;
913
914 // Avoid serializing long latency dependence chains.
915 // For acyclic path limited loops, latency was already checked above.
916 if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
917 !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
918 return TryCand.Reason != NoCand;
919
920 // Fall through to original instruction order.
921 if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) {
922 assert(TryCand.SU->NodeNum != Cand.SU->NodeNum);
923 TryCand.Reason = NodeOrder;
924 return true;
925 }
926 }
927
928 return false;
929}
930
932 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
933 : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
934 MFI(*MF.getInfo<SIMachineFunctionInfo>()),
935 StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy),
936 RegionLiveOuts(this, /*IsLiveOut=*/true) {
937
938 // We want regions with a single MI to be scheduled so that we can reason
939 // about them correctly during scheduling stages that move MIs between regions
940 // (e.g., rematerialization).
942 LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
943 if (RelaxedOcc) {
944 MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy);
945 if (MinOccupancy != StartingOccupancy)
946 LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
947 << ".\n");
948 }
949}
950
951std::unique_ptr<GCNSchedStage>
952GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
953 switch (SchedStageID) {
955 return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
957 return std::make_unique<RewriteMFMAFormStage>(SchedStageID, *this);
959 return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
961 return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this);
963 return std::make_unique<PreRARematStage>(SchedStageID, *this);
965 return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
967 return std::make_unique<MemoryClauseInitialScheduleStage>(SchedStageID,
968 *this);
969 }
970
971 llvm_unreachable("Unknown SchedStageID.");
972}
973
975 // Collect all scheduling regions. The actual scheduling is performed in
976 // GCNScheduleDAGMILive::finalizeSchedule.
977 Regions.push_back(std::pair(RegionBegin, RegionEnd));
978}
979
981GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
983 RPTracker.advance(Regions[RegionIdx].first, Regions[RegionIdx].second,
984 &LiveIns[RegionIdx]);
985 return RPTracker.moveMaxPressure();
986}
987
989 MachineBasicBlock::iterator RegionEnd) {
990 assert(RegionBegin != RegionEnd && "Region must not be empty");
991 return &*skipDebugInstructionsBackward(std::prev(RegionEnd), RegionBegin);
992}
993
994void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
995 const MachineBasicBlock *MBB) {
996 GCNDownwardRPTracker RPTracker(*LIS);
997
998 // If the block has the only successor then live-ins of that successor are
999 // live-outs of the current block. We can reuse calculated live set if the
1000 // successor will be sent to scheduling past current block.
1001
1002 // However, due to the bug in LiveInterval analysis it may happen that two
1003 // predecessors of the same successor block have different lane bitmasks for
1004 // a live-out register. Workaround that by sticking to one-to-one relationship
1005 // i.e. one predecessor with one successor block.
1006 const MachineBasicBlock *OnlySucc = nullptr;
1007 if (MBB->succ_size() == 1) {
1008 auto *Candidate = *MBB->succ_begin();
1009 if (!Candidate->empty() && Candidate->pred_size() == 1) {
1010 SlotIndexes *Ind = LIS->getSlotIndexes();
1011 if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate))
1012 OnlySucc = Candidate;
1013 }
1014 }
1015
1016 // Scheduler sends regions from the end of the block upwards.
1017 size_t CurRegion = RegionIdx;
1018 for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
1019 if (Regions[CurRegion].first->getParent() != MBB)
1020 break;
1021 --CurRegion;
1022
1023 auto I = MBB->begin();
1024 auto LiveInIt = MBBLiveIns.find(MBB);
1025 auto &Rgn = Regions[CurRegion];
1026 auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
1027 if (LiveInIt != MBBLiveIns.end()) {
1028 auto LiveIn = std::move(LiveInIt->second);
1029 RPTracker.reset(*MBB->begin(), &LiveIn);
1030 MBBLiveIns.erase(LiveInIt);
1031 } else {
1032 I = Rgn.first;
1033 auto LRS = BBLiveInMap.lookup(NonDbgMI);
1034#ifdef EXPENSIVE_CHECKS
1035 assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
1036#endif
1037 RPTracker.reset(*I, &LRS);
1038 }
1039
1040 for (;;) {
1041 I = RPTracker.getNext();
1042
1043 if (Regions[CurRegion].first == I || NonDbgMI == I) {
1044 LiveIns[CurRegion] = RPTracker.getLiveRegs();
1045 RPTracker.clearMaxPressure();
1046 }
1047
1048 if (Regions[CurRegion].second == I) {
1049 Pressure[CurRegion] = RPTracker.moveMaxPressure();
1050 if (CurRegion-- == RegionIdx)
1051 break;
1052 auto &Rgn = Regions[CurRegion];
1053 NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
1054 }
1055 RPTracker.advanceToNext();
1056 RPTracker.advanceBeforeNext();
1057 }
1058
1059 if (OnlySucc) {
1060 if (I != MBB->end()) {
1061 RPTracker.advanceToNext();
1062 RPTracker.advance(MBB->end());
1063 }
1064 RPTracker.advanceBeforeNext();
1065 MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
1066 }
1067}
1068
1070GCNScheduleDAGMILive::getRegionLiveInMap() const {
1071 assert(!Regions.empty());
1072 std::vector<MachineInstr *> RegionFirstMIs;
1073 RegionFirstMIs.reserve(Regions.size());
1074 for (auto &[RegionBegin, RegionEnd] : reverse(Regions))
1075 RegionFirstMIs.push_back(
1077
1078 return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
1079}
1080
1082GCNScheduleDAGMILive::getRegionLiveOutMap() const {
1083 assert(!Regions.empty());
1084 std::vector<MachineInstr *> RegionLastMIs;
1085 RegionLastMIs.reserve(Regions.size());
1086 for (auto &[RegionBegin, RegionEnd] : reverse(Regions)) {
1087 // Skip empty regions.
1088 if (RegionBegin == RegionEnd)
1089 continue;
1090 RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd));
1091 }
1092 return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS);
1093}
1094
1096 IdxToInstruction.clear();
1097
1098 RegionLiveRegMap =
1099 IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
1100 for (unsigned I = 0; I < DAG->Regions.size(); I++) {
1101 auto &[RegionBegin, RegionEnd] = DAG->Regions[I];
1102 // Skip empty regions.
1103 if (RegionBegin == RegionEnd)
1104 continue;
1105 MachineInstr *RegionKey =
1106 IsLiveOut ? getLastMIForRegion(RegionBegin, RegionEnd) : &*RegionBegin;
1107 IdxToInstruction[I] = RegionKey;
1108 }
1109}
1110
1112 // Start actual scheduling here. This function is called by the base
1113 // MachineScheduler after all regions have been recorded by
1114 // GCNScheduleDAGMILive::schedule().
1115 LiveIns.resize(Regions.size());
1116 Pressure.resize(Regions.size());
1117 RegionsWithHighRP.resize(Regions.size());
1118 RegionsWithExcessRP.resize(Regions.size());
1119 RegionsWithIGLPInstrs.resize(Regions.size());
1120 RegionsWithHighRP.reset();
1121 RegionsWithExcessRP.reset();
1122 RegionsWithIGLPInstrs.reset();
1123
1124 runSchedStages();
1125}
1126
1127void GCNScheduleDAGMILive::runSchedStages() {
1128 LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
1129
1130 if (!Regions.empty()) {
1131 BBLiveInMap = getRegionLiveInMap();
1132 if (GCNTrackers)
1133 RegionLiveOuts.buildLiveRegMap();
1134 }
1135
1136#ifdef DUMP_MAX_REG_PRESSURE
1140 LIS->dump();
1141 }
1142#endif
1143
1144 GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
1145 while (S.advanceStage()) {
1146 auto Stage = createSchedStage(S.getCurrentStage());
1147 if (!Stage->initGCNSchedStage())
1148 continue;
1149
1150 for (auto Region : Regions) {
1151 RegionBegin = Region.first;
1152 RegionEnd = Region.second;
1153 // Setup for scheduling the region and check whether it should be skipped.
1154 if (!Stage->initGCNRegion()) {
1155 Stage->advanceRegion();
1156 exitRegion();
1157 continue;
1158 }
1159
1160 if (GCNTrackers) {
1161 GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker();
1162 GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
1163 GCNRPTracker::LiveRegSet *RegionLiveIns =
1164 &LiveIns[Stage->getRegionIdx()];
1165
1166 reinterpret_cast<GCNRPTracker *>(DownwardTracker)
1167 ->reset(MRI, *RegionLiveIns);
1168 reinterpret_cast<GCNRPTracker *>(UpwardTracker)
1169 ->reset(MRI, RegionLiveOuts.getLiveRegsForRegionIdx(
1170 Stage->getRegionIdx()));
1171 }
1172
1174 Stage->finalizeGCNRegion();
1175 }
1176
1177 Stage->finalizeGCNSchedStage();
1178 }
1179
1180#ifdef DUMP_MAX_REG_PRESSURE
1184 LIS->dump();
1185 }
1186#endif
1187}
1188
1189#ifndef NDEBUG
1191 switch (StageID) {
1193 OS << "Max Occupancy Initial Schedule";
1194 break;
1196 OS << "Instruction Rewriting Reschedule";
1197 break;
1199 OS << "Unclustered High Register Pressure Reschedule";
1200 break;
1202 OS << "Clustered Low Occupancy Reschedule";
1203 break;
1205 OS << "Pre-RA Rematerialize";
1206 break;
1208 OS << "Max ILP Initial Schedule";
1209 break;
1211 OS << "Max memory clause Initial Schedule";
1212 break;
1213 }
1214
1215 return OS;
1216}
1217#endif
1218
1222
1224 if (!DAG.LIS)
1225 return false;
1226
1227 LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
1228 return true;
1229}
1230
1231void RewriteMFMAFormStage::findReachingDefs(
1232 MachineOperand &UseMO, LiveIntervals *LIS,
1233 SmallVectorImpl<SlotIndex> &DefIdxs) {
1234 MachineInstr *UseMI = UseMO.getParent();
1235 LiveInterval &UseLI = LIS->getInterval(UseMO.getReg());
1236 VNInfo *VNI = UseLI.getVNInfoAt(LIS->getInstructionIndex(*UseMI));
1237
1238 // If the def is not a PHI, then it must be the only reaching def.
1239 if (!VNI->isPHIDef()) {
1240 DefIdxs.push_back(VNI->def);
1241 return;
1242 }
1243
1244 SmallPtrSet<MachineBasicBlock *, 8> Visited = {UseMI->getParent()};
1245 SmallVector<MachineBasicBlock *, 8> Worklist;
1246
1247 // Mark the predecessor blocks for traversal
1248 for (MachineBasicBlock *PredMBB : UseMI->getParent()->predecessors()) {
1249 Worklist.push_back(PredMBB);
1250 Visited.insert(PredMBB);
1251 }
1252
1253 while (!Worklist.empty()) {
1254 MachineBasicBlock *CurrMBB = Worklist.pop_back_val();
1255
1256 SlotIndex CurrMBBEnd = LIS->getMBBEndIdx(CurrMBB);
1257 VNInfo *VNI = UseLI.getVNInfoAt(CurrMBBEnd.getPrevSlot());
1258
1259 MachineBasicBlock *DefMBB = LIS->getMBBFromIndex(VNI->def);
1260
1261 // If there is a def in this block, then add it to the list. This is the
1262 // reaching def of this path.
1263 if (!VNI->isPHIDef()) {
1264 DefIdxs.push_back(VNI->def);
1265 continue;
1266 }
1267
1268 for (MachineBasicBlock *PredMBB : DefMBB->predecessors()) {
1269 if (Visited.insert(PredMBB).second)
1270 Worklist.push_back(PredMBB);
1271 }
1272 }
1273}
1274
1275void RewriteMFMAFormStage::findReachingUses(
1277 SmallVectorImpl<MachineOperand *> &ReachingUses) {
1278 SlotIndex DefIdx = LIS->getInstructionIndex(*DefMI);
1279 for (MachineOperand &UseMO :
1280 DAG.MRI.use_nodbg_operands(DefMI->getOperand(0).getReg())) {
1281 SmallVector<SlotIndex, 8> ReachingDefIndexes;
1282 findReachingDefs(UseMO, LIS, ReachingDefIndexes);
1283
1284 // If we find a use that contains this DefMI in its reachingDefs, then it is
1285 // a reaching use.
1286 if (any_of(ReachingDefIndexes, [DefIdx](SlotIndex RDIdx) {
1287 return SlotIndex::isSameInstr(RDIdx, DefIdx);
1288 }))
1289 ReachingUses.push_back(&UseMO);
1290 }
1291}
1292
1294 // We only need to run this pass if the architecture supports AGPRs.
1295 // Additionally, we don't use AGPRs at occupancy levels above 1 so there
1296 // is no need for this pass in that case, either.
1297 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1298 if (!ST.hasGFX90AInsts() || MFI.getMinWavesPerEU() > 1)
1299 return false;
1300
1301 RegionsWithExcessArchVGPR.resize(DAG.Regions.size());
1302 RegionsWithExcessArchVGPR.reset();
1303 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
1305 if (PressureBefore.getArchVGPRNum() > ST.getAddressableNumArchVGPRs())
1306 RegionsWithExcessArchVGPR[Region] = true;
1307 }
1308
1309 if (RegionsWithExcessArchVGPR.none())
1310 return false;
1311
1312 TII = ST.getInstrInfo();
1313 SRI = ST.getRegisterInfo();
1314
1315 std::vector<std::pair<MachineInstr *, unsigned>> RewriteCands;
1318
1319 if (!initHeuristics(RewriteCands, CopyForUse, CopyForDef))
1320 return false;
1321
1322 int64_t Cost = getRewriteCost(RewriteCands, CopyForUse, CopyForDef);
1323
1324 // If we haven't found the beneficial conditions, prefer the VGPR form which
1325 // may result in less cross RC copies.
1326 if (Cost > 0)
1327 return false;
1328
1329 return rewrite(RewriteCands);
1330}
1331
1334 return false;
1335
1337 return false;
1338
1339 if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
1340 return false;
1341
1342 SavedMutations.swap(DAG.Mutations);
1343 DAG.addMutation(
1345
1346 InitialOccupancy = DAG.MinOccupancy;
1347 // Aggressively try to reduce register pressure in the unclustered high RP
1348 // stage. Temporarily increase occupancy target in the region.
1349 TempTargetOccupancy = MFI.getMaxWavesPerEU() > DAG.MinOccupancy
1350 ? InitialOccupancy + 1
1351 : InitialOccupancy;
1352 IsAnyRegionScheduled = false;
1353 S.SGPRLimitBias = S.HighRPSGPRBias;
1354 S.VGPRLimitBias = S.HighRPVGPRBias;
1355
1356 LLVM_DEBUG(
1357 dbgs()
1358 << "Retrying function scheduling without clustering. "
1359 "Aggressively try to reduce register pressure to achieve occupancy "
1360 << TempTargetOccupancy << ".\n");
1361
1362 return true;
1363}
1364
1367 return false;
1368
1370 return false;
1371
1372 // Don't bother trying to improve ILP in lower RP regions if occupancy has not
1373 // been dropped. All regions will have already been scheduled with the ideal
1374 // occupancy targets.
1375 if (DAG.StartingOccupancy <= DAG.MinOccupancy)
1376 return false;
1377
1378 LLVM_DEBUG(
1379 dbgs() << "Retrying function scheduling with lowest recorded occupancy "
1380 << DAG.MinOccupancy << ".\n");
1381 return true;
1382}
1383
1384/// Allows to easily filter for this stage's debug output.
1385#define REMAT_PREFIX "[PreRARemat] "
1386#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
1387
1389 // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
1390 // regions inbetween the defs and region we sinked the def to. Will need to be
1391 // fixed if there is another pass after this pass.
1392 assert(!S.hasNextStage());
1393
1394 if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() == 1)
1395 return false;
1396
1397 // Before performing any IR modification record the parent region of each MI
1398 // and the parent MBB of each region.
1399 const unsigned NumRegions = DAG.Regions.size();
1400 RegionBB.reserve(NumRegions);
1401 for (unsigned I = 0; I < NumRegions; ++I) {
1402 RegionBoundaries Region = DAG.Regions[I];
1403 for (auto MI = Region.first; MI != Region.second; ++MI)
1404 MIRegion.insert({&*MI, I});
1405 RegionBB.push_back(Region.first->getParent());
1406 }
1407
1408 if (!canIncreaseOccupancyOrReduceSpill())
1409 return false;
1410
1411 // Rematerialize identified instructions and update scheduler's state.
1412 rematerialize();
1413 if (GCNTrackers)
1414 DAG.RegionLiveOuts.buildLiveRegMap();
1415 REMAT_DEBUG({
1416 dbgs() << "Retrying function scheduling with new min. occupancy of "
1417 << AchievedOcc << " from rematerializing (original was "
1418 << DAG.MinOccupancy;
1419 if (TargetOcc)
1420 dbgs() << ", target was " << *TargetOcc;
1421 dbgs() << ")\n";
1422 });
1423
1424 if (AchievedOcc > DAG.MinOccupancy) {
1425 DAG.MinOccupancy = AchievedOcc;
1427 MFI.increaseOccupancy(MF, DAG.MinOccupancy);
1428 }
1429 return true;
1430}
1431
1433 DAG.finishBlock();
1434 LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
1435}
1436
1438 SavedMutations.swap(DAG.Mutations);
1439 S.SGPRLimitBias = S.VGPRLimitBias = 0;
1440 if (DAG.MinOccupancy > InitialOccupancy) {
1441 assert(IsAnyRegionScheduled);
1443 << " stage successfully increased occupancy to "
1444 << DAG.MinOccupancy << '\n');
1445 } else if (!IsAnyRegionScheduled) {
1446 assert(DAG.MinOccupancy == InitialOccupancy);
1448 << ": No regions scheduled, min occupancy stays at "
1449 << DAG.MinOccupancy << ", MFI occupancy stays at "
1450 << MFI.getOccupancy() << ".\n");
1451 }
1452
1454}
1455
1457 // Check whether this new region is also a new block.
1458 if (DAG.RegionBegin->getParent() != CurrentMBB)
1459 setupNewBlock();
1460
1461 unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end());
1462 DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs);
1463
1464 // Skip empty scheduling regions (0 or 1 schedulable instructions).
1465 if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end()))
1466 return false;
1467
1468 LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
1469 LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
1470 << " " << CurrentMBB->getName()
1471 << "\n From: " << *DAG.begin() << " To: ";
1472 if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
1473 else dbgs() << "End";
1474 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
1475
1476 // Save original instruction order before scheduling for possible revert.
1477 Unsched.clear();
1478 Unsched.reserve(DAG.NumRegionInstrs);
1481 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG.TII);
1482 for (auto &I : DAG) {
1483 Unsched.push_back(&I);
1484 if (SII->isIGLPMutationOnly(I.getOpcode()))
1485 DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
1486 }
1487 } else {
1488 for (auto &I : DAG)
1489 Unsched.push_back(&I);
1490 }
1491
1492 PressureBefore = DAG.Pressure[RegionIdx];
1493
1494 LLVM_DEBUG(
1495 dbgs() << "Pressure before scheduling:\nRegion live-ins:"
1496 << print(DAG.LiveIns[RegionIdx], DAG.MRI)
1497 << "Region live-in pressure: "
1498 << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
1499 << "Region register pressure: " << print(PressureBefore));
1500
1501 S.HasHighPressure = false;
1502 S.KnownExcessRP = isRegionWithExcessRP();
1503
1504 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1506 SavedMutations.clear();
1507 SavedMutations.swap(DAG.Mutations);
1508 bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
1510 DAG.addMutation(createIGroupLPDAGMutation(
1511 IsInitialStage ? AMDGPU::SchedulingPhase::Initial
1513 }
1514
1515 return true;
1516}
1517
1519 // Only reschedule regions that have excess register pressure (i.e. spilling)
1520 // or had minimum occupancy at the beginning of the stage (as long as
1521 // rescheduling of previous regions did not make occupancy drop back down to
1522 // the initial minimum).
1523 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1524 // If no region has been scheduled yet, the DAG has not yet been updated with
1525 // the occupancy target. So retrieve it from the temporary.
1526 unsigned CurrentTargetOccupancy =
1527 IsAnyRegionScheduled ? DAG.MinOccupancy : TempTargetOccupancy;
1528 if (!DAG.RegionsWithExcessRP[RegionIdx] &&
1529 (CurrentTargetOccupancy <= InitialOccupancy ||
1530 DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) !=
1531 InitialOccupancy))
1532 return false;
1533
1534 bool IsSchedulingThisRegion = GCNSchedStage::initGCNRegion();
1535 // If this is the first region scheduled during this stage, make the target
1536 // occupancy changes in the DAG and MFI.
1537 if (!IsAnyRegionScheduled && IsSchedulingThisRegion) {
1538 IsAnyRegionScheduled = true;
1539 if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) {
1540 DAG.MinOccupancy = TempTargetOccupancy;
1541 MFI.increaseOccupancy(MF, TempTargetOccupancy);
1542 }
1543 }
1544 return IsSchedulingThisRegion;
1545}
1546
1548 // We may need to reschedule this region if it wasn't rescheduled in the last
1549 // stage, or if we found it was testing critical register pressure limits in
1550 // the unclustered reschedule stage. The later is because we may not have been
1551 // able to raise the min occupancy in the previous stage so the region may be
1552 // overly constrained even if it was already rescheduled.
1553 if (!DAG.RegionsWithHighRP[RegionIdx])
1554 return false;
1555
1557}
1558
1560 return RescheduleRegions[RegionIdx] && GCNSchedStage::initGCNRegion();
1561}
1562
1564 if (CurrentMBB)
1565 DAG.finishBlock();
1566
1567 CurrentMBB = DAG.RegionBegin->getParent();
1568 DAG.startBlock(CurrentMBB);
1569 // Get real RP for the region if it hasn't be calculated before. After the
1570 // initial schedule stage real RP will be collected after scheduling.
1574 DAG.computeBlockPressure(RegionIdx, CurrentMBB);
1575}
1576
1578 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1579 if (S.HasHighPressure)
1580 DAG.RegionsWithHighRP[RegionIdx] = true;
1581
1582 // Revert scheduling if we have dropped occupancy or there is some other
1583 // reason that the original schedule is better.
1585
1586 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1588 SavedMutations.swap(DAG.Mutations);
1589
1590 DAG.exitRegion();
1591 advanceRegion();
1592}
1593
1595 // Check the results of scheduling.
1596 PressureAfter = DAG.getRealRegPressure(RegionIdx);
1597
1598 LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
1599 LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
1600
1601 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1602
1603 if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
1604 PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
1605 DAG.Pressure[RegionIdx] = PressureAfter;
1606
1607 // Early out if we have achieved the occupancy target.
1608 LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
1609 return;
1610 }
1611
1612 unsigned TargetOccupancy = std::min(
1613 S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second);
1614 unsigned WavesAfter = std::min(
1615 TargetOccupancy, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
1616 unsigned WavesBefore = std::min(
1617 TargetOccupancy, PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
1618 LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
1619 << ", after " << WavesAfter << ".\n");
1620
1621 // We may not be able to keep the current target occupancy because of the just
1622 // scheduled region. We might still be able to revert scheduling if the
1623 // occupancy before was higher, or if the current schedule has register
1624 // pressure higher than the excess limits which could lead to more spilling.
1625 unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
1626
1627 // Allow memory bound functions to drop to 4 waves if not limited by an
1628 // attribute.
1629 if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
1630 WavesAfter >= MFI.getMinAllowedOccupancy()) {
1631 LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
1632 << MFI.getMinAllowedOccupancy() << " waves\n");
1633 NewOccupancy = WavesAfter;
1634 }
1635
1636 if (NewOccupancy < DAG.MinOccupancy) {
1637 DAG.MinOccupancy = NewOccupancy;
1638 MFI.limitOccupancy(DAG.MinOccupancy);
1639 LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
1640 << DAG.MinOccupancy << ".\n");
1641 }
1642 // The maximum number of arch VGPR on non-unified register file, or the
1643 // maximum VGPR + AGPR in the unified register file case.
1644 unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
1645 // The maximum number of arch VGPR for both unified and non-unified register
1646 // file.
1647 unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
1648 unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
1649
1650 if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs ||
1651 PressureAfter.getArchVGPRNum() > MaxArchVGPRs ||
1652 PressureAfter.getAGPRNum() > MaxArchVGPRs ||
1653 PressureAfter.getSGPRNum() > MaxSGPRs) {
1654 DAG.RegionsWithHighRP[RegionIdx] = true;
1655 DAG.RegionsWithExcessRP[RegionIdx] = true;
1656 }
1657
1658 // Revert if this region's schedule would cause a drop in occupancy or
1659 // spilling.
1660 if (shouldRevertScheduling(WavesAfter))
1662 else
1663 DAG.Pressure[RegionIdx] = PressureAfter;
1664}
1665
1666unsigned
1667GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
1668 DenseMap<unsigned, unsigned> &ReadyCycles,
1669 const TargetSchedModel &SM) {
1670 unsigned ReadyCycle = CurrCycle;
1671 for (auto &D : SU.Preds) {
1672 if (D.isAssignedRegDep()) {
1673 MachineInstr *DefMI = D.getSUnit()->getInstr();
1674 unsigned Latency = SM.computeInstrLatency(DefMI);
1675 unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum];
1676 ReadyCycle = std::max(ReadyCycle, DefReady + Latency);
1677 }
1678 }
1679 ReadyCycles[SU.NodeNum] = ReadyCycle;
1680 return ReadyCycle;
1681}
1682
1683#ifndef NDEBUG
1685 bool operator()(std::pair<MachineInstr *, unsigned> A,
1686 std::pair<MachineInstr *, unsigned> B) const {
1687 return A.second < B.second;
1688 }
1689};
1690
1691static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>,
1692 EarlierIssuingCycle> &ReadyCycles) {
1693 if (ReadyCycles.empty())
1694 return;
1695 unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
1696 dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
1697 << " ##################\n# Cycle #\t\t\tInstruction "
1698 " "
1699 " \n";
1700 unsigned IPrev = 1;
1701 for (auto &I : ReadyCycles) {
1702 if (I.second > IPrev + 1)
1703 dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
1704 << " CYCLES DETECTED ******************************\n\n";
1705 dbgs() << "[ " << I.second << " ] : " << *I.first << "\n";
1706 IPrev = I.second;
1707 }
1708}
1709#endif
1710
1712GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
1713#ifndef NDEBUG
1714 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1715 ReadyCyclesSorted;
1716#endif
1717 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1718 unsigned SumBubbles = 0;
1719 DenseMap<unsigned, unsigned> ReadyCycles;
1720 unsigned CurrCycle = 0;
1721 for (auto &SU : InputSchedule) {
1722 unsigned ReadyCycle =
1723 computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
1724 SumBubbles += ReadyCycle - CurrCycle;
1725#ifndef NDEBUG
1726 ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
1727#endif
1728 CurrCycle = ++ReadyCycle;
1729 }
1730#ifndef NDEBUG
1731 LLVM_DEBUG(
1732 printScheduleModel(ReadyCyclesSorted);
1733 dbgs() << "\n\t"
1734 << "Metric: "
1735 << (SumBubbles
1736 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1737 : 1)
1738 << "\n\n");
1739#endif
1740
1741 return ScheduleMetrics(CurrCycle, SumBubbles);
1742}
1743
1746#ifndef NDEBUG
1747 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1748 ReadyCyclesSorted;
1749#endif
1750 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1751 unsigned SumBubbles = 0;
1752 DenseMap<unsigned, unsigned> ReadyCycles;
1753 unsigned CurrCycle = 0;
1754 for (auto &MI : DAG) {
1755 SUnit *SU = DAG.getSUnit(&MI);
1756 if (!SU)
1757 continue;
1758 unsigned ReadyCycle =
1759 computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM);
1760 SumBubbles += ReadyCycle - CurrCycle;
1761#ifndef NDEBUG
1762 ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
1763#endif
1764 CurrCycle = ++ReadyCycle;
1765 }
1766#ifndef NDEBUG
1767 LLVM_DEBUG(
1768 printScheduleModel(ReadyCyclesSorted);
1769 dbgs() << "\n\t"
1770 << "Metric: "
1771 << (SumBubbles
1772 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1773 : 1)
1774 << "\n\n");
1775#endif
1776
1777 return ScheduleMetrics(CurrCycle, SumBubbles);
1778}
1779
1780bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
1781 if (WavesAfter < DAG.MinOccupancy)
1782 return true;
1783
1784 // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
1785 if (DAG.MFI.isDynamicVGPREnabled()) {
1786 unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
1787 &ST, DAG.MFI.getDynamicVGPRBlockSize(),
1788 PressureBefore.getVGPRNum(false));
1789 unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
1790 &ST, DAG.MFI.getDynamicVGPRBlockSize(),
1791 PressureAfter.getVGPRNum(false));
1792 if (BlocksAfter > BlocksBefore)
1793 return true;
1794 }
1795
1796 return false;
1797}
1798
1801 return false;
1802
1804 return true;
1805
1806 if (mayCauseSpilling(WavesAfter))
1807 return true;
1808
1809 return false;
1810}
1811
1813 // If RP is not reduced in the unclustered reschedule stage, revert to the
1814 // old schedule.
1815 if ((WavesAfter <=
1816 PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) &&
1817 mayCauseSpilling(WavesAfter)) ||
1819 LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
1820 return true;
1821 }
1822
1823 // Do not attempt to relax schedule even more if we are already spilling.
1825 return false;
1826
1827 LLVM_DEBUG(
1828 dbgs()
1829 << "\n\t *** In shouldRevertScheduling ***\n"
1830 << " *********** BEFORE UnclusteredHighRPStage ***********\n");
1831 ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits);
1832 LLVM_DEBUG(
1833 dbgs()
1834 << "\n *********** AFTER UnclusteredHighRPStage ***********\n");
1836 unsigned OldMetric = MBefore.getMetric();
1837 unsigned NewMetric = MAfter.getMetric();
1838 unsigned WavesBefore = std::min(
1839 S.getTargetOccupancy(),
1840 PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()));
1841 unsigned Profit =
1842 ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
1844 NewMetric) /
1846 LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
1847 << MAfter << "Profit: " << Profit << "\n");
1848 return Profit < ScheduleMetrics::ScaleFactor;
1849}
1850
1853 return false;
1854
1856 return true;
1857
1858 if (mayCauseSpilling(WavesAfter))
1859 return true;
1860
1861 return false;
1862}
1863
1865 return GCNSchedStage::shouldRevertScheduling(WavesAfter) ||
1866 mayCauseSpilling(WavesAfter) || (TargetOcc && WavesAfter < TargetOcc);
1867}
1868
1870 if (mayCauseSpilling(WavesAfter))
1871 return true;
1872
1873 return false;
1874}
1875
1877 unsigned WavesAfter) {
1878 return mayCauseSpilling(WavesAfter);
1879}
1880
1881bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
1882 if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
1884 LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
1885 return true;
1886 }
1887
1888 return false;
1889}
1890
1892 LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
1893 DAG.RegionEnd = DAG.RegionBegin;
1894 int SkippedDebugInstr = 0;
1895 for (MachineInstr *MI : Unsched) {
1896 if (MI->isDebugInstr()) {
1897 ++SkippedDebugInstr;
1898 continue;
1899 }
1900
1901 MachineBasicBlock::iterator MII = MI->getIterator();
1902 if (MII != DAG.RegionEnd) {
1903 // Will subsequent splice move MI up past a non-debug instruction?
1904 bool NonDebugReordered =
1905 skipDebugInstructionsForward(DAG.RegionEnd, MII) != MII;
1906 DAG.BB->splice(DAG.RegionEnd, DAG.BB, MI);
1907 // Only update LiveIntervals information if non-debug instructions are
1908 // reordered. Otherwise debug instructions could cause code generation to
1909 // change.
1910 if (NonDebugReordered)
1911 DAG.LIS->handleMove(*MI, true);
1912 }
1913
1914 // Reset read-undef flags and update them later.
1915 for (auto &Op : MI->all_defs())
1916 Op.setIsUndef(false);
1917 RegisterOperands RegOpers;
1918 RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false);
1919 if (!MI->isDebugInstr()) {
1920 if (DAG.ShouldTrackLaneMasks) {
1921 // Adjust liveness and add missing dead+read-undef flags.
1922 SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot();
1923 RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
1924 } else {
1925 // Adjust for missing dead-def flags.
1926 RegOpers.detectDeadDefs(*MI, *DAG.LIS);
1927 }
1928 }
1929 DAG.RegionEnd = MI->getIterator();
1930 ++DAG.RegionEnd;
1931 LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
1932 }
1933
1934 // After reverting schedule, debug instrs will now be at the end of the block
1935 // and RegionEnd will point to the first debug instr. Increment RegionEnd
1936 // pass debug instrs to the actual end of the scheduling region.
1937 while (SkippedDebugInstr-- > 0)
1938 ++DAG.RegionEnd;
1939
1940 // If Unsched.front() instruction is a debug instruction, this will actually
1941 // shrink the region since we moved all debug instructions to the end of the
1942 // block. Find the first instruction that is not a debug instruction.
1943 DAG.RegionBegin = Unsched.front()->getIterator();
1944 if (DAG.RegionBegin->isDebugInstr()) {
1945 for (MachineInstr *MI : Unsched) {
1946 if (MI->isDebugInstr())
1947 continue;
1948 DAG.RegionBegin = MI->getIterator();
1949 break;
1950 }
1951 }
1952
1953 // Then move the debug instructions back into their correct place and set
1954 // RegionBegin and RegionEnd if needed.
1955 DAG.placeDebugValues();
1956
1957 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1958}
1959
1960bool RewriteMFMAFormStage::isRewriteCandidate(MachineInstr *MI) const {
1961
1962 if (!static_cast<const SIInstrInfo *>(DAG.TII)->isMAI(*MI))
1963 return false;
1964 return AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode()) != -1;
1965}
1966
1967bool RewriteMFMAFormStage::initHeuristics(
1968 std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
1969 DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
1970 SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
1971 bool Changed = false;
1972
1973 // Prepare for the heuristics
1974 for (MachineBasicBlock &MBB : MF) {
1975 for (MachineInstr &MI : MBB) {
1976 if (!isRewriteCandidate(&MI))
1977 continue;
1978
1979 int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
1980 assert(ReplacementOp != -1);
1981
1982 RewriteCands.push_back({&MI, MI.getOpcode()});
1983 MI.setDesc(TII->get(ReplacementOp));
1984
1985 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
1986 if (Src2->isReg()) {
1987 SmallVector<SlotIndex, 8> Src2ReachingDefs;
1988 findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
1989
1990 // For any definition of the src2 register which is non-MFMA, we
1991 // insert a copy.
1992 for (SlotIndex RDIdx : Src2ReachingDefs) {
1993 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIdx);
1994 if (!TII->isMAI(*RD))
1995 CopyForDef.insert(RD);
1996 }
1997 }
1998
1999 MachineOperand &Dst = MI.getOperand(0);
2000 SmallVector<MachineOperand *, 8> DstReachingUses;
2001
2002 findReachingUses(&MI, DAG.LIS, DstReachingUses);
2003
2004 for (MachineOperand *RUOp : DstReachingUses) {
2005 if (TII->isMAI(*RUOp->getParent()))
2006 continue;
2007
2008 // For any user of the result of the MFMA which is not an MFMA, we
2009 // insert a copy. For a given register, we will only insert one copy
2010 // per user block.
2011 CopyForUse[RUOp->getParent()->getParent()].insert(RUOp->getReg());
2012
2013 SmallVector<SlotIndex, 8> DstUsesReachingDefs;
2014 findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
2015
2016 for (SlotIndex RDIndex : DstUsesReachingDefs) {
2017 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
2018 if (TII->isMAI(*RD))
2019 continue;
2020
2021 // For any definition of the user of the MFMA which is not an MFMA,
2022 // we insert a copy. We do this to transform all the reaching defs
2023 // of this use to AGPR. By doing this, we can insert a copy from
2024 // AGPR to VGPR at the user rather than after the MFMA.
2025 CopyForDef.insert(RD);
2026 }
2027 }
2028
2029 // Do the rewrite to allow for updated RP calculation.
2030 const TargetRegisterClass *VGPRRC = DAG.MRI.getRegClass(Dst.getReg());
2031 const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(VGPRRC);
2032 DAG.MRI.setRegClass(Dst.getReg(), AGPRRC);
2033 if (Src2->isReg())
2034 DAG.MRI.setRegClass(Src2->getReg(), AGPRRC);
2035 Changed = true;
2036 }
2037 }
2038
2039 return Changed;
2040}
2041
2042int64_t RewriteMFMAFormStage::getRewriteCost(
2043 const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
2044 const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
2045 const SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
2046 MachineBlockFrequencyInfo *MBFI = DAG.MBFI;
2047
2048 int64_t BestSpillCost = 0;
2049 int64_t Cost = 0;
2050 uint64_t EntryFreq = MBFI->getEntryFreq().getFrequency();
2051
2052 std::pair<unsigned, unsigned> MaxVectorRegs =
2053 ST.getMaxNumVectorRegs(MF.getFunction());
2054 unsigned ArchVGPRThreshold = MaxVectorRegs.first;
2055 unsigned AGPRThreshold = MaxVectorRegs.second;
2056 unsigned CombinedThreshold = ST.getMaxNumVGPRs(MF);
2057
2058 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
2059 if (!RegionsWithExcessArchVGPR[Region])
2060 continue;
2061
2062 GCNRegPressure &PressureBefore = DAG.Pressure[Region];
2063 unsigned SpillCostBefore = PressureBefore.getVGPRSpills(
2064 MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
2065
2066 // For the cases we care about (i.e. ArchVGPR usage is greater than the
2067 // addressable limit), rewriting alone should bring pressure to manageable
2068 // level. If we find any such region, then the rewrite is potentially
2069 // beneficial.
2070 GCNRegPressure PressureAfter = DAG.getRealRegPressure(Region);
2071 unsigned SpillCostAfter = PressureAfter.getVGPRSpills(
2072 MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
2073
2074 uint64_t BlockFreq =
2075 MBFI->getBlockFreq(DAG.Regions[Region].first->getParent())
2076 .getFrequency();
2077
2078 bool RelativeFreqIsDenom = EntryFreq > BlockFreq;
2079 uint64_t RelativeFreq = EntryFreq && BlockFreq
2080 ? (RelativeFreqIsDenom ? EntryFreq / BlockFreq
2081 : BlockFreq / EntryFreq)
2082 : 1;
2083
2084 // This assumes perfect spilling / splitting -- using one spill / copy
2085 // instruction and one restoreFrom / copy for each excess register,
2086 int64_t SpillCost = ((int)SpillCostAfter - (int)SpillCostBefore) * 2;
2087
2088 // Also account for the block frequency.
2089 if (RelativeFreqIsDenom)
2090 SpillCost /= (int64_t)RelativeFreq;
2091 else
2092 SpillCost *= (int64_t)RelativeFreq;
2093
2094 // If we have increased spilling in any block, just bail.
2095 if (SpillCost > 0)
2096 return SpillCost;
2097
2098 if (SpillCost < BestSpillCost)
2099 BestSpillCost = SpillCost;
2100 }
2101
2102 // Set the cost to the largest decrease in spill cost in order to not double
2103 // count spill reductions.
2104 Cost = BestSpillCost;
2105 assert(Cost <= 0);
2106
2107 unsigned CopyCost = 0;
2108
2109 // For each CopyForDef, increase the cost by the register size while
2110 // accounting for block frequency.
2111 for (MachineInstr *DefMI : CopyForDef) {
2112 Register DefReg = DefMI->getOperand(0).getReg();
2113 uint64_t DefFreq =
2114 EntryFreq
2115 ? MBFI->getBlockFreq(DefMI->getParent()).getFrequency() / EntryFreq
2116 : 1;
2117
2118 const TargetRegisterClass *RC = DAG.MRI.getRegClass(DefReg);
2119 CopyCost += RC->getCopyCost() * DefFreq;
2120 }
2121
2122 // Account for CopyForUse copies in each block that the register is used.
2123 for (auto &[UseBlock, UseRegs] : CopyForUse) {
2124 uint64_t UseFreq =
2125 EntryFreq ? MBFI->getBlockFreq(UseBlock).getFrequency() / EntryFreq : 1;
2126
2127 for (Register UseReg : UseRegs) {
2128 const TargetRegisterClass *RC = DAG.MRI.getRegClass(UseReg);
2129 CopyCost += RC->getCopyCost() * UseFreq;
2130 }
2131 }
2132
2133 // Reset the classes that were changed to AGPR for better RB analysis.
2134 // We must do rewriting after copy-insertion, as some defs of the register
2135 // may require VGPR. Additionally, if we bail out and don't perform the
2136 // rewrite then these need to be restored anyway.
2137 for (auto &[MI, OriginalOpcode] : RewriteCands) {
2138 assert(TII->isMAI(*MI));
2139 const TargetRegisterClass *AGPRRC =
2140 DAG.MRI.getRegClass(MI->getOperand(0).getReg());
2141 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(AGPRRC);
2142
2143 MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
2144 assert(Src2);
2145
2146 if (Src2->isReg())
2147 DAG.MRI.setRegClass(Src2->getReg(), VGPRRC);
2148 DAG.MRI.setRegClass(MI->getOperand(0).getReg(), VGPRRC);
2149 MI->setDesc(TII->get(OriginalOpcode));
2150 }
2151
2152 return Cost + CopyCost;
2153}
2154
2155bool RewriteMFMAFormStage::rewrite(
2156 const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands) {
2157 DenseMap<MachineInstr *, unsigned> FirstMIToRegion;
2158 DenseMap<MachineInstr *, unsigned> LastMIToRegion;
2159
2160 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
2161 RegionBoundaries Entry = DAG.Regions[Region];
2162 if (Entry.first == Entry.second)
2163 continue;
2164
2165 FirstMIToRegion[&*Entry.first] = Region;
2166 if (Entry.second != Entry.first->getParent()->end())
2167 LastMIToRegion[&*Entry.second] = Region;
2168 }
2169
2170 // Rewrite the MFMAs to AGPR, and insert any copies as needed.
2171 // The general assumption of the algorithm (and the previous cost calculation)
2172 // is that it is better to insert the copies in the MBB of the def of the src2
2173 // operands, and in the MBB of the user of the dest operands. This is based on
2174 // the assumption that the MFMAs are likely to appear in loop bodies, while
2175 // the src2 and dest operands are live-in / live-out of the loop. Due to this
2176 // design, the algorithm for finding copy insertion points is more
2177 // complicated.
2178 //
2179 // There are three main cases to handle: 1. the reaching defs of the src2
2180 // operands, 2. the reaching uses of the dst operands, and 3. the reaching
2181 // defs of the reaching uses of the dst operand.
2182 //
2183 // In the first case, we simply insert copies after each of the reaching
2184 // definitions. In the second case, we collect all the uses of a given dest
2185 // and organize them by MBB. Then, we insert 1 copy for each MBB before the
2186 // earliest use. Since the use may have multiple reaching defs, and since we
2187 // want to replace the register it is using with the result of the copy, we
2188 // must handle case 3. In the third case, we simply insert a copy after each
2189 // of the reaching defs to connect to the copy of the reaching uses of the dst
2190 // reg. This allows us to avoid inserting copies next to the MFMAs.
2191 //
2192 // While inserting the copies, we maintain a map of operands which will use
2193 // different regs (i.e. the result of the copies). For example, a case 1 src2
2194 // operand will use the register result of the copies after the reaching defs,
2195 // as opposed to the original register. Now that we have completed our copy
2196 // analysis and placement, we can bulk update the registers. We do this
2197 // separately as to avoid complicating the reachingDef and reachingUse
2198 // queries.
2199 //
2200 // While inserting the copies, we also maintain a list or registers which we
2201 // will want to reclassify as AGPR. After doing the copy insertion and the
2202 // register replacement, we can finally do the reclassification. This uses the
2203 // redef map, as the registers we are interested in reclassifying may be
2204 // replaced by the result of a copy. We must do this after the copy analysis
2205 // and placement as we must have an accurate redef map -- otherwise we may end
2206 // up creating illegal instructions.
2207
2208 // The original registers of the MFMA that need to be reclassified as AGPR.
2209 DenseSet<Register> RewriteRegs;
2210 // The map of an original register in the MFMA to a new register (result of a
2211 // copy) that it should be replaced with.
2212 DenseMap<Register, Register> RedefMap;
2213 // The map of the original MFMA registers to the relevant MFMA operands.
2214 DenseMap<Register, DenseSet<MachineOperand *>> ReplaceMap;
2215 // The map of reaching defs for a given register -- to avoid duplicate copies.
2216 DenseMap<Register, SmallPtrSet<MachineInstr *, 8>> ReachingDefCopyMap;
2217 // The map of reaching uses for a given register by basic block -- to avoid
2218 // duplicate copies and to calculate per MBB insert pts.
2219 DenseMap<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>
2220 ReachingUseTracker;
2221
2222 for (auto &[MI, OriginalOpcode] : RewriteCands) {
2223 int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode());
2224 if (ReplacementOp == -1)
2225 continue;
2226 MI->setDesc(TII->get(ReplacementOp));
2227
2228 // Case 1: insert copies for the reaching defs of the Src2Reg.
2229 MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
2230 if (Src2->isReg()) {
2231 Register Src2Reg = Src2->getReg();
2232 if (!Src2Reg.isVirtual())
2233 return false;
2234
2235 Register MappedReg = Src2->getReg();
2236 SmallVector<SlotIndex, 8> Src2ReachingDefs;
2237 findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
2238 SmallSetVector<MachineInstr *, 8> Src2DefsReplace;
2239
2240 for (SlotIndex RDIndex : Src2ReachingDefs) {
2241 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
2242 if (TII->isMAI(*RD))
2243 continue;
2244
2245 // If there is a non mai reaching def, then we need a copy.
2246 Src2DefsReplace.insert(RD);
2247 }
2248
2249 if (!Src2DefsReplace.empty()) {
2250 DenseMap<Register, Register>::iterator RI = RedefMap.find(Src2Reg);
2251 if (RI != RedefMap.end()) {
2252 MappedReg = RI->second;
2253 } else {
2254 assert(!ReachingDefCopyMap.contains(Src2Reg));
2255 const TargetRegisterClass *Src2RC = DAG.MRI.getRegClass(Src2Reg);
2256 const TargetRegisterClass *VGPRRC =
2257 SRI->getEquivalentVGPRClass(Src2RC);
2258
2259 // Track the mapping of the original register to the new register.
2260 MappedReg = DAG.MRI.createVirtualRegister(VGPRRC);
2261 RedefMap[Src2Reg] = MappedReg;
2262 }
2263
2264 // If none exists, create a copy from this reaching def.
2265 // We may have inserted a copy already in an earlier iteration.
2266 for (MachineInstr *RD : Src2DefsReplace) {
2267 // Do not create redundant copies.
2268 if (ReachingDefCopyMap[Src2Reg].insert(RD).second) {
2269 MachineInstrBuilder VGPRCopy =
2270 BuildMI(*RD->getParent(), std::next(RD->getIterator()),
2271 RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
2272 .addDef(MappedReg, {}, 0)
2273 .addUse(Src2Reg, {}, 0);
2274 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2275
2276 // If this reaching def was the last MI in the region, update the
2277 // region boundaries.
2278 if (LastMIToRegion.contains(RD)) {
2279 unsigned UpdateRegion = LastMIToRegion[RD];
2280 DAG.Regions[UpdateRegion].second = VGPRCopy;
2281 LastMIToRegion.erase(RD);
2282 }
2283 }
2284 }
2285 }
2286
2287 // Track the register for reclassification
2288 RewriteRegs.insert(Src2Reg);
2289
2290 // Always insert the operand for replacement. If this corresponds with a
2291 // chain of tied-def we may not see the VGPR requirement until later.
2292 ReplaceMap[Src2Reg].insert(Src2);
2293 }
2294
2295 // Case 2 and Case 3: insert copies before the reaching uses of the dsts,
2296 // and after the reaching defs of the reaching uses of the dsts.
2297
2298 MachineOperand *Dst = &MI->getOperand(0);
2299 Register DstReg = Dst->getReg();
2300 if (!DstReg.isVirtual())
2301 return false;
2302
2303 Register MappedReg = DstReg;
2304 SmallVector<MachineOperand *, 8> DstReachingUses;
2305
2306 SmallVector<MachineOperand *, 8> DstReachingUseCopies;
2307 SmallVector<MachineInstr *, 8> DstUseDefsReplace;
2308
2309 findReachingUses(MI, DAG.LIS, DstReachingUses);
2310
2311 for (MachineOperand *RUOp : DstReachingUses) {
2312 if (TII->isMAI(*RUOp->getParent()))
2313 continue;
2314
2315 // If there is a non mai reaching use, then we need a copy.
2316 if (find(DstReachingUseCopies, RUOp) == DstReachingUseCopies.end())
2317 DstReachingUseCopies.push_back(RUOp);
2318 SmallVector<SlotIndex, 8> DstUsesReachingDefs;
2319 findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
2320
2321 for (SlotIndex RDIndex : DstUsesReachingDefs) {
2322 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
2323 if (TII->isMAI(*RD))
2324 continue;
2325
2326 // If there is a non mai reaching def of this reaching use, then we will
2327 // need a copy.
2328 if (find(DstUseDefsReplace, RD) == DstUseDefsReplace.end())
2329 DstUseDefsReplace.push_back(RD);
2330 }
2331 }
2332
2333 if (!DstUseDefsReplace.empty()) {
2334 DenseMap<Register, Register>::iterator RI = RedefMap.find(DstReg);
2335 if (RI != RedefMap.end()) {
2336 MappedReg = RI->second;
2337 } else {
2338 assert(!ReachingDefCopyMap.contains(DstReg));
2339 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg);
2340 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
2341
2342 // Track the mapping of the original register to the new register.
2343 MappedReg = DAG.MRI.createVirtualRegister(VGPRRC);
2344 RedefMap[DstReg] = MappedReg;
2345 }
2346
2347 // If none exists, create a copy from this reaching def.
2348 // We may have inserted a copy already in an earlier iteration.
2349 for (MachineInstr *RD : DstUseDefsReplace) {
2350 // Do not create reundant copies.
2351 if (ReachingDefCopyMap[DstReg].insert(RD).second) {
2352 MachineInstrBuilder VGPRCopy =
2353 BuildMI(*RD->getParent(), std::next(RD->getIterator()),
2354 RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
2355 .addDef(MappedReg, {}, 0)
2356 .addUse(DstReg, {}, 0);
2357 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2358
2359 // If this reaching def was the last MI in the region, update the
2360 // region boundaries.
2362 LastMIToRegion.find(RD);
2363 if (LMI != LastMIToRegion.end()) {
2364 unsigned UpdateRegion = LMI->second;
2365 DAG.Regions[UpdateRegion].second = VGPRCopy;
2366 LastMIToRegion.erase(RD);
2367 }
2368 }
2369 }
2370 }
2371
2372 DenseSet<MachineOperand *> &DstRegSet = ReplaceMap[DstReg];
2373 for (MachineOperand *RU : DstReachingUseCopies) {
2374 MachineBasicBlock *RUBlock = RU->getParent()->getParent();
2375 // Just keep track of the reaching use of this register by block. After we
2376 // have scanned all the MFMAs we can find optimal insert pts.
2377 if (RUBlock != MI->getParent()) {
2378 ReachingUseTracker[RUBlock->getNumber()][DstReg].insert(RU);
2379 continue;
2380 }
2381
2382 // Special case, the use is in the same block as the MFMA. Insert the copy
2383 // just before the use.
2384 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg);
2385 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
2386 Register NewUseReg = DAG.MRI.createVirtualRegister(VGPRRC);
2387 MachineInstr *UseInst = RU->getParent();
2388 MachineInstrBuilder VGPRCopy =
2389 BuildMI(*UseInst->getParent(), UseInst->getIterator(),
2390 UseInst->getDebugLoc(), TII->get(TargetOpcode::COPY))
2391 .addDef(NewUseReg, {}, 0)
2392 .addUse(DstReg, {}, 0);
2393 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2394 // Since we know this use has only one reaching def, we can replace the
2395 // use reg.
2396 RU->setReg(NewUseReg);
2397 // Track the copy source operand for r eplacement.
2398 DstRegSet.insert(&VGPRCopy->getOperand(1));
2399 }
2400
2401 // Track the register for reclassification
2402 RewriteRegs.insert(DstReg);
2403
2404 // Insert the dst operand for replacement. If this dst is in a chain of
2405 // tied-def MFMAs, and the first src2 needs to be replaced with a new reg,
2406 // all the correspond operands need to be replaced.
2407 DstRegSet.insert(Dst);
2408 }
2409
2410 // Handle the copies for dst uses.
2411 using RUBType =
2412 std::pair<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>;
2413 for (RUBType RUBlockEntry : ReachingUseTracker) {
2414 using RUDType = std::pair<Register, SmallPtrSet<MachineOperand *, 8>>;
2415 for (RUDType RUDst : RUBlockEntry.second) {
2416 MachineOperand *OpBegin = *RUDst.second.begin();
2417 SlotIndex InstPt = DAG.LIS->getInstructionIndex(*OpBegin->getParent());
2418
2419 // Find the earliest use in this block.
2420 for (MachineOperand *User : RUDst.second) {
2421 SlotIndex NewInstPt = DAG.LIS->getInstructionIndex(*User->getParent());
2422 if (SlotIndex::isEarlierInstr(NewInstPt, InstPt))
2423 InstPt = NewInstPt;
2424 }
2425
2426 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(RUDst.first);
2427 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
2428 Register NewUseReg = DAG.MRI.createVirtualRegister(VGPRRC);
2429 MachineInstr *UseInst = DAG.LIS->getInstructionFromIndex(InstPt);
2430
2431 MachineInstrBuilder VGPRCopy =
2432 BuildMI(*UseInst->getParent(), UseInst->getIterator(),
2433 UseInst->getDebugLoc(), TII->get(TargetOpcode::COPY))
2434 .addDef(NewUseReg, {}, 0)
2435 .addUse(RUDst.first, {}, 0);
2436 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2437
2438 // If this UseInst was the first MI in the region, update the region
2439 // boundaries.
2441 FirstMIToRegion.find(UseInst);
2442 if (FI != FirstMIToRegion.end()) {
2443 unsigned UpdateRegion = FI->second;
2444 DAG.Regions[UpdateRegion].first = VGPRCopy;
2445 FirstMIToRegion.erase(UseInst);
2446 }
2447
2448 // Replace the operand for all users.
2449 for (MachineOperand *User : RUDst.second) {
2450 User->setReg(NewUseReg);
2451 }
2452
2453 // Track the copy source operand for replacement.
2454 ReplaceMap[RUDst.first].insert(&VGPRCopy->getOperand(1));
2455 }
2456 }
2457
2458 // We may have needed to insert copies after the reaching defs of the MFMAs.
2459 // Replace the original register with the result of the copy for all relevant
2460 // operands.
2461 for (std::pair<Register, Register> NewDef : RedefMap) {
2462 Register OldReg = NewDef.first;
2463 Register NewReg = NewDef.second;
2464
2465 // Replace the register for any associated operand in the MFMA chain.
2466 for (MachineOperand *ReplaceOp : ReplaceMap[OldReg])
2467 ReplaceOp->setReg(NewReg);
2468 }
2469
2470 // Finally, do the reclassification of the MFMA registers.
2471 for (Register RewriteReg : RewriteRegs) {
2472 Register RegToRewrite = RewriteReg;
2473
2474 // Be sure to update the replacement register and not the original.
2475 DenseMap<Register, Register>::iterator RI = RedefMap.find(RewriteReg);
2476 if (RI != RedefMap.end())
2477 RegToRewrite = RI->second;
2478
2479 const TargetRegisterClass *CurrRC = DAG.MRI.getRegClass(RegToRewrite);
2480 const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(CurrRC);
2481
2482 DAG.MRI.setRegClass(RegToRewrite, AGPRRC);
2483 }
2484
2485 // Bulk update the LIS.
2486 DAG.LIS->reanalyze(DAG.MF);
2487 // Liveins may have been modified for cross RC copies
2488 RegionPressureMap LiveInUpdater(&DAG, false);
2489 LiveInUpdater.buildLiveRegMap();
2490
2491 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++)
2492 DAG.LiveIns[Region] = LiveInUpdater.getLiveRegsForRegionIdx(Region);
2493
2494 DAG.Pressure[RegionIdx] = DAG.getRealRegPressure(RegionIdx);
2495
2496 return true;
2497}
2498
2499bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
2500 const Function &F = MF.getFunction();
2501
2502 // Maps optimizable regions (i.e., regions at minimum and register-limited
2503 // occupancy, or regions with spilling) to the target RP we would like to
2504 // reach.
2505 DenseMap<unsigned, GCNRPTarget> OptRegions;
2506 unsigned MaxSGPRs = ST.getMaxNumSGPRs(F);
2507 unsigned MaxVGPRs = ST.getMaxNumVGPRs(F);
2508 bool HasVectorRegisterExcess;
2509
2510 auto ResetTargetRegions = [&]() {
2511 OptRegions.clear();
2512 HasVectorRegisterExcess = false;
2513 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
2514 const GCNRegPressure &RP = DAG.Pressure[I];
2515 GCNRPTarget Target(MaxSGPRs, MaxVGPRs, MF, RP);
2516 if (!Target.satisfied())
2517 OptRegions.insert({I, Target});
2518 // We are willing to consider a RP that does not satisfy the target
2519 // as long as it doesn't result in spilling to memory. We
2520 // tolerate SGPR spills to VGPR since these have a lower impact on the
2521 // performance.
2522 // VGPR/AGPR excess could be aliviated by copying to an AGPR/VGPR.
2523 // Currently, we consider any spilling as bad (we should consider this
2524 // case in the future).
2525 HasVectorRegisterExcess |= Target.hasVectorRegisterExcess();
2526 }
2527 };
2528
2529 ResetTargetRegions();
2530 if (HasVectorRegisterExcess || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) {
2531 // In addition to register usage being above addressable limits, occupancy
2532 // below the minimum is considered like "spilling" as well.
2533 TargetOcc = std::nullopt;
2534 } else {
2535 // There is no spilling and room to improve occupancy; set up "increased
2536 // occupancy targets" for all regions.
2537 TargetOcc = DAG.MinOccupancy + 1;
2538 unsigned VGPRBlockSize =
2539 MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
2540 MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false);
2541 MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize);
2542 ResetTargetRegions();
2543 }
2544 REMAT_DEBUG({
2545 dbgs() << "Analyzing ";
2546 MF.getFunction().printAsOperand(dbgs(), false);
2547 dbgs() << ": ";
2548 if (OptRegions.empty()) {
2549 dbgs() << "no objective to achieve, occupancy is maximal at "
2550 << MFI.getMaxWavesPerEU();
2551 } else if (!TargetOcc) {
2552 dbgs() << "reduce spilling (minimum target occupancy is "
2553 << MFI.getMinWavesPerEU() << ')';
2554 } else {
2555 dbgs() << "increase occupancy from " << DAG.MinOccupancy << " to "
2556 << TargetOcc;
2557 }
2558 dbgs() << '\n';
2559 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
2560 if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) {
2561 dbgs() << REMAT_PREFIX << " [" << I << "] " << OptIt->getSecond()
2562 << '\n';
2563 }
2564 }
2565 });
2566 if (OptRegions.empty())
2567 return false;
2568
2569 // Accounts for a reduction in RP in an optimizable region. Returns whether we
2570 // estimate that we have identified enough rematerialization opportunities to
2571 // achieve our goal, and sets Progress to true when this particular reduction
2572 // in pressure was helpful toward that goal.
2573 auto ReduceRPInRegion = [&](auto OptIt, Register Reg, LaneBitmask Mask,
2574 bool &Progress) -> bool {
2575 GCNRPTarget &Target = OptIt->getSecond();
2576 if (!Target.isSaveBeneficial(Reg))
2577 return false;
2578 Progress = true;
2579 Target.saveReg(Reg, Mask, DAG.MRI);
2580 if (Target.satisfied())
2581 OptRegions.erase(OptIt->getFirst());
2582 return OptRegions.empty();
2583 };
2584
2585 // We need up-to-date live-out info. to query live-out register masks in
2586 // regions containing rematerializable instructions.
2587 DAG.RegionLiveOuts.buildLiveRegMap();
2588
2589 // Cache set of registers that are going to be rematerialized.
2590 DenseSet<unsigned> RematRegs;
2591
2592 // Identify rematerializable instructions in the function.
2593 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
2594 auto Region = DAG.Regions[I];
2595 for (auto MI = Region.first; MI != Region.second; ++MI) {
2596 // The instruction must be rematerializable.
2597 MachineInstr &DefMI = *MI;
2598 if (!isReMaterializable(DefMI))
2599 continue;
2600
2601 // We only support rematerializing virtual registers with one definition.
2602 Register Reg = DefMI.getOperand(0).getReg();
2603 if (!Reg.isVirtual() || !DAG.MRI.hasOneDef(Reg))
2604 continue;
2605
2606 // We only care to rematerialize the instruction if it has a single
2607 // non-debug user in a different region. The using MI may not belong to a
2608 // region if it is a lone region terminator.
2609 MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg);
2610 if (!UseMI)
2611 continue;
2612 auto UseRegion = MIRegion.find(UseMI);
2613 if (UseRegion != MIRegion.end() && UseRegion->second == I)
2614 continue;
2615
2616 // Do not rematerialize an instruction if it uses or is used by an
2617 // instruction that we have designated for rematerialization.
2618 // FIXME: Allow for rematerialization chains: this requires 1. updating
2619 // remat points to account for uses that are rematerialized, and 2. either
2620 // rematerializing the candidates in careful ordering, or deferring the
2621 // MBB RP walk until the entire chain has been rematerialized.
2622 if (Rematerializations.contains(UseMI) ||
2623 llvm::any_of(DefMI.operands(), [&RematRegs](MachineOperand &MO) {
2624 return MO.isReg() && RematRegs.contains(MO.getReg());
2625 }))
2626 continue;
2627
2628 // Do not rematerialize an instruction it it uses registers that aren't
2629 // available at its use. This ensures that we are not extending any live
2630 // range while rematerializing.
2631 SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true);
2632 if (!VirtRegAuxInfo::allUsesAvailableAt(&DefMI, UseIdx, *DAG.LIS, DAG.MRI,
2633 *DAG.TII))
2634 continue;
2635
2636 REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI);
2637 RematInstruction &Remat =
2638 Rematerializations.try_emplace(&DefMI, UseMI).first->second;
2639
2640 bool RematUseful = false;
2641 if (auto It = OptRegions.find(I); It != OptRegions.end()) {
2642 // Optimistically consider that moving the instruction out of its
2643 // defining region will reduce RP in the latter; this assumes that
2644 // maximum RP in the region is reached somewhere between the defining
2645 // instruction and the end of the region.
2646 REMAT_DEBUG(dbgs() << " Defining region is optimizable\n");
2647 LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
2648 if (ReduceRPInRegion(It, Reg, Mask, RematUseful))
2649 return true;
2650 }
2651
2652 for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) {
2653 // We are only collecting regions in which the register is a live-in
2654 // (and may be live-through).
2655 auto It = DAG.LiveIns[LIRegion].find(Reg);
2656 if (It == DAG.LiveIns[LIRegion].end() || It->second.none())
2657 continue;
2658 Remat.LiveInRegions.insert(LIRegion);
2659
2660 // Account for the reduction in RP due to the rematerialization in an
2661 // optimizable region in which the defined register is a live-in. This
2662 // is exact for live-through region but optimistic in the using region,
2663 // where RP is actually reduced only if maximum RP is reached somewhere
2664 // between the beginning of the region and the rematerializable
2665 // instruction's use.
2666 if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) {
2667 REMAT_DEBUG(dbgs() << " Live-in in region " << LIRegion << '\n');
2668 if (ReduceRPInRegion(It, Reg, DAG.LiveIns[LIRegion][Reg],
2669 RematUseful))
2670 return true;
2671 }
2672 }
2673
2674 // If the instruction is not a live-in or live-out in any optimizable
2675 // region then there is no point in rematerializing it.
2676 if (!RematUseful) {
2677 Rematerializations.pop_back();
2678 REMAT_DEBUG(dbgs() << " No impact, not rematerializing instruction\n");
2679 } else {
2680 RematRegs.insert(Reg);
2681 }
2682 }
2683 }
2684
2685 if (TargetOcc) {
2686 // We were trying to increase occupancy but failed, abort the stage.
2687 REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
2688 Rematerializations.clear();
2689 return false;
2690 }
2691 REMAT_DEBUG(dbgs() << "Can reduce but not eliminate spilling\n");
2692 return !Rematerializations.empty();
2693}
2694
2695void PreRARematStage::rematerialize() {
2696 const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
2697
2698 // Collect regions whose RP changes in unpredictable way; we will have to
2699 // fully recompute their RP after all rematerailizations.
2700 DenseSet<unsigned> RecomputeRP;
2701
2702 // Rematerialize all instructions.
2703 for (auto &[DefMI, Remat] : Rematerializations) {
2704 MachineBasicBlock::iterator InsertPos(Remat.UseMI);
2706 unsigned DefRegion = MIRegion.at(DefMI);
2707
2708 // Rematerialize DefMI to its use block.
2709 TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
2710 AMDGPU::NoSubRegister, *DefMI);
2711 Remat.RematMI = &*std::prev(InsertPos);
2712 DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI);
2713
2714 // Update region boundaries in regions we sinked from (remove defining MI)
2715 // and to (insert MI rematerialized in use block). Only then we can erase
2716 // the original MI.
2717 DAG.updateRegionBoundaries(DAG.Regions[DefRegion], DefMI, nullptr);
2718 auto UseRegion = MIRegion.find(Remat.UseMI);
2719 if (UseRegion != MIRegion.end()) {
2720 DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], InsertPos,
2721 Remat.RematMI);
2722 }
2723 DAG.LIS->RemoveMachineInstrFromMaps(*DefMI);
2725
2726 // Collect all regions impacted by the rematerialization and update their
2727 // live-in/RP information.
2728 for (unsigned I : Remat.LiveInRegions) {
2729 ImpactedRegions.insert({I, DAG.Pressure[I]});
2730 GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I];
2731
2732#ifdef EXPENSIVE_CHECKS
2733 // All uses are known to be available / live at the remat point. Thus, the
2734 // uses should already be live in to the region.
2735 for (MachineOperand &MO : DefMI->operands()) {
2736 if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
2737 continue;
2738
2739 Register UseReg = MO.getReg();
2740 if (!UseReg.isVirtual())
2741 continue;
2742
2743 LiveInterval &LI = DAG.LIS->getInterval(UseReg);
2744 LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
2745 if (LI.hasSubRanges() && MO.getSubReg())
2746 LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
2747
2748 LaneBitmask LiveInMask = RegionLiveIns.at(UseReg);
2749 LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
2750 // If this register has lanes not covered by the LiveIns, be sure they
2751 // do not map to any subrange. ref:
2752 // machine-scheduler-sink-trivial-remats.mir::omitted_subrange
2753 if (UncoveredLanes.any()) {
2754 assert(LI.hasSubRanges());
2755 for (LiveInterval::SubRange &SR : LI.subranges())
2756 assert((SR.LaneMask & UncoveredLanes).none());
2757 }
2758 }
2759#endif
2760
2761 // The register is no longer a live-in in all regions but the one that
2762 // contains the single use. In live-through regions, maximum register
2763 // pressure decreases predictably so we can directly update it. In the
2764 // using region, maximum RP may or may not decrease, so we will mark it
2765 // for re-computation after all materializations have taken place.
2766 LaneBitmask PrevMask = RegionLiveIns[Reg];
2767 RegionLiveIns.erase(Reg);
2768 RegMasks.insert({{I, Remat.RematMI->getOperand(0).getReg()}, PrevMask});
2769 if (Remat.UseMI->getParent() != DAG.Regions[I].first->getParent())
2770 DAG.Pressure[I].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
2771 else
2772 RecomputeRP.insert(I);
2773 }
2774 // RP in the region from which the instruction was rematerialized may or may
2775 // not decrease.
2776 ImpactedRegions.insert({DefRegion, DAG.Pressure[DefRegion]});
2777 RecomputeRP.insert(DefRegion);
2778
2779 // Recompute live interval to reflect the register's rematerialization.
2780 Register RematReg = Remat.RematMI->getOperand(0).getReg();
2781 DAG.LIS->removeInterval(RematReg);
2782 DAG.LIS->createAndComputeVirtRegInterval(RematReg);
2783 }
2784
2785 // All regions impacted by at least one rematerialization must be rescheduled.
2786 // Maximum pressure must also be recomputed for all regions where it changed
2787 // non-predictably and checked against the target occupancy.
2788 unsigned DynamicVGPRBlockSize =
2789 MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
2790 AchievedOcc = MFI.getMaxWavesPerEU();
2791 for (auto &[I, OriginalRP] : ImpactedRegions) {
2792 bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second;
2793 RescheduleRegions[I] = !IsEmptyRegion;
2794 if (!RecomputeRP.contains(I))
2795 continue;
2796
2797 GCNRegPressure RP;
2798 if (IsEmptyRegion) {
2799 RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
2800 } else {
2801 GCNDownwardRPTracker RPT(*DAG.LIS);
2802 auto *NonDbgMI = &*skipDebugInstructionsForward(DAG.Regions[I].first,
2803 DAG.Regions[I].second);
2804 if (NonDbgMI == DAG.Regions[I].second) {
2805 // Region is non-empty but contains only debug instructions.
2806 RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
2807 } else {
2808 RPT.reset(*NonDbgMI, &DAG.LiveIns[I]);
2809 RPT.advance(DAG.Regions[I].second);
2810 RP = RPT.moveMaxPressure();
2811 }
2812 }
2813 DAG.Pressure[I] = RP;
2814 AchievedOcc =
2815 std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize));
2816 }
2817 REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
2818}
2819
2820// Copied from MachineLICM
2821bool PreRARematStage::isReMaterializable(const MachineInstr &MI) {
2822 if (!DAG.TII->isReMaterializable(MI))
2823 return false;
2824
2825 for (const MachineOperand &MO : MI.all_uses()) {
2826 // We can't remat physreg uses, unless it is a constant or an ignorable
2827 // use (e.g. implicit exec use on VALU instructions)
2828 if (MO.getReg().isPhysical()) {
2829 if (DAG.MRI.isConstantPhysReg(MO.getReg()) || DAG.TII->isIgnorableUse(MO))
2830 continue;
2831 return false;
2832 }
2833 }
2834
2835 return true;
2836}
2837
2839 // We consider that reducing spilling is always beneficial so we never
2840 // rollback rematerializations in such cases. It's also possible that
2841 // rescheduling lowers occupancy over the one achieved just through remats, in
2842 // which case we do not want to rollback either (the rescheduling was already
2843 // reverted in PreRARematStage::shouldRevertScheduling in such cases).
2844 unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy);
2845 if (!TargetOcc || MaxOcc >= *TargetOcc)
2846 return;
2847
2848 REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n");
2849 const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
2850
2851 // Rollback the rematerializations.
2852 for (const auto &[DefMI, Remat] : Rematerializations) {
2853 MachineInstr &RematMI = *Remat.RematMI;
2854 unsigned DefRegion = MIRegion.at(DefMI);
2855 MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second);
2856 MachineBasicBlock *MBB = RegionBB[DefRegion];
2857 Register Reg = RematMI.getOperand(0).getReg();
2858
2859 // Re-rematerialize MI at the end of its original region. Note that it may
2860 // not be rematerialized exactly in the same position as originally within
2861 // the region, but it should not matter much.
2862 TII->reMaterialize(*MBB, InsertPos, Reg, AMDGPU::NoSubRegister, RematMI);
2863 MachineInstr *NewMI = &*std::prev(InsertPos);
2864 DAG.LIS->InsertMachineInstrInMaps(*NewMI);
2865
2866 auto UseRegion = MIRegion.find(Remat.UseMI);
2867 if (UseRegion != MIRegion.end()) {
2868 DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], RematMI,
2869 nullptr);
2870 }
2871 DAG.updateRegionBoundaries(DAG.Regions[DefRegion], InsertPos, NewMI);
2872
2873 // Erase rematerialized MI.
2874 DAG.LIS->RemoveMachineInstrFromMaps(RematMI);
2875 RematMI.eraseFromParent();
2876
2877 // Recompute live interval for the re-rematerialized register
2878 DAG.LIS->removeInterval(Reg);
2879 DAG.LIS->createAndComputeVirtRegInterval(Reg);
2880
2881 // Re-add the register as a live-in in all regions it used to be one in.
2882 for (unsigned LIRegion : Remat.LiveInRegions)
2883 DAG.LiveIns[LIRegion].insert({Reg, RegMasks.at({LIRegion, Reg})});
2884 }
2885
2886 // Reset RP in all impacted regions.
2887 for (auto &[I, OriginalRP] : ImpactedRegions)
2888 DAG.Pressure[I] = OriginalRP;
2889
2891}
2892
2893void GCNScheduleDAGMILive::updateRegionBoundaries(
2895 MachineInstr *NewMI) {
2896 assert((!NewMI || NewMI != RegionBounds.second) &&
2897 "cannot remove at region end");
2898
2899 if (RegionBounds.first == RegionBounds.second) {
2900 assert(NewMI && "cannot remove from an empty region");
2901 RegionBounds.first = NewMI;
2902 return;
2903 }
2904
2905 // We only care for modifications at the beginning of a non-empty region since
2906 // the upper region boundary is exclusive.
2907 if (MI != RegionBounds.first)
2908 return;
2909 if (!NewMI)
2910 RegionBounds.first = std::next(MI); // Removal
2911 else
2912 RegionBounds.first = NewMI; // Insertion
2913}
2914
2916 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII);
2917 return any_of(*DAG, [SII](MachineBasicBlock::iterator MI) {
2918 return SII->isIGLPMutationOnly(MI->getOpcode());
2919 });
2920}
2921
2923 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
2924 bool RemoveKillFlags)
2926
2928 HasIGLPInstrs = hasIGLPInstrs(this);
2929 if (HasIGLPInstrs) {
2930 SavedMutations.clear();
2931 SavedMutations.swap(Mutations);
2933 }
2934
2936}
2937
2939 if (HasIGLPInstrs)
2940 SavedMutations.swap(Mutations);
2941
2943}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file defines the GCNRegPressure class, which tracks registry pressure by bookkeeping number of S...
static cl::opt< bool > GCNTrackers("amdgpu-use-amdgpu-trackers", cl::Hidden, cl::desc("Use the AMDGPU specific RPTrackers during scheduling"), cl::init(false))
static cl::opt< bool > DisableClusteredLowOccupancy("amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden, cl::desc("Disable clustered low occupancy " "rescheduling for ILP scheduling stage."), cl::init(false))
#define REMAT_PREFIX
Allows to easily filter for this stage's debug output.
static MachineInstr * getLastMIForRegion(MachineBasicBlock::iterator RegionBegin, MachineBasicBlock::iterator RegionEnd)
static bool shouldCheckPending(SchedBoundary &Zone, const TargetSchedModel *SchedModel)
static cl::opt< bool > RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden, cl::desc("Relax occupancy targets for kernels which are memory " "bound (amdgpu-membound-threshold), or " "Wave Limited (amdgpu-limit-wave-threshold)."), cl::init(false))
#define REMAT_DEBUG(X)
static cl::opt< bool > DisableUnclusterHighRP("amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden, cl::desc("Disable unclustered high register pressure " "reduction scheduling stage."), cl::init(false))
static void printScheduleModel(std::set< std::pair< MachineInstr *, unsigned >, EarlierIssuingCycle > &ReadyCycles)
static cl::opt< bool > PrintMaxRPRegUsageAfterScheduler("amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden, cl::desc("Print a list of live registers along with their def/uses at the " "point of maximum register pressure after scheduling."), cl::init(false))
static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG)
static bool canUsePressureDiffs(const SUnit &SU)
Checks whether SU can use the cached DAG pressure diffs to compute the current register pressure.
static cl::opt< unsigned > PendingQueueLimit("amdgpu-scheduler-pending-queue-limit", cl::Hidden, cl::desc("Max (Available+Pending) size to inspect pending queue (0 disables)"), cl::init(256))
static void getRegisterPressures(bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU, std::vector< unsigned > &Pressure, std::vector< unsigned > &MaxPressure, GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker, ScheduleDAGMI *DAG, const SIRegisterInfo *SRI)
static cl::opt< bool > PrintMaxRPRegUsageBeforeScheduler("amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden, cl::desc("Print a list of live registers along with their def/uses at the " "point of maximum register pressure before scheduling."), cl::init(false))
static SUnit * pickOnlyChoice(SchedBoundary &Zone, const TargetSchedModel *SchedModel)
static cl::opt< unsigned > ScheduleMetricBias("amdgpu-schedule-metric-bias", cl::Hidden, cl::desc("Sets the bias which adds weight to occupancy vs latency. Set it to " "100 to chase the occupancy only."), cl::init(10))
static cl::opt< bool > DisableRewriteMFMAFormSchedStage("amdgpu-disable-rewrite-mfma-form-sched-stage", cl::Hidden, cl::desc("Disable rewrie mfma rewrite scheduling stage"), cl::init(true))
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
static constexpr std::pair< StringLiteral, StringLiteral > ReplaceMap[]
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
if(PassOpts->AAPipeline)
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
bool shouldRevertScheduling(unsigned WavesAfter) override
ValueT & at(const_arg_type_t< KeyT > Val)
at - Return the entry for the specified key, or abort if no such entry exists.
Definition DenseMap.h:224
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
bool erase(const KeyT &Val)
Definition DenseMap.h:330
DenseMapIterator< KeyT, ValueT, KeyInfoT, BucketT > iterator
Definition DenseMap.h:74
bool empty() const
Definition DenseMap.h:109
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
GCNRegPressure bumpDownwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI) const
Mostly copy/paste from CodeGen/RegisterPressure.cpp Calculate the impact MI will have on CurPressure ...
GCNMaxILPSchedStrategy(const MachineSchedContext *C)
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const override
Apply a set of heuristics to a new candidate.
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const override
GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as much as possible.
GCNMaxMemoryClauseSchedStrategy(const MachineSchedContext *C)
GCNMaxOccupancySchedStrategy(const MachineSchedContext *C, bool IsLegacyScheduler=false)
void finalizeSchedule() override
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
void schedule() override
Orders nodes according to selected style.
GCNPostScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S, bool RemoveKillFlags)
GCNRegPressure getPressure() const
DenseMap< unsigned, LaneBitmask > LiveRegSet
GCNSchedStrategy & S
GCNRegPressure PressureBefore
bool isRegionWithExcessRP() const
bool mayCauseSpilling(unsigned WavesAfter)
ScheduleMetrics getScheduleMetrics(const std::vector< SUnit > &InputSchedule)
GCNScheduleDAGMILive & DAG
const GCNSchedStageID StageID
std::vector< MachineInstr * > Unsched
GCNRegPressure PressureAfter
MachineFunction & MF
SIMachineFunctionInfo & MFI
unsigned computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle, DenseMap< unsigned, unsigned > &ReadyCycles, const TargetSchedModel &SM)
virtual void finalizeGCNSchedStage()
virtual bool initGCNSchedStage()
virtual bool shouldRevertScheduling(unsigned WavesAfter)
std::vector< std::unique_ptr< ScheduleDAGMutation > > SavedMutations
GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
MachineBasicBlock * CurrentMBB
const GCNSubtarget & ST
This is a minimal scheduler strategy.
GCNDownwardRPTracker DownwardTracker
GCNSchedStrategy(const MachineSchedContext *C)
SmallVector< GCNSchedStageID, 4 > SchedStages
std::vector< unsigned > MaxPressure
SUnit * pickNodeBidirectional(bool &IsTopNode, bool &PickedPending)
GCNSchedStageID getCurrentStage()
bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const
Evaluates instructions in the pending queue using a subset of scheduling heuristics.
SmallVectorImpl< GCNSchedStageID >::iterator CurrentStage
void schedNode(SUnit *SU, bool IsTopNode) override
Notify MachineSchedStrategy that ScheduleDAGMI has scheduled an instruction and updated scheduled/rem...
GCNDownwardRPTracker * getDownwardTracker()
std::vector< unsigned > Pressure
void initialize(ScheduleDAGMI *DAG) override
Initialize the strategy after building the DAG for a new region.
GCNUpwardRPTracker UpwardTracker
void printCandidateDecision(const SchedCandidate &Current, const SchedCandidate &Preferred)
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, SchedCandidate &Cand, bool &IsPending, bool IsBottomUp)
void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, const SIRegisterInfo *SRI, unsigned SGPRPressure, unsigned VGPRPressure, bool IsBottomUp)
SUnit * pickNode(bool &IsTopNode) override
Pick the next node to schedule, or return NULL.
GCNUpwardRPTracker * getUpwardTracker()
GCNSchedStageID getNextStage() const
void finalizeSchedule() override
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
void schedule() override
Orders nodes according to selected style.
GCNScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S)
void recede(const MachineInstr &MI)
Move to the state of RP just before the MI .
void traceCandidate(const SchedCandidate &Cand)
LLVM_ABI void setPolicy(CandPolicy &Policy, bool IsPostRA, SchedBoundary &CurrZone, SchedBoundary *OtherZone)
Set the CandPolicy given a scheduling zone given the current resources and latencies inside and outsi...
MachineSchedPolicy RegionPolicy
const TargetSchedModel * SchedModel
const MachineSchedContext * Context
const TargetRegisterInfo * TRI
SchedCandidate BotCand
Candidate last picked from Bot boundary.
SchedCandidate TopCand
Candidate last picked from Top boundary.
virtual bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const
Apply a set of heuristics to a new candidate.
ScheduleDAGMILive * DAG
void initialize(ScheduleDAGMI *dag) override
Initialize the strategy after building the DAG for a new region.
void schedNode(SUnit *SU, bool IsTopNode) override
Update the scheduler's state after scheduling a node.
GenericScheduler(const MachineSchedContext *C)
bool shouldRevertScheduling(unsigned WavesAfter) override
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasSubRanges() const
Returns true if subregister liveness information is available.
iterator_range< subrange_iterator > subranges()
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
SlotIndex getMBBEndIdx(const MachineBasicBlock *mbb) const
Return the last index in the given basic block.
LiveInterval & getInterval(Register Reg)
LLVM_ABI void dump() const
MachineBasicBlock * getMBBFromIndex(SlotIndex index) const
VNInfo * getVNInfoAt(SlotIndex Idx) const
getVNInfoAt - Return the VNInfo that is live at Idx, or NULL.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
iterator_range< pred_iterator > predecessors()
MachineInstrBundleIterator< MachineInstr > iterator
LLVM_ABI BlockFrequency getBlockFreq(const MachineBasicBlock *MBB) const
getblockFreq - Return block frequency.
LLVM_ABI BlockFrequency getEntryFreq() const
Divide a block's BlockFrequency::getFrequency() value by this value to obtain the entry block - relat...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
mop_range operands()
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool readsReg() const
readsReg - Returns true if this operand reads the previous value of its register.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
Register getReg() const
getReg - Returns the register number.
bool shouldRevertScheduling(unsigned WavesAfter) override
bool shouldRevertScheduling(unsigned WavesAfter) override
bool shouldRevertScheduling(unsigned WavesAfter) override
bool initGCNSchedStage() override
Capture a change in pressure for a single pressure set.
Helpers for implementing custom MachineSchedStrategy classes.
unsigned size() const
Track the current register pressure at some position in the instruction stream, and remember the high...
LLVM_ABI void advance()
Advance across the current instruction.
LLVM_ABI void getDownwardPressure(const MachineInstr *MI, std::vector< unsigned > &PressureResult, std::vector< unsigned > &MaxPressureResult)
Get the pressure of each PSet after traversing this instruction top-down.
const std::vector< unsigned > & getRegSetPressureAtPos() const
Get the register set pressure at the current position, which may be less than the pressure across the...
LLVM_ABI void getUpwardPressure(const MachineInstr *MI, std::vector< unsigned > &PressureResult, std::vector< unsigned > &MaxPressureResult)
Get the pressure of each PSet after traversing this instruction bottom-up.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
List of registers defined and used by a machine instruction.
LLVM_ABI void collect(const MachineInstr &MI, const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, bool TrackLaneMasks, bool IgnoreDead)
Analyze the given instruction MI and fill in the Uses, Defs and DeadDefs list based on the MachineOpe...
LLVM_ABI void adjustLaneLiveness(const LiveIntervals &LIS, const MachineRegisterInfo &MRI, SlotIndex Pos, MachineInstr *AddFlagsMI=nullptr)
Use liveness information to find out which uses/defs are partially undefined/dead and adjust the VReg...
LLVM_ABI void detectDeadDefs(const MachineInstr &MI, const LiveIntervals &LIS)
Use liveness information to find dead defs not marked with a dead flag and move them to the DeadDefs ...
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
bool isIGLPMutationOnly(unsigned Opcode) const
static bool isMAI(const MCInstrDesc &Desc)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
unsigned TopReadyCycle
Cycle relative to start when node is ready.
unsigned NodeNum
Entry # of node in the node vector.
unsigned short Latency
Node latency.
bool isScheduled
True once scheduled.
unsigned ParentClusterIdx
The parent cluster id.
unsigned BotReadyCycle
Cycle relative to end when node is ready.
bool isBottomReady() const
bool isTopReady() const
SmallVector< SDep, 4 > Preds
All sunit predecessors.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Each Scheduling boundary is associated with ready queues.
LLVM_ABI void releasePending()
Release pending ready nodes in to the available queue.
LLVM_ABI unsigned getLatencyStallCycles(SUnit *SU)
Get the difference between the given SUnit's ready time and the current cycle.
LLVM_ABI SUnit * pickOnlyChoice()
Call this before applying any other heuristics to the Available queue.
LLVM_ABI void bumpCycle(unsigned NextCycle)
Move the boundary of scheduled code by one cycle.
unsigned getCurrMOps() const
Micro-ops issued in the current cycle.
unsigned getCurrCycle() const
Number of cycles to issue the instructions scheduled in this zone.
LLVM_ABI bool checkHazard(SUnit *SU)
Does this SU have a hazard within the current instruction group.
A ScheduleDAG for scheduling lists of MachineInstr.
bool ScheduleSingleMIRegions
True if regions with a single MI should be scheduled.
MachineBasicBlock::iterator RegionEnd
The end of the range to be scheduled.
virtual void finalizeSchedule()
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
virtual void exitRegion()
Called when the scheduler has finished scheduling the current region.
const MachineLoopInfo * MLI
bool RemoveKillFlags
True if the DAG builder should remove kill flags (in preparation for rescheduling).
MachineBasicBlock::iterator RegionBegin
The beginning of the range to be scheduled.
void schedule() override
Implement ScheduleDAGInstrs interface for scheduling a sequence of reorderable instructions.
ScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S)
RegPressureTracker RPTracker
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
void addMutation(std::unique_ptr< ScheduleDAGMutation > Mutation)
Add a postprocessing step to the DAG builder.
void schedule() override
Implement ScheduleDAGInstrs interface for scheduling a sequence of reorderable instructions.
ScheduleDAGMI(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S, bool RemoveKillFlags)
std::vector< std::unique_ptr< ScheduleDAGMutation > > Mutations
Ordered list of DAG postprocessing steps.
MachineRegisterInfo & MRI
Virtual/real register map.
const TargetInstrInfo * TII
Target instruction information.
MachineFunction & MF
Machine function.
static const unsigned ScaleFactor
unsigned getMetric() const
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
static bool isSameInstr(SlotIndex A, SlotIndex B)
isSameInstr - Return true if A and B refer to the same instruction.
static bool isEarlierInstr(SlotIndex A, SlotIndex B)
isEarlierInstr - Return true if A refers to an instruction earlier than B.
SlotIndex getPrevSlot() const
Returns the previous slot in the index list.
SlotIndex getMBBStartIdx(unsigned Num) const
Returns the first index in the given basic block number.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
uint8_t getCopyCost() const
Return the cost of copying a value between two registers in this class.
Provide an instruction scheduling machine model to CodeGen passes.
LLVM_ABI bool hasInstrSchedModel() const
Return true if this machine model includes an instruction-level scheduling model.
unsigned getMicroOpBufferSize() const
Number of micro-ops that may be buffered for OOO execution.
bool shouldRevertScheduling(unsigned WavesAfter) override
VNInfo - Value Number Information.
SlotIndex def
The index of the defining instruction.
bool isPHIDef() const
Returns true if this value is defined by a PHI instruction (or was, PHI instructions may have been el...
static bool allUsesAvailableAt(const MachineInstr *MI, SlotIndex UseIdx, const LiveIntervals &LIS, const MachineRegisterInfo &MRI, const TargetInstrInfo &TII)
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
self_iterator getIterator()
Definition ilist_node.h:123
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
unsigned getAllocatedNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize)
LLVM_READONLY int getMFMASrcCVDstAGPROp(uint16_t Opcode)
unsigned getDynamicVGPRBlockSize(const Function &F)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1763
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
LLVM_ABI unsigned getWeakLeft(const SUnit *SU, bool isTop)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI, Range &&LiveRegs)
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase)
Phase specifes whether or not this is a reentry into the IGroupLPDAGMutation.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
std::pair< MachineBasicBlock::iterator, MachineBasicBlock::iterator > RegionBoundaries
A region's boundaries i.e.
IterT skipDebugInstructionsForward(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It until it points to a non-debug instruction or to End and return the resulting iterator.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI bool tryPressure(const PressureChange &TryP, const PressureChange &CandP, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason, const TargetRegisterInfo *TRI, const MachineFunction &MF)
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
LLVM_ABI cl::opt< bool > VerifyScheduling
LLVM_ABI bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary &Zone)
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
IterT skipDebugInstructionsBackward(IterT It, IterT Begin, bool SkipPseudoOp=true)
Decrement It until it points to a non-debug instruction or to Begin and return the resulting iterator...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
bool isTheSameCluster(unsigned A, unsigned B)
Return whether the input cluster ID's are the same and valid.
DWARFExpression::Operation Op
LLVM_ABI bool tryGreater(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1915
DenseMap< MachineInstr *, GCNRPTracker::LiveRegSet > getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS)
creates a map MachineInstr -> LiveRegSet R - range of iterators on instructions After - upon entry or...
GCNRPTracker::LiveRegSet getLiveRegsBefore(const MachineInstr &MI, const LiveIntervals &LIS)
LLVM_ABI bool tryLess(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
Return true if this heuristic determines order.
LLVM_ABI void dumpMaxRegPressure(MachineFunction &MF, GCNRegPressure::RegKind Kind, LiveIntervals &LIS, const MachineLoopInfo *MLI)
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
LLVM_ABI int biasPhysReg(const SUnit *SU, bool isTop)
Minimize physical register live ranges.
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:870
bool operator()(std::pair< MachineInstr *, unsigned > A, std::pair< MachineInstr *, unsigned > B) const
unsigned getArchVGPRNum() const
unsigned getAGPRNum() const
unsigned getSGPRNum() const
Policy for scheduling the next instruction in the candidate's zone.
Store the state used by GenericScheduler heuristics, required for the lifetime of one invocation of p...
void reset(const CandPolicy &NewPolicy)
LLVM_ABI void initResourceDelta(const ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel)
Status of an instruction's critical resource consumption.
constexpr bool any() const
Definition LaneBitmask.h:53
static constexpr LaneBitmask getNone()
Definition LaneBitmask.h:81
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...