LLVM 22.0.0git
MachinePipeliner.cpp
Go to the documentation of this file.
1//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
10//
11// This SMS implementation is a target-independent back-end pass. When enabled,
12// the pass runs just prior to the register allocation pass, while the machine
13// IR is in SSA form. If software pipelining is successful, then the original
14// loop is replaced by the optimized loop. The optimized loop contains one or
15// more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16// the instructions cannot be scheduled in a given MII, we increase the MII by
17// one and try again.
18//
19// The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20// represent loop carried dependences in the DAG as order edges to the Phi
21// nodes. We also perform several passes over the DAG to eliminate unnecessary
22// edges that inhibit the ability to pipeline. The implementation uses the
23// DFAPacketizer class to compute the minimum initiation interval and the check
24// where an instruction may be inserted in the pipelined schedule.
25//
26// In order for the SMS pass to work, several target specific hooks need to be
27// implemented to get information about the loop structure and to rewrite
28// instructions.
29//
30//===----------------------------------------------------------------------===//
31
33#include "llvm/ADT/ArrayRef.h"
34#include "llvm/ADT/BitVector.h"
35#include "llvm/ADT/DenseMap.h"
37#include "llvm/ADT/STLExtras.h"
39#include "llvm/ADT/SetVector.h"
41#include "llvm/ADT/SmallSet.h"
43#include "llvm/ADT/Statistic.h"
72#include "llvm/Config/llvm-config.h"
73#include "llvm/IR/Attributes.h"
74#include "llvm/IR/Function.h"
75#include "llvm/MC/LaneBitmask.h"
76#include "llvm/MC/MCInstrDesc.h"
78#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
83#include <algorithm>
84#include <cassert>
85#include <climits>
86#include <cstdint>
87#include <deque>
88#include <functional>
89#include <iomanip>
90#include <iterator>
91#include <map>
92#include <memory>
93#include <sstream>
94#include <tuple>
95#include <utility>
96#include <vector>
97
98using namespace llvm;
99
100#define DEBUG_TYPE "pipeliner"
101
102STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
103STATISTIC(NumPipelined, "Number of loops software pipelined");
104STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
105STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
106STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
107STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
108STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
109STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
110STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
111STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
112STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
113STATISTIC(NumFailTooManyStores, "Pipeliner abort due to too many stores");
114
115/// A command line option to turn software pipelining on or off.
116static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
117 cl::desc("Enable Software Pipelining"));
118
119/// A command line option to enable SWP at -Os.
120static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
121 cl::desc("Enable SWP at Os."), cl::Hidden,
122 cl::init(false));
123
124/// A command line argument to limit minimum initial interval for pipelining.
125static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
126 cl::desc("Size limit for the MII."),
127 cl::Hidden, cl::init(27));
128
129/// A command line argument to force pipeliner to use specified initial
130/// interval.
131static cl::opt<int> SwpForceII("pipeliner-force-ii",
132 cl::desc("Force pipeliner to use specified II."),
133 cl::Hidden, cl::init(-1));
134
135/// A command line argument to limit the number of stages in the pipeline.
136static cl::opt<int>
137 SwpMaxStages("pipeliner-max-stages",
138 cl::desc("Maximum stages allowed in the generated scheduled."),
139 cl::Hidden, cl::init(3));
140
141/// A command line option to disable the pruning of chain dependences due to
142/// an unrelated Phi.
143static cl::opt<bool>
144 SwpPruneDeps("pipeliner-prune-deps",
145 cl::desc("Prune dependences between unrelated Phi nodes."),
146 cl::Hidden, cl::init(true));
147
148/// A command line option to disable the pruning of loop carried order
149/// dependences.
150static cl::opt<bool>
151 SwpPruneLoopCarried("pipeliner-prune-loop-carried",
152 cl::desc("Prune loop carried order dependences."),
153 cl::Hidden, cl::init(true));
154
155#ifndef NDEBUG
156static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
157#endif
158
159static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
161 cl::desc("Ignore RecMII"));
162
163static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
164 cl::init(false));
165static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
166 cl::init(false));
167
169 "pipeliner-annotate-for-testing", cl::Hidden, cl::init(false),
170 cl::desc("Instead of emitting the pipelined code, annotate instructions "
171 "with the generated schedule for feeding into the "
172 "-modulo-schedule-test pass"));
173
175 "pipeliner-experimental-cg", cl::Hidden, cl::init(false),
176 cl::desc(
177 "Use the experimental peeling code generator for software pipelining"));
178
179static cl::opt<int> SwpIISearchRange("pipeliner-ii-search-range",
180 cl::desc("Range to search for II"),
181 cl::Hidden, cl::init(10));
182
183static cl::opt<bool>
184 LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false),
185 cl::desc("Limit register pressure of scheduled loop"));
186
187static cl::opt<int>
188 RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden,
189 cl::init(5),
190 cl::desc("Margin representing the unused percentage of "
191 "the register pressure limit"));
192
193static cl::opt<bool>
194 MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false),
195 cl::desc("Use the MVE code generator for software pipelining"));
196
197/// A command line argument to limit the number of store instructions in the
198/// target basic block.
200 "pipeliner-max-num-stores",
201 cl::desc("Maximum number of stores allwed in the target loop."), cl::Hidden,
202 cl::init(200));
203
204namespace llvm {
205
206// A command line option to enable the CopyToPhi DAG mutation.
208 cl::init(true),
209 cl::desc("Enable CopyToPhi DAG Mutation"));
210
211/// A command line argument to force pipeliner to use specified issue
212/// width.
214 "pipeliner-force-issue-width",
215 cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
216 cl::init(-1));
217
218/// A command line argument to set the window scheduling option.
221 cl::desc("Set how to use window scheduling algorithm."),
223 "Turn off window algorithm."),
225 "Use window algorithm after SMS algorithm fails."),
227 "Use window algorithm instead of SMS algorithm.")));
228
229} // end namespace llvm
230
231unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
232char MachinePipeliner::ID = 0;
233#ifndef NDEBUG
235#endif
237
239 "Modulo Software Pipelining", false, false)
245 "Modulo Software Pipelining", false, false)
246
247namespace {
248
249/// This class holds an SUnit corresponding to a memory operation and other
250/// information related to the instruction.
254
255 /// The value of a memory operand.
256 const Value *MemOpValue = nullptr;
257
258 /// The offset of a memory operand.
259 int64_t MemOpOffset = 0;
260
262
263 /// True if all the underlying objects are identified.
264 bool IsAllIdentified = false;
265
267
268 bool isTriviallyDisjoint(const SUnitWithMemInfo &Other) const;
269
270 bool isUnknown() const { return MemOpValue == nullptr; }
271
272private:
274};
275
276/// Add loop-carried chain dependencies. This class handles the same type of
277/// dependencies added by `ScheduleDAGInstrs::buildSchedGraph`, but takes into
278/// account dependencies across iterations.
280 // Type of instruction that is relevant to order-dependencies
281 enum class InstrTag {
282 Barrier = 0, ///< A barrier event instruction.
283 LoadOrStore = 1, ///< An instruction that may load or store memory, but is
284 ///< not a barrier event.
285 FPExceptions = 2, ///< An instruction that does not match above, but may
286 ///< raise floatin-point exceptions.
287 };
288
289 struct TaggedSUnit : PointerIntPair<SUnit *, 2> {
290 TaggedSUnit(SUnit *SU, InstrTag Tag)
291 : PointerIntPair<SUnit *, 2>(SU, unsigned(Tag)) {}
292
293 InstrTag getTag() const { return InstrTag(getInt()); }
294 };
295
296 /// Holds loads and stores with memory related information.
297 struct LoadStoreChunk {
300
301 void append(SUnit *SU);
302 };
303
305 BatchAAResults *BAA;
306 std::vector<SUnit> &SUnits;
307
308 /// The size of SUnits, for convenience.
309 const unsigned N;
310
311 /// Loop-carried Edges.
312 std::vector<BitVector> LoopCarried;
313
314 /// Instructions related to chain dependencies. They are one of the
315 /// following:
316 ///
317 /// 1. Barrier event.
318 /// 2. Load, but neither a barrier event, invariant load, nor may load trap
319 /// value.
320 /// 3. Store, but not a barrier event.
321 /// 4. None of them, but may raise floating-point exceptions.
322 ///
323 /// This is used when analyzing loop-carried dependencies that access global
324 /// barrier instructions.
325 std::vector<TaggedSUnit> TaggedSUnits;
326
327 const TargetInstrInfo *TII = nullptr;
328 const TargetRegisterInfo *TRI = nullptr;
329
330public:
332 const TargetInstrInfo *TII,
333 const TargetRegisterInfo *TRI);
334
335 /// The main function to compute loop-carried order-dependencies.
336 void computeDependencies();
337
338 const BitVector &getLoopCarried(unsigned Idx) const {
339 return LoopCarried[Idx];
340 }
341
342private:
343 /// Tags to \p SU if the instruction may affect the order-dependencies.
344 std::optional<InstrTag> getInstrTag(SUnit *SU) const;
345
346 void addLoopCarriedDepenenciesForChunks(const LoadStoreChunk &From,
347 const LoadStoreChunk &To);
348
349 /// Add a loop-carried order dependency between \p Src and \p Dst if we
350 /// cannot prove they are independent. When \p PerformCheapCheck is true, a
351 /// lightweight dependency test (referred to as "cheap check" below) is
352 /// performed at first. Note that the cheap check is retained to maintain the
353 /// existing behavior and not expected to be used anymore.
354 ///
355 /// TODO: Remove \p PerformCheapCheck and the corresponding cheap check.
356 void addDependenciesBetweenSUs(const SUnitWithMemInfo &Src,
357 const SUnitWithMemInfo &Dst,
358 bool PerformCheapCheck = false);
359
360 void computeDependenciesAux();
361};
362
363} // end anonymous namespace
364
365/// The "main" function for implementing Swing Modulo Scheduling.
367 if (skipFunction(mf.getFunction()))
368 return false;
369
370 if (!EnableSWP)
371 return false;
372
373 if (mf.getFunction().getAttributes().hasFnAttr(Attribute::OptimizeForSize) &&
374 !EnableSWPOptSize.getPosition())
375 return false;
376
378 return false;
379
380 // Cannot pipeline loops without instruction itineraries if we are using
381 // DFA for the pipeliner.
382 if (mf.getSubtarget().useDFAforSMS() &&
385 return false;
386
387 MF = &mf;
391 TII = MF->getSubtarget().getInstrInfo();
392 RegClassInfo.runOnMachineFunction(*MF);
393
394 for (const auto &L : *MLI)
395 scheduleLoop(*L);
396
397 return false;
398}
399
400/// Attempt to perform the SMS algorithm on the specified loop. This function is
401/// the main entry point for the algorithm. The function identifies candidate
402/// loops, calculates the minimum initiation interval, and attempts to schedule
403/// the loop.
404bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
405 bool Changed = false;
406 for (const auto &InnerLoop : L)
407 Changed |= scheduleLoop(*InnerLoop);
408
409#ifndef NDEBUG
410 // Stop trying after reaching the limit (if any).
411 int Limit = SwpLoopLimit;
412 if (Limit >= 0) {
413 if (NumTries >= SwpLoopLimit)
414 return Changed;
415 NumTries++;
416 }
417#endif
418
419 setPragmaPipelineOptions(L);
420 if (!canPipelineLoop(L)) {
421 LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
422 ORE->emit([&]() {
423 return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
424 L.getStartLoc(), L.getHeader())
425 << "Failed to pipeline loop";
426 });
427
428 LI.LoopPipelinerInfo.reset();
429 return Changed;
430 }
431
432 ++NumTrytoPipeline;
433 if (useSwingModuloScheduler())
434 Changed = swingModuloScheduler(L);
435
436 if (useWindowScheduler(Changed))
437 Changed = runWindowScheduler(L);
438
439 LI.LoopPipelinerInfo.reset();
440 return Changed;
441}
442
443void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
444 // Reset the pragma for the next loop in iteration.
445 disabledByPragma = false;
446 II_setByPragma = 0;
447
448 MachineBasicBlock *LBLK = L.getTopBlock();
449
450 if (LBLK == nullptr)
451 return;
452
453 const BasicBlock *BBLK = LBLK->getBasicBlock();
454 if (BBLK == nullptr)
455 return;
456
457 const Instruction *TI = BBLK->getTerminator();
458 if (TI == nullptr)
459 return;
460
461 MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);
462 if (LoopID == nullptr)
463 return;
464
465 assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");
466 assert(LoopID->getOperand(0) == LoopID && "invalid loop");
467
468 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
469 MDNode *MD = dyn_cast<MDNode>(MDO);
470
471 if (MD == nullptr)
472 continue;
473
474 MDString *S = dyn_cast<MDString>(MD->getOperand(0));
475
476 if (S == nullptr)
477 continue;
478
479 if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
480 assert(MD->getNumOperands() == 2 &&
481 "Pipeline initiation interval hint metadata should have two operands.");
483 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
484 assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");
485 } else if (S->getString() == "llvm.loop.pipeline.disable") {
486 disabledByPragma = true;
487 }
488 }
489}
490
491/// Return true if the loop can be software pipelined. The algorithm is
492/// restricted to loops with a single basic block. Make sure that the
493/// branch in the loop can be analyzed.
494bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
495 if (L.getNumBlocks() != 1) {
496 ORE->emit([&]() {
497 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
498 L.getStartLoc(), L.getHeader())
499 << "Not a single basic block: "
500 << ore::NV("NumBlocks", L.getNumBlocks());
501 });
502 return false;
503 }
504
505 if (disabledByPragma) {
506 ORE->emit([&]() {
507 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
508 L.getStartLoc(), L.getHeader())
509 << "Disabled by Pragma.";
510 });
511 return false;
512 }
513
514 // Check if the branch can't be understood because we can't do pipelining
515 // if that's the case.
516 LI.TBB = nullptr;
517 LI.FBB = nullptr;
518 LI.BrCond.clear();
519 if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
520 LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
521 NumFailBranch++;
522 ORE->emit([&]() {
523 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
524 L.getStartLoc(), L.getHeader())
525 << "The branch can't be understood";
526 });
527 return false;
528 }
529
530 LI.LoopInductionVar = nullptr;
531 LI.LoopCompare = nullptr;
532 LI.LoopPipelinerInfo = TII->analyzeLoopForPipelining(L.getTopBlock());
533 if (!LI.LoopPipelinerInfo) {
534 LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
535 NumFailLoop++;
536 ORE->emit([&]() {
537 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
538 L.getStartLoc(), L.getHeader())
539 << "The loop structure is not supported";
540 });
541 return false;
542 }
543
544 if (!L.getLoopPreheader()) {
545 LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
546 NumFailPreheader++;
547 ORE->emit([&]() {
548 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
549 L.getStartLoc(), L.getHeader())
550 << "No loop preheader found";
551 });
552 return false;
553 }
554
555 unsigned NumStores = 0;
556 for (MachineInstr &MI : *L.getHeader())
557 if (MI.mayStore())
558 ++NumStores;
559 if (NumStores > SwpMaxNumStores) {
560 LLVM_DEBUG(dbgs() << "Too many stores\n");
561 NumFailTooManyStores++;
562 ORE->emit([&]() {
563 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
564 L.getStartLoc(), L.getHeader())
565 << "Too many store instructions in the loop: "
566 << ore::NV("NumStores", NumStores) << " > "
567 << ore::NV("SwpMaxNumStores", SwpMaxNumStores) << ".";
568 });
569 return false;
570 }
571
572 // Remove any subregisters from inputs to phi nodes.
573 preprocessPhiNodes(*L.getHeader());
574 return true;
575}
576
577void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
578 MachineRegisterInfo &MRI = MF->getRegInfo();
579 SlotIndexes &Slots =
580 *getAnalysis<LiveIntervalsWrapperPass>().getLIS().getSlotIndexes();
581
582 for (MachineInstr &PI : B.phis()) {
583 MachineOperand &DefOp = PI.getOperand(0);
584 assert(DefOp.getSubReg() == 0);
585 auto *RC = MRI.getRegClass(DefOp.getReg());
586
587 for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
588 MachineOperand &RegOp = PI.getOperand(i);
589 if (RegOp.getSubReg() == 0)
590 continue;
591
592 // If the operand uses a subregister, replace it with a new register
593 // without subregisters, and generate a copy to the new register.
594 Register NewReg = MRI.createVirtualRegister(RC);
595 MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
597 const DebugLoc &DL = PredB.findDebugLoc(At);
598 auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
599 .addReg(RegOp.getReg(), getRegState(RegOp),
600 RegOp.getSubReg());
601 Slots.insertMachineInstrInMaps(*Copy);
602 RegOp.setReg(NewReg);
603 RegOp.setSubReg(0);
604 }
605 }
606}
607
608/// The SMS algorithm consists of the following main steps:
609/// 1. Computation and analysis of the dependence graph.
610/// 2. Ordering of the nodes (instructions).
611/// 3. Attempt to Schedule the loop.
612bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
613 assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
614
615 AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
616 SwingSchedulerDAG SMS(
618 II_setByPragma, LI.LoopPipelinerInfo.get(), AA);
619
620 MachineBasicBlock *MBB = L.getHeader();
621 // The kernel should not include any terminator instructions. These
622 // will be added back later.
623 SMS.startBlock(MBB);
624
625 // Compute the number of 'real' instructions in the basic block by
626 // ignoring terminators.
627 unsigned size = MBB->size();
629 E = MBB->instr_end();
630 I != E; ++I, --size)
631 ;
632
633 SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
634 SMS.schedule();
635 SMS.exitRegion();
636
637 SMS.finishBlock();
638 return SMS.hasNewSchedule();
639}
640
651
652bool MachinePipeliner::runWindowScheduler(MachineLoop &L) {
653 MachineSchedContext Context;
654 Context.MF = MF;
655 Context.MLI = MLI;
656 Context.MDT = MDT;
657 Context.TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
658 Context.AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
659 Context.LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
660 Context.RegClassInfo->runOnMachineFunction(*MF);
661 WindowScheduler WS(&Context, L);
662 return WS.run();
663}
664
665bool MachinePipeliner::useSwingModuloScheduler() {
666 // SwingModuloScheduler does not work when WindowScheduler is forced.
668}
669
670bool MachinePipeliner::useWindowScheduler(bool Changed) {
671 // WindowScheduler does not work for following cases:
672 // 1. when it is off.
673 // 2. when SwingModuloScheduler is successfully scheduled.
674 // 3. when pragma II is enabled.
675 if (II_setByPragma) {
676 LLVM_DEBUG(dbgs() << "Window scheduling is disabled when "
677 "llvm.loop.pipeline.initiationinterval is set.\n");
678 return false;
679 }
680
683}
684
685void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
686 if (SwpForceII > 0)
687 MII = SwpForceII;
688 else if (II_setByPragma > 0)
689 MII = II_setByPragma;
690 else
691 MII = std::max(ResMII, RecMII);
692}
693
694void SwingSchedulerDAG::setMAX_II() {
695 if (SwpForceII > 0)
696 MAX_II = SwpForceII;
697 else if (II_setByPragma > 0)
698 MAX_II = II_setByPragma;
699 else
700 MAX_II = MII + SwpIISearchRange;
701}
702
703/// We override the schedule function in ScheduleDAGInstrs to implement the
704/// scheduling part of the Swing Modulo Scheduling algorithm.
706 buildSchedGraph(AA);
707 const LoopCarriedEdges LCE = addLoopCarriedDependences();
708 updatePhiDependences();
709 Topo.InitDAGTopologicalSorting();
710 changeDependences();
711 postProcessDAG();
712 DDG = std::make_unique<SwingSchedulerDDG>(SUnits, &EntrySU, &ExitSU, LCE);
713 LLVM_DEBUG({
714 dump();
715 dbgs() << "===== Loop Carried Edges Begin =====\n";
716 for (SUnit &SU : SUnits)
717 LCE.dump(&SU, TRI, &MRI);
718 dbgs() << "===== Loop Carried Edges End =====\n";
719 });
720
721 NodeSetType NodeSets;
722 findCircuits(NodeSets);
723 NodeSetType Circuits = NodeSets;
724
725 // Calculate the MII.
726 unsigned ResMII = calculateResMII();
727 unsigned RecMII = calculateRecMII(NodeSets);
728
729 fuseRecs(NodeSets);
730
731 // This flag is used for testing and can cause correctness problems.
732 if (SwpIgnoreRecMII)
733 RecMII = 0;
734
735 setMII(ResMII, RecMII);
736 setMAX_II();
737
738 LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
739 << " (rec=" << RecMII << ", res=" << ResMII << ")\n");
740
741 // Can't schedule a loop without a valid MII.
742 if (MII == 0) {
743 LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
744 NumFailZeroMII++;
745 Pass.ORE->emit([&]() {
747 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
748 << "Invalid Minimal Initiation Interval: 0";
749 });
750 return;
751 }
752
753 // Don't pipeline large loops.
754 if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {
755 LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
756 << ", we don't pipeline large loops\n");
757 NumFailLargeMaxMII++;
758 Pass.ORE->emit([&]() {
760 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
761 << "Minimal Initiation Interval too large: "
762 << ore::NV("MII", (int)MII) << " > "
763 << ore::NV("SwpMaxMii", SwpMaxMii) << "."
764 << "Refer to -pipeliner-max-mii.";
765 });
766 return;
767 }
768
769 computeNodeFunctions(NodeSets);
770
771 registerPressureFilter(NodeSets);
772
773 colocateNodeSets(NodeSets);
774
775 checkNodeSets(NodeSets);
776
777 LLVM_DEBUG({
778 for (auto &I : NodeSets) {
779 dbgs() << " Rec NodeSet ";
780 I.dump();
781 }
782 });
783
784 llvm::stable_sort(NodeSets, std::greater<NodeSet>());
785
786 groupRemainingNodes(NodeSets);
787
788 removeDuplicateNodes(NodeSets);
789
790 LLVM_DEBUG({
791 for (auto &I : NodeSets) {
792 dbgs() << " NodeSet ";
793 I.dump();
794 }
795 });
796
797 computeNodeOrder(NodeSets);
798
799 // check for node order issues
800 checkValidNodeOrder(Circuits);
801
802 SMSchedule Schedule(Pass.MF, this);
803 Scheduled = schedulePipeline(Schedule);
804
805 if (!Scheduled){
806 LLVM_DEBUG(dbgs() << "No schedule found, return\n");
807 NumFailNoSchedule++;
808 Pass.ORE->emit([&]() {
810 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
811 << "Unable to find schedule";
812 });
813 return;
814 }
815
816 unsigned numStages = Schedule.getMaxStageCount();
817 // No need to generate pipeline if there are no overlapped iterations.
818 if (numStages == 0) {
819 LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
820 NumFailZeroStage++;
821 Pass.ORE->emit([&]() {
823 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
824 << "No need to pipeline - no overlapped iterations in schedule.";
825 });
826 return;
827 }
828 // Check that the maximum stage count is less than user-defined limit.
829 if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {
830 LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
831 << " : too many stages, abort\n");
832 NumFailLargeMaxStage++;
833 Pass.ORE->emit([&]() {
835 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
836 << "Too many stages in schedule: "
837 << ore::NV("numStages", (int)numStages) << " > "
838 << ore::NV("SwpMaxStages", SwpMaxStages)
839 << ". Refer to -pipeliner-max-stages.";
840 });
841 return;
842 }
843
844 Pass.ORE->emit([&]() {
845 return MachineOptimizationRemark(DEBUG_TYPE, "schedule", Loop.getStartLoc(),
846 Loop.getHeader())
847 << "Pipelined succesfully!";
848 });
849
850 // Generate the schedule as a ModuloSchedule.
851 DenseMap<MachineInstr *, int> Cycles, Stages;
852 std::vector<MachineInstr *> OrderedInsts;
853 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
854 ++Cycle) {
855 for (SUnit *SU : Schedule.getInstructions(Cycle)) {
856 OrderedInsts.push_back(SU->getInstr());
857 Cycles[SU->getInstr()] = Cycle;
858 Stages[SU->getInstr()] = Schedule.stageScheduled(SU);
859 }
860 }
862 for (auto &KV : NewMIs) {
863 Cycles[KV.first] = Cycles[KV.second];
864 Stages[KV.first] = Stages[KV.second];
865 NewInstrChanges[KV.first] = InstrChanges[getSUnit(KV.first)];
866 }
867
868 ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
869 std::move(Stages));
871 assert(NewInstrChanges.empty() &&
872 "Cannot serialize a schedule with InstrChanges!");
874 MSTI.annotate();
875 return;
876 }
877 // The experimental code generator can't work if there are InstChanges.
878 if (ExperimentalCodeGen && NewInstrChanges.empty()) {
879 PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
880 MSE.expand();
881 } else if (MVECodeGen && NewInstrChanges.empty() &&
882 LoopPipelinerInfo->isMVEExpanderSupported() &&
884 ModuloScheduleExpanderMVE MSE(MF, MS, LIS);
885 MSE.expand();
886 } else {
887 ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
888 MSE.expand();
889 MSE.cleanup();
890 }
891 ++NumPipelined;
892}
893
894/// Clean up after the software pipeliner runs.
896 for (auto &KV : NewMIs)
897 MF.deleteMachineInstr(KV.second);
898 NewMIs.clear();
899
900 // Call the superclass.
902}
903
904/// Return the register values for the operands of a Phi instruction.
905/// This function assume the instruction is a Phi.
907 Register &InitVal, Register &LoopVal) {
908 assert(Phi.isPHI() && "Expecting a Phi.");
909
910 InitVal = Register();
911 LoopVal = Register();
912 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
913 if (Phi.getOperand(i + 1).getMBB() != Loop)
914 InitVal = Phi.getOperand(i).getReg();
915 else
916 LoopVal = Phi.getOperand(i).getReg();
917
918 assert(InitVal && LoopVal && "Unexpected Phi structure.");
919}
920
921/// Return the Phi register value that comes the loop block.
923 const MachineBasicBlock *LoopBB) {
924 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
925 if (Phi.getOperand(i + 1).getMBB() == LoopBB)
926 return Phi.getOperand(i).getReg();
927 return Register();
928}
929
930/// Return true if SUb can be reached from SUa following the chain edges.
931static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
934 Worklist.push_back(SUa);
935 while (!Worklist.empty()) {
936 const SUnit *SU = Worklist.pop_back_val();
937 for (const auto &SI : SU->Succs) {
938 SUnit *SuccSU = SI.getSUnit();
939 if (SI.getKind() == SDep::Order) {
940 if (Visited.count(SuccSU))
941 continue;
942 if (SuccSU == SUb)
943 return true;
944 Worklist.push_back(SuccSU);
945 Visited.insert(SuccSU);
946 }
947 }
948 }
949 return false;
950}
951
953 if (!getUnderlyingObjects())
954 return;
955 for (const Value *Obj : UnderlyingObjs)
956 if (!isIdentifiedObject(Obj)) {
957 IsAllIdentified = false;
958 break;
959 }
960}
961
963 const SUnitWithMemInfo &Other) const {
964 // If all underlying objects are identified objects and there is no overlap
965 // between them, then these two instructions are disjoint.
966 if (!IsAllIdentified || !Other.IsAllIdentified)
967 return false;
968 for (const Value *Obj : UnderlyingObjs)
969 if (llvm::is_contained(Other.UnderlyingObjs, Obj))
970 return false;
971 return true;
972}
973
974/// Collect the underlying objects for the memory references of an instruction.
975/// This function calls the code in ValueTracking, but first checks that the
976/// instruction has a memory operand.
977/// Returns false if we cannot find the underlying objects.
978bool SUnitWithMemInfo::getUnderlyingObjects() {
979 const MachineInstr *MI = SU->getInstr();
980 if (!MI->hasOneMemOperand())
981 return false;
982 MachineMemOperand *MM = *MI->memoperands_begin();
983 if (!MM->getValue())
984 return false;
985 MemOpValue = MM->getValue();
986 MemOpOffset = MM->getOffset();
988
989 // TODO: A no alias scope may be valid only in a single iteration. In this
990 // case we need to peel off it like LoopAccessAnalysis does.
991 AATags = MM->getAAInfo();
992 return true;
993}
994
995/// Returns true if there is a loop-carried order dependency from \p Src to \p
996/// Dst.
997static bool
998hasLoopCarriedMemDep(const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst,
1000 const TargetRegisterInfo *TRI,
1001 const SwingSchedulerDAG *SSD, bool PerformCheapCheck) {
1002 if (Src.isTriviallyDisjoint(Dst))
1003 return false;
1004 if (isSuccOrder(Src.SU, Dst.SU))
1005 return false;
1006
1007 MachineInstr &SrcMI = *Src.SU->getInstr();
1008 MachineInstr &DstMI = *Dst.SU->getInstr();
1009 if (PerformCheapCheck) {
1010 // First, perform the cheaper check that compares the base register.
1011 // If they are the same and the load offset is less than the store
1012 // offset, then mark the dependence as loop carried potentially.
1013 //
1014 // TODO: This check will be removed.
1015 const MachineOperand *BaseOp1, *BaseOp2;
1016 int64_t Offset1, Offset2;
1017 bool Offset1IsScalable, Offset2IsScalable;
1018 if (TII->getMemOperandWithOffset(SrcMI, BaseOp1, Offset1, Offset1IsScalable,
1019 TRI) &&
1020 TII->getMemOperandWithOffset(DstMI, BaseOp2, Offset2, Offset2IsScalable,
1021 TRI)) {
1022 if (BaseOp1->isIdenticalTo(*BaseOp2) &&
1023 Offset1IsScalable == Offset2IsScalable &&
1024 (int)Offset1 < (int)Offset2) {
1025 assert(TII->areMemAccessesTriviallyDisjoint(SrcMI, DstMI) &&
1026 "What happened to the chain edge?");
1027 return true;
1028 }
1029 }
1030 }
1031
1032 if (!SSD->mayOverlapInLaterIter(&SrcMI, &DstMI))
1033 return false;
1034
1035 // Second, the more expensive check that uses alias analysis on the
1036 // base registers. If they alias, and the load offset is less than
1037 // the store offset, the mark the dependence as loop carried.
1038 if (Src.isUnknown() || Dst.isUnknown())
1039 return true;
1040 if (Src.MemOpValue == Dst.MemOpValue && Src.MemOpOffset <= Dst.MemOpOffset)
1041 return true;
1042
1043 if (BAA.isNoAlias(
1044 MemoryLocation::getBeforeOrAfter(Src.MemOpValue, Src.AATags),
1045 MemoryLocation::getBeforeOrAfter(Dst.MemOpValue, Dst.AATags)))
1046 return false;
1047
1048 // AliasAnalysis sometimes gives up on following the underlying
1049 // object. In such a case, separate checks for underlying objects may
1050 // prove that there are no aliases between two accesses.
1051 for (const Value *SrcObj : Src.UnderlyingObjs)
1052 for (const Value *DstObj : Dst.UnderlyingObjs)
1053 if (!BAA.isNoAlias(MemoryLocation::getBeforeOrAfter(SrcObj, Src.AATags),
1054 MemoryLocation::getBeforeOrAfter(DstObj, Dst.AATags)))
1055 return true;
1056
1057 return false;
1058}
1059
1060void LoopCarriedOrderDepsTracker::LoadStoreChunk::append(SUnit *SU) {
1061 const MachineInstr *MI = SU->getInstr();
1062 if (!MI->mayLoadOrStore())
1063 return;
1064 (MI->mayStore() ? Stores : Loads).emplace_back(SU);
1065}
1066
1068 SwingSchedulerDAG *SSD, BatchAAResults *BAA, const TargetInstrInfo *TII,
1069 const TargetRegisterInfo *TRI)
1070 : DAG(SSD), BAA(BAA), SUnits(DAG->SUnits), N(SUnits.size()),
1071 LoopCarried(N, BitVector(N)), TII(TII), TRI(TRI) {}
1072
1074 // Traverse all instructions and extract only what we are targetting.
1075 for (auto &SU : SUnits) {
1076 auto Tagged = getInstrTag(&SU);
1077
1078 // This instruction has no loop-carried order-dependencies.
1079 if (!Tagged)
1080 continue;
1081 TaggedSUnits.emplace_back(&SU, *Tagged);
1082 }
1083
1084 computeDependenciesAux();
1085}
1086
1087std::optional<LoopCarriedOrderDepsTracker::InstrTag>
1088LoopCarriedOrderDepsTracker::getInstrTag(SUnit *SU) const {
1089 MachineInstr *MI = SU->getInstr();
1090 if (TII->isGlobalMemoryObject(MI))
1091 return InstrTag::Barrier;
1092
1093 if (MI->mayStore() ||
1094 (MI->mayLoad() && !MI->isDereferenceableInvariantLoad()))
1095 return InstrTag::LoadOrStore;
1096
1097 if (MI->mayRaiseFPException())
1098 return InstrTag::FPExceptions;
1099
1100 return std::nullopt;
1101}
1102
1103void LoopCarriedOrderDepsTracker::addDependenciesBetweenSUs(
1104 const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst,
1105 bool PerformCheapCheck) {
1106 // Avoid self-dependencies.
1107 if (Src.SU == Dst.SU)
1108 return;
1109
1110 if (hasLoopCarriedMemDep(Src, Dst, *BAA, TII, TRI, DAG, PerformCheapCheck))
1111 LoopCarried[Src.SU->NodeNum].set(Dst.SU->NodeNum);
1112}
1113
1114void LoopCarriedOrderDepsTracker::addLoopCarriedDepenenciesForChunks(
1115 const LoadStoreChunk &From, const LoadStoreChunk &To) {
1116 // Add load-to-store dependencies (WAR).
1117 for (const SUnitWithMemInfo &Src : From.Loads)
1118 for (const SUnitWithMemInfo &Dst : To.Stores)
1119 // Perform a cheap check first if this is a forward dependency.
1120 addDependenciesBetweenSUs(Src, Dst, Src.SU->NodeNum < Dst.SU->NodeNum);
1121
1122 // Add store-to-load dependencies (RAW).
1123 for (const SUnitWithMemInfo &Src : From.Stores)
1124 for (const SUnitWithMemInfo &Dst : To.Loads)
1125 addDependenciesBetweenSUs(Src, Dst);
1126
1127 // Add store-to-store dependencies (WAW).
1128 for (const SUnitWithMemInfo &Src : From.Stores)
1129 for (const SUnitWithMemInfo &Dst : To.Stores)
1130 addDependenciesBetweenSUs(Src, Dst);
1131}
1132
1133void LoopCarriedOrderDepsTracker::computeDependenciesAux() {
1135 for (const auto &TSU : TaggedSUnits) {
1136 InstrTag Tag = TSU.getTag();
1137 SUnit *SU = TSU.getPointer();
1138 switch (Tag) {
1139 case InstrTag::Barrier:
1140 Chunks.emplace_back();
1141 break;
1142 case InstrTag::LoadOrStore:
1143 Chunks.back().append(SU);
1144 break;
1145 case InstrTag::FPExceptions:
1146 // TODO: Handle this properly.
1147 break;
1148 }
1149 }
1150
1151 // Add dependencies between memory operations. If there are one or more
1152 // barrier events between two memory instructions, we don't add a
1153 // loop-carried dependence for them.
1154 for (const LoadStoreChunk &Chunk : Chunks)
1155 addLoopCarriedDepenenciesForChunks(Chunk, Chunk);
1156
1157 // TODO: If there are multiple barrier instructions, dependencies from the
1158 // last barrier instruction (or load/store below it) to the first barrier
1159 // instruction (or load/store above it).
1160}
1161
1162/// Add a chain edge between a load and store if the store can be an
1163/// alias of the load on a subsequent iteration, i.e., a loop carried
1164/// dependence. This code is very similar to the code in ScheduleDAGInstrs
1165/// but that code doesn't create loop carried dependences.
1166/// TODO: Also compute output-dependencies.
1167LoopCarriedEdges SwingSchedulerDAG::addLoopCarriedDependences() {
1168 LoopCarriedEdges LCE;
1169
1170 // Add loop-carried order-dependencies
1171 LoopCarriedOrderDepsTracker LCODTracker(this, &BAA, TII, TRI);
1172 LCODTracker.computeDependencies();
1173 for (unsigned I = 0; I != SUnits.size(); I++)
1174 for (const int Succ : LCODTracker.getLoopCarried(I).set_bits())
1175 LCE.OrderDeps[&SUnits[I]].insert(&SUnits[Succ]);
1176
1177 LCE.modifySUnits(SUnits, TII);
1178 return LCE;
1179}
1180
1181/// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
1182/// processes dependences for PHIs. This function adds true dependences
1183/// from a PHI to a use, and a loop carried dependence from the use to the
1184/// PHI. The loop carried dependence is represented as an anti dependence
1185/// edge. This function also removes chain dependences between unrelated
1186/// PHIs.
1187void SwingSchedulerDAG::updatePhiDependences() {
1188 SmallVector<SDep, 4> RemoveDeps;
1189 const TargetSubtargetInfo &ST = MF.getSubtarget<TargetSubtargetInfo>();
1190
1191 // Iterate over each DAG node.
1192 for (SUnit &I : SUnits) {
1193 RemoveDeps.clear();
1194 // Set to true if the instruction has an operand defined by a Phi.
1195 Register HasPhiUse;
1196 Register HasPhiDef;
1197 MachineInstr *MI = I.getInstr();
1198 // Iterate over each operand, and we process the definitions.
1199 for (const MachineOperand &MO : MI->operands()) {
1200 if (!MO.isReg())
1201 continue;
1202 Register Reg = MO.getReg();
1203 if (MO.isDef()) {
1204 // If the register is used by a Phi, then create an anti dependence.
1206 UI = MRI.use_instr_begin(Reg),
1207 UE = MRI.use_instr_end();
1208 UI != UE; ++UI) {
1209 MachineInstr *UseMI = &*UI;
1210 SUnit *SU = getSUnit(UseMI);
1211 if (SU != nullptr && UseMI->isPHI()) {
1212 if (!MI->isPHI()) {
1213 SDep Dep(SU, SDep::Anti, Reg);
1214 Dep.setLatency(1);
1215 I.addPred(Dep);
1216 } else {
1217 HasPhiDef = Reg;
1218 // Add a chain edge to a dependent Phi that isn't an existing
1219 // predecessor.
1220 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
1221 I.addPred(SDep(SU, SDep::Barrier));
1222 }
1223 }
1224 }
1225 } else if (MO.isUse()) {
1226 // If the register is defined by a Phi, then create a true dependence.
1227 MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
1228 if (DefMI == nullptr)
1229 continue;
1230 SUnit *SU = getSUnit(DefMI);
1231 if (SU != nullptr && DefMI->isPHI()) {
1232 if (!MI->isPHI()) {
1233 SDep Dep(SU, SDep::Data, Reg);
1234 Dep.setLatency(0);
1235 ST.adjustSchedDependency(SU, 0, &I, MO.getOperandNo(), Dep,
1236 &SchedModel);
1237 I.addPred(Dep);
1238 } else {
1239 HasPhiUse = Reg;
1240 // Add a chain edge to a dependent Phi that isn't an existing
1241 // predecessor.
1242 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
1243 I.addPred(SDep(SU, SDep::Barrier));
1244 }
1245 }
1246 }
1247 }
1248 // Remove order dependences from an unrelated Phi.
1249 if (!SwpPruneDeps)
1250 continue;
1251 for (auto &PI : I.Preds) {
1252 MachineInstr *PMI = PI.getSUnit()->getInstr();
1253 if (PMI->isPHI() && PI.getKind() == SDep::Order) {
1254 if (I.getInstr()->isPHI()) {
1255 if (PMI->getOperand(0).getReg() == HasPhiUse)
1256 continue;
1257 if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
1258 continue;
1259 }
1260 RemoveDeps.push_back(PI);
1261 }
1262 }
1263 for (const SDep &D : RemoveDeps)
1264 I.removePred(D);
1265 }
1266}
1267
1268/// Iterate over each DAG node and see if we can change any dependences
1269/// in order to reduce the recurrence MII.
1270void SwingSchedulerDAG::changeDependences() {
1271 // See if an instruction can use a value from the previous iteration.
1272 // If so, we update the base and offset of the instruction and change
1273 // the dependences.
1274 for (SUnit &I : SUnits) {
1275 unsigned BasePos = 0, OffsetPos = 0;
1276 Register NewBase;
1277 int64_t NewOffset = 0;
1278 if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
1279 NewOffset))
1280 continue;
1281
1282 // Get the MI and SUnit for the instruction that defines the original base.
1283 Register OrigBase = I.getInstr()->getOperand(BasePos).getReg();
1284 MachineInstr *DefMI = MRI.getUniqueVRegDef(OrigBase);
1285 if (!DefMI)
1286 continue;
1287 SUnit *DefSU = getSUnit(DefMI);
1288 if (!DefSU)
1289 continue;
1290 // Get the MI and SUnit for the instruction that defins the new base.
1291 MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
1292 if (!LastMI)
1293 continue;
1294 SUnit *LastSU = getSUnit(LastMI);
1295 if (!LastSU)
1296 continue;
1297
1298 if (Topo.IsReachable(&I, LastSU))
1299 continue;
1300
1301 // Remove the dependence. The value now depends on a prior iteration.
1303 for (const SDep &P : I.Preds)
1304 if (P.getSUnit() == DefSU)
1305 Deps.push_back(P);
1306 for (const SDep &D : Deps) {
1307 Topo.RemovePred(&I, D.getSUnit());
1308 I.removePred(D);
1309 }
1310 // Remove the chain dependence between the instructions.
1311 Deps.clear();
1312 for (auto &P : LastSU->Preds)
1313 if (P.getSUnit() == &I && P.getKind() == SDep::Order)
1314 Deps.push_back(P);
1315 for (const SDep &D : Deps) {
1316 Topo.RemovePred(LastSU, D.getSUnit());
1317 LastSU->removePred(D);
1318 }
1319
1320 // Add a dependence between the new instruction and the instruction
1321 // that defines the new base.
1322 SDep Dep(&I, SDep::Anti, NewBase);
1323 Topo.AddPred(LastSU, &I);
1324 LastSU->addPred(Dep);
1325
1326 // Remember the base and offset information so that we can update the
1327 // instruction during code generation.
1328 InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
1329 }
1330}
1331
1332/// Create an instruction stream that represents a single iteration and stage of
1333/// each instruction. This function differs from SMSchedule::finalizeSchedule in
1334/// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this
1335/// function is an approximation of SMSchedule::finalizeSchedule with all
1336/// non-const operations removed.
1338 SMSchedule &Schedule,
1339 std::vector<MachineInstr *> &OrderedInsts,
1342
1343 // Move all instructions to the first stage from the later stages.
1344 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1345 ++Cycle) {
1346 for (int Stage = 0, LastStage = Schedule.getMaxStageCount();
1347 Stage <= LastStage; ++Stage) {
1348 for (SUnit *SU : llvm::reverse(Schedule.getInstructions(
1349 Cycle + Stage * Schedule.getInitiationInterval()))) {
1350 Instrs[Cycle].push_front(SU);
1351 }
1352 }
1353 }
1354
1355 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1356 ++Cycle) {
1357 std::deque<SUnit *> &CycleInstrs = Instrs[Cycle];
1358 CycleInstrs = Schedule.reorderInstructions(SSD, CycleInstrs);
1359 for (SUnit *SU : CycleInstrs) {
1360 MachineInstr *MI = SU->getInstr();
1361 OrderedInsts.push_back(MI);
1362 Stages[MI] = Schedule.stageScheduled(SU);
1363 }
1364 }
1365}
1366
1367namespace {
1368
1369// FuncUnitSorter - Comparison operator used to sort instructions by
1370// the number of functional unit choices.
1371struct FuncUnitSorter {
1372 const InstrItineraryData *InstrItins;
1373 const MCSubtargetInfo *STI;
1374 DenseMap<InstrStage::FuncUnits, unsigned> Resources;
1375
1376 FuncUnitSorter(const TargetSubtargetInfo &TSI)
1377 : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
1378
1379 // Compute the number of functional unit alternatives needed
1380 // at each stage, and take the minimum value. We prioritize the
1381 // instructions by the least number of choices first.
1382 unsigned minFuncUnits(const MachineInstr *Inst,
1383 InstrStage::FuncUnits &F) const {
1384 unsigned SchedClass = Inst->getDesc().getSchedClass();
1385 unsigned min = UINT_MAX;
1386 if (InstrItins && !InstrItins->isEmpty()) {
1387 for (const InstrStage &IS :
1388 make_range(InstrItins->beginStage(SchedClass),
1389 InstrItins->endStage(SchedClass))) {
1390 InstrStage::FuncUnits funcUnits = IS.getUnits();
1391 unsigned numAlternatives = llvm::popcount(funcUnits);
1392 if (numAlternatives < min) {
1393 min = numAlternatives;
1394 F = funcUnits;
1395 }
1396 }
1397 return min;
1398 }
1399 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1400 const MCSchedClassDesc *SCDesc =
1401 STI->getSchedModel().getSchedClassDesc(SchedClass);
1402 if (!SCDesc->isValid())
1403 // No valid Schedule Class Desc for schedClass, should be
1404 // Pseudo/PostRAPseudo
1405 return min;
1406
1407 for (const MCWriteProcResEntry &PRE :
1408 make_range(STI->getWriteProcResBegin(SCDesc),
1409 STI->getWriteProcResEnd(SCDesc))) {
1410 if (!PRE.ReleaseAtCycle)
1411 continue;
1412 const MCProcResourceDesc *ProcResource =
1413 STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);
1414 unsigned NumUnits = ProcResource->NumUnits;
1415 if (NumUnits < min) {
1416 min = NumUnits;
1417 F = PRE.ProcResourceIdx;
1418 }
1419 }
1420 return min;
1421 }
1422 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1423 }
1424
1425 // Compute the critical resources needed by the instruction. This
1426 // function records the functional units needed by instructions that
1427 // must use only one functional unit. We use this as a tie breaker
1428 // for computing the resource MII. The instrutions that require
1429 // the same, highly used, functional unit have high priority.
1430 void calcCriticalResources(MachineInstr &MI) {
1431 unsigned SchedClass = MI.getDesc().getSchedClass();
1432 if (InstrItins && !InstrItins->isEmpty()) {
1433 for (const InstrStage &IS :
1434 make_range(InstrItins->beginStage(SchedClass),
1435 InstrItins->endStage(SchedClass))) {
1436 InstrStage::FuncUnits FuncUnits = IS.getUnits();
1437 if (llvm::popcount(FuncUnits) == 1)
1438 Resources[FuncUnits]++;
1439 }
1440 return;
1441 }
1442 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1443 const MCSchedClassDesc *SCDesc =
1444 STI->getSchedModel().getSchedClassDesc(SchedClass);
1445 if (!SCDesc->isValid())
1446 // No valid Schedule Class Desc for schedClass, should be
1447 // Pseudo/PostRAPseudo
1448 return;
1449
1450 for (const MCWriteProcResEntry &PRE :
1451 make_range(STI->getWriteProcResBegin(SCDesc),
1452 STI->getWriteProcResEnd(SCDesc))) {
1453 if (!PRE.ReleaseAtCycle)
1454 continue;
1455 Resources[PRE.ProcResourceIdx]++;
1456 }
1457 return;
1458 }
1459 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1460 }
1461
1462 /// Return true if IS1 has less priority than IS2.
1463 bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1464 InstrStage::FuncUnits F1 = 0, F2 = 0;
1465 unsigned MFUs1 = minFuncUnits(IS1, F1);
1466 unsigned MFUs2 = minFuncUnits(IS2, F2);
1467 if (MFUs1 == MFUs2)
1468 return Resources.lookup(F1) < Resources.lookup(F2);
1469 return MFUs1 > MFUs2;
1470 }
1471};
1472
1473/// Calculate the maximum register pressure of the scheduled instructions stream
1474class HighRegisterPressureDetector {
1475 MachineBasicBlock *OrigMBB;
1476 const MachineRegisterInfo &MRI;
1477 const TargetRegisterInfo *TRI;
1478
1479 const unsigned PSetNum;
1480
1481 // Indexed by PSet ID
1482 // InitSetPressure takes into account the register pressure of live-in
1483 // registers. It's not depend on how the loop is scheduled, so it's enough to
1484 // calculate them once at the beginning.
1485 std::vector<unsigned> InitSetPressure;
1486
1487 // Indexed by PSet ID
1488 // Upper limit for each register pressure set
1489 std::vector<unsigned> PressureSetLimit;
1490
1491 DenseMap<MachineInstr *, RegisterOperands> ROMap;
1492
1493 using Instr2LastUsesTy = DenseMap<MachineInstr *, SmallDenseSet<Register, 4>>;
1494
1495public:
1496 using OrderedInstsTy = std::vector<MachineInstr *>;
1497 using Instr2StageTy = DenseMap<MachineInstr *, unsigned>;
1498
1499private:
1500 static void dumpRegisterPressures(const std::vector<unsigned> &Pressures) {
1501 if (Pressures.size() == 0) {
1502 dbgs() << "[]";
1503 } else {
1504 char Prefix = '[';
1505 for (unsigned P : Pressures) {
1506 dbgs() << Prefix << P;
1507 Prefix = ' ';
1508 }
1509 dbgs() << ']';
1510 }
1511 }
1512
1513 void dumpPSet(Register Reg) const {
1514 dbgs() << "Reg=" << printReg(Reg, TRI, 0, &MRI) << " PSet=";
1515 for (auto PSetIter = MRI.getPressureSets(Reg); PSetIter.isValid();
1516 ++PSetIter) {
1517 dbgs() << *PSetIter << ' ';
1518 }
1519 dbgs() << '\n';
1520 }
1521
1522 void increaseRegisterPressure(std::vector<unsigned> &Pressure,
1523 Register Reg) const {
1524 auto PSetIter = MRI.getPressureSets(Reg);
1525 unsigned Weight = PSetIter.getWeight();
1526 for (; PSetIter.isValid(); ++PSetIter)
1527 Pressure[*PSetIter] += Weight;
1528 }
1529
1530 void decreaseRegisterPressure(std::vector<unsigned> &Pressure,
1531 Register Reg) const {
1532 auto PSetIter = MRI.getPressureSets(Reg);
1533 unsigned Weight = PSetIter.getWeight();
1534 for (; PSetIter.isValid(); ++PSetIter) {
1535 auto &P = Pressure[*PSetIter];
1536 assert(P >= Weight &&
1537 "register pressure must be greater than or equal weight");
1538 P -= Weight;
1539 }
1540 }
1541
1542 // Return true if Reg is reserved one, for example, stack pointer
1543 bool isReservedRegister(Register Reg) const {
1544 return Reg.isPhysical() && MRI.isReserved(Reg.asMCReg());
1545 }
1546
1547 bool isDefinedInThisLoop(Register Reg) const {
1548 return Reg.isVirtual() && MRI.getVRegDef(Reg)->getParent() == OrigMBB;
1549 }
1550
1551 // Search for live-in variables. They are factored into the register pressure
1552 // from the begining. Live-in variables used by every iteration should be
1553 // considered as alive throughout the loop. For example, the variable `c` in
1554 // following code. \code
1555 // int c = ...;
1556 // for (int i = 0; i < n; i++)
1557 // a[i] += b[i] + c;
1558 // \endcode
1559 void computeLiveIn() {
1560 DenseSet<Register> Used;
1561 for (auto &MI : *OrigMBB) {
1562 if (MI.isDebugInstr())
1563 continue;
1564 for (auto &Use : ROMap[&MI].Uses) {
1565 auto Reg = Use.RegUnit;
1566 // Ignore the variable that appears only on one side of phi instruction
1567 // because it's used only at the first iteration.
1568 if (MI.isPHI() && Reg != getLoopPhiReg(MI, OrigMBB))
1569 continue;
1570 if (isReservedRegister(Reg))
1571 continue;
1572 if (isDefinedInThisLoop(Reg))
1573 continue;
1574 Used.insert(Reg);
1575 }
1576 }
1577
1578 for (auto LiveIn : Used)
1579 increaseRegisterPressure(InitSetPressure, LiveIn);
1580 }
1581
1582 // Calculate the upper limit of each pressure set
1583 void computePressureSetLimit(const RegisterClassInfo &RCI) {
1584 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1585 PressureSetLimit[PSet] = RCI.getRegPressureSetLimit(PSet);
1586 }
1587
1588 // There are two patterns of last-use.
1589 // - by an instruction of the current iteration
1590 // - by a phi instruction of the next iteration (loop carried value)
1591 //
1592 // Furthermore, following two groups of instructions are executed
1593 // simultaneously
1594 // - next iteration's phi instructions in i-th stage
1595 // - current iteration's instructions in i+1-th stage
1596 //
1597 // This function calculates the last-use of each register while taking into
1598 // account the above two patterns.
1599 Instr2LastUsesTy computeLastUses(const OrderedInstsTy &OrderedInsts,
1600 Instr2StageTy &Stages) const {
1601 // We treat virtual registers that are defined and used in this loop.
1602 // Following virtual register will be ignored
1603 // - live-in one
1604 // - defined but not used in the loop (potentially live-out)
1605 DenseSet<Register> TargetRegs;
1606 const auto UpdateTargetRegs = [this, &TargetRegs](Register Reg) {
1607 if (isDefinedInThisLoop(Reg))
1608 TargetRegs.insert(Reg);
1609 };
1610 for (MachineInstr *MI : OrderedInsts) {
1611 if (MI->isPHI()) {
1612 Register Reg = getLoopPhiReg(*MI, OrigMBB);
1613 UpdateTargetRegs(Reg);
1614 } else {
1615 for (auto &Use : ROMap.find(MI)->getSecond().Uses)
1616 UpdateTargetRegs(Use.RegUnit);
1617 }
1618 }
1619
1620 const auto InstrScore = [&Stages](MachineInstr *MI) {
1621 return Stages[MI] + MI->isPHI();
1622 };
1623
1624 DenseMap<Register, MachineInstr *> LastUseMI;
1625 for (MachineInstr *MI : llvm::reverse(OrderedInsts)) {
1626 for (auto &Use : ROMap.find(MI)->getSecond().Uses) {
1627 auto Reg = Use.RegUnit;
1628 if (!TargetRegs.contains(Reg))
1629 continue;
1630 auto [Ite, Inserted] = LastUseMI.try_emplace(Reg, MI);
1631 if (!Inserted) {
1632 MachineInstr *Orig = Ite->second;
1633 MachineInstr *New = MI;
1634 if (InstrScore(Orig) < InstrScore(New))
1635 Ite->second = New;
1636 }
1637 }
1638 }
1639
1640 Instr2LastUsesTy LastUses;
1641 for (auto &Entry : LastUseMI)
1642 LastUses[Entry.second].insert(Entry.first);
1643 return LastUses;
1644 }
1645
1646 // Compute the maximum register pressure of the kernel. We'll simulate #Stage
1647 // iterations and check the register pressure at the point where all stages
1648 // overlapping.
1649 //
1650 // An example of unrolled loop where #Stage is 4..
1651 // Iter i+0 i+1 i+2 i+3
1652 // ------------------------
1653 // Stage 0
1654 // Stage 1 0
1655 // Stage 2 1 0
1656 // Stage 3 2 1 0 <- All stages overlap
1657 //
1658 std::vector<unsigned>
1659 computeMaxSetPressure(const OrderedInstsTy &OrderedInsts,
1660 Instr2StageTy &Stages,
1661 const unsigned StageCount) const {
1662 using RegSetTy = SmallDenseSet<Register, 16>;
1663
1664 // Indexed by #Iter. To treat "local" variables of each stage separately, we
1665 // manage the liveness of the registers independently by iterations.
1666 SmallVector<RegSetTy> LiveRegSets(StageCount);
1667
1668 auto CurSetPressure = InitSetPressure;
1669 auto MaxSetPressure = InitSetPressure;
1670 auto LastUses = computeLastUses(OrderedInsts, Stages);
1671
1672 LLVM_DEBUG({
1673 dbgs() << "Ordered instructions:\n";
1674 for (MachineInstr *MI : OrderedInsts) {
1675 dbgs() << "Stage " << Stages[MI] << ": ";
1676 MI->dump();
1677 }
1678 });
1679
1680 const auto InsertReg = [this, &CurSetPressure](RegSetTy &RegSet,
1681 Register Reg) {
1682 if (!Reg.isValid() || isReservedRegister(Reg))
1683 return;
1684
1685 bool Inserted = RegSet.insert(Reg).second;
1686 if (!Inserted)
1687 return;
1688
1689 LLVM_DEBUG(dbgs() << "insert " << printReg(Reg, TRI, 0, &MRI) << "\n");
1690 increaseRegisterPressure(CurSetPressure, Reg);
1691 LLVM_DEBUG(dumpPSet(Reg));
1692 };
1693
1694 const auto EraseReg = [this, &CurSetPressure](RegSetTy &RegSet,
1695 Register Reg) {
1696 if (!Reg.isValid() || isReservedRegister(Reg))
1697 return;
1698
1699 // live-in register
1700 if (!RegSet.contains(Reg))
1701 return;
1702
1703 LLVM_DEBUG(dbgs() << "erase " << printReg(Reg, TRI, 0, &MRI) << "\n");
1704 RegSet.erase(Reg);
1705 decreaseRegisterPressure(CurSetPressure, Reg);
1706 LLVM_DEBUG(dumpPSet(Reg));
1707 };
1708
1709 for (unsigned I = 0; I < StageCount; I++) {
1710 for (MachineInstr *MI : OrderedInsts) {
1711 const auto Stage = Stages[MI];
1712 if (I < Stage)
1713 continue;
1714
1715 const unsigned Iter = I - Stage;
1716
1717 for (auto &Def : ROMap.find(MI)->getSecond().Defs)
1718 InsertReg(LiveRegSets[Iter], Def.RegUnit);
1719
1720 for (auto LastUse : LastUses[MI]) {
1721 if (MI->isPHI()) {
1722 if (Iter != 0)
1723 EraseReg(LiveRegSets[Iter - 1], LastUse);
1724 } else {
1725 EraseReg(LiveRegSets[Iter], LastUse);
1726 }
1727 }
1728
1729 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1730 MaxSetPressure[PSet] =
1731 std::max(MaxSetPressure[PSet], CurSetPressure[PSet]);
1732
1733 LLVM_DEBUG({
1734 dbgs() << "CurSetPressure=";
1735 dumpRegisterPressures(CurSetPressure);
1736 dbgs() << " iter=" << Iter << " stage=" << Stage << ":";
1737 MI->dump();
1738 });
1739 }
1740 }
1741
1742 return MaxSetPressure;
1743 }
1744
1745public:
1746 HighRegisterPressureDetector(MachineBasicBlock *OrigMBB,
1747 const MachineFunction &MF)
1748 : OrigMBB(OrigMBB), MRI(MF.getRegInfo()),
1749 TRI(MF.getSubtarget().getRegisterInfo()),
1750 PSetNum(TRI->getNumRegPressureSets()), InitSetPressure(PSetNum, 0),
1751 PressureSetLimit(PSetNum, 0) {}
1752
1753 // Used to calculate register pressure, which is independent of loop
1754 // scheduling.
1755 void init(const RegisterClassInfo &RCI) {
1756 for (MachineInstr &MI : *OrigMBB) {
1757 if (MI.isDebugInstr())
1758 continue;
1759 ROMap[&MI].collect(MI, *TRI, MRI, false, true);
1760 }
1761
1762 computeLiveIn();
1763 computePressureSetLimit(RCI);
1764 }
1765
1766 // Calculate the maximum register pressures of the loop and check if they
1767 // exceed the limit
1768 bool detect(const SwingSchedulerDAG *SSD, SMSchedule &Schedule,
1769 const unsigned MaxStage) const {
1771 "the percentage of the margin must be between 0 to 100");
1772
1773 OrderedInstsTy OrderedInsts;
1774 Instr2StageTy Stages;
1775 computeScheduledInsts(SSD, Schedule, OrderedInsts, Stages);
1776 const auto MaxSetPressure =
1777 computeMaxSetPressure(OrderedInsts, Stages, MaxStage + 1);
1778
1779 LLVM_DEBUG({
1780 dbgs() << "Dump MaxSetPressure:\n";
1781 for (unsigned I = 0; I < MaxSetPressure.size(); I++) {
1782 dbgs() << format("MaxSetPressure[%d]=%d\n", I, MaxSetPressure[I]);
1783 }
1784 dbgs() << '\n';
1785 });
1786
1787 for (unsigned PSet = 0; PSet < PSetNum; PSet++) {
1788 unsigned Limit = PressureSetLimit[PSet];
1789 unsigned Margin = Limit * RegPressureMargin / 100;
1790 LLVM_DEBUG(dbgs() << "PSet=" << PSet << " Limit=" << Limit
1791 << " Margin=" << Margin << "\n");
1792 if (Limit < MaxSetPressure[PSet] + Margin) {
1793 LLVM_DEBUG(
1794 dbgs()
1795 << "Rejected the schedule because of too high register pressure\n");
1796 return true;
1797 }
1798 }
1799 return false;
1800 }
1801};
1802
1803} // end anonymous namespace
1804
1805/// Calculate the resource constrained minimum initiation interval for the
1806/// specified loop. We use the DFA to model the resources needed for
1807/// each instruction, and we ignore dependences. A different DFA is created
1808/// for each cycle that is required. When adding a new instruction, we attempt
1809/// to add it to each existing DFA, until a legal space is found. If the
1810/// instruction cannot be reserved in an existing DFA, we create a new one.
1811unsigned SwingSchedulerDAG::calculateResMII() {
1812 LLVM_DEBUG(dbgs() << "calculateResMII:\n");
1813 ResourceManager RM(&MF.getSubtarget(), this);
1814 return RM.calculateResMII();
1815}
1816
1817/// Calculate the recurrence-constrainted minimum initiation interval.
1818/// Iterate over each circuit. Compute the delay(c) and distance(c)
1819/// for each circuit. The II needs to satisfy the inequality
1820/// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1821/// II that satisfies the inequality, and the RecMII is the maximum
1822/// of those values.
1823unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1824 unsigned RecMII = 0;
1825
1826 for (NodeSet &Nodes : NodeSets) {
1827 if (Nodes.empty())
1828 continue;
1829
1830 unsigned Delay = Nodes.getLatency();
1831 unsigned Distance = 1;
1832
1833 // ii = ceil(delay / distance)
1834 unsigned CurMII = (Delay + Distance - 1) / Distance;
1835 Nodes.setRecMII(CurMII);
1836 if (CurMII > RecMII)
1837 RecMII = CurMII;
1838 }
1839
1840 return RecMII;
1841}
1842
1843/// Create the adjacency structure of the nodes in the graph.
1844void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1845 SwingSchedulerDAG *DAG) {
1846 BitVector Added(SUnits.size());
1847 DenseMap<int, int> OutputDeps;
1848 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1849 Added.reset();
1850 // Add any successor to the adjacency matrix and exclude duplicates.
1851 for (auto &OE : DAG->DDG->getOutEdges(&SUnits[i])) {
1852 // Only create a back-edge on the first and last nodes of a dependence
1853 // chain. This records any chains and adds them later.
1854 if (OE.isOutputDep()) {
1855 int N = OE.getDst()->NodeNum;
1856 int BackEdge = i;
1857 auto Dep = OutputDeps.find(BackEdge);
1858 if (Dep != OutputDeps.end()) {
1859 BackEdge = Dep->second;
1860 OutputDeps.erase(Dep);
1861 }
1862 OutputDeps[N] = BackEdge;
1863 }
1864 // Do not process a boundary node, an artificial node.
1865 if (OE.getDst()->isBoundaryNode() || OE.isArtificial())
1866 continue;
1867
1868 // This code is retained o preserve previous behavior and prevent
1869 // regression. This condition means that anti-dependnecies within an
1870 // iteration are ignored when searching circuits. Therefore it's natural
1871 // to consider this dependence as well.
1872 // FIXME: Remove this code if it doesn't have significant impact on
1873 // performance.
1874 if (OE.isAntiDep())
1875 continue;
1876
1877 int N = OE.getDst()->NodeNum;
1878 if (!Added.test(N)) {
1879 AdjK[i].push_back(N);
1880 Added.set(N);
1881 }
1882 }
1883 // A chain edge between a store and a load is treated as a back-edge in the
1884 // adjacency matrix.
1885 for (auto &IE : DAG->DDG->getInEdges(&SUnits[i])) {
1886 SUnit *Src = IE.getSrc();
1887 SUnit *Dst = IE.getDst();
1888 if (!Dst->getInstr()->mayStore() || !DAG->isLoopCarriedDep(IE))
1889 continue;
1890 if (IE.isOrderDep() && Src->getInstr()->mayLoad()) {
1891 int N = Src->NodeNum;
1892 if (!Added.test(N)) {
1893 AdjK[i].push_back(N);
1894 Added.set(N);
1895 }
1896 }
1897 }
1898 }
1899 // Add back-edges in the adjacency matrix for the output dependences.
1900 for (auto &OD : OutputDeps)
1901 if (!Added.test(OD.second)) {
1902 AdjK[OD.first].push_back(OD.second);
1903 Added.set(OD.second);
1904 }
1905}
1906
1907/// Identify an elementary circuit in the dependence graph starting at the
1908/// specified node.
1909bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1910 const SwingSchedulerDAG *DAG,
1911 bool HasBackedge) {
1912 SUnit *SV = &SUnits[V];
1913 bool F = false;
1914 Stack.insert(SV);
1915 Blocked.set(V);
1916
1917 for (auto W : AdjK[V]) {
1918 if (NumPaths > MaxPaths)
1919 break;
1920 if (W < S)
1921 continue;
1922 if (W == S) {
1923 if (!HasBackedge)
1924 NodeSets.push_back(NodeSet(Stack.begin(), Stack.end(), DAG));
1925 F = true;
1926 ++NumPaths;
1927 break;
1928 }
1929 if (!Blocked.test(W)) {
1930 if (circuit(W, S, NodeSets, DAG,
1931 Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
1932 F = true;
1933 }
1934 }
1935
1936 if (F)
1937 unblock(V);
1938 else {
1939 for (auto W : AdjK[V]) {
1940 if (W < S)
1941 continue;
1942 B[W].insert(SV);
1943 }
1944 }
1945 Stack.pop_back();
1946 return F;
1947}
1948
1949/// Unblock a node in the circuit finding algorithm.
1950void SwingSchedulerDAG::Circuits::unblock(int U) {
1951 Blocked.reset(U);
1952 SmallPtrSet<SUnit *, 4> &BU = B[U];
1953 while (!BU.empty()) {
1954 SmallPtrSet<SUnit *, 4>::iterator SI = BU.begin();
1955 assert(SI != BU.end() && "Invalid B set.");
1956 SUnit *W = *SI;
1957 BU.erase(W);
1958 if (Blocked.test(W->NodeNum))
1959 unblock(W->NodeNum);
1960 }
1961}
1962
1963/// Identify all the elementary circuits in the dependence graph using
1964/// Johnson's circuit algorithm.
1965void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
1966 Circuits Cir(SUnits, Topo);
1967 // Create the adjacency structure.
1968 Cir.createAdjacencyStructure(this);
1969 for (int I = 0, E = SUnits.size(); I != E; ++I) {
1970 Cir.reset();
1971 Cir.circuit(I, I, NodeSets, this);
1972 }
1973}
1974
1975// Create artificial dependencies between the source of COPY/REG_SEQUENCE that
1976// is loop-carried to the USE in next iteration. This will help pipeliner avoid
1977// additional copies that are needed across iterations. An artificial dependence
1978// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
1979
1980// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
1981// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
1982// PHI-------True-Dep------> USEOfPhi
1983
1984// The mutation creates
1985// USEOfPHI -------Artificial-Dep---> SRCOfCopy
1986
1987// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
1988// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
1989// late to avoid additional copies across iterations. The possible scheduling
1990// order would be
1991// USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE.
1992
1993void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
1994 for (SUnit &SU : DAG->SUnits) {
1995 // Find the COPY/REG_SEQUENCE instruction.
1996 if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
1997 continue;
1998
1999 // Record the loop carried PHIs.
2001 // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
2003
2004 for (auto &Dep : SU.Preds) {
2005 SUnit *TmpSU = Dep.getSUnit();
2006 MachineInstr *TmpMI = TmpSU->getInstr();
2007 SDep::Kind DepKind = Dep.getKind();
2008 // Save the loop carried PHI.
2009 if (DepKind == SDep::Anti && TmpMI->isPHI())
2010 PHISUs.push_back(TmpSU);
2011 // Save the source of COPY/REG_SEQUENCE.
2012 // If the source has no pre-decessors, we will end up creating cycles.
2013 else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
2014 SrcSUs.push_back(TmpSU);
2015 }
2016
2017 if (PHISUs.size() == 0 || SrcSUs.size() == 0)
2018 continue;
2019
2020 // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
2021 // SUnit to the container.
2023 // Do not use iterator based loop here as we are updating the container.
2024 for (size_t Index = 0; Index < PHISUs.size(); ++Index) {
2025 for (auto &Dep : PHISUs[Index]->Succs) {
2026 if (Dep.getKind() != SDep::Data)
2027 continue;
2028
2029 SUnit *TmpSU = Dep.getSUnit();
2030 MachineInstr *TmpMI = TmpSU->getInstr();
2031 if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
2032 PHISUs.push_back(TmpSU);
2033 continue;
2034 }
2035 UseSUs.push_back(TmpSU);
2036 }
2037 }
2038
2039 if (UseSUs.size() == 0)
2040 continue;
2041
2042 SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
2043 // Add the artificial dependencies if it does not form a cycle.
2044 for (auto *I : UseSUs) {
2045 for (auto *Src : SrcSUs) {
2046 if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
2047 Src->addPred(SDep(I, SDep::Artificial));
2048 SDAG->Topo.AddPred(Src, I);
2049 }
2050 }
2051 }
2052 }
2053}
2054
2055/// Compute several functions need to order the nodes for scheduling.
2056/// ASAP - Earliest time to schedule a node.
2057/// ALAP - Latest time to schedule a node.
2058/// MOV - Mobility function, difference between ALAP and ASAP.
2059/// D - Depth of each node.
2060/// H - Height of each node.
2061void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
2062 ScheduleInfo.resize(SUnits.size());
2063
2064 LLVM_DEBUG({
2065 for (int I : Topo) {
2066 const SUnit &SU = SUnits[I];
2067 dumpNode(SU);
2068 }
2069 });
2070
2071 int maxASAP = 0;
2072 // Compute ASAP and ZeroLatencyDepth.
2073 for (int I : Topo) {
2074 int asap = 0;
2075 int zeroLatencyDepth = 0;
2076 SUnit *SU = &SUnits[I];
2077 for (const auto &IE : DDG->getInEdges(SU)) {
2078 SUnit *Pred = IE.getSrc();
2079 if (IE.getLatency() == 0)
2080 zeroLatencyDepth =
2081 std::max(zeroLatencyDepth, getZeroLatencyDepth(Pred) + 1);
2082 if (IE.ignoreDependence(true))
2083 continue;
2084 asap = std::max(asap, (int)(getASAP(Pred) + IE.getLatency() -
2085 IE.getDistance() * MII));
2086 }
2087 maxASAP = std::max(maxASAP, asap);
2088 ScheduleInfo[I].ASAP = asap;
2089 ScheduleInfo[I].ZeroLatencyDepth = zeroLatencyDepth;
2090 }
2091
2092 // Compute ALAP, ZeroLatencyHeight, and MOV.
2093 for (int I : llvm::reverse(Topo)) {
2094 int alap = maxASAP;
2095 int zeroLatencyHeight = 0;
2096 SUnit *SU = &SUnits[I];
2097 for (const auto &OE : DDG->getOutEdges(SU)) {
2098 SUnit *Succ = OE.getDst();
2099 if (Succ->isBoundaryNode())
2100 continue;
2101 if (OE.getLatency() == 0)
2102 zeroLatencyHeight =
2103 std::max(zeroLatencyHeight, getZeroLatencyHeight(Succ) + 1);
2104 if (OE.ignoreDependence(true))
2105 continue;
2106 alap = std::min(alap, (int)(getALAP(Succ) - OE.getLatency() +
2107 OE.getDistance() * MII));
2108 }
2109
2110 ScheduleInfo[I].ALAP = alap;
2111 ScheduleInfo[I].ZeroLatencyHeight = zeroLatencyHeight;
2112 }
2113
2114 // After computing the node functions, compute the summary for each node set.
2115 for (NodeSet &I : NodeSets)
2116 I.computeNodeSetInfo(this);
2117
2118 LLVM_DEBUG({
2119 for (unsigned i = 0; i < SUnits.size(); i++) {
2120 dbgs() << "\tNode " << i << ":\n";
2121 dbgs() << "\t ASAP = " << getASAP(&SUnits[i]) << "\n";
2122 dbgs() << "\t ALAP = " << getALAP(&SUnits[i]) << "\n";
2123 dbgs() << "\t MOV = " << getMOV(&SUnits[i]) << "\n";
2124 dbgs() << "\t D = " << getDepth(&SUnits[i]) << "\n";
2125 dbgs() << "\t H = " << getHeight(&SUnits[i]) << "\n";
2126 dbgs() << "\t ZLD = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
2127 dbgs() << "\t ZLH = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
2128 }
2129 });
2130}
2131
2132/// Compute the Pred_L(O) set, as defined in the paper. The set is defined
2133/// as the predecessors of the elements of NodeOrder that are not also in
2134/// NodeOrder.
2137 const NodeSet *S = nullptr) {
2138 Preds.clear();
2139
2140 for (SUnit *SU : NodeOrder) {
2141 for (const auto &IE : DDG->getInEdges(SU)) {
2142 SUnit *PredSU = IE.getSrc();
2143 if (S && S->count(PredSU) == 0)
2144 continue;
2145 if (IE.ignoreDependence(true))
2146 continue;
2147 if (NodeOrder.count(PredSU) == 0)
2148 Preds.insert(PredSU);
2149 }
2150
2151 // FIXME: The following loop-carried dependencies may also need to be
2152 // considered.
2153 // - Physical register dependencies (true-dependence and WAW).
2154 // - Memory dependencies.
2155 for (const auto &OE : DDG->getOutEdges(SU)) {
2156 SUnit *SuccSU = OE.getDst();
2157 if (!OE.isAntiDep())
2158 continue;
2159 if (S && S->count(SuccSU) == 0)
2160 continue;
2161 if (NodeOrder.count(SuccSU) == 0)
2162 Preds.insert(SuccSU);
2163 }
2164 }
2165 return !Preds.empty();
2166}
2167
2168/// Compute the Succ_L(O) set, as defined in the paper. The set is defined
2169/// as the successors of the elements of NodeOrder that are not also in
2170/// NodeOrder.
2173 const NodeSet *S = nullptr) {
2174 Succs.clear();
2175
2176 for (SUnit *SU : NodeOrder) {
2177 for (const auto &OE : DDG->getOutEdges(SU)) {
2178 SUnit *SuccSU = OE.getDst();
2179 if (S && S->count(SuccSU) == 0)
2180 continue;
2181 if (OE.ignoreDependence(false))
2182 continue;
2183 if (NodeOrder.count(SuccSU) == 0)
2184 Succs.insert(SuccSU);
2185 }
2186
2187 // FIXME: The following loop-carried dependencies may also need to be
2188 // considered.
2189 // - Physical register dependnecies (true-dependnece and WAW).
2190 // - Memory dependencies.
2191 for (const auto &IE : DDG->getInEdges(SU)) {
2192 SUnit *PredSU = IE.getSrc();
2193 if (!IE.isAntiDep())
2194 continue;
2195 if (S && S->count(PredSU) == 0)
2196 continue;
2197 if (NodeOrder.count(PredSU) == 0)
2198 Succs.insert(PredSU);
2199 }
2200 }
2201 return !Succs.empty();
2202}
2203
2204/// Return true if there is a path from the specified node to any of the nodes
2205/// in DestNodes. Keep track and return the nodes in any path.
2206static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
2207 SetVector<SUnit *> &DestNodes,
2208 SetVector<SUnit *> &Exclude,
2209 SmallPtrSet<SUnit *, 8> &Visited,
2210 SwingSchedulerDDG *DDG) {
2211 if (Cur->isBoundaryNode())
2212 return false;
2213 if (Exclude.contains(Cur))
2214 return false;
2215 if (DestNodes.contains(Cur))
2216 return true;
2217 if (!Visited.insert(Cur).second)
2218 return Path.contains(Cur);
2219 bool FoundPath = false;
2220 for (const auto &OE : DDG->getOutEdges(Cur))
2221 if (!OE.ignoreDependence(false))
2222 FoundPath |=
2223 computePath(OE.getDst(), Path, DestNodes, Exclude, Visited, DDG);
2224 for (const auto &IE : DDG->getInEdges(Cur))
2225 if (IE.isAntiDep() && IE.getDistance() == 0)
2226 FoundPath |=
2227 computePath(IE.getSrc(), Path, DestNodes, Exclude, Visited, DDG);
2228 if (FoundPath)
2229 Path.insert(Cur);
2230 return FoundPath;
2231}
2232
2233/// Compute the live-out registers for the instructions in a node-set.
2234/// The live-out registers are those that are defined in the node-set,
2235/// but not used. Except for use operands of Phis.
2237 NodeSet &NS) {
2242 for (SUnit *SU : NS) {
2243 const MachineInstr *MI = SU->getInstr();
2244 if (MI->isPHI())
2245 continue;
2246 for (const MachineOperand &MO : MI->all_uses()) {
2247 Register Reg = MO.getReg();
2248 if (Reg.isVirtual())
2249 Uses.insert(Reg);
2250 else if (MRI.isAllocatable(Reg))
2251 Uses.insert_range(TRI->regunits(Reg.asMCReg()));
2252 }
2253 }
2254 for (SUnit *SU : NS)
2255 for (const MachineOperand &MO : SU->getInstr()->all_defs())
2256 if (!MO.isDead()) {
2257 Register Reg = MO.getReg();
2258 if (Reg.isVirtual()) {
2259 if (!Uses.count(Reg))
2260 LiveOutRegs.emplace_back(Reg, LaneBitmask::getNone());
2261 } else if (MRI.isAllocatable(Reg)) {
2262 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
2263 if (!Uses.count(Unit))
2264 LiveOutRegs.emplace_back(Unit, LaneBitmask::getNone());
2265 }
2266 }
2267 RPTracker.addLiveRegs(LiveOutRegs);
2268}
2269
2270/// A heuristic to filter nodes in recurrent node-sets if the register
2271/// pressure of a set is too high.
2272void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
2273 for (auto &NS : NodeSets) {
2274 // Skip small node-sets since they won't cause register pressure problems.
2275 if (NS.size() <= 2)
2276 continue;
2277 IntervalPressure RecRegPressure;
2278 RegPressureTracker RecRPTracker(RecRegPressure);
2279 RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
2280 computeLiveOuts(MF, RecRPTracker, NS);
2281 RecRPTracker.closeBottom();
2282
2283 std::vector<SUnit *> SUnits(NS.begin(), NS.end());
2284 llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
2285 return A->NodeNum > B->NodeNum;
2286 });
2287
2288 for (auto &SU : SUnits) {
2289 // Since we're computing the register pressure for a subset of the
2290 // instructions in a block, we need to set the tracker for each
2291 // instruction in the node-set. The tracker is set to the instruction
2292 // just after the one we're interested in.
2294 RecRPTracker.setPos(std::next(CurInstI));
2295
2296 RegPressureDelta RPDelta;
2297 ArrayRef<PressureChange> CriticalPSets;
2298 RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
2299 CriticalPSets,
2300 RecRegPressure.MaxSetPressure);
2301 if (RPDelta.Excess.isValid()) {
2302 LLVM_DEBUG(
2303 dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
2304 << TRI->getRegPressureSetName(RPDelta.Excess.getPSet())
2305 << ":" << RPDelta.Excess.getUnitInc() << "\n");
2306 NS.setExceedPressure(SU);
2307 break;
2308 }
2309 RecRPTracker.recede();
2310 }
2311 }
2312}
2313
2314/// A heuristic to colocate node sets that have the same set of
2315/// successors.
2316void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
2317 unsigned Colocate = 0;
2318 for (int i = 0, e = NodeSets.size(); i < e; ++i) {
2319 NodeSet &N1 = NodeSets[i];
2320 SmallSetVector<SUnit *, 8> S1;
2321 if (N1.empty() || !succ_L(N1, S1, DDG.get()))
2322 continue;
2323 for (int j = i + 1; j < e; ++j) {
2324 NodeSet &N2 = NodeSets[j];
2325 if (N1.compareRecMII(N2) != 0)
2326 continue;
2327 SmallSetVector<SUnit *, 8> S2;
2328 if (N2.empty() || !succ_L(N2, S2, DDG.get()))
2329 continue;
2330 if (llvm::set_is_subset(S1, S2) && S1.size() == S2.size()) {
2331 N1.setColocate(++Colocate);
2332 N2.setColocate(Colocate);
2333 break;
2334 }
2335 }
2336 }
2337}
2338
2339/// Check if the existing node-sets are profitable. If not, then ignore the
2340/// recurrent node-sets, and attempt to schedule all nodes together. This is
2341/// a heuristic. If the MII is large and all the recurrent node-sets are small,
2342/// then it's best to try to schedule all instructions together instead of
2343/// starting with the recurrent node-sets.
2344void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
2345 // Look for loops with a large MII.
2346 if (MII < 17)
2347 return;
2348 // Check if the node-set contains only a simple add recurrence.
2349 for (auto &NS : NodeSets) {
2350 if (NS.getRecMII() > 2)
2351 return;
2352 if (NS.getMaxDepth() > MII)
2353 return;
2354 }
2355 NodeSets.clear();
2356 LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
2357}
2358
2359/// Add the nodes that do not belong to a recurrence set into groups
2360/// based upon connected components.
2361void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
2362 SetVector<SUnit *> NodesAdded;
2363 SmallPtrSet<SUnit *, 8> Visited;
2364 // Add the nodes that are on a path between the previous node sets and
2365 // the current node set.
2366 for (NodeSet &I : NodeSets) {
2367 SmallSetVector<SUnit *, 8> N;
2368 // Add the nodes from the current node set to the previous node set.
2369 if (succ_L(I, N, DDG.get())) {
2370 SetVector<SUnit *> Path;
2371 for (SUnit *NI : N) {
2372 Visited.clear();
2373 computePath(NI, Path, NodesAdded, I, Visited, DDG.get());
2374 }
2375 if (!Path.empty())
2376 I.insert(Path.begin(), Path.end());
2377 }
2378 // Add the nodes from the previous node set to the current node set.
2379 N.clear();
2380 if (succ_L(NodesAdded, N, DDG.get())) {
2381 SetVector<SUnit *> Path;
2382 for (SUnit *NI : N) {
2383 Visited.clear();
2384 computePath(NI, Path, I, NodesAdded, Visited, DDG.get());
2385 }
2386 if (!Path.empty())
2387 I.insert(Path.begin(), Path.end());
2388 }
2389 NodesAdded.insert_range(I);
2390 }
2391
2392 // Create a new node set with the connected nodes of any successor of a node
2393 // in a recurrent set.
2394 NodeSet NewSet;
2395 SmallSetVector<SUnit *, 8> N;
2396 if (succ_L(NodesAdded, N, DDG.get()))
2397 for (SUnit *I : N)
2398 addConnectedNodes(I, NewSet, NodesAdded);
2399 if (!NewSet.empty())
2400 NodeSets.push_back(NewSet);
2401
2402 // Create a new node set with the connected nodes of any predecessor of a node
2403 // in a recurrent set.
2404 NewSet.clear();
2405 if (pred_L(NodesAdded, N, DDG.get()))
2406 for (SUnit *I : N)
2407 addConnectedNodes(I, NewSet, NodesAdded);
2408 if (!NewSet.empty())
2409 NodeSets.push_back(NewSet);
2410
2411 // Create new nodes sets with the connected nodes any remaining node that
2412 // has no predecessor.
2413 for (SUnit &SU : SUnits) {
2414 if (NodesAdded.count(&SU) == 0) {
2415 NewSet.clear();
2416 addConnectedNodes(&SU, NewSet, NodesAdded);
2417 if (!NewSet.empty())
2418 NodeSets.push_back(NewSet);
2419 }
2420 }
2421}
2422
2423/// Add the node to the set, and add all of its connected nodes to the set.
2424void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
2425 SetVector<SUnit *> &NodesAdded) {
2426 NewSet.insert(SU);
2427 NodesAdded.insert(SU);
2428 for (auto &OE : DDG->getOutEdges(SU)) {
2429 SUnit *Successor = OE.getDst();
2430 if (!OE.isArtificial() && !Successor->isBoundaryNode() &&
2431 NodesAdded.count(Successor) == 0)
2432 addConnectedNodes(Successor, NewSet, NodesAdded);
2433 }
2434 for (auto &IE : DDG->getInEdges(SU)) {
2435 SUnit *Predecessor = IE.getSrc();
2436 if (!IE.isArtificial() && NodesAdded.count(Predecessor) == 0)
2437 addConnectedNodes(Predecessor, NewSet, NodesAdded);
2438 }
2439}
2440
2441/// Return true if Set1 contains elements in Set2. The elements in common
2442/// are returned in a different container.
2443static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
2445 Result.clear();
2446 for (SUnit *SU : Set1) {
2447 if (Set2.count(SU) != 0)
2448 Result.insert(SU);
2449 }
2450 return !Result.empty();
2451}
2452
2453/// Merge the recurrence node sets that have the same initial node.
2454void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
2455 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2456 ++I) {
2457 NodeSet &NI = *I;
2458 for (NodeSetType::iterator J = I + 1; J != E;) {
2459 NodeSet &NJ = *J;
2460 if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
2461 if (NJ.compareRecMII(NI) > 0)
2462 NI.setRecMII(NJ.getRecMII());
2463 for (SUnit *SU : *J)
2464 I->insert(SU);
2465 NodeSets.erase(J);
2466 E = NodeSets.end();
2467 } else {
2468 ++J;
2469 }
2470 }
2471 }
2472}
2473
2474/// Remove nodes that have been scheduled in previous NodeSets.
2475void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
2476 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2477 ++I)
2478 for (NodeSetType::iterator J = I + 1; J != E;) {
2479 J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
2480
2481 if (J->empty()) {
2482 NodeSets.erase(J);
2483 E = NodeSets.end();
2484 } else {
2485 ++J;
2486 }
2487 }
2488}
2489
2490/// Compute an ordered list of the dependence graph nodes, which
2491/// indicates the order that the nodes will be scheduled. This is a
2492/// two-level algorithm. First, a partial order is created, which
2493/// consists of a list of sets ordered from highest to lowest priority.
2494void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
2495 SmallSetVector<SUnit *, 8> R;
2496 NodeOrder.clear();
2497
2498 for (auto &Nodes : NodeSets) {
2499 LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
2500 OrderKind Order;
2501 SmallSetVector<SUnit *, 8> N;
2502 if (pred_L(NodeOrder, N, DDG.get()) && llvm::set_is_subset(N, Nodes)) {
2503 R.insert_range(N);
2504 Order = BottomUp;
2505 LLVM_DEBUG(dbgs() << " Bottom up (preds) ");
2506 } else if (succ_L(NodeOrder, N, DDG.get()) &&
2507 llvm::set_is_subset(N, Nodes)) {
2508 R.insert_range(N);
2509 Order = TopDown;
2510 LLVM_DEBUG(dbgs() << " Top down (succs) ");
2511 } else if (isIntersect(N, Nodes, R)) {
2512 // If some of the successors are in the existing node-set, then use the
2513 // top-down ordering.
2514 Order = TopDown;
2515 LLVM_DEBUG(dbgs() << " Top down (intersect) ");
2516 } else if (NodeSets.size() == 1) {
2517 for (const auto &N : Nodes)
2518 if (N->Succs.size() == 0)
2519 R.insert(N);
2520 Order = BottomUp;
2521 LLVM_DEBUG(dbgs() << " Bottom up (all) ");
2522 } else {
2523 // Find the node with the highest ASAP.
2524 SUnit *maxASAP = nullptr;
2525 for (SUnit *SU : Nodes) {
2526 if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
2527 (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
2528 maxASAP = SU;
2529 }
2530 R.insert(maxASAP);
2531 Order = BottomUp;
2532 LLVM_DEBUG(dbgs() << " Bottom up (default) ");
2533 }
2534
2535 while (!R.empty()) {
2536 if (Order == TopDown) {
2537 // Choose the node with the maximum height. If more than one, choose
2538 // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
2539 // choose the node with the lowest MOV.
2540 while (!R.empty()) {
2541 SUnit *maxHeight = nullptr;
2542 for (SUnit *I : R) {
2543 if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
2544 maxHeight = I;
2545 else if (getHeight(I) == getHeight(maxHeight) &&
2546 getZeroLatencyHeight(I) > getZeroLatencyHeight(maxHeight))
2547 maxHeight = I;
2548 else if (getHeight(I) == getHeight(maxHeight) &&
2549 getZeroLatencyHeight(I) ==
2550 getZeroLatencyHeight(maxHeight) &&
2551 getMOV(I) < getMOV(maxHeight))
2552 maxHeight = I;
2553 }
2554 NodeOrder.insert(maxHeight);
2555 LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
2556 R.remove(maxHeight);
2557 for (const auto &OE : DDG->getOutEdges(maxHeight)) {
2558 SUnit *SU = OE.getDst();
2559 if (Nodes.count(SU) == 0)
2560 continue;
2561 if (NodeOrder.contains(SU))
2562 continue;
2563 if (OE.ignoreDependence(false))
2564 continue;
2565 R.insert(SU);
2566 }
2567
2568 // FIXME: The following loop-carried dependencies may also need to be
2569 // considered.
2570 // - Physical register dependnecies (true-dependnece and WAW).
2571 // - Memory dependencies.
2572 for (const auto &IE : DDG->getInEdges(maxHeight)) {
2573 SUnit *SU = IE.getSrc();
2574 if (!IE.isAntiDep())
2575 continue;
2576 if (Nodes.count(SU) == 0)
2577 continue;
2578 if (NodeOrder.contains(SU))
2579 continue;
2580 R.insert(SU);
2581 }
2582 }
2583 Order = BottomUp;
2584 LLVM_DEBUG(dbgs() << "\n Switching order to bottom up ");
2585 SmallSetVector<SUnit *, 8> N;
2586 if (pred_L(NodeOrder, N, DDG.get(), &Nodes))
2587 R.insert_range(N);
2588 } else {
2589 // Choose the node with the maximum depth. If more than one, choose
2590 // the node with the maximum ZeroLatencyDepth. If still more than one,
2591 // choose the node with the lowest MOV.
2592 while (!R.empty()) {
2593 SUnit *maxDepth = nullptr;
2594 for (SUnit *I : R) {
2595 if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
2596 maxDepth = I;
2597 else if (getDepth(I) == getDepth(maxDepth) &&
2598 getZeroLatencyDepth(I) > getZeroLatencyDepth(maxDepth))
2599 maxDepth = I;
2600 else if (getDepth(I) == getDepth(maxDepth) &&
2601 getZeroLatencyDepth(I) == getZeroLatencyDepth(maxDepth) &&
2602 getMOV(I) < getMOV(maxDepth))
2603 maxDepth = I;
2604 }
2605 NodeOrder.insert(maxDepth);
2606 LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
2607 R.remove(maxDepth);
2608 if (Nodes.isExceedSU(maxDepth)) {
2609 Order = TopDown;
2610 R.clear();
2611 R.insert(Nodes.getNode(0));
2612 break;
2613 }
2614 for (const auto &IE : DDG->getInEdges(maxDepth)) {
2615 SUnit *SU = IE.getSrc();
2616 if (Nodes.count(SU) == 0)
2617 continue;
2618 if (NodeOrder.contains(SU))
2619 continue;
2620 R.insert(SU);
2621 }
2622
2623 // FIXME: The following loop-carried dependencies may also need to be
2624 // considered.
2625 // - Physical register dependnecies (true-dependnece and WAW).
2626 // - Memory dependencies.
2627 for (const auto &OE : DDG->getOutEdges(maxDepth)) {
2628 SUnit *SU = OE.getDst();
2629 if (!OE.isAntiDep())
2630 continue;
2631 if (Nodes.count(SU) == 0)
2632 continue;
2633 if (NodeOrder.contains(SU))
2634 continue;
2635 R.insert(SU);
2636 }
2637 }
2638 Order = TopDown;
2639 LLVM_DEBUG(dbgs() << "\n Switching order to top down ");
2640 SmallSetVector<SUnit *, 8> N;
2641 if (succ_L(NodeOrder, N, DDG.get(), &Nodes))
2642 R.insert_range(N);
2643 }
2644 }
2645 LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
2646 }
2647
2648 LLVM_DEBUG({
2649 dbgs() << "Node order: ";
2650 for (SUnit *I : NodeOrder)
2651 dbgs() << " " << I->NodeNum << " ";
2652 dbgs() << "\n";
2653 });
2654}
2655
2656/// Process the nodes in the computed order and create the pipelined schedule
2657/// of the instructions, if possible. Return true if a schedule is found.
2658bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
2659
2660 if (NodeOrder.empty()){
2661 LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
2662 return false;
2663 }
2664
2665 bool scheduleFound = false;
2666 std::unique_ptr<HighRegisterPressureDetector> HRPDetector;
2667 if (LimitRegPressure) {
2668 HRPDetector =
2669 std::make_unique<HighRegisterPressureDetector>(Loop.getHeader(), MF);
2670 HRPDetector->init(RegClassInfo);
2671 }
2672 // Keep increasing II until a valid schedule is found.
2673 for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) {
2674 Schedule.reset();
2675 Schedule.setInitiationInterval(II);
2676 LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
2677
2680 do {
2681 SUnit *SU = *NI;
2682
2683 // Compute the schedule time for the instruction, which is based
2684 // upon the scheduled time for any predecessors/successors.
2685 int EarlyStart = INT_MIN;
2686 int LateStart = INT_MAX;
2687 Schedule.computeStart(SU, &EarlyStart, &LateStart, II, this);
2688 LLVM_DEBUG({
2689 dbgs() << "\n";
2690 dbgs() << "Inst (" << SU->NodeNum << ") ";
2691 SU->getInstr()->dump();
2692 dbgs() << "\n";
2693 });
2694 LLVM_DEBUG(
2695 dbgs() << format("\tes: %8x ls: %8x\n", EarlyStart, LateStart));
2696
2697 if (EarlyStart > LateStart)
2698 scheduleFound = false;
2699 else if (EarlyStart != INT_MIN && LateStart == INT_MAX)
2700 scheduleFound =
2701 Schedule.insert(SU, EarlyStart, EarlyStart + (int)II - 1, II);
2702 else if (EarlyStart == INT_MIN && LateStart != INT_MAX)
2703 scheduleFound =
2704 Schedule.insert(SU, LateStart, LateStart - (int)II + 1, II);
2705 else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2706 LateStart = std::min(LateStart, EarlyStart + (int)II - 1);
2707 // When scheduling a Phi it is better to start at the late cycle and
2708 // go backwards. The default order may insert the Phi too far away
2709 // from its first dependence.
2710 // Also, do backward search when all scheduled predecessors are
2711 // loop-carried output/order dependencies. Empirically, there are also
2712 // cases where scheduling becomes possible with backward search.
2713 if (SU->getInstr()->isPHI() ||
2714 Schedule.onlyHasLoopCarriedOutputOrOrderPreds(SU, this->getDDG()))
2715 scheduleFound = Schedule.insert(SU, LateStart, EarlyStart, II);
2716 else
2717 scheduleFound = Schedule.insert(SU, EarlyStart, LateStart, II);
2718 } else {
2719 int FirstCycle = Schedule.getFirstCycle();
2720 scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2721 FirstCycle + getASAP(SU) + II - 1, II);
2722 }
2723
2724 // Even if we find a schedule, make sure the schedule doesn't exceed the
2725 // allowable number of stages. We keep trying if this happens.
2726 if (scheduleFound)
2727 if (SwpMaxStages > -1 &&
2728 Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2729 scheduleFound = false;
2730
2731 LLVM_DEBUG({
2732 if (!scheduleFound)
2733 dbgs() << "\tCan't schedule\n";
2734 });
2735 } while (++NI != NE && scheduleFound);
2736
2737 // If a schedule is found, validate it against the validation-only
2738 // dependencies.
2739 if (scheduleFound)
2740 scheduleFound = DDG->isValidSchedule(Schedule);
2741
2742 // If a schedule is found, ensure non-pipelined instructions are in stage 0
2743 if (scheduleFound)
2744 scheduleFound =
2745 Schedule.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo);
2746
2747 // If a schedule is found, check if it is a valid schedule too.
2748 if (scheduleFound)
2749 scheduleFound = Schedule.isValidSchedule(this);
2750
2751 // If a schedule was found and the option is enabled, check if the schedule
2752 // might generate additional register spills/fills.
2753 if (scheduleFound && LimitRegPressure)
2754 scheduleFound =
2755 !HRPDetector->detect(this, Schedule, Schedule.getMaxStageCount());
2756 }
2757
2758 LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
2759 << " (II=" << Schedule.getInitiationInterval()
2760 << ")\n");
2761
2762 if (scheduleFound) {
2763 scheduleFound = LoopPipelinerInfo->shouldUseSchedule(*this, Schedule);
2764 if (!scheduleFound)
2765 LLVM_DEBUG(dbgs() << "Target rejected schedule\n");
2766 }
2767
2768 if (scheduleFound) {
2769 Schedule.finalizeSchedule(this);
2770 Pass.ORE->emit([&]() {
2771 return MachineOptimizationRemarkAnalysis(
2772 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
2773 << "Schedule found with Initiation Interval: "
2774 << ore::NV("II", Schedule.getInitiationInterval())
2775 << ", MaxStageCount: "
2776 << ore::NV("MaxStageCount", Schedule.getMaxStageCount());
2777 });
2778 } else
2779 Schedule.reset();
2780
2781 return scheduleFound && Schedule.getMaxStageCount() > 0;
2782}
2783
2785 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
2786 Register Result;
2787 for (const MachineOperand &Use : MI.all_uses()) {
2788 Register Reg = Use.getReg();
2789 if (!Reg.isVirtual())
2790 return Register();
2791 if (MRI.getVRegDef(Reg)->getParent() != MI.getParent())
2792 continue;
2793 if (Result)
2794 return Register();
2795 Result = Reg;
2796 }
2797 return Result;
2798}
2799
2800/// When Op is a value that is incremented recursively in a loop and there is a
2801/// unique instruction that increments it, returns true and sets Value.
2803 if (!Op.isReg() || !Op.getReg().isVirtual())
2804 return false;
2805
2806 Register OrgReg = Op.getReg();
2807 Register CurReg = OrgReg;
2808 const MachineBasicBlock *LoopBB = Op.getParent()->getParent();
2809 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
2810
2811 const TargetInstrInfo *TII =
2812 LoopBB->getParent()->getSubtarget().getInstrInfo();
2813 const TargetRegisterInfo *TRI =
2814 LoopBB->getParent()->getSubtarget().getRegisterInfo();
2815
2816 MachineInstr *Phi = nullptr;
2817 MachineInstr *Increment = nullptr;
2818
2819 // Traverse definitions until it reaches Op or an instruction that does not
2820 // satisfy the condition.
2821 // Acceptable example:
2822 // bb.0:
2823 // %0 = PHI %3, %bb.0, ...
2824 // %2 = ADD %0, Value
2825 // ... = LOAD %2(Op)
2826 // %3 = COPY %2
2827 while (true) {
2828 if (!CurReg.isValid() || !CurReg.isVirtual())
2829 return false;
2830 MachineInstr *Def = MRI.getVRegDef(CurReg);
2831 if (Def->getParent() != LoopBB)
2832 return false;
2833
2834 if (Def->isCopy()) {
2835 // Ignore copy instructions unless they contain subregisters
2836 if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())
2837 return false;
2838 CurReg = Def->getOperand(1).getReg();
2839 } else if (Def->isPHI()) {
2840 // There must be just one Phi
2841 if (Phi)
2842 return false;
2843 Phi = Def;
2844 CurReg = getLoopPhiReg(*Def, LoopBB);
2845 } else if (TII->getIncrementValue(*Def, Value)) {
2846 // Potentially a unique increment
2847 if (Increment)
2848 // Multiple increments exist
2849 return false;
2850
2851 const MachineOperand *BaseOp;
2852 int64_t Offset;
2853 bool OffsetIsScalable;
2854 if (TII->getMemOperandWithOffset(*Def, BaseOp, Offset, OffsetIsScalable,
2855 TRI)) {
2856 // Pre/post increment instruction
2857 CurReg = BaseOp->getReg();
2858 } else {
2859 // If only one of the operands is defined within the loop, it is assumed
2860 // to be an incremented value.
2861 CurReg = findUniqueOperandDefinedInLoop(*Def);
2862 if (!CurReg.isValid())
2863 return false;
2864 }
2865 Increment = Def;
2866 } else {
2867 return false;
2868 }
2869 if (CurReg == OrgReg)
2870 break;
2871 }
2872
2873 if (!Phi || !Increment)
2874 return false;
2875
2876 return true;
2877}
2878
2879/// Return true if we can compute the amount the instruction changes
2880/// during each iteration. Set Delta to the amount of the change.
2881bool SwingSchedulerDAG::computeDelta(const MachineInstr &MI, int &Delta) const {
2882 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2883 const MachineOperand *BaseOp;
2884 int64_t Offset;
2885 bool OffsetIsScalable;
2886 if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
2887 return false;
2888
2889 // FIXME: This algorithm assumes instructions have fixed-size offsets.
2890 if (OffsetIsScalable)
2891 return false;
2892
2893 if (!BaseOp->isReg())
2894 return false;
2895
2896 return findLoopIncrementValue(*BaseOp, Delta);
2897}
2898
2899/// Check if we can change the instruction to use an offset value from the
2900/// previous iteration. If so, return true and set the base and offset values
2901/// so that we can rewrite the load, if necessary.
2902/// v1 = Phi(v0, v3)
2903/// v2 = load v1, 0
2904/// v3 = post_store v1, 4, x
2905/// This function enables the load to be rewritten as v2 = load v3, 4.
2906bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
2907 unsigned &BasePos,
2908 unsigned &OffsetPos,
2909 Register &NewBase,
2910 int64_t &Offset) {
2911 // Get the load instruction.
2912 if (TII->isPostIncrement(*MI))
2913 return false;
2914 unsigned BasePosLd, OffsetPosLd;
2915 if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
2916 return false;
2917 Register BaseReg = MI->getOperand(BasePosLd).getReg();
2918
2919 // Look for the Phi instruction.
2920 MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
2921 MachineInstr *Phi = MRI.getVRegDef(BaseReg);
2922 if (!Phi || !Phi->isPHI())
2923 return false;
2924 // Get the register defined in the loop block.
2925 Register PrevReg = getLoopPhiReg(*Phi, MI->getParent());
2926 if (!PrevReg)
2927 return false;
2928
2929 // Check for the post-increment load/store instruction.
2930 MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
2931 if (!PrevDef || PrevDef == MI)
2932 return false;
2933
2934 if (!TII->isPostIncrement(*PrevDef))
2935 return false;
2936
2937 unsigned BasePos1 = 0, OffsetPos1 = 0;
2938 if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
2939 return false;
2940
2941 // Make sure that the instructions do not access the same memory location in
2942 // the next iteration.
2943 int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
2944 int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
2945 MachineInstr *NewMI = MF.CloneMachineInstr(MI);
2946 NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
2947 bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
2948 MF.deleteMachineInstr(NewMI);
2949 if (!Disjoint)
2950 return false;
2951
2952 // Set the return value once we determine that we return true.
2953 BasePos = BasePosLd;
2954 OffsetPos = OffsetPosLd;
2955 NewBase = PrevReg;
2956 Offset = StoreOffset;
2957 return true;
2958}
2959
2960/// Apply changes to the instruction if needed. The changes are need
2961/// to improve the scheduling and depend up on the final schedule.
2963 SMSchedule &Schedule) {
2964 SUnit *SU = getSUnit(MI);
2966 InstrChanges.find(SU);
2967 if (It != InstrChanges.end()) {
2968 std::pair<Register, int64_t> RegAndOffset = It->second;
2969 unsigned BasePos, OffsetPos;
2970 if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2971 return;
2972 Register BaseReg = MI->getOperand(BasePos).getReg();
2973 MachineInstr *LoopDef = findDefInLoop(BaseReg);
2974 int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
2975 int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
2976 int BaseStageNum = Schedule.stageScheduled(SU);
2977 int BaseCycleNum = Schedule.cycleScheduled(SU);
2978 if (BaseStageNum < DefStageNum) {
2979 MachineInstr *NewMI = MF.CloneMachineInstr(MI);
2980 int OffsetDiff = DefStageNum - BaseStageNum;
2981 if (DefCycleNum < BaseCycleNum) {
2982 NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
2983 if (OffsetDiff > 0)
2984 --OffsetDiff;
2985 }
2986 int64_t NewOffset =
2987 MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
2988 NewMI->getOperand(OffsetPos).setImm(NewOffset);
2989 SU->setInstr(NewMI);
2990 MISUnitMap[NewMI] = SU;
2991 NewMIs[MI] = NewMI;
2992 }
2993 }
2994}
2995
2996/// Return the instruction in the loop that defines the register.
2997/// If the definition is a Phi, then follow the Phi operand to
2998/// the instruction in the loop.
2999MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
3001 MachineInstr *Def = MRI.getVRegDef(Reg);
3002 while (Def->isPHI()) {
3003 if (!Visited.insert(Def).second)
3004 break;
3005 for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
3006 if (Def->getOperand(i + 1).getMBB() == BB) {
3007 Def = MRI.getVRegDef(Def->getOperand(i).getReg());
3008 break;
3009 }
3010 }
3011 return Def;
3012}
3013
3014/// Return false if there is no overlap between the region accessed by BaseMI in
3015/// an iteration and the region accessed by OtherMI in subsequent iterations.
3017 const MachineInstr *BaseMI, const MachineInstr *OtherMI) const {
3018 int DeltaB, DeltaO, Delta;
3019 if (!computeDelta(*BaseMI, DeltaB) || !computeDelta(*OtherMI, DeltaO) ||
3020 DeltaB != DeltaO)
3021 return true;
3022 Delta = DeltaB;
3023
3024 const MachineOperand *BaseOpB, *BaseOpO;
3025 int64_t OffsetB, OffsetO;
3026 bool OffsetBIsScalable, OffsetOIsScalable;
3027 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3028 if (!TII->getMemOperandWithOffset(*BaseMI, BaseOpB, OffsetB,
3029 OffsetBIsScalable, TRI) ||
3030 !TII->getMemOperandWithOffset(*OtherMI, BaseOpO, OffsetO,
3031 OffsetOIsScalable, TRI))
3032 return true;
3033
3034 if (OffsetBIsScalable || OffsetOIsScalable)
3035 return true;
3036
3037 if (!BaseOpB->isIdenticalTo(*BaseOpO)) {
3038 // Pass cases with different base operands but same initial values.
3039 // Typically for when pre/post increment is used.
3040
3041 if (!BaseOpB->isReg() || !BaseOpO->isReg())
3042 return true;
3043 Register RegB = BaseOpB->getReg(), RegO = BaseOpO->getReg();
3044 if (!RegB.isVirtual() || !RegO.isVirtual())
3045 return true;
3046
3047 MachineInstr *DefB = MRI.getVRegDef(BaseOpB->getReg());
3048 MachineInstr *DefO = MRI.getVRegDef(BaseOpO->getReg());
3049 if (!DefB || !DefO || !DefB->isPHI() || !DefO->isPHI())
3050 return true;
3051
3052 Register InitValB;
3053 Register LoopValB;
3054 Register InitValO;
3055 Register LoopValO;
3056 getPhiRegs(*DefB, BB, InitValB, LoopValB);
3057 getPhiRegs(*DefO, BB, InitValO, LoopValO);
3058 MachineInstr *InitDefB = MRI.getVRegDef(InitValB);
3059 MachineInstr *InitDefO = MRI.getVRegDef(InitValO);
3060
3061 if (!InitDefB->isIdenticalTo(*InitDefO))
3062 return true;
3063 }
3064
3065 LocationSize AccessSizeB = (*BaseMI->memoperands_begin())->getSize();
3066 LocationSize AccessSizeO = (*OtherMI->memoperands_begin())->getSize();
3067
3068 // This is the main test, which checks the offset values and the loop
3069 // increment value to determine if the accesses may be loop carried.
3070 if (!AccessSizeB.hasValue() || !AccessSizeO.hasValue())
3071 return true;
3072
3073 LLVM_DEBUG({
3074 dbgs() << "Overlap check:\n";
3075 dbgs() << " BaseMI: ";
3076 BaseMI->dump();
3077 dbgs() << " Base + " << OffsetB << " + I * " << Delta
3078 << ", Len: " << AccessSizeB.getValue() << "\n";
3079 dbgs() << " OtherMI: ";
3080 OtherMI->dump();
3081 dbgs() << " Base + " << OffsetO << " + I * " << Delta
3082 << ", Len: " << AccessSizeO.getValue() << "\n";
3083 });
3084
3085 // Excessive overlap may be detected in strided patterns.
3086 // For example, the memory addresses of the store and the load in
3087 // for (i=0; i<n; i+=2) a[i+1] = a[i];
3088 // are assumed to overlap.
3089 if (Delta < 0) {
3090 int64_t BaseMinAddr = OffsetB;
3091 int64_t OhterNextIterMaxAddr = OffsetO + Delta + AccessSizeO.getValue() - 1;
3092 if (BaseMinAddr > OhterNextIterMaxAddr) {
3093 LLVM_DEBUG(dbgs() << " Result: No overlap\n");
3094 return false;
3095 }
3096 } else {
3097 int64_t BaseMaxAddr = OffsetB + AccessSizeB.getValue() - 1;
3098 int64_t OtherNextIterMinAddr = OffsetO + Delta;
3099 if (BaseMaxAddr < OtherNextIterMinAddr) {
3100 LLVM_DEBUG(dbgs() << " Result: No overlap\n");
3101 return false;
3102 }
3103 }
3104 LLVM_DEBUG(dbgs() << " Result: Overlap\n");
3105 return true;
3106}
3107
3108/// Return true for an order or output dependence that is loop carried
3109/// potentially. A dependence is loop carried if the destination defines a value
3110/// that may be used or defined by the source in a subsequent iteration.
3112 const SwingSchedulerDDGEdge &Edge) const {
3113 if ((!Edge.isOrderDep() && !Edge.isOutputDep()) || Edge.isArtificial() ||
3114 Edge.getDst()->isBoundaryNode())
3115 return false;
3116
3118 return true;
3119
3120 if (Edge.isOutputDep())
3121 return true;
3122
3123 MachineInstr *SI = Edge.getSrc()->getInstr();
3124 MachineInstr *DI = Edge.getDst()->getInstr();
3125 assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
3126
3127 // Assume ordered loads and stores may have a loop carried dependence.
3128 if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
3129 SI->mayRaiseFPException() || DI->mayRaiseFPException() ||
3130 SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
3131 return true;
3132
3133 if (!DI->mayLoadOrStore() || !SI->mayLoadOrStore())
3134 return false;
3135
3136 // The conservative assumption is that a dependence between memory operations
3137 // may be loop carried. The following code checks when it can be proved that
3138 // there is no loop carried dependence.
3139 return mayOverlapInLaterIter(DI, SI);
3140}
3141
3142void SwingSchedulerDAG::postProcessDAG() {
3143 for (auto &M : Mutations)
3144 M->apply(this);
3145}
3146
3147/// Try to schedule the node at the specified StartCycle and continue
3148/// until the node is schedule or the EndCycle is reached. This function
3149/// returns true if the node is scheduled. This routine may search either
3150/// forward or backward for a place to insert the instruction based upon
3151/// the relative values of StartCycle and EndCycle.
3152bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
3153 bool forward = true;
3154 LLVM_DEBUG({
3155 dbgs() << "Trying to insert node between " << StartCycle << " and "
3156 << EndCycle << " II: " << II << "\n";
3157 });
3158 if (StartCycle > EndCycle)
3159 forward = false;
3160
3161 // The terminating condition depends on the direction.
3162 int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
3163 for (int curCycle = StartCycle; curCycle != termCycle;
3164 forward ? ++curCycle : --curCycle) {
3165
3166 if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
3167 ProcItinResources.canReserveResources(*SU, curCycle)) {
3168 LLVM_DEBUG({
3169 dbgs() << "\tinsert at cycle " << curCycle << " ";
3170 SU->getInstr()->dump();
3171 });
3172
3173 if (!ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
3174 ProcItinResources.reserveResources(*SU, curCycle);
3175 ScheduledInstrs[curCycle].push_back(SU);
3176 InstrToCycle.insert(std::make_pair(SU, curCycle));
3177 if (curCycle > LastCycle)
3178 LastCycle = curCycle;
3179 if (curCycle < FirstCycle)
3180 FirstCycle = curCycle;
3181 return true;
3182 }
3183 LLVM_DEBUG({
3184 dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
3185 SU->getInstr()->dump();
3186 });
3187 }
3188 return false;
3189}
3190
3191// Return the cycle of the earliest scheduled instruction in the chain.
3193 const SwingSchedulerDDG *DDG) {
3196 Worklist.push_back(Dep);
3197 int EarlyCycle = INT_MAX;
3198 while (!Worklist.empty()) {
3199 const SwingSchedulerDDGEdge &Cur = Worklist.pop_back_val();
3200 SUnit *PrevSU = Cur.getSrc();
3201 if (Visited.count(PrevSU))
3202 continue;
3203 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
3204 if (it == InstrToCycle.end())
3205 continue;
3206 EarlyCycle = std::min(EarlyCycle, it->second);
3207 for (const auto &IE : DDG->getInEdges(PrevSU))
3208 if (IE.isOrderDep() || IE.isOutputDep())
3209 Worklist.push_back(IE);
3210 Visited.insert(PrevSU);
3211 }
3212 return EarlyCycle;
3213}
3214
3215// Return the cycle of the latest scheduled instruction in the chain.
3217 const SwingSchedulerDDG *DDG) {
3220 Worklist.push_back(Dep);
3221 int LateCycle = INT_MIN;
3222 while (!Worklist.empty()) {
3223 const SwingSchedulerDDGEdge &Cur = Worklist.pop_back_val();
3224 SUnit *SuccSU = Cur.getDst();
3225 if (Visited.count(SuccSU) || SuccSU->isBoundaryNode())
3226 continue;
3227 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
3228 if (it == InstrToCycle.end())
3229 continue;
3230 LateCycle = std::max(LateCycle, it->second);
3231 for (const auto &OE : DDG->getOutEdges(SuccSU))
3232 if (OE.isOrderDep() || OE.isOutputDep())
3233 Worklist.push_back(OE);
3234 Visited.insert(SuccSU);
3235 }
3236 return LateCycle;
3237}
3238
3239/// If an instruction has a use that spans multiple iterations, then
3240/// return true. These instructions are characterized by having a back-ege
3241/// to a Phi, which contains a reference to another Phi.
3243 for (auto &P : SU->Preds)
3244 if (P.getKind() == SDep::Anti && P.getSUnit()->getInstr()->isPHI())
3245 for (auto &S : P.getSUnit()->Succs)
3246 if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
3247 return P.getSUnit();
3248 return nullptr;
3249}
3250
3251/// Compute the scheduling start slot for the instruction. The start slot
3252/// depends on any predecessor or successor nodes scheduled already.
3253void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
3254 int II, SwingSchedulerDAG *DAG) {
3255 const SwingSchedulerDDG *DDG = DAG->getDDG();
3256
3257 // Iterate over each instruction that has been scheduled already. The start
3258 // slot computation depends on whether the previously scheduled instruction
3259 // is a predecessor or successor of the specified instruction.
3260 for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
3261 for (SUnit *I : getInstructions(cycle)) {
3262 for (const auto &IE : DDG->getInEdges(SU)) {
3263 if (IE.getSrc() == I) {
3264 // FIXME: Add reverse edge to `DDG` instead of calling
3265 // `isLoopCarriedDep`
3266 if (DAG->isLoopCarriedDep(IE)) {
3267 int End = earliestCycleInChain(IE, DDG) + (II - 1);
3268 *MinLateStart = std::min(*MinLateStart, End);
3269 }
3270 int EarlyStart = cycle + IE.getLatency() - IE.getDistance() * II;
3271 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
3272 }
3273 }
3274
3275 for (const auto &OE : DDG->getOutEdges(SU)) {
3276 if (OE.getDst() == I) {
3277 // FIXME: Add reverse edge to `DDG` instead of calling
3278 // `isLoopCarriedDep`
3279 if (DAG->isLoopCarriedDep(OE)) {
3280 int Start = latestCycleInChain(OE, DDG) + 1 - II;
3281 *MaxEarlyStart = std::max(*MaxEarlyStart, Start);
3282 }
3283 int LateStart = cycle - OE.getLatency() + OE.getDistance() * II;
3284 *MinLateStart = std::min(*MinLateStart, LateStart);
3285 }
3286 }
3287
3288 SUnit *BE = multipleIterations(I, DAG);
3289 for (const auto &Dep : SU->Preds) {
3290 // For instruction that requires multiple iterations, make sure that
3291 // the dependent instruction is not scheduled past the definition.
3292 if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
3293 !SU->isPred(I))
3294 *MinLateStart = std::min(*MinLateStart, cycle);
3295 }
3296 }
3297 }
3298}
3299
3300/// Order the instructions within a cycle so that the definitions occur
3301/// before the uses. Returns true if the instruction is added to the start
3302/// of the list, or false if added to the end.
3304 std::deque<SUnit *> &Insts) const {
3305 MachineInstr *MI = SU->getInstr();
3306 bool OrderBeforeUse = false;
3307 bool OrderAfterDef = false;
3308 bool OrderBeforeDef = false;
3309 unsigned MoveDef = 0;
3310 unsigned MoveUse = 0;
3311 int StageInst1 = stageScheduled(SU);
3312 const SwingSchedulerDDG *DDG = SSD->getDDG();
3313
3314 unsigned Pos = 0;
3315 for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
3316 ++I, ++Pos) {
3317 for (MachineOperand &MO : MI->operands()) {
3318 if (!MO.isReg() || !MO.getReg().isVirtual())
3319 continue;
3320
3321 Register Reg = MO.getReg();
3322 unsigned BasePos, OffsetPos;
3323 if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
3324 if (MI->getOperand(BasePos).getReg() == Reg)
3325 if (Register NewReg = SSD->getInstrBaseReg(SU))
3326 Reg = NewReg;
3327 bool Reads, Writes;
3328 std::tie(Reads, Writes) =
3329 (*I)->getInstr()->readsWritesVirtualRegister(Reg);
3330 if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
3331 OrderBeforeUse = true;
3332 if (MoveUse == 0)
3333 MoveUse = Pos;
3334 } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
3335 // Add the instruction after the scheduled instruction.
3336 OrderAfterDef = true;
3337 MoveDef = Pos;
3338 } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
3339 if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
3340 OrderBeforeUse = true;
3341 if (MoveUse == 0)
3342 MoveUse = Pos;
3343 } else {
3344 OrderAfterDef = true;
3345 MoveDef = Pos;
3346 }
3347 } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
3348 OrderBeforeUse = true;
3349 if (MoveUse == 0)
3350 MoveUse = Pos;
3351 if (MoveUse != 0) {
3352 OrderAfterDef = true;
3353 MoveDef = Pos - 1;
3354 }
3355 } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
3356 // Add the instruction before the scheduled instruction.
3357 OrderBeforeUse = true;
3358 if (MoveUse == 0)
3359 MoveUse = Pos;
3360 } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
3361 isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
3362 if (MoveUse == 0) {
3363 OrderBeforeDef = true;
3364 MoveUse = Pos;
3365 }
3366 }
3367 }
3368 // Check for order dependences between instructions. Make sure the source
3369 // is ordered before the destination.
3370 for (auto &OE : DDG->getOutEdges(SU)) {
3371 if (OE.getDst() != *I)
3372 continue;
3373 if (OE.isOrderDep() && stageScheduled(*I) == StageInst1) {
3374 OrderBeforeUse = true;
3375 if (Pos < MoveUse)
3376 MoveUse = Pos;
3377 }
3378 // We did not handle HW dependences in previous for loop,
3379 // and we normally set Latency = 0 for Anti/Output deps,
3380 // so may have nodes in same cycle with Anti/Output dependent on HW regs.
3381 else if ((OE.isAntiDep() || OE.isOutputDep()) &&
3382 stageScheduled(*I) == StageInst1) {
3383 OrderBeforeUse = true;
3384 if ((MoveUse == 0) || (Pos < MoveUse))
3385 MoveUse = Pos;
3386 }
3387 }
3388 for (auto &IE : DDG->getInEdges(SU)) {
3389 if (IE.getSrc() != *I)
3390 continue;
3391 if ((IE.isAntiDep() || IE.isOutputDep() || IE.isOrderDep()) &&
3392 stageScheduled(*I) == StageInst1) {
3393 OrderAfterDef = true;
3394 MoveDef = Pos;
3395 }
3396 }
3397 }
3398
3399 // A circular dependence.
3400 if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
3401 OrderBeforeUse = false;
3402
3403 // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3404 // to a loop-carried dependence.
3405 if (OrderBeforeDef)
3406 OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
3407
3408 // The uncommon case when the instruction order needs to be updated because
3409 // there is both a use and def.
3410 if (OrderBeforeUse && OrderAfterDef) {
3411 SUnit *UseSU = Insts.at(MoveUse);
3412 SUnit *DefSU = Insts.at(MoveDef);
3413 if (MoveUse > MoveDef) {
3414 Insts.erase(Insts.begin() + MoveUse);
3415 Insts.erase(Insts.begin() + MoveDef);
3416 } else {
3417 Insts.erase(Insts.begin() + MoveDef);
3418 Insts.erase(Insts.begin() + MoveUse);
3419 }
3420 orderDependence(SSD, UseSU, Insts);
3421 orderDependence(SSD, SU, Insts);
3422 orderDependence(SSD, DefSU, Insts);
3423 return;
3424 }
3425 // Put the new instruction first if there is a use in the list. Otherwise,
3426 // put it at the end of the list.
3427 if (OrderBeforeUse)
3428 Insts.push_front(SU);
3429 else
3430 Insts.push_back(SU);
3431}
3432
3433/// Return true if the scheduled Phi has a loop carried operand.
3435 MachineInstr &Phi) const {
3436 if (!Phi.isPHI())
3437 return false;
3438 assert(Phi.isPHI() && "Expecting a Phi.");
3439 SUnit *DefSU = SSD->getSUnit(&Phi);
3440 unsigned DefCycle = cycleScheduled(DefSU);
3441 int DefStage = stageScheduled(DefSU);
3442
3443 Register InitVal;
3444 Register LoopVal;
3445 getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
3446 SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
3447 if (!UseSU)
3448 return true;
3449 if (UseSU->getInstr()->isPHI())
3450 return true;
3451 unsigned LoopCycle = cycleScheduled(UseSU);
3452 int LoopStage = stageScheduled(UseSU);
3453 return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
3454}
3455
3456/// Return true if the instruction is a definition that is loop carried
3457/// and defines the use on the next iteration.
3458/// v1 = phi(v2, v3)
3459/// (Def) v3 = op v1
3460/// (MO) = v1
3461/// If MO appears before Def, then v1 and v3 may get assigned to the same
3462/// register.
3464 MachineInstr *Def,
3465 MachineOperand &MO) const {
3466 if (!MO.isReg())
3467 return false;
3468 if (Def->isPHI())
3469 return false;
3470 MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
3471 if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
3472 return false;
3473 if (!isLoopCarried(SSD, *Phi))
3474 return false;
3475 Register LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
3476 for (MachineOperand &DMO : Def->all_defs()) {
3477 if (DMO.getReg() == LoopReg)
3478 return true;
3479 }
3480 return false;
3481}
3482
3483/// Return true if all scheduled predecessors are loop-carried output/order
3484/// dependencies.
3486 SUnit *SU, const SwingSchedulerDDG *DDG) const {
3487 for (const auto &IE : DDG->getInEdges(SU))
3488 if (InstrToCycle.count(IE.getSrc()))
3489 return false;
3490 return true;
3491}
3492
3493/// Determine transitive dependences of unpipelineable instructions
3496 SmallPtrSet<SUnit *, 8> DoNotPipeline;
3497 SmallVector<SUnit *, 8> Worklist;
3498
3499 for (auto &SU : SSD->SUnits)
3500 if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr()))
3501 Worklist.push_back(&SU);
3502
3503 const SwingSchedulerDDG *DDG = SSD->getDDG();
3504 while (!Worklist.empty()) {
3505 auto SU = Worklist.pop_back_val();
3506 if (DoNotPipeline.count(SU))
3507 continue;
3508 LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");
3509 DoNotPipeline.insert(SU);
3510 for (const auto &IE : DDG->getInEdges(SU))
3511 Worklist.push_back(IE.getSrc());
3512
3513 // To preserve previous behavior and prevent regression
3514 // FIXME: Remove if this doesn't have significant impact on
3515 for (const auto &OE : DDG->getOutEdges(SU))
3516 if (OE.getDistance() == 1)
3517 Worklist.push_back(OE.getDst());
3518 }
3519 return DoNotPipeline;
3520}
3521
3522// Determine all instructions upon which any unpipelineable instruction depends
3523// and ensure that they are in stage 0. If unable to do so, return false.
3527
3528 int NewLastCycle = INT_MIN;
3529 for (SUnit &SU : SSD->SUnits) {
3530 if (!SU.isInstr())
3531 continue;
3532 if (!DNP.contains(&SU) || stageScheduled(&SU) == 0) {
3533 NewLastCycle = std::max(NewLastCycle, InstrToCycle[&SU]);
3534 continue;
3535 }
3536
3537 // Put the non-pipelined instruction as early as possible in the schedule
3538 int NewCycle = getFirstCycle();
3539 for (const auto &IE : SSD->getDDG()->getInEdges(&SU))
3540 if (IE.getDistance() == 0)
3541 NewCycle = std::max(InstrToCycle[IE.getSrc()], NewCycle);
3542
3543 // To preserve previous behavior and prevent regression
3544 // FIXME: Remove if this doesn't have significant impact on performance
3545 for (auto &OE : SSD->getDDG()->getOutEdges(&SU))
3546 if (OE.getDistance() == 1)
3547 NewCycle = std::max(InstrToCycle[OE.getDst()], NewCycle);
3548
3549 int OldCycle = InstrToCycle[&SU];
3550 if (OldCycle != NewCycle) {
3551 InstrToCycle[&SU] = NewCycle;
3552 auto &OldS = getInstructions(OldCycle);
3553 llvm::erase(OldS, &SU);
3554 getInstructions(NewCycle).emplace_back(&SU);
3555 LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum
3556 << ") is not pipelined; moving from cycle " << OldCycle
3557 << " to " << NewCycle << " Instr:" << *SU.getInstr());
3558 }
3559
3560 // We traverse the SUs in the order of the original basic block. Computing
3561 // NewCycle in this order normally works fine because all dependencies
3562 // (except for loop-carried dependencies) don't violate the original order.
3563 // However, an artificial dependency (e.g., added by CopyToPhiMutation) can
3564 // break it. That is, there may be exist an artificial dependency from
3565 // bottom to top. In such a case, NewCycle may become too large to be
3566 // scheduled in Stage 0. For example, assume that Inst0 is in DNP in the
3567 // following case:
3568 //
3569 // | Inst0 <-+
3570 // SU order | | artificial dep
3571 // | Inst1 --+
3572 // v
3573 //
3574 // If Inst1 is scheduled at cycle N and is not at Stage 0, then NewCycle of
3575 // Inst0 must be greater than or equal to N so that Inst0 is not be
3576 // scheduled at Stage 0. In such cases, we reject this schedule at this
3577 // time.
3578 // FIXME: The reason for this is the existence of artificial dependencies
3579 // that are contradict to the original SU order. If ignoring artificial
3580 // dependencies does not affect correctness, then it is better to ignore
3581 // them.
3582 if (FirstCycle + InitiationInterval <= NewCycle)
3583 return false;
3584
3585 NewLastCycle = std::max(NewLastCycle, NewCycle);
3586 }
3587 LastCycle = NewLastCycle;
3588 return true;
3589}
3590
3591// Check if the generated schedule is valid. This function checks if
3592// an instruction that uses a physical register is scheduled in a
3593// different stage than the definition. The pipeliner does not handle
3594// physical register values that may cross a basic block boundary.
3595// Furthermore, if a physical def/use pair is assigned to the same
3596// cycle, orderDependence does not guarantee def/use ordering, so that
3597// case should be considered invalid. (The test checks for both
3598// earlier and same-cycle use to be more robust.)
3600 for (SUnit &SU : SSD->SUnits) {
3601 if (!SU.hasPhysRegDefs)
3602 continue;
3603 int StageDef = stageScheduled(&SU);
3604 int CycleDef = InstrToCycle[&SU];
3605 assert(StageDef != -1 && "Instruction should have been scheduled.");
3606 for (auto &OE : SSD->getDDG()->getOutEdges(&SU)) {
3607 SUnit *Dst = OE.getDst();
3608 if (OE.isAssignedRegDep() && !Dst->isBoundaryNode())
3609 if (OE.getReg().isPhysical()) {
3610 if (stageScheduled(Dst) != StageDef)
3611 return false;
3612 if (InstrToCycle[Dst] <= CycleDef)
3613 return false;
3614 }
3615 }
3616 }
3617 return true;
3618}
3619
3620/// A property of the node order in swing-modulo-scheduling is
3621/// that for nodes outside circuits the following holds:
3622/// none of them is scheduled after both a successor and a
3623/// predecessor.
3624/// The method below checks whether the property is met.
3625/// If not, debug information is printed and statistics information updated.
3626/// Note that we do not use an assert statement.
3627/// The reason is that although an invalid node order may prevent
3628/// the pipeliner from finding a pipelined schedule for arbitrary II,
3629/// it does not lead to the generation of incorrect code.
3630void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
3631
3632 // a sorted vector that maps each SUnit to its index in the NodeOrder
3633 typedef std::pair<SUnit *, unsigned> UnitIndex;
3634 std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
3635
3636 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
3637 Indices.push_back(std::make_pair(NodeOrder[i], i));
3638
3639 auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
3640 return std::get<0>(i1) < std::get<0>(i2);
3641 };
3642
3643 // sort, so that we can perform a binary search
3644 llvm::sort(Indices, CompareKey);
3645
3646 bool Valid = true;
3647 (void)Valid;
3648 // for each SUnit in the NodeOrder, check whether
3649 // it appears after both a successor and a predecessor
3650 // of the SUnit. If this is the case, and the SUnit
3651 // is not part of circuit, then the NodeOrder is not
3652 // valid.
3653 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
3654 SUnit *SU = NodeOrder[i];
3655 unsigned Index = i;
3656
3657 bool PredBefore = false;
3658 bool SuccBefore = false;
3659
3660 SUnit *Succ;
3661 SUnit *Pred;
3662 (void)Succ;
3663 (void)Pred;
3664
3665 for (const auto &IE : DDG->getInEdges(SU)) {
3666 SUnit *PredSU = IE.getSrc();
3667 unsigned PredIndex = std::get<1>(
3668 *llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey));
3669 if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
3670 PredBefore = true;
3671 Pred = PredSU;
3672 break;
3673 }
3674 }
3675
3676 for (const auto &OE : DDG->getOutEdges(SU)) {
3677 SUnit *SuccSU = OE.getDst();
3678 // Do not process a boundary node, it was not included in NodeOrder,
3679 // hence not in Indices either, call to std::lower_bound() below will
3680 // return Indices.end().
3681 if (SuccSU->isBoundaryNode())
3682 continue;
3683 unsigned SuccIndex = std::get<1>(
3684 *llvm::lower_bound(Indices, std::make_pair(SuccSU, 0), CompareKey));
3685 if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
3686 SuccBefore = true;
3687 Succ = SuccSU;
3688 break;
3689 }
3690 }
3691
3692 if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
3693 // instructions in circuits are allowed to be scheduled
3694 // after both a successor and predecessor.
3695 bool InCircuit = llvm::any_of(
3696 Circuits, [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
3697 if (InCircuit)
3698 LLVM_DEBUG(dbgs() << "In a circuit, predecessor ");
3699 else {
3700 Valid = false;
3701 NumNodeOrderIssues++;
3702 LLVM_DEBUG(dbgs() << "Predecessor ");
3703 }
3704 LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
3705 << " are scheduled before node " << SU->NodeNum
3706 << "\n");
3707 }
3708 }
3709
3710 LLVM_DEBUG({
3711 if (!Valid)
3712 dbgs() << "Invalid node order found!\n";
3713 });
3714}
3715
3716/// Attempt to fix the degenerate cases when the instruction serialization
3717/// causes the register lifetimes to overlap. For example,
3718/// p' = store_pi(p, b)
3719/// = load p, offset
3720/// In this case p and p' overlap, which means that two registers are needed.
3721/// Instead, this function changes the load to use p' and updates the offset.
3722void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
3723 Register OverlapReg;
3724 Register NewBaseReg;
3725 for (SUnit *SU : Instrs) {
3726 MachineInstr *MI = SU->getInstr();
3727 for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3728 const MachineOperand &MO = MI->getOperand(i);
3729 // Look for an instruction that uses p. The instruction occurs in the
3730 // same cycle but occurs later in the serialized order.
3731 if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
3732 // Check that the instruction appears in the InstrChanges structure,
3733 // which contains instructions that can have the offset updated.
3735 InstrChanges.find(SU);
3736 if (It != InstrChanges.end()) {
3737 unsigned BasePos, OffsetPos;
3738 // Update the base register and adjust the offset.
3739 if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
3740 MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3741 NewMI->getOperand(BasePos).setReg(NewBaseReg);
3742 int64_t NewOffset =
3743 MI->getOperand(OffsetPos).getImm() - It->second.second;
3744 NewMI->getOperand(OffsetPos).setImm(NewOffset);
3745 SU->setInstr(NewMI);
3746 MISUnitMap[NewMI] = SU;
3747 NewMIs[MI] = NewMI;
3748 }
3749 }
3750 OverlapReg = Register();
3751 NewBaseReg = Register();
3752 break;
3753 }
3754 // Look for an instruction of the form p' = op(p), which uses and defines
3755 // two virtual registers that get allocated to the same physical register.
3756 unsigned TiedUseIdx = 0;
3757 if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
3758 // OverlapReg is p in the example above.
3759 OverlapReg = MI->getOperand(TiedUseIdx).getReg();
3760 // NewBaseReg is p' in the example above.
3761 NewBaseReg = MI->getOperand(i).getReg();
3762 break;
3763 }
3764 }
3765 }
3766}
3767
3768std::deque<SUnit *>
3770 const std::deque<SUnit *> &Instrs) const {
3771 std::deque<SUnit *> NewOrderPhi;
3772 for (SUnit *SU : Instrs) {
3773 if (SU->getInstr()->isPHI())
3774 NewOrderPhi.push_back(SU);
3775 }
3776 std::deque<SUnit *> NewOrderI;
3777 for (SUnit *SU : Instrs) {
3778 if (!SU->getInstr()->isPHI())
3779 orderDependence(SSD, SU, NewOrderI);
3780 }
3781 llvm::append_range(NewOrderPhi, NewOrderI);
3782 return NewOrderPhi;
3783}
3784
3785/// After the schedule has been formed, call this function to combine
3786/// the instructions from the different stages/cycles. That is, this
3787/// function creates a schedule that represents a single iteration.
3789 // Move all instructions to the first stage from later stages.
3790 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3791 for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
3792 ++stage) {
3793 std::deque<SUnit *> &cycleInstrs =
3794 ScheduledInstrs[cycle + (stage * InitiationInterval)];
3795 for (SUnit *SU : llvm::reverse(cycleInstrs))
3796 ScheduledInstrs[cycle].push_front(SU);
3797 }
3798 }
3799
3800 // Erase all the elements in the later stages. Only one iteration should
3801 // remain in the scheduled list, and it contains all the instructions.
3802 for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
3803 ScheduledInstrs.erase(cycle);
3804
3805 // Change the registers in instruction as specified in the InstrChanges
3806 // map. We need to use the new registers to create the correct order.
3807 for (const SUnit &SU : SSD->SUnits)
3808 SSD->applyInstrChange(SU.getInstr(), *this);
3809
3810 // Reorder the instructions in each cycle to fix and improve the
3811 // generated code.
3812 for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
3813 std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
3814 cycleInstrs = reorderInstructions(SSD, cycleInstrs);
3815 SSD->fixupRegisterOverlaps(cycleInstrs);
3816 }
3817
3818 LLVM_DEBUG(dump(););
3819}
3820
3822 os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
3823 << " depth " << MaxDepth << " col " << Colocate << "\n";
3824 for (const auto &I : Nodes)
3825 os << " SU(" << I->NodeNum << ") " << *(I->getInstr());
3826 os << "\n";
3827}
3828
3829#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3830/// Print the schedule information to the given output.
3832 // Iterate over each cycle.
3833 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3834 // Iterate over each instruction in the cycle.
3835 const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
3836 for (SUnit *CI : cycleInstrs->second) {
3837 os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
3838 os << "(" << CI->NodeNum << ") ";
3839 CI->getInstr()->print(os);
3840 os << "\n";
3841 }
3842 }
3843}
3844
3845/// Utility function used for debugging to print the schedule.
3848
3849void ResourceManager::dumpMRT() const {
3850 LLVM_DEBUG({
3851 if (UseDFA)
3852 return;
3853 std::stringstream SS;
3854 SS << "MRT:\n";
3855 SS << std::setw(4) << "Slot";
3856 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3857 SS << std::setw(3) << I;
3858 SS << std::setw(7) << "#Mops"
3859 << "\n";
3860 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3861 SS << std::setw(4) << Slot;
3862 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3863 SS << std::setw(3) << MRT[Slot][I];
3864 SS << std::setw(7) << NumScheduledMops[Slot] << "\n";
3865 }
3866 dbgs() << SS.str();
3867 });
3868}
3869#endif
3870
3872 const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
3873 unsigned ProcResourceID = 0;
3874
3875 // We currently limit the resource kinds to 64 and below so that we can use
3876 // uint64_t for Masks
3877 assert(SM.getNumProcResourceKinds() < 64 &&
3878 "Too many kinds of resources, unsupported");
3879 // Create a unique bitmask for every processor resource unit.
3880 // Skip resource at index 0, since it always references 'InvalidUnit'.
3881 Masks.resize(SM.getNumProcResourceKinds());
3882 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3883 const MCProcResourceDesc &Desc = *SM.getProcResource(I);
3884 if (Desc.SubUnitsIdxBegin)
3885 continue;
3886 Masks[I] = 1ULL << ProcResourceID;
3887 ProcResourceID++;
3888 }
3889 // Create a unique bitmask for every processor resource group.
3890 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3891 const MCProcResourceDesc &Desc = *SM.getProcResource(I);
3892 if (!Desc.SubUnitsIdxBegin)
3893 continue;
3894 Masks[I] = 1ULL << ProcResourceID;
3895 for (unsigned U = 0; U < Desc.NumUnits; ++U)
3896 Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];
3897 ProcResourceID++;
3898 }
3899 LLVM_DEBUG({
3900 if (SwpShowResMask) {
3901 dbgs() << "ProcResourceDesc:\n";
3902 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3903 const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
3904 dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
3905 ProcResource->Name, I, Masks[I],
3906 ProcResource->NumUnits);
3907 }
3908 dbgs() << " -----------------\n";
3909 }
3910 });
3911}
3912
3914 LLVM_DEBUG({
3915 if (SwpDebugResource)
3916 dbgs() << "canReserveResources:\n";
3917 });
3918 if (UseDFA)
3919 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3920 ->canReserveResources(&SU.getInstr()->getDesc());
3921
3922 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3923 if (!SCDesc->isValid()) {
3924 LLVM_DEBUG({
3925 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3926 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3927 });
3928 return true;
3929 }
3930
3931 reserveResources(SCDesc, Cycle);
3932 bool Result = !isOverbooked();
3933 unreserveResources(SCDesc, Cycle);
3934
3935 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return " << Result << "\n\n");
3936 return Result;
3937}
3938
3939void ResourceManager::reserveResources(SUnit &SU, int Cycle) {
3940 LLVM_DEBUG({
3941 if (SwpDebugResource)
3942 dbgs() << "reserveResources:\n";
3943 });
3944 if (UseDFA)
3945 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3946 ->reserveResources(&SU.getInstr()->getDesc());
3947
3948 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3949 if (!SCDesc->isValid()) {
3950 LLVM_DEBUG({
3951 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3952 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3953 });
3954 return;
3955 }
3956
3957 reserveResources(SCDesc, Cycle);
3958
3959 LLVM_DEBUG({
3960 if (SwpDebugResource) {
3961 dumpMRT();
3962 dbgs() << "reserveResources: done!\n\n";
3963 }
3964 });
3965}
3966
3967void ResourceManager::reserveResources(const MCSchedClassDesc *SCDesc,
3968 int Cycle) {
3969 assert(!UseDFA);
3970 for (const MCWriteProcResEntry &PRE : make_range(
3971 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3972 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3973 ++MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3974
3975 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3976 ++NumScheduledMops[positiveModulo(C, InitiationInterval)];
3977}
3978
3979void ResourceManager::unreserveResources(const MCSchedClassDesc *SCDesc,
3980 int Cycle) {
3981 assert(!UseDFA);
3982 for (const MCWriteProcResEntry &PRE : make_range(
3983 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3984 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3985 --MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3986
3987 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3988 --NumScheduledMops[positiveModulo(C, InitiationInterval)];
3989}
3990
3991bool ResourceManager::isOverbooked() const {
3992 assert(!UseDFA);
3993 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3994 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3995 const MCProcResourceDesc *Desc = SM.getProcResource(I);
3996 if (MRT[Slot][I] > Desc->NumUnits)
3997 return true;
3998 }
3999 if (NumScheduledMops[Slot] > IssueWidth)
4000 return true;
4001 }
4002 return false;
4003}
4004
4005int ResourceManager::calculateResMIIDFA() const {
4006 assert(UseDFA);
4007
4008 // Sort the instructions by the number of available choices for scheduling,
4009 // least to most. Use the number of critical resources as the tie breaker.
4010 FuncUnitSorter FUS = FuncUnitSorter(*ST);
4011 for (SUnit &SU : DAG->SUnits)
4012 FUS.calcCriticalResources(*SU.getInstr());
4013 PriorityQueue<MachineInstr *, std::vector<MachineInstr *>, FuncUnitSorter>
4014 FuncUnitOrder(FUS);
4015
4016 for (SUnit &SU : DAG->SUnits)
4017 FuncUnitOrder.push(SU.getInstr());
4018
4020 Resources.push_back(
4021 std::unique_ptr<DFAPacketizer>(TII->CreateTargetScheduleState(*ST)));
4022
4023 while (!FuncUnitOrder.empty()) {
4024 MachineInstr *MI = FuncUnitOrder.top();
4025 FuncUnitOrder.pop();
4026 if (TII->isZeroCost(MI->getOpcode()))
4027 continue;
4028
4029 // Attempt to reserve the instruction in an existing DFA. At least one
4030 // DFA is needed for each cycle.
4031 unsigned NumCycles = DAG->getSUnit(MI)->Latency;
4032 unsigned ReservedCycles = 0;
4033 auto *RI = Resources.begin();
4034 auto *RE = Resources.end();
4035 LLVM_DEBUG({
4036 dbgs() << "Trying to reserve resource for " << NumCycles
4037 << " cycles for \n";
4038 MI->dump();
4039 });
4040 for (unsigned C = 0; C < NumCycles; ++C)
4041 while (RI != RE) {
4042 if ((*RI)->canReserveResources(*MI)) {
4043 (*RI)->reserveResources(*MI);
4044 ++ReservedCycles;
4045 break;
4046 }
4047 RI++;
4048 }
4049 LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
4050 << ", NumCycles:" << NumCycles << "\n");
4051 // Add new DFAs, if needed, to reserve resources.
4052 for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
4054 << "NewResource created to reserve resources"
4055 << "\n");
4056 auto *NewResource = TII->CreateTargetScheduleState(*ST);
4057 assert(NewResource->canReserveResources(*MI) && "Reserve error.");
4058 NewResource->reserveResources(*MI);
4059 Resources.push_back(std::unique_ptr<DFAPacketizer>(NewResource));
4060 }
4061 }
4062
4063 int Resmii = Resources.size();
4064 LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
4065 return Resmii;
4066}
4067
4069 if (UseDFA)
4070 return calculateResMIIDFA();
4071
4072 // Count each resource consumption and divide it by the number of units.
4073 // ResMII is the max value among them.
4074
4075 int NumMops = 0;
4076 SmallVector<uint64_t> ResourceCount(SM.getNumProcResourceKinds());
4077 for (SUnit &SU : DAG->SUnits) {
4078 if (TII->isZeroCost(SU.getInstr()->getOpcode()))
4079 continue;
4080
4081 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
4082 if (!SCDesc->isValid())
4083 continue;
4084
4085 LLVM_DEBUG({
4086 if (SwpDebugResource) {
4087 DAG->dumpNode(SU);
4088 dbgs() << " #Mops: " << SCDesc->NumMicroOps << "\n"
4089 << " WriteProcRes: ";
4090 }
4091 });
4092 NumMops += SCDesc->NumMicroOps;
4093 for (const MCWriteProcResEntry &PRE :
4094 make_range(STI->getWriteProcResBegin(SCDesc),
4095 STI->getWriteProcResEnd(SCDesc))) {
4096 LLVM_DEBUG({
4097 if (SwpDebugResource) {
4098 const MCProcResourceDesc *Desc =
4099 SM.getProcResource(PRE.ProcResourceIdx);
4100 dbgs() << Desc->Name << ": " << PRE.ReleaseAtCycle << ", ";
4101 }
4102 });
4103 ResourceCount[PRE.ProcResourceIdx] += PRE.ReleaseAtCycle;
4104 }
4105 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "\n");
4106 }
4107
4108 int Result = (NumMops + IssueWidth - 1) / IssueWidth;
4109 LLVM_DEBUG({
4110 if (SwpDebugResource)
4111 dbgs() << "#Mops: " << NumMops << ", "
4112 << "IssueWidth: " << IssueWidth << ", "
4113 << "Cycles: " << Result << "\n";
4114 });
4115
4116 LLVM_DEBUG({
4117 if (SwpDebugResource) {
4118 std::stringstream SS;
4119 SS << std::setw(2) << "ID" << std::setw(16) << "Name" << std::setw(10)
4120 << "Units" << std::setw(10) << "Consumed" << std::setw(10) << "Cycles"
4121 << "\n";
4122 dbgs() << SS.str();
4123 }
4124 });
4125 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
4126 const MCProcResourceDesc *Desc = SM.getProcResource(I);
4127 int Cycles = (ResourceCount[I] + Desc->NumUnits - 1) / Desc->NumUnits;
4128 LLVM_DEBUG({
4129 if (SwpDebugResource) {
4130 std::stringstream SS;
4131 SS << std::setw(2) << I << std::setw(16) << Desc->Name << std::setw(10)
4132 << Desc->NumUnits << std::setw(10) << ResourceCount[I]
4133 << std::setw(10) << Cycles << "\n";
4134 dbgs() << SS.str();
4135 }
4136 });
4137 if (Cycles > Result)
4138 Result = Cycles;
4139 }
4140 return Result;
4141}
4142
4144 InitiationInterval = II;
4145 DFAResources.clear();
4146 DFAResources.resize(II);
4147 for (auto &I : DFAResources)
4148 I.reset(ST->getInstrInfo()->CreateTargetScheduleState(*ST));
4149 MRT.clear();
4150 MRT.resize(II, SmallVector<uint64_t>(SM.getNumProcResourceKinds()));
4151 NumScheduledMops.clear();
4152 NumScheduledMops.resize(II);
4153}
4154
4155bool SwingSchedulerDDGEdge::ignoreDependence(bool IgnoreAnti) const {
4156 if (Pred.isArtificial() || Dst->isBoundaryNode())
4157 return true;
4158 // Currently, dependence that is an anti-dependences but not a loop-carried is
4159 // also ignored. This behavior is preserved to prevent regression.
4160 // FIXME: Remove if this doesn't have significant impact on performance
4161 return IgnoreAnti && (Pred.getKind() == SDep::Kind::Anti || Distance != 0);
4162}
4163
4164SwingSchedulerDDG::SwingSchedulerDDGEdges &
4165SwingSchedulerDDG::getEdges(const SUnit *SU) {
4166 if (SU == EntrySU)
4167 return EntrySUEdges;
4168 if (SU == ExitSU)
4169 return ExitSUEdges;
4170 return EdgesVec[SU->NodeNum];
4171}
4172
4173const SwingSchedulerDDG::SwingSchedulerDDGEdges &
4174SwingSchedulerDDG::getEdges(const SUnit *SU) const {
4175 if (SU == EntrySU)
4176 return EntrySUEdges;
4177 if (SU == ExitSU)
4178 return ExitSUEdges;
4179 return EdgesVec[SU->NodeNum];
4180}
4181
4182void SwingSchedulerDDG::addEdge(const SUnit *SU,
4183 const SwingSchedulerDDGEdge &Edge) {
4184 assert(!Edge.isValidationOnly() &&
4185 "Validation-only edges are not expected here.");
4186 auto &Edges = getEdges(SU);
4187 if (Edge.getSrc() == SU)
4188 Edges.Succs.push_back(Edge);
4189 else
4190 Edges.Preds.push_back(Edge);
4191}
4192
4193void SwingSchedulerDDG::initEdges(SUnit *SU) {
4194 for (const auto &PI : SU->Preds) {
4195 SwingSchedulerDDGEdge Edge(SU, PI, /*IsSucc=*/false,
4196 /*IsValidationOnly=*/false);
4197 addEdge(SU, Edge);
4198 }
4199
4200 for (const auto &SI : SU->Succs) {
4201 SwingSchedulerDDGEdge Edge(SU, SI, /*IsSucc=*/true,
4202 /*IsValidationOnly=*/false);
4203 addEdge(SU, Edge);
4204 }
4205}
4206
4207SwingSchedulerDDG::SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU,
4208 SUnit *ExitSU, const LoopCarriedEdges &LCE)
4209 : EntrySU(EntrySU), ExitSU(ExitSU) {
4210 EdgesVec.resize(SUnits.size());
4211
4212 // Add non-loop-carried edges based on the DAG.
4213 initEdges(EntrySU);
4214 initEdges(ExitSU);
4215 for (auto &SU : SUnits)
4216 initEdges(&SU);
4217
4218 // Add loop-carried edges, which are not represented in the DAG.
4219 for (SUnit &SU : SUnits) {
4220 SUnit *Src = &SU;
4221 if (const LoopCarriedEdges::OrderDep *OD = LCE.getOrderDepOrNull(Src)) {
4222 SDep Base(Src, SDep::Barrier);
4223 Base.setLatency(1);
4224 for (SUnit *Dst : *OD) {
4225 SwingSchedulerDDGEdge Edge(Dst, Base, /*IsSucc=*/false,
4226 /*IsValidationOnly=*/true);
4227 Edge.setDistance(1);
4228 ValidationOnlyEdges.push_back(Edge);
4229 }
4230 }
4231 }
4232}
4233
4234const SwingSchedulerDDG::EdgesType &
4236 return getEdges(SU).Preds;
4237}
4238
4239const SwingSchedulerDDG::EdgesType &
4241 return getEdges(SU).Succs;
4242}
4243
4244/// Check if \p Schedule doesn't violate the validation-only dependencies.
4246 unsigned II = Schedule.getInitiationInterval();
4247
4248 auto ExpandCycle = [&](SUnit *SU) {
4249 int Stage = Schedule.stageScheduled(SU);
4250 int Cycle = Schedule.cycleScheduled(SU);
4251 return Cycle + (Stage * II);
4252 };
4253
4254 for (const SwingSchedulerDDGEdge &Edge : ValidationOnlyEdges) {
4255 SUnit *Src = Edge.getSrc();
4256 SUnit *Dst = Edge.getDst();
4257 if (!Src->isInstr() || !Dst->isInstr())
4258 continue;
4259 int CycleSrc = ExpandCycle(Src);
4260 int CycleDst = ExpandCycle(Dst);
4261 int MaxLateStart = CycleDst + Edge.getDistance() * II - Edge.getLatency();
4262 if (CycleSrc > MaxLateStart) {
4263 LLVM_DEBUG({
4264 dbgs() << "Validation failed for edge from " << Src->NodeNum << " to "
4265 << Dst->NodeNum << "\n";
4266 });
4267 return false;
4268 }
4269 }
4270 return true;
4271}
4272
4273void LoopCarriedEdges::modifySUnits(std::vector<SUnit> &SUnits,
4274 const TargetInstrInfo *TII) {
4275 for (SUnit &SU : SUnits) {
4276 SUnit *Src = &SU;
4277 if (auto *OrderDep = getOrderDepOrNull(Src)) {
4278 SDep Dep(Src, SDep::Barrier);
4279 Dep.setLatency(1);
4280 for (SUnit *Dst : *OrderDep) {
4281 SUnit *From = Src;
4282 SUnit *To = Dst;
4283 if (From->NodeNum > To->NodeNum)
4284 std::swap(From, To);
4285
4286 // Add a forward edge if the following conditions are met:
4287 //
4288 // - The instruction of the source node (FromMI) may read memory.
4289 // - The instruction of the target node (ToMI) may modify memory, but
4290 // does not read it.
4291 // - Neither instruction is a global barrier.
4292 // - The load appears before the store in the original basic block.
4293 // - There are no barrier or store instructions between the two nodes.
4294 // - The target node is unreachable from the source node in the current
4295 // DAG.
4296 //
4297 // TODO: These conditions are inherited from a previous implementation,
4298 // and some may no longer be necessary. For now, we conservatively
4299 // retain all of them to avoid regressions, but the logic could
4300 // potentially be simplified
4301 MachineInstr *FromMI = From->getInstr();
4302 MachineInstr *ToMI = To->getInstr();
4303 if (FromMI->mayLoad() && !ToMI->mayLoad() && ToMI->mayStore() &&
4304 !TII->isGlobalMemoryObject(FromMI) &&
4305 !TII->isGlobalMemoryObject(ToMI) && !isSuccOrder(From, To)) {
4306 SDep Pred = Dep;
4307 Pred.setSUnit(From);
4308 To->addPred(Pred);
4309 }
4310 }
4311 }
4312 }
4313}
4314
4316 const MachineRegisterInfo *MRI) const {
4317 const auto *Order = getOrderDepOrNull(SU);
4318
4319 if (!Order)
4320 return;
4321
4322 const auto DumpSU = [](const SUnit *SU) {
4323 std::ostringstream OSS;
4324 OSS << "SU(" << SU->NodeNum << ")";
4325 return OSS.str();
4326 };
4327
4328 dbgs() << " Loop carried edges from " << DumpSU(SU) << "\n"
4329 << " Order\n";
4330 for (SUnit *Dst : *Order)
4331 dbgs() << " " << DumpSU(Dst) << "\n";
4332}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static std::optional< unsigned > getTag(const TargetRegisterInfo *TRI, const MachineInstr &MI, const LoadInfo &LI)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
constexpr LLT S1
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:638
This file defines the DenseMap class.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
static void addEdge(SmallVectorImpl< LazyCallGraph::Edge > &Edges, DenseMap< LazyCallGraph::Node *, int > &EdgeIndexMap, LazyCallGraph::Node &N, LazyCallGraph::Edge::Kind EK)
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
print mir2vec MIR2Vec Vocabulary Printer Pass
Definition MIR2Vec.cpp:273
static cl::opt< int > SwpForceII("pipeliner-force-ii", cl::desc("Force pipeliner to use specified II."), cl::Hidden, cl::init(-1))
A command line argument to force pipeliner to use specified initial interval.
static cl::opt< bool > ExperimentalCodeGen("pipeliner-experimental-cg", cl::Hidden, cl::init(false), cl::desc("Use the experimental peeling code generator for software pipelining"))
static cl::opt< bool > MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false), cl::desc("Use the MVE code generator for software pipelining"))
static cl::opt< int > RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden, cl::init(5), cl::desc("Margin representing the unused percentage of " "the register pressure limit"))
static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop, Register &InitVal, Register &LoopVal)
Return the register values for the operands of a Phi instruction.
static cl::opt< bool > SwpDebugResource("pipeliner-dbg-res", cl::Hidden, cl::init(false))
static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker, NodeSet &NS)
Compute the live-out registers for the instructions in a node-set.
static void computeScheduledInsts(const SwingSchedulerDAG *SSD, SMSchedule &Schedule, std::vector< MachineInstr * > &OrderedInsts, DenseMap< MachineInstr *, unsigned > &Stages)
Create an instruction stream that represents a single iteration and stage of each instruction.
static cl::opt< bool > EmitTestAnnotations("pipeliner-annotate-for-testing", cl::Hidden, cl::init(false), cl::desc("Instead of emitting the pipelined code, annotate instructions " "with the generated schedule for feeding into the " "-modulo-schedule-test pass"))
static Register getLoopPhiReg(const MachineInstr &Phi, const MachineBasicBlock *LoopBB)
Return the Phi register value that comes the loop block.
static bool isIntersect(SmallSetVector< SUnit *, 8 > &Set1, const NodeSet &Set2, SmallSetVector< SUnit *, 8 > &Result)
Return true if Set1 contains elements in Set2.
static bool findLoopIncrementValue(const MachineOperand &Op, int &Value)
When Op is a value that is incremented recursively in a loop and there is a unique instruction that i...
static cl::opt< bool > SwpIgnoreRecMII("pipeliner-ignore-recmii", cl::ReallyHidden, cl::desc("Ignore RecMII"))
static cl::opt< int > SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1))
static cl::opt< bool > SwpPruneLoopCarried("pipeliner-prune-loop-carried", cl::desc("Prune loop carried order dependences."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of loop carried order dependences.
static cl::opt< unsigned > SwpMaxNumStores("pipeliner-max-num-stores", cl::desc("Maximum number of stores allwed in the target loop."), cl::Hidden, cl::init(200))
A command line argument to limit the number of store instructions in the target basic block.
static cl::opt< int > SwpMaxMii("pipeliner-max-mii", cl::desc("Size limit for the MII."), cl::Hidden, cl::init(27))
A command line argument to limit minimum initial interval for pipelining.
static bool isSuccOrder(SUnit *SUa, SUnit *SUb)
Return true if SUb can be reached from SUa following the chain edges.
static cl::opt< int > SwpMaxStages("pipeliner-max-stages", cl::desc("Maximum stages allowed in the generated scheduled."), cl::Hidden, cl::init(3))
A command line argument to limit the number of stages in the pipeline.
static cl::opt< bool > EnableSWPOptSize("enable-pipeliner-opt-size", cl::desc("Enable SWP at Os."), cl::Hidden, cl::init(false))
A command line option to enable SWP at -Os.
static bool pred_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Preds, SwingSchedulerDDG *DDG, const NodeSet *S=nullptr)
Compute the Pred_L(O) set, as defined in the paper.
static bool hasLoopCarriedMemDep(const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst, BatchAAResults &BAA, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI, const SwingSchedulerDAG *SSD, bool PerformCheapCheck)
Returns true if there is a loop-carried order dependency from Src to Dst.
static cl::opt< bool > SwpShowResMask("pipeliner-show-mask", cl::Hidden, cl::init(false))
static cl::opt< int > SwpIISearchRange("pipeliner-ii-search-range", cl::desc("Range to search for II"), cl::Hidden, cl::init(10))
static bool computePath(SUnit *Cur, SetVector< SUnit * > &Path, SetVector< SUnit * > &DestNodes, SetVector< SUnit * > &Exclude, SmallPtrSet< SUnit *, 8 > &Visited, SwingSchedulerDDG *DDG)
Return true if there is a path from the specified node to any of the nodes in DestNodes.
static bool succ_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Succs, SwingSchedulerDDG *DDG, const NodeSet *S=nullptr)
Compute the Succ_L(O) set, as defined in the paper.
static cl::opt< bool > LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false), cl::desc("Limit register pressure of scheduled loop"))
static cl::opt< bool > EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true), cl::desc("Enable Software Pipelining"))
A command line option to turn software pipelining on or off.
static cl::opt< bool > SwpPruneDeps("pipeliner-prune-deps", cl::desc("Prune dependences between unrelated Phi nodes."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of chain dependences due to an unrelated Phi.
static SUnit * multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG)
If an instruction has a use that spans multiple iterations, then return true.
static Register findUniqueOperandDefinedInLoop(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the PriorityQueue class.
Remove Loads Into Fake Uses
std::pair< BasicBlock *, BasicBlock * > Edge
This file contains some templates that are useful if you are working with the STL at all.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
Target-Independent Code Generator Pass Configuration Options pass.
Add loop-carried chain dependencies.
void computeDependencies()
The main function to compute loop-carried order-dependencies.
const BitVector & getLoopCarried(unsigned Idx) const
LoopCarriedOrderDepsTracker(SwingSchedulerDAG *SSD, BatchAAResults *BAA, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
bool isNoAlias(const MemoryLocation &LocA, const MemoryLocation &LocB)
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:167
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:237
bool erase(const KeyT &Val)
Definition DenseMap.h:311
bool empty() const
Definition DenseMap.h:109
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:222
bool skipFunction(const Function &F) const
Optional passes call this function to check whether the pass should be skipped.
Definition Pass.cpp:188
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isPostIncrement(const MachineInstr &MI) const override
Return true for post-incremented instructions.
DFAPacketizer * CreateTargetScheduleState(const TargetSubtargetInfo &STI) const override
Create machine specific model for scheduling.
bool getBaseAndOffsetPosition(const MachineInstr &MI, unsigned &BasePos, unsigned &OffsetPos) const override
For instructions with a base and offset, return the position of the base register and offset operands...
const InstrStage * beginStage(unsigned ItinClassIndx) const
Return the first stage of the itinerary.
const InstrStage * endStage(unsigned ItinClassIndx) const
Return the last+1 stage of the itinerary.
bool isEmpty() const
Returns true if there are no itineraries.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
bool hasValue() const
TypeSize getValue() const
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
unsigned getSchedClass() const
Return the scheduling class for this instruction.
const MCWriteProcResEntry * getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCWriteProcResEntry * getWriteProcResBegin(const MCSchedClassDesc *SC) const
Return an iterator at the first process resource consumed by the given scheduling class.
const MCSchedModel & getSchedModel() const
Get the machine model for this subtarget's CPU.
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1442
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1440
unsigned getNumOperands() const
Return number of MDNode operands.
Definition Metadata.h:1448
LLVM_ABI StringRef getString() const
Definition Metadata.cpp:618
MachineInstrBundleIterator< const MachineInstr > const_iterator
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Analysis pass which computes a MachineDominatorTree.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
bool mayRaiseFPException() const
Return true if this instruction could possibly raise a floating-point exception.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool isRegSequence() const
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool isIdenticalTo(const MachineInstr &Other, MICheckType Check=CheckDefs) const
Return true if this instruction is identical to Other.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isPseudo(QueryType Type=IgnoreBundle) const
Return true if this is a pseudo instruction that doesn't correspond to a real machine instruction.
LLVM_ABI void dump() const
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
const Value * getValue() const
Return the base address of the memory access.
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
Register getReg() const
getReg - Returns the register number.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
Diagnostic information for optimization analysis remarks.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
The main class in the implementation of the target independent software pipeliner pass.
bool runOnMachineFunction(MachineFunction &MF) override
The "main" function for implementing Swing Modulo Scheduling.
const TargetInstrInfo * TII
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const MachineDominatorTree * MDT
const MachineLoopInfo * MLI
MachineOptimizationRemarkEmitter * ORE
RegisterClassInfo RegClassInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
defusechain_instr_iterator< true, false, false, true > use_instr_iterator
use_instr_iterator/use_instr_begin/use_instr_end - Walk all uses of the specified register,...
static MemoryLocation getBeforeOrAfter(const Value *Ptr, const AAMDNodes &AATags=AAMDNodes())
Return a location that may access any location before or after Ptr, while remaining within the underl...
Expand the kernel using modulo variable expansion algorithm (MVE).
static bool canApply(MachineLoop &L)
Check if ModuloScheduleExpanderMVE can be applied to L.
The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place, rewriting the old loop and...
void cleanup()
Performs final cleanup after expansion.
void expand()
Performs the actual expansion.
Expander that simply annotates each scheduled instruction with a post-instr symbol that can be consum...
void annotate()
Performs the annotation.
Represents a schedule for a single-block loop.
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
SUnit * getNode(unsigned i) const
void print(raw_ostream &os) const
void setRecMII(unsigned mii)
unsigned count(SUnit *SU) const
void setColocate(unsigned c)
int compareRecMII(NodeSet &RHS)
bool insert(SUnit *SU)
LLVM_DUMP_METHOD void dump() const
bool empty() const
void dump() const
Definition Pass.cpp:146
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
A reimplementation of ModuloScheduleExpander.
PointerIntPair - This class implements a pair of a pointer and small integer.
unsigned getPSet() const
Track the current register pressure at some position in the instruction stream, and remember the high...
LLVM_ABI void addLiveRegs(ArrayRef< VRegMaskOrUnit > Regs)
Force liveness of virtual registers or physical register units.
unsigned getRegPressureSetLimit(unsigned Idx) const
Get the register unit limit for the given pressure set index.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:102
constexpr bool isValid() const
Definition Register.h:107
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:74
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
void initProcResourceVectors(const MCSchedModel &SM, SmallVectorImpl< uint64_t > &Masks)
void init(int II)
Initialize resources with the initiation interval II.
bool canReserveResources(SUnit &SU, int Cycle)
Check if the resources occupied by a machine instruction are available in the current state.
Scheduling dependency.
Definition ScheduleDAG.h:51
Kind
These are the different kinds of scheduling dependencies.
Definition ScheduleDAG.h:54
@ Order
Any other ordering dependency.
Definition ScheduleDAG.h:58
@ Anti
A register anti-dependence (aka WAR).
Definition ScheduleDAG.h:56
@ Data
Regular data dependence (aka true-dependence).
Definition ScheduleDAG.h:55
void setLatency(unsigned Lat)
Sets the latency for this edge.
@ Barrier
An unknown scheduling barrier.
Definition ScheduleDAG.h:71
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition ScheduleDAG.h:74
void setSUnit(SUnit *SU)
This class represents the scheduled code.
std::deque< SUnit * > reorderInstructions(const SwingSchedulerDAG *SSD, const std::deque< SUnit * > &Instrs) const
void setInitiationInterval(int ii)
Set the initiation interval for this schedule.
void dump() const
Utility function used for debugging to print the schedule.
bool insert(SUnit *SU, int StartCycle, int EndCycle, int II)
Try to schedule the node at the specified StartCycle and continue until the node is schedule or the E...
int earliestCycleInChain(const SwingSchedulerDDGEdge &Dep, const SwingSchedulerDDG *DDG)
Return the cycle of the earliest scheduled instruction in the dependence chain.
unsigned getMaxStageCount()
Return the maximum stage count needed for this schedule.
void print(raw_ostream &os) const
Print the schedule information to the given output.
bool onlyHasLoopCarriedOutputOrOrderPreds(SUnit *SU, const SwingSchedulerDDG *DDG) const
Return true if all scheduled predecessors are loop-carried output/order dependencies.
int stageScheduled(SUnit *SU) const
Return the stage for a scheduled instruction.
void orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU, std::deque< SUnit * > &Insts) const
Order the instructions within a cycle so that the definitions occur before the uses.
bool isValidSchedule(SwingSchedulerDAG *SSD)
int getInitiationInterval() const
Return the initiation interval for this schedule.
std::deque< SUnit * > & getInstructions(int cycle)
Return the instructions that are scheduled at the specified cycle.
int getFirstCycle() const
Return the first cycle in the completed schedule.
DenseMap< int, std::deque< SUnit * > >::const_iterator const_sched_iterator
bool isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD, MachineInstr *Def, MachineOperand &MO) const
Return true if the instruction is a definition that is loop carried and defines the use on the next i...
unsigned cycleScheduled(SUnit *SU) const
Return the cycle for a scheduled instruction.
SmallPtrSet< SUnit *, 8 > computeUnpipelineableNodes(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
Determine transitive dependences of unpipelineable instructions.
void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart, int II, SwingSchedulerDAG *DAG)
Compute the scheduling start slot for the instruction.
bool normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
bool isLoopCarried(const SwingSchedulerDAG *SSD, MachineInstr &Phi) const
Return true if the scheduled Phi has a loop carried operand.
int latestCycleInChain(const SwingSchedulerDDGEdge &Dep, const SwingSchedulerDDG *DDG)
Return the cycle of the latest scheduled instruction in the dependence chain.
int getFinalCycle() const
Return the last cycle in the finalized schedule.
void finalizeSchedule(SwingSchedulerDAG *SSD)
After the schedule has been formed, call this function to combine the instructions from the different...
Scheduling unit. This is a node in the scheduling DAG.
unsigned NumPreds
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
unsigned NodeNum
Entry # of node in the node vector.
void setInstr(MachineInstr *MI)
Assigns the instruction for the SUnit.
LLVM_ABI void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
bool isPred(const SUnit *N) const
Tests if node N is a predecessor of this node.
unsigned short Latency
Node latency.
bool isBoundaryNode() const
Boundary nodes are placeholders for the boundary of the scheduling region.
bool hasPhysRegDefs
Has physreg defs that are being used.
SmallVector< SDep, 4 > Succs
All sunit successors.
SmallVector< SDep, 4 > Preds
All sunit predecessors.
LLVM_ABI bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
DenseMap< MachineInstr *, SUnit * > MISUnitMap
After calling BuildSchedGraph, each machine instruction in the current scheduling region is mapped to...
virtual void finishBlock()
Cleans up after scheduling in the given block.
MachineBasicBlock * BB
The block in which to insert instructions.
void buildSchedGraph(AAResults *AA, RegPressureTracker *RPTracker=nullptr, PressureDiffs *PDiffs=nullptr, LiveIntervals *LIS=nullptr, bool TrackLaneMasks=false)
Builds SUnits for the current region.
SUnit * getSUnit(MachineInstr *MI) const
Returns an existing SUnit for this MI, or nullptr.
void dump() const override
LLVM_ABI void AddPred(SUnit *Y, SUnit *X)
Updates the topological ordering to accommodate an edge to be added from SUnit X to SUnit Y.
LLVM_ABI bool IsReachable(const SUnit *SU, const SUnit *TargetSU)
Checks if SU is reachable from TargetSU.
MachineRegisterInfo & MRI
Virtual/real register map.
const TargetInstrInfo * TII
Target instruction information.
std::vector< SUnit > SUnits
The scheduling units.
const TargetRegisterInfo * TRI
Target processor register info.
SUnit EntrySU
Special node for the region entry.
MachineFunction & MF
Machine function.
SUnit ExitSU
Special node for the region exit.
A vector that has set insertion semantics.
Definition SetVector.h:59
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:102
void insert_range(Range &&R)
Definition SetVector.h:175
typename vector_type::const_iterator iterator
Definition SetVector.h:71
void clear()
Completely clear the SetVector.
Definition SetVector.h:266
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition SetVector.h:261
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:99
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:150
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition SetVector.h:251
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
iterator end() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:338
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class builds the dependence graph for the instructions in a loop, and attempts to schedule the i...
void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule)
Apply changes to the instruction if needed.
const SwingSchedulerDDG * getDDG() const
void finishBlock() override
Clean up after the software pipeliner runs.
void fixupRegisterOverlaps(std::deque< SUnit * > &Instrs)
Attempt to fix the degenerate cases when the instruction serialization causes the register lifetimes ...
bool isLoopCarriedDep(const SwingSchedulerDDGEdge &Edge) const
Return true for an order or output dependence that is loop carried potentially.
void schedule() override
We override the schedule function in ScheduleDAGInstrs to implement the scheduling part of the Swing ...
bool mayOverlapInLaterIter(const MachineInstr *BaseMI, const MachineInstr *OtherMI) const
Return false if there is no overlap between the region accessed by BaseMI in an iteration and the reg...
Register getInstrBaseReg(SUnit *SU) const
Return the new base register that was stored away for the changed instruction.
Represents a dependence between two instruction.
SUnit * getDst() const
Returns the SUnit to which the edge points (destination node).
bool ignoreDependence(bool IgnoreAnti) const
Returns true for DDG nodes that we ignore when computing the cost functions.
SUnit * getSrc() const
Returns the SUnit from which the edge comes (source node).
This class provides APIs to retrieve edges from/to an SUnit node, with a particular focus on loop-car...
SwingSchedulerDDG(std::vector< SUnit > &SUnits, SUnit *EntrySU, SUnit *ExitSU, const LoopCarriedEdges &LCE)
const EdgesType & getInEdges(const SUnit *SU) const
bool isValidSchedule(const SMSchedule &Schedule) const
Check if Schedule doesn't violate the validation-only dependencies.
const EdgesType & getOutEdges(const SUnit *SU) const
Object returned by analyzeLoopForPipelining.
virtual bool shouldIgnoreForPipelining(const MachineInstr *MI) const =0
Return true if the given instruction should not be pipelined and should be ignored.
TargetInstrInfo - Interface to description of machine instruction set.
Primary interface to the complete machine description for the target machine.
Target-Independent Code Generator Pass Configuration Options.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual bool enableMachinePipeliner() const
True if the subtarget should run MachinePipeliner.
virtual bool useDFAforSMS() const
Default to DFA for resource management, return false when target will use ProcResource in InstrSchedM...
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
virtual const InstrItineraryData * getInstrItineraryData() const
getInstrItineraryData - Returns instruction itinerary data for the target or specific subtarget.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
The main class in the implementation of the target independent window scheduler.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition COFF.h:862
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
Definition Metadata.h:667
constexpr double e
Definition MathExtras.h:47
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
std::set< NodeId > NodeSet
Definition RDFGraph.h:551
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
Definition SFrame.h:77
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:477
void stable_sort(R &&Range)
Definition STLExtras.h:2060
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1657
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2138
Op::Description Desc
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2130
CycleInfo::CycleT Cycle
Definition CycleInfo.h:24
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1734
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
static int64_t computeDelta(SectionEntry *A, SectionEntry *B)
@ WS_Force
Use window algorithm after SMS algorithm fails.
@ WS_On
Turn off window algorithm.
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1624
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
unsigned MCRegUnit
Register units are used to compute register aliasing.
Definition MCRegister.h:30
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:118
@ Other
Any other memory.
Definition ModRef.h:68
unsigned getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:1996
LLVM_ABI char & MachinePipelinerID
This pass performs software pipelining on machine instructions.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
cl::opt< bool > SwpEnableCopyToPhi
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1899
static cl::opt< WindowSchedulingFlag > WindowSchedulingOption("window-sched", cl::Hidden, cl::init(WindowSchedulingFlag::WS_On), cl::desc("Set how to use window scheduling algorithm."), cl::values(clEnumValN(WindowSchedulingFlag::WS_Off, "off", "Turn off window algorithm."), clEnumValN(WindowSchedulingFlag::WS_On, "on", "Use window algorithm after SMS algorithm fails."), clEnumValN(WindowSchedulingFlag::WS_Force, "force", "Use window algorithm instead of SMS algorithm.")))
A command line argument to set the window scheduling option.
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=MaxLookupSearchDepth)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
LLVM_ABI bool isIdentifiedObject(const Value *V)
Return true if this pointer refers to a distinct and identifiable object.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
cl::opt< int > SwpForceIssueWidth
A command line argument to force pipeliner to use specified issue width.
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
This class holds an SUnit corresponding to a memory operation and other information related to the in...
const Value * MemOpValue
The value of a memory operand.
SmallVector< const Value *, 2 > UnderlyingObjs
bool isTriviallyDisjoint(const SUnitWithMemInfo &Other) const
int64_t MemOpOffset
The offset of a memory operand.
bool IsAllIdentified
True if all the underlying objects are identified.
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:761
uint64_t FuncUnits
Bitmask representing a set of functional units.
static constexpr LaneBitmask getNone()
Definition LaneBitmask.h:81
Represents loop-carried dependencies.
SmallSetVector< SUnit *, 8 > OrderDep
const OrderDep * getOrderDepOrNull(SUnit *Key) const
void modifySUnits(std::vector< SUnit > &SUnits, const TargetInstrInfo *TII)
Adds some edges to the original DAG that correspond to loop-carried dependencies.
void dump(SUnit *SU, const TargetRegisterInfo *TRI, const MachineRegisterInfo *MRI) const
Define a kind of processor resource that will be modeled by the scheduler.
Definition MCSchedule.h:36
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:123
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:258
const MCSchedClassDesc * getSchedClassDesc(unsigned SchedClassIdx) const
Definition MCSchedule.h:366
bool hasInstrSchedModel() const
Does this machine model include instruction-level scheduling.
Definition MCSchedule.h:340
const MCProcResourceDesc * getProcResource(unsigned ProcResourceIdx) const
Definition MCSchedule.h:359
Identify one of the processor resource kinds consumed by a particular scheduling class for the specif...
Definition MCSchedule.h:68
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
std::vector< unsigned > MaxSetPressure
Map of max reg pressure indexed by pressure set ID, not class ID.