LLVM 22.0.0git
MachinePipeliner.cpp
Go to the documentation of this file.
1//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
10//
11// This SMS implementation is a target-independent back-end pass. When enabled,
12// the pass runs just prior to the register allocation pass, while the machine
13// IR is in SSA form. If software pipelining is successful, then the original
14// loop is replaced by the optimized loop. The optimized loop contains one or
15// more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16// the instructions cannot be scheduled in a given MII, we increase the MII by
17// one and try again.
18//
19// The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20// represent loop carried dependences in the DAG as order edges to the Phi
21// nodes. We also perform several passes over the DAG to eliminate unnecessary
22// edges that inhibit the ability to pipeline. The implementation uses the
23// DFAPacketizer class to compute the minimum initiation interval and the check
24// where an instruction may be inserted in the pipelined schedule.
25//
26// In order for the SMS pass to work, several target specific hooks need to be
27// implemented to get information about the loop structure and to rewrite
28// instructions.
29//
30//===----------------------------------------------------------------------===//
31
33#include "llvm/ADT/ArrayRef.h"
34#include "llvm/ADT/BitVector.h"
35#include "llvm/ADT/DenseMap.h"
37#include "llvm/ADT/STLExtras.h"
39#include "llvm/ADT/SetVector.h"
41#include "llvm/ADT/SmallSet.h"
43#include "llvm/ADT/Statistic.h"
72#include "llvm/Config/llvm-config.h"
73#include "llvm/IR/Attributes.h"
74#include "llvm/IR/Function.h"
75#include "llvm/MC/LaneBitmask.h"
76#include "llvm/MC/MCInstrDesc.h"
78#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
83#include <algorithm>
84#include <cassert>
85#include <climits>
86#include <cstdint>
87#include <deque>
88#include <functional>
89#include <iomanip>
90#include <iterator>
91#include <map>
92#include <memory>
93#include <sstream>
94#include <tuple>
95#include <utility>
96#include <vector>
97
98using namespace llvm;
99
100#define DEBUG_TYPE "pipeliner"
101
102STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
103STATISTIC(NumPipelined, "Number of loops software pipelined");
104STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
105STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
106STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
107STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
108STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
109STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
110STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
111STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
112STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
113STATISTIC(NumFailTooManyStores, "Pipeliner abort due to too many stores");
114
115/// A command line option to turn software pipelining on or off.
116static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
117 cl::desc("Enable Software Pipelining"));
118
119/// A command line option to enable SWP at -Os.
120static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
121 cl::desc("Enable SWP at Os."), cl::Hidden,
122 cl::init(false));
123
124/// A command line argument to limit minimum initial interval for pipelining.
125static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
126 cl::desc("Size limit for the MII."),
127 cl::Hidden, cl::init(27));
128
129/// A command line argument to force pipeliner to use specified initial
130/// interval.
131static cl::opt<int> SwpForceII("pipeliner-force-ii",
132 cl::desc("Force pipeliner to use specified II."),
133 cl::Hidden, cl::init(-1));
134
135/// A command line argument to limit the number of stages in the pipeline.
136static cl::opt<int>
137 SwpMaxStages("pipeliner-max-stages",
138 cl::desc("Maximum stages allowed in the generated scheduled."),
139 cl::Hidden, cl::init(3));
140
141/// A command line option to disable the pruning of chain dependences due to
142/// an unrelated Phi.
143static cl::opt<bool>
144 SwpPruneDeps("pipeliner-prune-deps",
145 cl::desc("Prune dependences between unrelated Phi nodes."),
146 cl::Hidden, cl::init(true));
147
148/// A command line option to disable the pruning of loop carried order
149/// dependences.
150static cl::opt<bool>
151 SwpPruneLoopCarried("pipeliner-prune-loop-carried",
152 cl::desc("Prune loop carried order dependences."),
153 cl::Hidden, cl::init(true));
154
155#ifndef NDEBUG
156static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
157#endif
158
159static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
161 cl::desc("Ignore RecMII"));
162
163static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
164 cl::init(false));
165static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
166 cl::init(false));
167
169 "pipeliner-annotate-for-testing", cl::Hidden, cl::init(false),
170 cl::desc("Instead of emitting the pipelined code, annotate instructions "
171 "with the generated schedule for feeding into the "
172 "-modulo-schedule-test pass"));
173
175 "pipeliner-experimental-cg", cl::Hidden, cl::init(false),
176 cl::desc(
177 "Use the experimental peeling code generator for software pipelining"));
178
179static cl::opt<int> SwpIISearchRange("pipeliner-ii-search-range",
180 cl::desc("Range to search for II"),
181 cl::Hidden, cl::init(10));
182
183static cl::opt<bool>
184 LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false),
185 cl::desc("Limit register pressure of scheduled loop"));
186
187static cl::opt<int>
188 RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden,
189 cl::init(5),
190 cl::desc("Margin representing the unused percentage of "
191 "the register pressure limit"));
192
193static cl::opt<bool>
194 MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false),
195 cl::desc("Use the MVE code generator for software pipelining"));
196
197/// A command line argument to limit the number of store instructions in the
198/// target basic block.
200 "pipeliner-max-num-stores",
201 cl::desc("Maximum number of stores allwed in the target loop."), cl::Hidden,
202 cl::init(200));
203
204// A command line option to enable the CopyToPhi DAG mutation.
206 llvm::SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
207 cl::init(true),
208 cl::desc("Enable CopyToPhi DAG Mutation"));
209
210/// A command line argument to force pipeliner to use specified issue
211/// width.
213 "pipeliner-force-issue-width",
214 cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
215 cl::init(-1));
216
217/// A command line argument to set the window scheduling option.
220 cl::desc("Set how to use window scheduling algorithm."),
222 "Turn off window algorithm."),
224 "Use window algorithm after SMS algorithm fails."),
226 "Use window algorithm instead of SMS algorithm.")));
227
228unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
229char MachinePipeliner::ID = 0;
230#ifndef NDEBUG
232#endif
234
236 "Modulo Software Pipelining", false, false)
242 "Modulo Software Pipelining", false, false)
243
244namespace {
245
246/// This class holds an SUnit corresponding to a memory operation and other
247/// information related to the instruction.
251
252 /// The value of a memory operand.
253 const Value *MemOpValue = nullptr;
254
255 /// The offset of a memory operand.
256 int64_t MemOpOffset = 0;
257
259
260 /// True if all the underlying objects are identified.
261 bool IsAllIdentified = false;
262
264
265 bool isTriviallyDisjoint(const SUnitWithMemInfo &Other) const;
266
267 bool isUnknown() const { return MemOpValue == nullptr; }
268
269private:
271};
272
273/// Add loop-carried chain dependencies. This class handles the same type of
274/// dependencies added by `ScheduleDAGInstrs::buildSchedGraph`, but takes into
275/// account dependencies across iterations.
277 // Type of instruction that is relevant to order-dependencies
278 enum class InstrTag {
279 Barrier = 0, ///< A barrier event instruction.
280 LoadOrStore = 1, ///< An instruction that may load or store memory, but is
281 ///< not a barrier event.
282 FPExceptions = 2, ///< An instruction that does not match above, but may
283 ///< raise floatin-point exceptions.
284 };
285
286 struct TaggedSUnit : PointerIntPair<SUnit *, 2> {
287 TaggedSUnit(SUnit *SU, InstrTag Tag)
288 : PointerIntPair<SUnit *, 2>(SU, unsigned(Tag)) {}
289
290 InstrTag getTag() const { return InstrTag(getInt()); }
291 };
292
293 /// Holds loads and stores with memory related information.
294 struct LoadStoreChunk {
297
298 void append(SUnit *SU);
299 };
300
302 BatchAAResults *BAA;
303 std::vector<SUnit> &SUnits;
304
305 /// The size of SUnits, for convenience.
306 const unsigned N;
307
308 /// Loop-carried Edges.
309 std::vector<BitVector> LoopCarried;
310
311 /// Instructions related to chain dependencies. They are one of the
312 /// following:
313 ///
314 /// 1. Barrier event.
315 /// 2. Load, but neither a barrier event, invariant load, nor may load trap
316 /// value.
317 /// 3. Store, but not a barrier event.
318 /// 4. None of them, but may raise floating-point exceptions.
319 ///
320 /// This is used when analyzing loop-carried dependencies that access global
321 /// barrier instructions.
322 std::vector<TaggedSUnit> TaggedSUnits;
323
324 const TargetInstrInfo *TII = nullptr;
325 const TargetRegisterInfo *TRI = nullptr;
326
327public:
329 const TargetInstrInfo *TII,
330 const TargetRegisterInfo *TRI);
331
332 /// The main function to compute loop-carried order-dependencies.
333 void computeDependencies();
334
335 const BitVector &getLoopCarried(unsigned Idx) const {
336 return LoopCarried[Idx];
337 }
338
339private:
340 /// Tags to \p SU if the instruction may affect the order-dependencies.
341 std::optional<InstrTag> getInstrTag(SUnit *SU) const;
342
343 void addLoopCarriedDepenenciesForChunks(const LoadStoreChunk &From,
344 const LoadStoreChunk &To);
345
346 /// Add a loop-carried order dependency between \p Src and \p Dst if we
347 /// cannot prove they are independent. When \p PerformCheapCheck is true, a
348 /// lightweight dependency test (referred to as "cheap check" below) is
349 /// performed at first. Note that the cheap check is retained to maintain the
350 /// existing behavior and not expected to be used anymore.
351 ///
352 /// TODO: Remove \p PerformCheapCheck and the corresponding cheap check.
353 void addDependenciesBetweenSUs(const SUnitWithMemInfo &Src,
354 const SUnitWithMemInfo &Dst,
355 bool PerformCheapCheck = false);
356
357 void computeDependenciesAux();
358};
359
360} // end anonymous namespace
361
362/// The "main" function for implementing Swing Modulo Scheduling.
364 if (skipFunction(mf.getFunction()))
365 return false;
366
367 if (!EnableSWP)
368 return false;
369
370 if (mf.getFunction().getAttributes().hasFnAttr(Attribute::OptimizeForSize) &&
371 !EnableSWPOptSize.getPosition())
372 return false;
373
375 return false;
376
377 // Cannot pipeline loops without instruction itineraries if we are using
378 // DFA for the pipeliner.
379 if (mf.getSubtarget().useDFAforSMS() &&
382 return false;
383
384 MF = &mf;
388 TII = MF->getSubtarget().getInstrInfo();
389 RegClassInfo.runOnMachineFunction(*MF);
390
391 for (const auto &L : *MLI)
392 scheduleLoop(*L);
393
394 return false;
395}
396
397/// Attempt to perform the SMS algorithm on the specified loop. This function is
398/// the main entry point for the algorithm. The function identifies candidate
399/// loops, calculates the minimum initiation interval, and attempts to schedule
400/// the loop.
401bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
402 bool Changed = false;
403 for (const auto &InnerLoop : L)
404 Changed |= scheduleLoop(*InnerLoop);
405
406#ifndef NDEBUG
407 // Stop trying after reaching the limit (if any).
408 int Limit = SwpLoopLimit;
409 if (Limit >= 0) {
410 if (NumTries >= SwpLoopLimit)
411 return Changed;
412 NumTries++;
413 }
414#endif
415
416 setPragmaPipelineOptions(L);
417 if (!canPipelineLoop(L)) {
418 LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
419 ORE->emit([&]() {
420 return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
421 L.getStartLoc(), L.getHeader())
422 << "Failed to pipeline loop";
423 });
424
425 LI.LoopPipelinerInfo.reset();
426 return Changed;
427 }
428
429 ++NumTrytoPipeline;
430 if (useSwingModuloScheduler())
431 Changed = swingModuloScheduler(L);
432
433 if (useWindowScheduler(Changed))
434 Changed = runWindowScheduler(L);
435
436 LI.LoopPipelinerInfo.reset();
437 return Changed;
438}
439
440void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
441 // Reset the pragma for the next loop in iteration.
442 disabledByPragma = false;
443 II_setByPragma = 0;
444
445 MachineBasicBlock *LBLK = L.getTopBlock();
446
447 if (LBLK == nullptr)
448 return;
449
450 const BasicBlock *BBLK = LBLK->getBasicBlock();
451 if (BBLK == nullptr)
452 return;
453
454 const Instruction *TI = BBLK->getTerminator();
455 if (TI == nullptr)
456 return;
457
458 MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);
459 if (LoopID == nullptr)
460 return;
461
462 assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");
463 assert(LoopID->getOperand(0) == LoopID && "invalid loop");
464
465 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
466 MDNode *MD = dyn_cast<MDNode>(MDO);
467
468 if (MD == nullptr)
469 continue;
470
471 MDString *S = dyn_cast<MDString>(MD->getOperand(0));
472
473 if (S == nullptr)
474 continue;
475
476 if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
477 assert(MD->getNumOperands() == 2 &&
478 "Pipeline initiation interval hint metadata should have two operands.");
480 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
481 assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");
482 } else if (S->getString() == "llvm.loop.pipeline.disable") {
483 disabledByPragma = true;
484 }
485 }
486}
487
488/// Depth-first search to detect cycles among PHI dependencies.
489/// Returns true if a cycle is detected within the PHI-only subgraph.
490static bool hasPHICycleDFS(
491 unsigned Reg, const DenseMap<unsigned, SmallVector<unsigned, 2>> &PhiDeps,
492 SmallSet<unsigned, 8> &Visited, SmallSet<unsigned, 8> &RecStack) {
493
494 // If Reg is not a PHI-def it cannot contribute to a PHI cycle.
495 auto It = PhiDeps.find(Reg);
496 if (It == PhiDeps.end())
497 return false;
498
499 if (RecStack.count(Reg))
500 return true; // backedge.
501 if (Visited.count(Reg))
502 return false;
503
504 Visited.insert(Reg);
505 RecStack.insert(Reg);
506
507 for (unsigned Dep : It->second) {
508 if (hasPHICycleDFS(Dep, PhiDeps, Visited, RecStack))
509 return true;
510 }
511
512 RecStack.erase(Reg);
513 return false;
514}
515
516static bool hasPHICycle(const MachineBasicBlock *LoopHeader,
517 const MachineRegisterInfo &MRI) {
519
520 // Collect PHI nodes and their dependencies.
521 for (const MachineInstr &MI : LoopHeader->phis()) {
522 unsigned DefReg = MI.getOperand(0).getReg();
523 auto Ins = PhiDeps.try_emplace(DefReg).first;
524
525 // PHI operands are (Reg, MBB) pairs starting at index 1.
526 for (unsigned I = 1; I < MI.getNumOperands(); I += 2)
527 Ins->second.push_back(MI.getOperand(I).getReg());
528 }
529
530 // DFS to detect cycles among PHI nodes.
531 SmallSet<unsigned, 8> Visited, RecStack;
532
533 // Start DFS from each PHI-def.
534 for (const auto &KV : PhiDeps) {
535 unsigned Reg = KV.first;
536 if (hasPHICycleDFS(Reg, PhiDeps, Visited, RecStack))
537 return true;
538 }
539
540 return false;
541}
542
543/// Return true if the loop can be software pipelined. The algorithm is
544/// restricted to loops with a single basic block. Make sure that the
545/// branch in the loop can be analyzed.
546bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
547 if (L.getNumBlocks() != 1) {
548 ORE->emit([&]() {
549 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
550 L.getStartLoc(), L.getHeader())
551 << "Not a single basic block: "
552 << ore::NV("NumBlocks", L.getNumBlocks());
553 });
554 return false;
555 }
556
557 if (hasPHICycle(L.getHeader(), MF->getRegInfo())) {
558 LLVM_DEBUG(dbgs() << "Cannot pipeline loop due to PHI cycle\n");
559 return false;
560 }
561
562 if (disabledByPragma) {
563 ORE->emit([&]() {
564 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
565 L.getStartLoc(), L.getHeader())
566 << "Disabled by Pragma.";
567 });
568 return false;
569 }
570
571 // Check if the branch can't be understood because we can't do pipelining
572 // if that's the case.
573 LI.TBB = nullptr;
574 LI.FBB = nullptr;
575 LI.BrCond.clear();
576 if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
577 LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
578 NumFailBranch++;
579 ORE->emit([&]() {
580 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
581 L.getStartLoc(), L.getHeader())
582 << "The branch can't be understood";
583 });
584 return false;
585 }
586
587 LI.LoopInductionVar = nullptr;
588 LI.LoopCompare = nullptr;
589 LI.LoopPipelinerInfo = TII->analyzeLoopForPipelining(L.getTopBlock());
590 if (!LI.LoopPipelinerInfo) {
591 LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
592 NumFailLoop++;
593 ORE->emit([&]() {
594 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
595 L.getStartLoc(), L.getHeader())
596 << "The loop structure is not supported";
597 });
598 return false;
599 }
600
601 if (!L.getLoopPreheader()) {
602 LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
603 NumFailPreheader++;
604 ORE->emit([&]() {
605 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
606 L.getStartLoc(), L.getHeader())
607 << "No loop preheader found";
608 });
609 return false;
610 }
611
612 unsigned NumStores = 0;
613 for (MachineInstr &MI : *L.getHeader())
614 if (MI.mayStore())
615 ++NumStores;
616 if (NumStores > SwpMaxNumStores) {
617 LLVM_DEBUG(dbgs() << "Too many stores\n");
618 NumFailTooManyStores++;
619 ORE->emit([&]() {
620 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
621 L.getStartLoc(), L.getHeader())
622 << "Too many store instructions in the loop: "
623 << ore::NV("NumStores", NumStores) << " > "
624 << ore::NV("SwpMaxNumStores", SwpMaxNumStores) << ".";
625 });
626 return false;
627 }
628
629 // Remove any subregisters from inputs to phi nodes.
630 preprocessPhiNodes(*L.getHeader());
631 return true;
632}
633
634void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
635 MachineRegisterInfo &MRI = MF->getRegInfo();
636 SlotIndexes &Slots =
637 *getAnalysis<LiveIntervalsWrapperPass>().getLIS().getSlotIndexes();
638
639 for (MachineInstr &PI : B.phis()) {
640 MachineOperand &DefOp = PI.getOperand(0);
641 assert(DefOp.getSubReg() == 0);
642 auto *RC = MRI.getRegClass(DefOp.getReg());
643
644 for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
645 MachineOperand &RegOp = PI.getOperand(i);
646 if (RegOp.getSubReg() == 0)
647 continue;
648
649 // If the operand uses a subregister, replace it with a new register
650 // without subregisters, and generate a copy to the new register.
651 Register NewReg = MRI.createVirtualRegister(RC);
652 MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
654 const DebugLoc &DL = PredB.findDebugLoc(At);
655 auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
656 .addReg(RegOp.getReg(), getRegState(RegOp),
657 RegOp.getSubReg());
658 Slots.insertMachineInstrInMaps(*Copy);
659 RegOp.setReg(NewReg);
660 RegOp.setSubReg(0);
661 }
662 }
663}
664
665/// The SMS algorithm consists of the following main steps:
666/// 1. Computation and analysis of the dependence graph.
667/// 2. Ordering of the nodes (instructions).
668/// 3. Attempt to Schedule the loop.
669bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
670 assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
671
672 AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
673 SwingSchedulerDAG SMS(
675 II_setByPragma, LI.LoopPipelinerInfo.get(), AA);
676
677 MachineBasicBlock *MBB = L.getHeader();
678 // The kernel should not include any terminator instructions. These
679 // will be added back later.
680 SMS.startBlock(MBB);
681
682 // Compute the number of 'real' instructions in the basic block by
683 // ignoring terminators.
684 unsigned size = MBB->size();
686 E = MBB->instr_end();
687 I != E; ++I, --size)
688 ;
689
690 SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
691 SMS.schedule();
692 SMS.exitRegion();
693
694 SMS.finishBlock();
695 return SMS.hasNewSchedule();
696}
697
708
709bool MachinePipeliner::runWindowScheduler(MachineLoop &L) {
710 MachineSchedContext Context;
711 Context.MF = MF;
712 Context.MLI = MLI;
713 Context.MDT = MDT;
714 Context.TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
715 Context.AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
716 Context.LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
717 Context.RegClassInfo->runOnMachineFunction(*MF);
718 WindowScheduler WS(&Context, L);
719 return WS.run();
720}
721
722bool MachinePipeliner::useSwingModuloScheduler() {
723 // SwingModuloScheduler does not work when WindowScheduler is forced.
725}
726
727bool MachinePipeliner::useWindowScheduler(bool Changed) {
728 // WindowScheduler does not work for following cases:
729 // 1. when it is off.
730 // 2. when SwingModuloScheduler is successfully scheduled.
731 // 3. when pragma II is enabled.
732 if (II_setByPragma) {
733 LLVM_DEBUG(dbgs() << "Window scheduling is disabled when "
734 "llvm.loop.pipeline.initiationinterval is set.\n");
735 return false;
736 }
737
740}
741
742void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
743 if (SwpForceII > 0)
744 MII = SwpForceII;
745 else if (II_setByPragma > 0)
746 MII = II_setByPragma;
747 else
748 MII = std::max(ResMII, RecMII);
749}
750
751void SwingSchedulerDAG::setMAX_II() {
752 if (SwpForceII > 0)
753 MAX_II = SwpForceII;
754 else if (II_setByPragma > 0)
755 MAX_II = II_setByPragma;
756 else
757 MAX_II = MII + SwpIISearchRange;
758}
759
760/// We override the schedule function in ScheduleDAGInstrs to implement the
761/// scheduling part of the Swing Modulo Scheduling algorithm.
763 buildSchedGraph(AA);
764 const LoopCarriedEdges LCE = addLoopCarriedDependences();
765 updatePhiDependences();
766 Topo.InitDAGTopologicalSorting();
767 changeDependences();
768 postProcessDAG();
769 DDG = std::make_unique<SwingSchedulerDDG>(SUnits, &EntrySU, &ExitSU, LCE);
770 LLVM_DEBUG({
771 dump();
772 dbgs() << "===== Loop Carried Edges Begin =====\n";
773 for (SUnit &SU : SUnits)
774 LCE.dump(&SU, TRI, &MRI);
775 dbgs() << "===== Loop Carried Edges End =====\n";
776 });
777
778 NodeSetType NodeSets;
779 findCircuits(NodeSets);
780 NodeSetType Circuits = NodeSets;
781
782 // Calculate the MII.
783 unsigned ResMII = calculateResMII();
784 unsigned RecMII = calculateRecMII(NodeSets);
785
786 fuseRecs(NodeSets);
787
788 // This flag is used for testing and can cause correctness problems.
789 if (SwpIgnoreRecMII)
790 RecMII = 0;
791
792 setMII(ResMII, RecMII);
793 setMAX_II();
794
795 LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
796 << " (rec=" << RecMII << ", res=" << ResMII << ")\n");
797
798 // Can't schedule a loop without a valid MII.
799 if (MII == 0) {
800 LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
801 NumFailZeroMII++;
802 Pass.ORE->emit([&]() {
804 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
805 << "Invalid Minimal Initiation Interval: 0";
806 });
807 return;
808 }
809
810 // Don't pipeline large loops.
811 if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {
812 LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
813 << ", we don't pipeline large loops\n");
814 NumFailLargeMaxMII++;
815 Pass.ORE->emit([&]() {
817 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
818 << "Minimal Initiation Interval too large: "
819 << ore::NV("MII", (int)MII) << " > "
820 << ore::NV("SwpMaxMii", SwpMaxMii) << "."
821 << "Refer to -pipeliner-max-mii.";
822 });
823 return;
824 }
825
826 computeNodeFunctions(NodeSets);
827
828 registerPressureFilter(NodeSets);
829
830 colocateNodeSets(NodeSets);
831
832 checkNodeSets(NodeSets);
833
834 LLVM_DEBUG({
835 for (auto &I : NodeSets) {
836 dbgs() << " Rec NodeSet ";
837 I.dump();
838 }
839 });
840
841 llvm::stable_sort(NodeSets, std::greater<NodeSet>());
842
843 groupRemainingNodes(NodeSets);
844
845 removeDuplicateNodes(NodeSets);
846
847 LLVM_DEBUG({
848 for (auto &I : NodeSets) {
849 dbgs() << " NodeSet ";
850 I.dump();
851 }
852 });
853
854 computeNodeOrder(NodeSets);
855
856 // check for node order issues
857 checkValidNodeOrder(Circuits);
858
859 SMSchedule Schedule(Pass.MF, this);
860 Scheduled = schedulePipeline(Schedule);
861
862 if (!Scheduled){
863 LLVM_DEBUG(dbgs() << "No schedule found, return\n");
864 NumFailNoSchedule++;
865 Pass.ORE->emit([&]() {
867 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
868 << "Unable to find schedule";
869 });
870 return;
871 }
872
873 unsigned numStages = Schedule.getMaxStageCount();
874 // No need to generate pipeline if there are no overlapped iterations.
875 if (numStages == 0) {
876 LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
877 NumFailZeroStage++;
878 Pass.ORE->emit([&]() {
880 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
881 << "No need to pipeline - no overlapped iterations in schedule.";
882 });
883 return;
884 }
885 // Check that the maximum stage count is less than user-defined limit.
886 if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {
887 LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
888 << " : too many stages, abort\n");
889 NumFailLargeMaxStage++;
890 Pass.ORE->emit([&]() {
892 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
893 << "Too many stages in schedule: "
894 << ore::NV("numStages", (int)numStages) << " > "
895 << ore::NV("SwpMaxStages", SwpMaxStages)
896 << ". Refer to -pipeliner-max-stages.";
897 });
898 return;
899 }
900
901 Pass.ORE->emit([&]() {
902 return MachineOptimizationRemark(DEBUG_TYPE, "schedule", Loop.getStartLoc(),
903 Loop.getHeader())
904 << "Pipelined succesfully!";
905 });
906
907 // Generate the schedule as a ModuloSchedule.
908 DenseMap<MachineInstr *, int> Cycles, Stages;
909 std::vector<MachineInstr *> OrderedInsts;
910 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
911 ++Cycle) {
912 for (SUnit *SU : Schedule.getInstructions(Cycle)) {
913 OrderedInsts.push_back(SU->getInstr());
914 Cycles[SU->getInstr()] = Cycle;
915 Stages[SU->getInstr()] = Schedule.stageScheduled(SU);
916 }
917 }
919 for (auto &KV : NewMIs) {
920 Cycles[KV.first] = Cycles[KV.second];
921 Stages[KV.first] = Stages[KV.second];
922 NewInstrChanges[KV.first] = InstrChanges[getSUnit(KV.first)];
923 }
924
925 ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
926 std::move(Stages));
928 assert(NewInstrChanges.empty() &&
929 "Cannot serialize a schedule with InstrChanges!");
931 MSTI.annotate();
932 return;
933 }
934 // The experimental code generator can't work if there are InstChanges.
935 if (ExperimentalCodeGen && NewInstrChanges.empty()) {
936 PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
937 MSE.expand();
938 } else if (MVECodeGen && NewInstrChanges.empty() &&
939 LoopPipelinerInfo->isMVEExpanderSupported() &&
941 ModuloScheduleExpanderMVE MSE(MF, MS, LIS);
942 MSE.expand();
943 } else {
944 ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
945 MSE.expand();
946 MSE.cleanup();
947 }
948 ++NumPipelined;
949}
950
951/// Clean up after the software pipeliner runs.
953 for (auto &KV : NewMIs)
954 MF.deleteMachineInstr(KV.second);
955 NewMIs.clear();
956
957 // Call the superclass.
959}
960
961/// Return the register values for the operands of a Phi instruction.
962/// This function assume the instruction is a Phi.
964 Register &InitVal, Register &LoopVal) {
965 assert(Phi.isPHI() && "Expecting a Phi.");
966
967 InitVal = Register();
968 LoopVal = Register();
969 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
970 if (Phi.getOperand(i + 1).getMBB() != Loop)
971 InitVal = Phi.getOperand(i).getReg();
972 else
973 LoopVal = Phi.getOperand(i).getReg();
974
975 assert(InitVal && LoopVal && "Unexpected Phi structure.");
976}
977
978/// Return the Phi register value that comes the loop block.
980 const MachineBasicBlock *LoopBB) {
981 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
982 if (Phi.getOperand(i + 1).getMBB() == LoopBB)
983 return Phi.getOperand(i).getReg();
984 return Register();
985}
986
987/// Return true if SUb can be reached from SUa following the chain edges.
988static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
991 Worklist.push_back(SUa);
992 while (!Worklist.empty()) {
993 const SUnit *SU = Worklist.pop_back_val();
994 for (const auto &SI : SU->Succs) {
995 SUnit *SuccSU = SI.getSUnit();
996 if (SI.getKind() == SDep::Order) {
997 if (Visited.count(SuccSU))
998 continue;
999 if (SuccSU == SUb)
1000 return true;
1001 Worklist.push_back(SuccSU);
1002 Visited.insert(SuccSU);
1003 }
1004 }
1005 }
1006 return false;
1007}
1008
1010 if (!getUnderlyingObjects())
1011 return;
1012 for (const Value *Obj : UnderlyingObjs)
1013 if (!isIdentifiedObject(Obj)) {
1014 IsAllIdentified = false;
1015 break;
1016 }
1017}
1018
1020 const SUnitWithMemInfo &Other) const {
1021 // If all underlying objects are identified objects and there is no overlap
1022 // between them, then these two instructions are disjoint.
1023 if (!IsAllIdentified || !Other.IsAllIdentified)
1024 return false;
1025 for (const Value *Obj : UnderlyingObjs)
1026 if (llvm::is_contained(Other.UnderlyingObjs, Obj))
1027 return false;
1028 return true;
1029}
1030
1031/// Collect the underlying objects for the memory references of an instruction.
1032/// This function calls the code in ValueTracking, but first checks that the
1033/// instruction has a memory operand.
1034/// Returns false if we cannot find the underlying objects.
1035bool SUnitWithMemInfo::getUnderlyingObjects() {
1036 const MachineInstr *MI = SU->getInstr();
1037 if (!MI->hasOneMemOperand())
1038 return false;
1039 MachineMemOperand *MM = *MI->memoperands_begin();
1040 if (!MM->getValue())
1041 return false;
1042 MemOpValue = MM->getValue();
1043 MemOpOffset = MM->getOffset();
1045
1046 // TODO: A no alias scope may be valid only in a single iteration. In this
1047 // case we need to peel off it like LoopAccessAnalysis does.
1048 AATags = MM->getAAInfo();
1049 return true;
1050}
1051
1052/// Returns true if there is a loop-carried order dependency from \p Src to \p
1053/// Dst.
1054static bool
1055hasLoopCarriedMemDep(const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst,
1056 BatchAAResults &BAA, const TargetInstrInfo *TII,
1057 const TargetRegisterInfo *TRI,
1058 const SwingSchedulerDAG *SSD, bool PerformCheapCheck) {
1059 if (Src.isTriviallyDisjoint(Dst))
1060 return false;
1061 if (isSuccOrder(Src.SU, Dst.SU))
1062 return false;
1063
1064 MachineInstr &SrcMI = *Src.SU->getInstr();
1065 MachineInstr &DstMI = *Dst.SU->getInstr();
1066 if (PerformCheapCheck) {
1067 // First, perform the cheaper check that compares the base register.
1068 // If they are the same and the load offset is less than the store
1069 // offset, then mark the dependence as loop carried potentially.
1070 //
1071 // TODO: This check will be removed.
1072 const MachineOperand *BaseOp1, *BaseOp2;
1073 int64_t Offset1, Offset2;
1074 bool Offset1IsScalable, Offset2IsScalable;
1075 if (TII->getMemOperandWithOffset(SrcMI, BaseOp1, Offset1, Offset1IsScalable,
1076 TRI) &&
1077 TII->getMemOperandWithOffset(DstMI, BaseOp2, Offset2, Offset2IsScalable,
1078 TRI)) {
1079 if (BaseOp1->isIdenticalTo(*BaseOp2) &&
1080 Offset1IsScalable == Offset2IsScalable &&
1081 (int)Offset1 < (int)Offset2) {
1082 assert(TII->areMemAccessesTriviallyDisjoint(SrcMI, DstMI) &&
1083 "What happened to the chain edge?");
1084 return true;
1085 }
1086 }
1087 }
1088
1089 if (!SSD->mayOverlapInLaterIter(&SrcMI, &DstMI))
1090 return false;
1091
1092 // Second, the more expensive check that uses alias analysis on the
1093 // base registers. If they alias, and the load offset is less than
1094 // the store offset, the mark the dependence as loop carried.
1095 if (Src.isUnknown() || Dst.isUnknown())
1096 return true;
1097 if (Src.MemOpValue == Dst.MemOpValue && Src.MemOpOffset <= Dst.MemOpOffset)
1098 return true;
1099
1100 if (BAA.isNoAlias(
1101 MemoryLocation::getBeforeOrAfter(Src.MemOpValue, Src.AATags),
1102 MemoryLocation::getBeforeOrAfter(Dst.MemOpValue, Dst.AATags)))
1103 return false;
1104
1105 // AliasAnalysis sometimes gives up on following the underlying
1106 // object. In such a case, separate checks for underlying objects may
1107 // prove that there are no aliases between two accesses.
1108 for (const Value *SrcObj : Src.UnderlyingObjs)
1109 for (const Value *DstObj : Dst.UnderlyingObjs)
1110 if (!BAA.isNoAlias(MemoryLocation::getBeforeOrAfter(SrcObj, Src.AATags),
1111 MemoryLocation::getBeforeOrAfter(DstObj, Dst.AATags)))
1112 return true;
1113
1114 return false;
1115}
1116
1117void LoopCarriedOrderDepsTracker::LoadStoreChunk::append(SUnit *SU) {
1118 const MachineInstr *MI = SU->getInstr();
1119 if (!MI->mayLoadOrStore())
1120 return;
1121 (MI->mayStore() ? Stores : Loads).emplace_back(SU);
1122}
1123
1125 SwingSchedulerDAG *SSD, BatchAAResults *BAA, const TargetInstrInfo *TII,
1126 const TargetRegisterInfo *TRI)
1127 : DAG(SSD), BAA(BAA), SUnits(DAG->SUnits), N(SUnits.size()),
1128 LoopCarried(N, BitVector(N)), TII(TII), TRI(TRI) {}
1129
1131 // Traverse all instructions and extract only what we are targetting.
1132 for (auto &SU : SUnits) {
1133 auto Tagged = getInstrTag(&SU);
1134
1135 // This instruction has no loop-carried order-dependencies.
1136 if (!Tagged)
1137 continue;
1138 TaggedSUnits.emplace_back(&SU, *Tagged);
1139 }
1140
1141 computeDependenciesAux();
1142}
1143
1144std::optional<LoopCarriedOrderDepsTracker::InstrTag>
1145LoopCarriedOrderDepsTracker::getInstrTag(SUnit *SU) const {
1146 MachineInstr *MI = SU->getInstr();
1147 if (TII->isGlobalMemoryObject(MI))
1148 return InstrTag::Barrier;
1149
1150 if (MI->mayStore() ||
1151 (MI->mayLoad() && !MI->isDereferenceableInvariantLoad()))
1152 return InstrTag::LoadOrStore;
1153
1154 if (MI->mayRaiseFPException())
1155 return InstrTag::FPExceptions;
1156
1157 return std::nullopt;
1158}
1159
1160void LoopCarriedOrderDepsTracker::addDependenciesBetweenSUs(
1161 const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst,
1162 bool PerformCheapCheck) {
1163 // Avoid self-dependencies.
1164 if (Src.SU == Dst.SU)
1165 return;
1166
1167 if (hasLoopCarriedMemDep(Src, Dst, *BAA, TII, TRI, DAG, PerformCheapCheck))
1168 LoopCarried[Src.SU->NodeNum].set(Dst.SU->NodeNum);
1169}
1170
1171void LoopCarriedOrderDepsTracker::addLoopCarriedDepenenciesForChunks(
1172 const LoadStoreChunk &From, const LoadStoreChunk &To) {
1173 // Add load-to-store dependencies (WAR).
1174 for (const SUnitWithMemInfo &Src : From.Loads)
1175 for (const SUnitWithMemInfo &Dst : To.Stores)
1176 // Perform a cheap check first if this is a forward dependency.
1177 addDependenciesBetweenSUs(Src, Dst, Src.SU->NodeNum < Dst.SU->NodeNum);
1178
1179 // Add store-to-load dependencies (RAW).
1180 for (const SUnitWithMemInfo &Src : From.Stores)
1181 for (const SUnitWithMemInfo &Dst : To.Loads)
1182 addDependenciesBetweenSUs(Src, Dst);
1183
1184 // Add store-to-store dependencies (WAW).
1185 for (const SUnitWithMemInfo &Src : From.Stores)
1186 for (const SUnitWithMemInfo &Dst : To.Stores)
1187 addDependenciesBetweenSUs(Src, Dst);
1188}
1189
1190void LoopCarriedOrderDepsTracker::computeDependenciesAux() {
1192 for (const auto &TSU : TaggedSUnits) {
1193 InstrTag Tag = TSU.getTag();
1194 SUnit *SU = TSU.getPointer();
1195 switch (Tag) {
1196 case InstrTag::Barrier:
1197 Chunks.emplace_back();
1198 break;
1199 case InstrTag::LoadOrStore:
1200 Chunks.back().append(SU);
1201 break;
1202 case InstrTag::FPExceptions:
1203 // TODO: Handle this properly.
1204 break;
1205 }
1206 }
1207
1208 // Add dependencies between memory operations. If there are one or more
1209 // barrier events between two memory instructions, we don't add a
1210 // loop-carried dependence for them.
1211 for (const LoadStoreChunk &Chunk : Chunks)
1212 addLoopCarriedDepenenciesForChunks(Chunk, Chunk);
1213
1214 // TODO: If there are multiple barrier instructions, dependencies from the
1215 // last barrier instruction (or load/store below it) to the first barrier
1216 // instruction (or load/store above it).
1217}
1218
1219/// Add a chain edge between a load and store if the store can be an
1220/// alias of the load on a subsequent iteration, i.e., a loop carried
1221/// dependence. This code is very similar to the code in ScheduleDAGInstrs
1222/// but that code doesn't create loop carried dependences.
1223/// TODO: Also compute output-dependencies.
1224LoopCarriedEdges SwingSchedulerDAG::addLoopCarriedDependences() {
1225 LoopCarriedEdges LCE;
1226
1227 // Add loop-carried order-dependencies
1228 LoopCarriedOrderDepsTracker LCODTracker(this, &BAA, TII, TRI);
1229 LCODTracker.computeDependencies();
1230 for (unsigned I = 0; I != SUnits.size(); I++)
1231 for (const int Succ : LCODTracker.getLoopCarried(I).set_bits())
1232 LCE.OrderDeps[&SUnits[I]].insert(&SUnits[Succ]);
1233
1234 LCE.modifySUnits(SUnits, TII);
1235 return LCE;
1236}
1237
1238/// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
1239/// processes dependences for PHIs. This function adds true dependences
1240/// from a PHI to a use, and a loop carried dependence from the use to the
1241/// PHI. The loop carried dependence is represented as an anti dependence
1242/// edge. This function also removes chain dependences between unrelated
1243/// PHIs.
1244void SwingSchedulerDAG::updatePhiDependences() {
1245 SmallVector<SDep, 4> RemoveDeps;
1246 const TargetSubtargetInfo &ST = MF.getSubtarget<TargetSubtargetInfo>();
1247
1248 // Iterate over each DAG node.
1249 for (SUnit &I : SUnits) {
1250 RemoveDeps.clear();
1251 // Set to true if the instruction has an operand defined by a Phi.
1252 Register HasPhiUse;
1253 Register HasPhiDef;
1254 MachineInstr *MI = I.getInstr();
1255 // Iterate over each operand, and we process the definitions.
1256 for (const MachineOperand &MO : MI->operands()) {
1257 if (!MO.isReg())
1258 continue;
1259 Register Reg = MO.getReg();
1260 if (MO.isDef()) {
1261 // If the register is used by a Phi, then create an anti dependence.
1263 UI = MRI.use_instr_begin(Reg),
1264 UE = MRI.use_instr_end();
1265 UI != UE; ++UI) {
1266 MachineInstr *UseMI = &*UI;
1267 SUnit *SU = getSUnit(UseMI);
1268 if (SU != nullptr && UseMI->isPHI()) {
1269 if (!MI->isPHI()) {
1270 SDep Dep(SU, SDep::Anti, Reg);
1271 Dep.setLatency(1);
1272 I.addPred(Dep);
1273 } else {
1274 HasPhiDef = Reg;
1275 // Add a chain edge to a dependent Phi that isn't an existing
1276 // predecessor.
1277 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
1278 I.addPred(SDep(SU, SDep::Barrier));
1279 }
1280 }
1281 }
1282 } else if (MO.isUse()) {
1283 // If the register is defined by a Phi, then create a true dependence.
1284 MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
1285 if (DefMI == nullptr)
1286 continue;
1287 SUnit *SU = getSUnit(DefMI);
1288 if (SU != nullptr && DefMI->isPHI()) {
1289 if (!MI->isPHI()) {
1290 SDep Dep(SU, SDep::Data, Reg);
1291 Dep.setLatency(0);
1292 ST.adjustSchedDependency(SU, 0, &I, MO.getOperandNo(), Dep,
1293 &SchedModel);
1294 I.addPred(Dep);
1295 } else {
1296 HasPhiUse = Reg;
1297 // Add a chain edge to a dependent Phi that isn't an existing
1298 // predecessor.
1299 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
1300 I.addPred(SDep(SU, SDep::Barrier));
1301 }
1302 }
1303 }
1304 }
1305 // Remove order dependences from an unrelated Phi.
1306 if (!SwpPruneDeps)
1307 continue;
1308 for (auto &PI : I.Preds) {
1309 MachineInstr *PMI = PI.getSUnit()->getInstr();
1310 if (PMI->isPHI() && PI.getKind() == SDep::Order) {
1311 if (I.getInstr()->isPHI()) {
1312 if (PMI->getOperand(0).getReg() == HasPhiUse)
1313 continue;
1314 if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
1315 continue;
1316 }
1317 RemoveDeps.push_back(PI);
1318 }
1319 }
1320 for (const SDep &D : RemoveDeps)
1321 I.removePred(D);
1322 }
1323}
1324
1325/// Iterate over each DAG node and see if we can change any dependences
1326/// in order to reduce the recurrence MII.
1327void SwingSchedulerDAG::changeDependences() {
1328 // See if an instruction can use a value from the previous iteration.
1329 // If so, we update the base and offset of the instruction and change
1330 // the dependences.
1331 for (SUnit &I : SUnits) {
1332 unsigned BasePos = 0, OffsetPos = 0;
1333 Register NewBase;
1334 int64_t NewOffset = 0;
1335 if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
1336 NewOffset))
1337 continue;
1338
1339 // Get the MI and SUnit for the instruction that defines the original base.
1340 Register OrigBase = I.getInstr()->getOperand(BasePos).getReg();
1341 MachineInstr *DefMI = MRI.getUniqueVRegDef(OrigBase);
1342 if (!DefMI)
1343 continue;
1344 SUnit *DefSU = getSUnit(DefMI);
1345 if (!DefSU)
1346 continue;
1347 // Get the MI and SUnit for the instruction that defins the new base.
1348 MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
1349 if (!LastMI)
1350 continue;
1351 SUnit *LastSU = getSUnit(LastMI);
1352 if (!LastSU)
1353 continue;
1354
1355 if (Topo.IsReachable(&I, LastSU))
1356 continue;
1357
1358 // Remove the dependence. The value now depends on a prior iteration.
1360 for (const SDep &P : I.Preds)
1361 if (P.getSUnit() == DefSU)
1362 Deps.push_back(P);
1363 for (const SDep &D : Deps) {
1364 Topo.RemovePred(&I, D.getSUnit());
1365 I.removePred(D);
1366 }
1367 // Remove the chain dependence between the instructions.
1368 Deps.clear();
1369 for (auto &P : LastSU->Preds)
1370 if (P.getSUnit() == &I && P.getKind() == SDep::Order)
1371 Deps.push_back(P);
1372 for (const SDep &D : Deps) {
1373 Topo.RemovePred(LastSU, D.getSUnit());
1374 LastSU->removePred(D);
1375 }
1376
1377 // Add a dependence between the new instruction and the instruction
1378 // that defines the new base.
1379 SDep Dep(&I, SDep::Anti, NewBase);
1380 Topo.AddPred(LastSU, &I);
1381 LastSU->addPred(Dep);
1382
1383 // Remember the base and offset information so that we can update the
1384 // instruction during code generation.
1385 InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
1386 }
1387}
1388
1389/// Create an instruction stream that represents a single iteration and stage of
1390/// each instruction. This function differs from SMSchedule::finalizeSchedule in
1391/// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this
1392/// function is an approximation of SMSchedule::finalizeSchedule with all
1393/// non-const operations removed.
1395 SMSchedule &Schedule,
1396 std::vector<MachineInstr *> &OrderedInsts,
1399
1400 // Move all instructions to the first stage from the later stages.
1401 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1402 ++Cycle) {
1403 for (int Stage = 0, LastStage = Schedule.getMaxStageCount();
1404 Stage <= LastStage; ++Stage) {
1405 for (SUnit *SU : llvm::reverse(Schedule.getInstructions(
1406 Cycle + Stage * Schedule.getInitiationInterval()))) {
1407 Instrs[Cycle].push_front(SU);
1408 }
1409 }
1410 }
1411
1412 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1413 ++Cycle) {
1414 std::deque<SUnit *> &CycleInstrs = Instrs[Cycle];
1415 CycleInstrs = Schedule.reorderInstructions(SSD, CycleInstrs);
1416 for (SUnit *SU : CycleInstrs) {
1417 MachineInstr *MI = SU->getInstr();
1418 OrderedInsts.push_back(MI);
1419 Stages[MI] = Schedule.stageScheduled(SU);
1420 }
1421 }
1422}
1423
1424namespace {
1425
1426// FuncUnitSorter - Comparison operator used to sort instructions by
1427// the number of functional unit choices.
1428struct FuncUnitSorter {
1429 const InstrItineraryData *InstrItins;
1430 const MCSubtargetInfo *STI;
1431 DenseMap<InstrStage::FuncUnits, unsigned> Resources;
1432
1433 FuncUnitSorter(const TargetSubtargetInfo &TSI)
1434 : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
1435
1436 // Compute the number of functional unit alternatives needed
1437 // at each stage, and take the minimum value. We prioritize the
1438 // instructions by the least number of choices first.
1439 unsigned minFuncUnits(const MachineInstr *Inst,
1440 InstrStage::FuncUnits &F) const {
1441 unsigned SchedClass = Inst->getDesc().getSchedClass();
1442 unsigned min = UINT_MAX;
1443 if (InstrItins && !InstrItins->isEmpty()) {
1444 for (const InstrStage &IS :
1445 make_range(InstrItins->beginStage(SchedClass),
1446 InstrItins->endStage(SchedClass))) {
1447 InstrStage::FuncUnits funcUnits = IS.getUnits();
1448 unsigned numAlternatives = llvm::popcount(funcUnits);
1449 if (numAlternatives < min) {
1450 min = numAlternatives;
1451 F = funcUnits;
1452 }
1453 }
1454 return min;
1455 }
1456 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1457 const MCSchedClassDesc *SCDesc =
1458 STI->getSchedModel().getSchedClassDesc(SchedClass);
1459 if (!SCDesc->isValid())
1460 // No valid Schedule Class Desc for schedClass, should be
1461 // Pseudo/PostRAPseudo
1462 return min;
1463
1464 for (const MCWriteProcResEntry &PRE :
1465 make_range(STI->getWriteProcResBegin(SCDesc),
1466 STI->getWriteProcResEnd(SCDesc))) {
1467 if (!PRE.ReleaseAtCycle)
1468 continue;
1469 const MCProcResourceDesc *ProcResource =
1470 STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);
1471 unsigned NumUnits = ProcResource->NumUnits;
1472 if (NumUnits < min) {
1473 min = NumUnits;
1474 F = PRE.ProcResourceIdx;
1475 }
1476 }
1477 return min;
1478 }
1479 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1480 }
1481
1482 // Compute the critical resources needed by the instruction. This
1483 // function records the functional units needed by instructions that
1484 // must use only one functional unit. We use this as a tie breaker
1485 // for computing the resource MII. The instrutions that require
1486 // the same, highly used, functional unit have high priority.
1487 void calcCriticalResources(MachineInstr &MI) {
1488 unsigned SchedClass = MI.getDesc().getSchedClass();
1489 if (InstrItins && !InstrItins->isEmpty()) {
1490 for (const InstrStage &IS :
1491 make_range(InstrItins->beginStage(SchedClass),
1492 InstrItins->endStage(SchedClass))) {
1493 InstrStage::FuncUnits FuncUnits = IS.getUnits();
1494 if (llvm::popcount(FuncUnits) == 1)
1495 Resources[FuncUnits]++;
1496 }
1497 return;
1498 }
1499 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1500 const MCSchedClassDesc *SCDesc =
1501 STI->getSchedModel().getSchedClassDesc(SchedClass);
1502 if (!SCDesc->isValid())
1503 // No valid Schedule Class Desc for schedClass, should be
1504 // Pseudo/PostRAPseudo
1505 return;
1506
1507 for (const MCWriteProcResEntry &PRE :
1508 make_range(STI->getWriteProcResBegin(SCDesc),
1509 STI->getWriteProcResEnd(SCDesc))) {
1510 if (!PRE.ReleaseAtCycle)
1511 continue;
1512 Resources[PRE.ProcResourceIdx]++;
1513 }
1514 return;
1515 }
1516 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1517 }
1518
1519 /// Return true if IS1 has less priority than IS2.
1520 bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1521 InstrStage::FuncUnits F1 = 0, F2 = 0;
1522 unsigned MFUs1 = minFuncUnits(IS1, F1);
1523 unsigned MFUs2 = minFuncUnits(IS2, F2);
1524 if (MFUs1 == MFUs2)
1525 return Resources.lookup(F1) < Resources.lookup(F2);
1526 return MFUs1 > MFUs2;
1527 }
1528};
1529
1530/// Calculate the maximum register pressure of the scheduled instructions stream
1531class HighRegisterPressureDetector {
1532 MachineBasicBlock *OrigMBB;
1533 const MachineRegisterInfo &MRI;
1534 const TargetRegisterInfo *TRI;
1535
1536 const unsigned PSetNum;
1537
1538 // Indexed by PSet ID
1539 // InitSetPressure takes into account the register pressure of live-in
1540 // registers. It's not depend on how the loop is scheduled, so it's enough to
1541 // calculate them once at the beginning.
1542 std::vector<unsigned> InitSetPressure;
1543
1544 // Indexed by PSet ID
1545 // Upper limit for each register pressure set
1546 std::vector<unsigned> PressureSetLimit;
1547
1548 DenseMap<MachineInstr *, RegisterOperands> ROMap;
1549
1550 using Instr2LastUsesTy = DenseMap<MachineInstr *, SmallDenseSet<Register, 4>>;
1551
1552public:
1553 using OrderedInstsTy = std::vector<MachineInstr *>;
1554 using Instr2StageTy = DenseMap<MachineInstr *, unsigned>;
1555
1556private:
1557 static void dumpRegisterPressures(const std::vector<unsigned> &Pressures) {
1558 if (Pressures.size() == 0) {
1559 dbgs() << "[]";
1560 } else {
1561 char Prefix = '[';
1562 for (unsigned P : Pressures) {
1563 dbgs() << Prefix << P;
1564 Prefix = ' ';
1565 }
1566 dbgs() << ']';
1567 }
1568 }
1569
1570 void dumpPSet(Register Reg) const {
1571 dbgs() << "Reg=" << printReg(Reg, TRI, 0, &MRI) << " PSet=";
1572 // FIXME: The static_cast is a bug compensating bugs in the callers.
1573 VirtRegOrUnit VRegOrUnit =
1574 Reg.isVirtual() ? VirtRegOrUnit(Reg)
1575 : VirtRegOrUnit(static_cast<MCRegUnit>(Reg.id()));
1576 for (auto PSetIter = MRI.getPressureSets(VRegOrUnit); PSetIter.isValid();
1577 ++PSetIter) {
1578 dbgs() << *PSetIter << ' ';
1579 }
1580 dbgs() << '\n';
1581 }
1582
1583 void increaseRegisterPressure(std::vector<unsigned> &Pressure,
1584 Register Reg) const {
1585 // FIXME: The static_cast is a bug compensating bugs in the callers.
1586 VirtRegOrUnit VRegOrUnit =
1587 Reg.isVirtual() ? VirtRegOrUnit(Reg)
1588 : VirtRegOrUnit(static_cast<MCRegUnit>(Reg.id()));
1589 auto PSetIter = MRI.getPressureSets(VRegOrUnit);
1590 unsigned Weight = PSetIter.getWeight();
1591 for (; PSetIter.isValid(); ++PSetIter)
1592 Pressure[*PSetIter] += Weight;
1593 }
1594
1595 void decreaseRegisterPressure(std::vector<unsigned> &Pressure,
1596 Register Reg) const {
1597 auto PSetIter = MRI.getPressureSets(VirtRegOrUnit(Reg));
1598 unsigned Weight = PSetIter.getWeight();
1599 for (; PSetIter.isValid(); ++PSetIter) {
1600 auto &P = Pressure[*PSetIter];
1601 assert(P >= Weight &&
1602 "register pressure must be greater than or equal weight");
1603 P -= Weight;
1604 }
1605 }
1606
1607 // Return true if Reg is reserved one, for example, stack pointer
1608 bool isReservedRegister(Register Reg) const {
1609 return Reg.isPhysical() && MRI.isReserved(Reg.asMCReg());
1610 }
1611
1612 bool isDefinedInThisLoop(Register Reg) const {
1613 return Reg.isVirtual() && MRI.getVRegDef(Reg)->getParent() == OrigMBB;
1614 }
1615
1616 // Search for live-in variables. They are factored into the register pressure
1617 // from the begining. Live-in variables used by every iteration should be
1618 // considered as alive throughout the loop. For example, the variable `c` in
1619 // following code. \code
1620 // int c = ...;
1621 // for (int i = 0; i < n; i++)
1622 // a[i] += b[i] + c;
1623 // \endcode
1624 void computeLiveIn() {
1625 DenseSet<Register> Used;
1626 for (auto &MI : *OrigMBB) {
1627 if (MI.isDebugInstr())
1628 continue;
1629 for (auto &Use : ROMap[&MI].Uses) {
1630 // FIXME: The static_cast is a bug.
1631 Register Reg =
1632 Use.VRegOrUnit.isVirtualReg()
1633 ? Use.VRegOrUnit.asVirtualReg()
1634 : Register(static_cast<unsigned>(Use.VRegOrUnit.asMCRegUnit()));
1635 // Ignore the variable that appears only on one side of phi instruction
1636 // because it's used only at the first iteration.
1637 if (MI.isPHI() && Reg != getLoopPhiReg(MI, OrigMBB))
1638 continue;
1639 if (isReservedRegister(Reg))
1640 continue;
1641 if (isDefinedInThisLoop(Reg))
1642 continue;
1643 Used.insert(Reg);
1644 }
1645 }
1646
1647 for (auto LiveIn : Used)
1648 increaseRegisterPressure(InitSetPressure, LiveIn);
1649 }
1650
1651 // Calculate the upper limit of each pressure set
1652 void computePressureSetLimit(const RegisterClassInfo &RCI) {
1653 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1654 PressureSetLimit[PSet] = RCI.getRegPressureSetLimit(PSet);
1655 }
1656
1657 // There are two patterns of last-use.
1658 // - by an instruction of the current iteration
1659 // - by a phi instruction of the next iteration (loop carried value)
1660 //
1661 // Furthermore, following two groups of instructions are executed
1662 // simultaneously
1663 // - next iteration's phi instructions in i-th stage
1664 // - current iteration's instructions in i+1-th stage
1665 //
1666 // This function calculates the last-use of each register while taking into
1667 // account the above two patterns.
1668 Instr2LastUsesTy computeLastUses(const OrderedInstsTy &OrderedInsts,
1669 Instr2StageTy &Stages) const {
1670 // We treat virtual registers that are defined and used in this loop.
1671 // Following virtual register will be ignored
1672 // - live-in one
1673 // - defined but not used in the loop (potentially live-out)
1674 DenseSet<Register> TargetRegs;
1675 const auto UpdateTargetRegs = [this, &TargetRegs](Register Reg) {
1676 if (isDefinedInThisLoop(Reg))
1677 TargetRegs.insert(Reg);
1678 };
1679 for (MachineInstr *MI : OrderedInsts) {
1680 if (MI->isPHI()) {
1681 Register Reg = getLoopPhiReg(*MI, OrigMBB);
1682 UpdateTargetRegs(Reg);
1683 } else {
1684 for (auto &Use : ROMap.find(MI)->getSecond().Uses) {
1685 // FIXME: The static_cast is a bug.
1686 Register Reg = Use.VRegOrUnit.isVirtualReg()
1687 ? Use.VRegOrUnit.asVirtualReg()
1688 : Register(static_cast<unsigned>(
1689 Use.VRegOrUnit.asMCRegUnit()));
1690 UpdateTargetRegs(Reg);
1691 }
1692 }
1693 }
1694
1695 const auto InstrScore = [&Stages](MachineInstr *MI) {
1696 return Stages[MI] + MI->isPHI();
1697 };
1698
1699 DenseMap<Register, MachineInstr *> LastUseMI;
1700 for (MachineInstr *MI : llvm::reverse(OrderedInsts)) {
1701 for (auto &Use : ROMap.find(MI)->getSecond().Uses) {
1702 // FIXME: The static_cast is a bug.
1703 Register Reg =
1704 Use.VRegOrUnit.isVirtualReg()
1705 ? Use.VRegOrUnit.asVirtualReg()
1706 : Register(static_cast<unsigned>(Use.VRegOrUnit.asMCRegUnit()));
1707 if (!TargetRegs.contains(Reg))
1708 continue;
1709 auto [Ite, Inserted] = LastUseMI.try_emplace(Reg, MI);
1710 if (!Inserted) {
1711 MachineInstr *Orig = Ite->second;
1712 MachineInstr *New = MI;
1713 if (InstrScore(Orig) < InstrScore(New))
1714 Ite->second = New;
1715 }
1716 }
1717 }
1718
1719 Instr2LastUsesTy LastUses;
1720 for (auto [Reg, MI] : LastUseMI)
1721 LastUses[MI].insert(Reg);
1722 return LastUses;
1723 }
1724
1725 // Compute the maximum register pressure of the kernel. We'll simulate #Stage
1726 // iterations and check the register pressure at the point where all stages
1727 // overlapping.
1728 //
1729 // An example of unrolled loop where #Stage is 4..
1730 // Iter i+0 i+1 i+2 i+3
1731 // ------------------------
1732 // Stage 0
1733 // Stage 1 0
1734 // Stage 2 1 0
1735 // Stage 3 2 1 0 <- All stages overlap
1736 //
1737 std::vector<unsigned>
1738 computeMaxSetPressure(const OrderedInstsTy &OrderedInsts,
1739 Instr2StageTy &Stages,
1740 const unsigned StageCount) const {
1741 using RegSetTy = SmallDenseSet<Register, 16>;
1742
1743 // Indexed by #Iter. To treat "local" variables of each stage separately, we
1744 // manage the liveness of the registers independently by iterations.
1745 SmallVector<RegSetTy> LiveRegSets(StageCount);
1746
1747 auto CurSetPressure = InitSetPressure;
1748 auto MaxSetPressure = InitSetPressure;
1749 auto LastUses = computeLastUses(OrderedInsts, Stages);
1750
1751 LLVM_DEBUG({
1752 dbgs() << "Ordered instructions:\n";
1753 for (MachineInstr *MI : OrderedInsts) {
1754 dbgs() << "Stage " << Stages[MI] << ": ";
1755 MI->dump();
1756 }
1757 });
1758
1759 const auto InsertReg = [this, &CurSetPressure](RegSetTy &RegSet,
1760 VirtRegOrUnit VRegOrUnit) {
1761 // FIXME: The static_cast is a bug.
1762 Register Reg =
1763 VRegOrUnit.isVirtualReg()
1764 ? VRegOrUnit.asVirtualReg()
1765 : Register(static_cast<unsigned>(VRegOrUnit.asMCRegUnit()));
1766 if (!Reg.isValid() || isReservedRegister(Reg))
1767 return;
1768
1769 bool Inserted = RegSet.insert(Reg).second;
1770 if (!Inserted)
1771 return;
1772
1773 LLVM_DEBUG(dbgs() << "insert " << printReg(Reg, TRI, 0, &MRI) << "\n");
1774 increaseRegisterPressure(CurSetPressure, Reg);
1775 LLVM_DEBUG(dumpPSet(Reg));
1776 };
1777
1778 const auto EraseReg = [this, &CurSetPressure](RegSetTy &RegSet,
1779 Register Reg) {
1780 if (!Reg.isValid() || isReservedRegister(Reg))
1781 return;
1782
1783 // live-in register
1784 if (!RegSet.contains(Reg))
1785 return;
1786
1787 LLVM_DEBUG(dbgs() << "erase " << printReg(Reg, TRI, 0, &MRI) << "\n");
1788 RegSet.erase(Reg);
1789 decreaseRegisterPressure(CurSetPressure, Reg);
1790 LLVM_DEBUG(dumpPSet(Reg));
1791 };
1792
1793 for (unsigned I = 0; I < StageCount; I++) {
1794 for (MachineInstr *MI : OrderedInsts) {
1795 const auto Stage = Stages[MI];
1796 if (I < Stage)
1797 continue;
1798
1799 const unsigned Iter = I - Stage;
1800
1801 for (auto &Def : ROMap.find(MI)->getSecond().Defs)
1802 InsertReg(LiveRegSets[Iter], Def.VRegOrUnit);
1803
1804 for (auto LastUse : LastUses[MI]) {
1805 if (MI->isPHI()) {
1806 if (Iter != 0)
1807 EraseReg(LiveRegSets[Iter - 1], LastUse);
1808 } else {
1809 EraseReg(LiveRegSets[Iter], LastUse);
1810 }
1811 }
1812
1813 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1814 MaxSetPressure[PSet] =
1815 std::max(MaxSetPressure[PSet], CurSetPressure[PSet]);
1816
1817 LLVM_DEBUG({
1818 dbgs() << "CurSetPressure=";
1819 dumpRegisterPressures(CurSetPressure);
1820 dbgs() << " iter=" << Iter << " stage=" << Stage << ":";
1821 MI->dump();
1822 });
1823 }
1824 }
1825
1826 return MaxSetPressure;
1827 }
1828
1829public:
1830 HighRegisterPressureDetector(MachineBasicBlock *OrigMBB,
1831 const MachineFunction &MF)
1832 : OrigMBB(OrigMBB), MRI(MF.getRegInfo()),
1833 TRI(MF.getSubtarget().getRegisterInfo()),
1834 PSetNum(TRI->getNumRegPressureSets()), InitSetPressure(PSetNum, 0),
1835 PressureSetLimit(PSetNum, 0) {}
1836
1837 // Used to calculate register pressure, which is independent of loop
1838 // scheduling.
1839 void init(const RegisterClassInfo &RCI) {
1840 for (MachineInstr &MI : *OrigMBB) {
1841 if (MI.isDebugInstr())
1842 continue;
1843 ROMap[&MI].collect(MI, *TRI, MRI, false, true);
1844 }
1845
1846 computeLiveIn();
1847 computePressureSetLimit(RCI);
1848 }
1849
1850 // Calculate the maximum register pressures of the loop and check if they
1851 // exceed the limit
1852 bool detect(const SwingSchedulerDAG *SSD, SMSchedule &Schedule,
1853 const unsigned MaxStage) const {
1855 "the percentage of the margin must be between 0 to 100");
1856
1857 OrderedInstsTy OrderedInsts;
1858 Instr2StageTy Stages;
1859 computeScheduledInsts(SSD, Schedule, OrderedInsts, Stages);
1860 const auto MaxSetPressure =
1861 computeMaxSetPressure(OrderedInsts, Stages, MaxStage + 1);
1862
1863 LLVM_DEBUG({
1864 dbgs() << "Dump MaxSetPressure:\n";
1865 for (unsigned I = 0; I < MaxSetPressure.size(); I++) {
1866 dbgs() << format("MaxSetPressure[%d]=%d\n", I, MaxSetPressure[I]);
1867 }
1868 dbgs() << '\n';
1869 });
1870
1871 for (unsigned PSet = 0; PSet < PSetNum; PSet++) {
1872 unsigned Limit = PressureSetLimit[PSet];
1873 unsigned Margin = Limit * RegPressureMargin / 100;
1874 LLVM_DEBUG(dbgs() << "PSet=" << PSet << " Limit=" << Limit
1875 << " Margin=" << Margin << "\n");
1876 if (Limit < MaxSetPressure[PSet] + Margin) {
1877 LLVM_DEBUG(
1878 dbgs()
1879 << "Rejected the schedule because of too high register pressure\n");
1880 return true;
1881 }
1882 }
1883 return false;
1884 }
1885};
1886
1887} // end anonymous namespace
1888
1889/// Calculate the resource constrained minimum initiation interval for the
1890/// specified loop. We use the DFA to model the resources needed for
1891/// each instruction, and we ignore dependences. A different DFA is created
1892/// for each cycle that is required. When adding a new instruction, we attempt
1893/// to add it to each existing DFA, until a legal space is found. If the
1894/// instruction cannot be reserved in an existing DFA, we create a new one.
1895unsigned SwingSchedulerDAG::calculateResMII() {
1896 LLVM_DEBUG(dbgs() << "calculateResMII:\n");
1897 ResourceManager RM(&MF.getSubtarget(), this);
1898 return RM.calculateResMII();
1899}
1900
1901/// Calculate the recurrence-constrainted minimum initiation interval.
1902/// Iterate over each circuit. Compute the delay(c) and distance(c)
1903/// for each circuit. The II needs to satisfy the inequality
1904/// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1905/// II that satisfies the inequality, and the RecMII is the maximum
1906/// of those values.
1907unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1908 unsigned RecMII = 0;
1909
1910 for (NodeSet &Nodes : NodeSets) {
1911 if (Nodes.empty())
1912 continue;
1913
1914 unsigned Delay = Nodes.getLatency();
1915 unsigned Distance = 1;
1916
1917 // ii = ceil(delay / distance)
1918 unsigned CurMII = (Delay + Distance - 1) / Distance;
1919 Nodes.setRecMII(CurMII);
1920 if (CurMII > RecMII)
1921 RecMII = CurMII;
1922 }
1923
1924 return RecMII;
1925}
1926
1927/// Create the adjacency structure of the nodes in the graph.
1928void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1929 SwingSchedulerDAG *DAG) {
1930 BitVector Added(SUnits.size());
1931 DenseMap<int, int> OutputDeps;
1932 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1933 Added.reset();
1934 // Add any successor to the adjacency matrix and exclude duplicates.
1935 for (auto &OE : DAG->DDG->getOutEdges(&SUnits[i])) {
1936 // Only create a back-edge on the first and last nodes of a dependence
1937 // chain. This records any chains and adds them later.
1938 if (OE.isOutputDep()) {
1939 int N = OE.getDst()->NodeNum;
1940 int BackEdge = i;
1941 auto Dep = OutputDeps.find(BackEdge);
1942 if (Dep != OutputDeps.end()) {
1943 BackEdge = Dep->second;
1944 OutputDeps.erase(Dep);
1945 }
1946 OutputDeps[N] = BackEdge;
1947 }
1948 // Do not process a boundary node, an artificial node.
1949 if (OE.getDst()->isBoundaryNode() || OE.isArtificial())
1950 continue;
1951
1952 // This code is retained o preserve previous behavior and prevent
1953 // regression. This condition means that anti-dependnecies within an
1954 // iteration are ignored when searching circuits. Therefore it's natural
1955 // to consider this dependence as well.
1956 // FIXME: Remove this code if it doesn't have significant impact on
1957 // performance.
1958 if (OE.isAntiDep())
1959 continue;
1960
1961 int N = OE.getDst()->NodeNum;
1962 if (!Added.test(N)) {
1963 AdjK[i].push_back(N);
1964 Added.set(N);
1965 }
1966 }
1967 // A chain edge between a store and a load is treated as a back-edge in the
1968 // adjacency matrix.
1969 for (auto &IE : DAG->DDG->getInEdges(&SUnits[i])) {
1970 SUnit *Src = IE.getSrc();
1971 SUnit *Dst = IE.getDst();
1972 if (!Dst->getInstr()->mayStore() || !DAG->isLoopCarriedDep(IE))
1973 continue;
1974 if (IE.isOrderDep() && Src->getInstr()->mayLoad()) {
1975 int N = Src->NodeNum;
1976 if (!Added.test(N)) {
1977 AdjK[i].push_back(N);
1978 Added.set(N);
1979 }
1980 }
1981 }
1982 }
1983 // Add back-edges in the adjacency matrix for the output dependences.
1984 for (auto &OD : OutputDeps)
1985 if (!Added.test(OD.second)) {
1986 AdjK[OD.first].push_back(OD.second);
1987 Added.set(OD.second);
1988 }
1989}
1990
1991/// Identify an elementary circuit in the dependence graph starting at the
1992/// specified node.
1993bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1994 const SwingSchedulerDAG *DAG,
1995 bool HasBackedge) {
1996 SUnit *SV = &SUnits[V];
1997 bool F = false;
1998 Stack.insert(SV);
1999 Blocked.set(V);
2000
2001 for (auto W : AdjK[V]) {
2002 if (NumPaths > MaxPaths)
2003 break;
2004 if (W < S)
2005 continue;
2006 if (W == S) {
2007 if (!HasBackedge)
2008 NodeSets.push_back(NodeSet(Stack.begin(), Stack.end(), DAG));
2009 F = true;
2010 ++NumPaths;
2011 break;
2012 }
2013 if (!Blocked.test(W)) {
2014 if (circuit(W, S, NodeSets, DAG,
2015 Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
2016 F = true;
2017 }
2018 }
2019
2020 if (F)
2021 unblock(V);
2022 else {
2023 for (auto W : AdjK[V]) {
2024 if (W < S)
2025 continue;
2026 B[W].insert(SV);
2027 }
2028 }
2029 Stack.pop_back();
2030 return F;
2031}
2032
2033/// Unblock a node in the circuit finding algorithm.
2034void SwingSchedulerDAG::Circuits::unblock(int U) {
2035 Blocked.reset(U);
2036 SmallPtrSet<SUnit *, 4> &BU = B[U];
2037 while (!BU.empty()) {
2038 SmallPtrSet<SUnit *, 4>::iterator SI = BU.begin();
2039 assert(SI != BU.end() && "Invalid B set.");
2040 SUnit *W = *SI;
2041 BU.erase(W);
2042 if (Blocked.test(W->NodeNum))
2043 unblock(W->NodeNum);
2044 }
2045}
2046
2047/// Identify all the elementary circuits in the dependence graph using
2048/// Johnson's circuit algorithm.
2049void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
2050 Circuits Cir(SUnits, Topo);
2051 // Create the adjacency structure.
2052 Cir.createAdjacencyStructure(this);
2053 for (int I = 0, E = SUnits.size(); I != E; ++I) {
2054 Cir.reset();
2055 Cir.circuit(I, I, NodeSets, this);
2056 }
2057}
2058
2059// Create artificial dependencies between the source of COPY/REG_SEQUENCE that
2060// is loop-carried to the USE in next iteration. This will help pipeliner avoid
2061// additional copies that are needed across iterations. An artificial dependence
2062// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
2063
2064// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
2065// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
2066// PHI-------True-Dep------> USEOfPhi
2067
2068// The mutation creates
2069// USEOfPHI -------Artificial-Dep---> SRCOfCopy
2070
2071// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
2072// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
2073// late to avoid additional copies across iterations. The possible scheduling
2074// order would be
2075// USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE.
2076
2077void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
2078 for (SUnit &SU : DAG->SUnits) {
2079 // Find the COPY/REG_SEQUENCE instruction.
2080 if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
2081 continue;
2082
2083 // Record the loop carried PHIs.
2085 // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
2087
2088 for (auto &Dep : SU.Preds) {
2089 SUnit *TmpSU = Dep.getSUnit();
2090 MachineInstr *TmpMI = TmpSU->getInstr();
2091 SDep::Kind DepKind = Dep.getKind();
2092 // Save the loop carried PHI.
2093 if (DepKind == SDep::Anti && TmpMI->isPHI())
2094 PHISUs.push_back(TmpSU);
2095 // Save the source of COPY/REG_SEQUENCE.
2096 // If the source has no pre-decessors, we will end up creating cycles.
2097 else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
2098 SrcSUs.push_back(TmpSU);
2099 }
2100
2101 if (PHISUs.size() == 0 || SrcSUs.size() == 0)
2102 continue;
2103
2104 // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
2105 // SUnit to the container.
2107 // Do not use iterator based loop here as we are updating the container.
2108 for (size_t Index = 0; Index < PHISUs.size(); ++Index) {
2109 for (auto &Dep : PHISUs[Index]->Succs) {
2110 if (Dep.getKind() != SDep::Data)
2111 continue;
2112
2113 SUnit *TmpSU = Dep.getSUnit();
2114 MachineInstr *TmpMI = TmpSU->getInstr();
2115 if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
2116 PHISUs.push_back(TmpSU);
2117 continue;
2118 }
2119 UseSUs.push_back(TmpSU);
2120 }
2121 }
2122
2123 if (UseSUs.size() == 0)
2124 continue;
2125
2126 SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
2127 // Add the artificial dependencies if it does not form a cycle.
2128 for (auto *I : UseSUs) {
2129 for (auto *Src : SrcSUs) {
2130 if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
2131 Src->addPred(SDep(I, SDep::Artificial));
2132 SDAG->Topo.AddPred(Src, I);
2133 }
2134 }
2135 }
2136 }
2137}
2138
2139/// Compute several functions need to order the nodes for scheduling.
2140/// ASAP - Earliest time to schedule a node.
2141/// ALAP - Latest time to schedule a node.
2142/// MOV - Mobility function, difference between ALAP and ASAP.
2143/// D - Depth of each node.
2144/// H - Height of each node.
2145void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
2146 ScheduleInfo.resize(SUnits.size());
2147
2148 LLVM_DEBUG({
2149 for (int I : Topo) {
2150 const SUnit &SU = SUnits[I];
2151 dumpNode(SU);
2152 }
2153 });
2154
2155 int maxASAP = 0;
2156 // Compute ASAP and ZeroLatencyDepth.
2157 for (int I : Topo) {
2158 int asap = 0;
2159 int zeroLatencyDepth = 0;
2160 SUnit *SU = &SUnits[I];
2161 for (const auto &IE : DDG->getInEdges(SU)) {
2162 SUnit *Pred = IE.getSrc();
2163 if (IE.getLatency() == 0)
2164 zeroLatencyDepth =
2165 std::max(zeroLatencyDepth, getZeroLatencyDepth(Pred) + 1);
2166 if (IE.ignoreDependence(true))
2167 continue;
2168 asap = std::max(asap, (int)(getASAP(Pred) + IE.getLatency() -
2169 IE.getDistance() * MII));
2170 }
2171 maxASAP = std::max(maxASAP, asap);
2172 ScheduleInfo[I].ASAP = asap;
2173 ScheduleInfo[I].ZeroLatencyDepth = zeroLatencyDepth;
2174 }
2175
2176 // Compute ALAP, ZeroLatencyHeight, and MOV.
2177 for (int I : llvm::reverse(Topo)) {
2178 int alap = maxASAP;
2179 int zeroLatencyHeight = 0;
2180 SUnit *SU = &SUnits[I];
2181 for (const auto &OE : DDG->getOutEdges(SU)) {
2182 SUnit *Succ = OE.getDst();
2183 if (Succ->isBoundaryNode())
2184 continue;
2185 if (OE.getLatency() == 0)
2186 zeroLatencyHeight =
2187 std::max(zeroLatencyHeight, getZeroLatencyHeight(Succ) + 1);
2188 if (OE.ignoreDependence(true))
2189 continue;
2190 alap = std::min(alap, (int)(getALAP(Succ) - OE.getLatency() +
2191 OE.getDistance() * MII));
2192 }
2193
2194 ScheduleInfo[I].ALAP = alap;
2195 ScheduleInfo[I].ZeroLatencyHeight = zeroLatencyHeight;
2196 }
2197
2198 // After computing the node functions, compute the summary for each node set.
2199 for (NodeSet &I : NodeSets)
2200 I.computeNodeSetInfo(this);
2201
2202 LLVM_DEBUG({
2203 for (unsigned i = 0; i < SUnits.size(); i++) {
2204 dbgs() << "\tNode " << i << ":\n";
2205 dbgs() << "\t ASAP = " << getASAP(&SUnits[i]) << "\n";
2206 dbgs() << "\t ALAP = " << getALAP(&SUnits[i]) << "\n";
2207 dbgs() << "\t MOV = " << getMOV(&SUnits[i]) << "\n";
2208 dbgs() << "\t D = " << getDepth(&SUnits[i]) << "\n";
2209 dbgs() << "\t H = " << getHeight(&SUnits[i]) << "\n";
2210 dbgs() << "\t ZLD = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
2211 dbgs() << "\t ZLH = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
2212 }
2213 });
2214}
2215
2216/// Compute the Pred_L(O) set, as defined in the paper. The set is defined
2217/// as the predecessors of the elements of NodeOrder that are not also in
2218/// NodeOrder.
2221 const NodeSet *S = nullptr) {
2222 Preds.clear();
2223
2224 for (SUnit *SU : NodeOrder) {
2225 for (const auto &IE : DDG->getInEdges(SU)) {
2226 SUnit *PredSU = IE.getSrc();
2227 if (S && S->count(PredSU) == 0)
2228 continue;
2229 if (IE.ignoreDependence(true))
2230 continue;
2231 if (NodeOrder.count(PredSU) == 0)
2232 Preds.insert(PredSU);
2233 }
2234
2235 // FIXME: The following loop-carried dependencies may also need to be
2236 // considered.
2237 // - Physical register dependencies (true-dependence and WAW).
2238 // - Memory dependencies.
2239 for (const auto &OE : DDG->getOutEdges(SU)) {
2240 SUnit *SuccSU = OE.getDst();
2241 if (!OE.isAntiDep())
2242 continue;
2243 if (S && S->count(SuccSU) == 0)
2244 continue;
2245 if (NodeOrder.count(SuccSU) == 0)
2246 Preds.insert(SuccSU);
2247 }
2248 }
2249 return !Preds.empty();
2250}
2251
2252/// Compute the Succ_L(O) set, as defined in the paper. The set is defined
2253/// as the successors of the elements of NodeOrder that are not also in
2254/// NodeOrder.
2257 const NodeSet *S = nullptr) {
2258 Succs.clear();
2259
2260 for (SUnit *SU : NodeOrder) {
2261 for (const auto &OE : DDG->getOutEdges(SU)) {
2262 SUnit *SuccSU = OE.getDst();
2263 if (S && S->count(SuccSU) == 0)
2264 continue;
2265 if (OE.ignoreDependence(false))
2266 continue;
2267 if (NodeOrder.count(SuccSU) == 0)
2268 Succs.insert(SuccSU);
2269 }
2270
2271 // FIXME: The following loop-carried dependencies may also need to be
2272 // considered.
2273 // - Physical register dependnecies (true-dependnece and WAW).
2274 // - Memory dependencies.
2275 for (const auto &IE : DDG->getInEdges(SU)) {
2276 SUnit *PredSU = IE.getSrc();
2277 if (!IE.isAntiDep())
2278 continue;
2279 if (S && S->count(PredSU) == 0)
2280 continue;
2281 if (NodeOrder.count(PredSU) == 0)
2282 Succs.insert(PredSU);
2283 }
2284 }
2285 return !Succs.empty();
2286}
2287
2288/// Return true if there is a path from the specified node to any of the nodes
2289/// in DestNodes. Keep track and return the nodes in any path.
2290static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
2291 SetVector<SUnit *> &DestNodes,
2292 SetVector<SUnit *> &Exclude,
2293 SmallPtrSet<SUnit *, 8> &Visited,
2294 SwingSchedulerDDG *DDG) {
2295 if (Cur->isBoundaryNode())
2296 return false;
2297 if (Exclude.contains(Cur))
2298 return false;
2299 if (DestNodes.contains(Cur))
2300 return true;
2301 if (!Visited.insert(Cur).second)
2302 return Path.contains(Cur);
2303 bool FoundPath = false;
2304 for (const auto &OE : DDG->getOutEdges(Cur))
2305 if (!OE.ignoreDependence(false))
2306 FoundPath |=
2307 computePath(OE.getDst(), Path, DestNodes, Exclude, Visited, DDG);
2308 for (const auto &IE : DDG->getInEdges(Cur))
2309 if (IE.isAntiDep() && IE.getDistance() == 0)
2310 FoundPath |=
2311 computePath(IE.getSrc(), Path, DestNodes, Exclude, Visited, DDG);
2312 if (FoundPath)
2313 Path.insert(Cur);
2314 return FoundPath;
2315}
2316
2317/// Compute the live-out registers for the instructions in a node-set.
2318/// The live-out registers are those that are defined in the node-set,
2319/// but not used. Except for use operands of Phis.
2321 NodeSet &NS) {
2326 for (SUnit *SU : NS) {
2327 const MachineInstr *MI = SU->getInstr();
2328 if (MI->isPHI())
2329 continue;
2330 for (const MachineOperand &MO : MI->all_uses()) {
2331 Register Reg = MO.getReg();
2332 if (Reg.isVirtual())
2333 Uses.insert(VirtRegOrUnit(Reg));
2334 else if (MRI.isAllocatable(Reg))
2335 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
2336 Uses.insert(VirtRegOrUnit(Unit));
2337 }
2338 }
2339 for (SUnit *SU : NS)
2340 for (const MachineOperand &MO : SU->getInstr()->all_defs())
2341 if (!MO.isDead()) {
2342 Register Reg = MO.getReg();
2343 if (Reg.isVirtual()) {
2344 if (!Uses.count(VirtRegOrUnit(Reg)))
2345 LiveOutRegs.emplace_back(VirtRegOrUnit(Reg),
2347 } else if (MRI.isAllocatable(Reg)) {
2348 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
2349 if (!Uses.count(VirtRegOrUnit(Unit)))
2350 LiveOutRegs.emplace_back(VirtRegOrUnit(Unit),
2352 }
2353 }
2354 RPTracker.addLiveRegs(LiveOutRegs);
2355}
2356
2357/// A heuristic to filter nodes in recurrent node-sets if the register
2358/// pressure of a set is too high.
2359void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
2360 for (auto &NS : NodeSets) {
2361 // Skip small node-sets since they won't cause register pressure problems.
2362 if (NS.size() <= 2)
2363 continue;
2364 IntervalPressure RecRegPressure;
2365 RegPressureTracker RecRPTracker(RecRegPressure);
2366 RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
2367 computeLiveOuts(MF, RecRPTracker, NS);
2368 RecRPTracker.closeBottom();
2369
2370 std::vector<SUnit *> SUnits(NS.begin(), NS.end());
2371 llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
2372 return A->NodeNum > B->NodeNum;
2373 });
2374
2375 for (auto &SU : SUnits) {
2376 // Since we're computing the register pressure for a subset of the
2377 // instructions in a block, we need to set the tracker for each
2378 // instruction in the node-set. The tracker is set to the instruction
2379 // just after the one we're interested in.
2381 RecRPTracker.setPos(std::next(CurInstI));
2382
2383 RegPressureDelta RPDelta;
2384 ArrayRef<PressureChange> CriticalPSets;
2385 RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
2386 CriticalPSets,
2387 RecRegPressure.MaxSetPressure);
2388 if (RPDelta.Excess.isValid()) {
2389 LLVM_DEBUG(
2390 dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
2391 << TRI->getRegPressureSetName(RPDelta.Excess.getPSet())
2392 << ":" << RPDelta.Excess.getUnitInc() << "\n");
2393 NS.setExceedPressure(SU);
2394 break;
2395 }
2396 RecRPTracker.recede();
2397 }
2398 }
2399}
2400
2401/// A heuristic to colocate node sets that have the same set of
2402/// successors.
2403void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
2404 unsigned Colocate = 0;
2405 for (int i = 0, e = NodeSets.size(); i < e; ++i) {
2406 NodeSet &N1 = NodeSets[i];
2407 SmallSetVector<SUnit *, 8> S1;
2408 if (N1.empty() || !succ_L(N1, S1, DDG.get()))
2409 continue;
2410 for (int j = i + 1; j < e; ++j) {
2411 NodeSet &N2 = NodeSets[j];
2412 if (N1.compareRecMII(N2) != 0)
2413 continue;
2414 SmallSetVector<SUnit *, 8> S2;
2415 if (N2.empty() || !succ_L(N2, S2, DDG.get()))
2416 continue;
2417 if (llvm::set_is_subset(S1, S2) && S1.size() == S2.size()) {
2418 N1.setColocate(++Colocate);
2419 N2.setColocate(Colocate);
2420 break;
2421 }
2422 }
2423 }
2424}
2425
2426/// Check if the existing node-sets are profitable. If not, then ignore the
2427/// recurrent node-sets, and attempt to schedule all nodes together. This is
2428/// a heuristic. If the MII is large and all the recurrent node-sets are small,
2429/// then it's best to try to schedule all instructions together instead of
2430/// starting with the recurrent node-sets.
2431void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
2432 // Look for loops with a large MII.
2433 if (MII < 17)
2434 return;
2435 // Check if the node-set contains only a simple add recurrence.
2436 for (auto &NS : NodeSets) {
2437 if (NS.getRecMII() > 2)
2438 return;
2439 if (NS.getMaxDepth() > MII)
2440 return;
2441 }
2442 NodeSets.clear();
2443 LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
2444}
2445
2446/// Add the nodes that do not belong to a recurrence set into groups
2447/// based upon connected components.
2448void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
2449 SetVector<SUnit *> NodesAdded;
2450 SmallPtrSet<SUnit *, 8> Visited;
2451 // Add the nodes that are on a path between the previous node sets and
2452 // the current node set.
2453 for (NodeSet &I : NodeSets) {
2454 SmallSetVector<SUnit *, 8> N;
2455 // Add the nodes from the current node set to the previous node set.
2456 if (succ_L(I, N, DDG.get())) {
2457 SetVector<SUnit *> Path;
2458 for (SUnit *NI : N) {
2459 Visited.clear();
2460 computePath(NI, Path, NodesAdded, I, Visited, DDG.get());
2461 }
2462 if (!Path.empty())
2463 I.insert(Path.begin(), Path.end());
2464 }
2465 // Add the nodes from the previous node set to the current node set.
2466 N.clear();
2467 if (succ_L(NodesAdded, N, DDG.get())) {
2468 SetVector<SUnit *> Path;
2469 for (SUnit *NI : N) {
2470 Visited.clear();
2471 computePath(NI, Path, I, NodesAdded, Visited, DDG.get());
2472 }
2473 if (!Path.empty())
2474 I.insert(Path.begin(), Path.end());
2475 }
2476 NodesAdded.insert_range(I);
2477 }
2478
2479 // Create a new node set with the connected nodes of any successor of a node
2480 // in a recurrent set.
2481 NodeSet NewSet;
2482 SmallSetVector<SUnit *, 8> N;
2483 if (succ_L(NodesAdded, N, DDG.get()))
2484 for (SUnit *I : N)
2485 addConnectedNodes(I, NewSet, NodesAdded);
2486 if (!NewSet.empty())
2487 NodeSets.push_back(NewSet);
2488
2489 // Create a new node set with the connected nodes of any predecessor of a node
2490 // in a recurrent set.
2491 NewSet.clear();
2492 if (pred_L(NodesAdded, N, DDG.get()))
2493 for (SUnit *I : N)
2494 addConnectedNodes(I, NewSet, NodesAdded);
2495 if (!NewSet.empty())
2496 NodeSets.push_back(NewSet);
2497
2498 // Create new nodes sets with the connected nodes any remaining node that
2499 // has no predecessor.
2500 for (SUnit &SU : SUnits) {
2501 if (NodesAdded.count(&SU) == 0) {
2502 NewSet.clear();
2503 addConnectedNodes(&SU, NewSet, NodesAdded);
2504 if (!NewSet.empty())
2505 NodeSets.push_back(NewSet);
2506 }
2507 }
2508}
2509
2510/// Add the node to the set, and add all of its connected nodes to the set.
2511void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
2512 SetVector<SUnit *> &NodesAdded) {
2513 NewSet.insert(SU);
2514 NodesAdded.insert(SU);
2515 for (auto &OE : DDG->getOutEdges(SU)) {
2516 SUnit *Successor = OE.getDst();
2517 if (!OE.isArtificial() && !Successor->isBoundaryNode() &&
2518 NodesAdded.count(Successor) == 0)
2519 addConnectedNodes(Successor, NewSet, NodesAdded);
2520 }
2521 for (auto &IE : DDG->getInEdges(SU)) {
2522 SUnit *Predecessor = IE.getSrc();
2523 if (!IE.isArtificial() && NodesAdded.count(Predecessor) == 0)
2524 addConnectedNodes(Predecessor, NewSet, NodesAdded);
2525 }
2526}
2527
2528/// Return true if Set1 contains elements in Set2. The elements in common
2529/// are returned in a different container.
2530static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
2532 Result.clear();
2533 for (SUnit *SU : Set1) {
2534 if (Set2.count(SU) != 0)
2535 Result.insert(SU);
2536 }
2537 return !Result.empty();
2538}
2539
2540/// Merge the recurrence node sets that have the same initial node.
2541void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
2542 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2543 ++I) {
2544 NodeSet &NI = *I;
2545 for (NodeSetType::iterator J = I + 1; J != E;) {
2546 NodeSet &NJ = *J;
2547 if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
2548 if (NJ.compareRecMII(NI) > 0)
2549 NI.setRecMII(NJ.getRecMII());
2550 for (SUnit *SU : *J)
2551 I->insert(SU);
2552 NodeSets.erase(J);
2553 E = NodeSets.end();
2554 } else {
2555 ++J;
2556 }
2557 }
2558 }
2559}
2560
2561/// Remove nodes that have been scheduled in previous NodeSets.
2562void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
2563 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2564 ++I)
2565 for (NodeSetType::iterator J = I + 1; J != E;) {
2566 J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
2567
2568 if (J->empty()) {
2569 NodeSets.erase(J);
2570 E = NodeSets.end();
2571 } else {
2572 ++J;
2573 }
2574 }
2575}
2576
2577/// Compute an ordered list of the dependence graph nodes, which
2578/// indicates the order that the nodes will be scheduled. This is a
2579/// two-level algorithm. First, a partial order is created, which
2580/// consists of a list of sets ordered from highest to lowest priority.
2581void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
2582 SmallSetVector<SUnit *, 8> R;
2583 NodeOrder.clear();
2584
2585 for (auto &Nodes : NodeSets) {
2586 LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
2587 OrderKind Order;
2588 SmallSetVector<SUnit *, 8> N;
2589 if (pred_L(NodeOrder, N, DDG.get()) && llvm::set_is_subset(N, Nodes)) {
2590 R.insert_range(N);
2591 Order = BottomUp;
2592 LLVM_DEBUG(dbgs() << " Bottom up (preds) ");
2593 } else if (succ_L(NodeOrder, N, DDG.get()) &&
2594 llvm::set_is_subset(N, Nodes)) {
2595 R.insert_range(N);
2596 Order = TopDown;
2597 LLVM_DEBUG(dbgs() << " Top down (succs) ");
2598 } else if (isIntersect(N, Nodes, R)) {
2599 // If some of the successors are in the existing node-set, then use the
2600 // top-down ordering.
2601 Order = TopDown;
2602 LLVM_DEBUG(dbgs() << " Top down (intersect) ");
2603 } else if (NodeSets.size() == 1) {
2604 for (const auto &N : Nodes)
2605 if (N->Succs.size() == 0)
2606 R.insert(N);
2607 Order = BottomUp;
2608 LLVM_DEBUG(dbgs() << " Bottom up (all) ");
2609 } else {
2610 // Find the node with the highest ASAP.
2611 SUnit *maxASAP = nullptr;
2612 for (SUnit *SU : Nodes) {
2613 if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
2614 (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
2615 maxASAP = SU;
2616 }
2617 R.insert(maxASAP);
2618 Order = BottomUp;
2619 LLVM_DEBUG(dbgs() << " Bottom up (default) ");
2620 }
2621
2622 while (!R.empty()) {
2623 if (Order == TopDown) {
2624 // Choose the node with the maximum height. If more than one, choose
2625 // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
2626 // choose the node with the lowest MOV.
2627 while (!R.empty()) {
2628 SUnit *maxHeight = nullptr;
2629 for (SUnit *I : R) {
2630 if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
2631 maxHeight = I;
2632 else if (getHeight(I) == getHeight(maxHeight) &&
2633 getZeroLatencyHeight(I) > getZeroLatencyHeight(maxHeight))
2634 maxHeight = I;
2635 else if (getHeight(I) == getHeight(maxHeight) &&
2636 getZeroLatencyHeight(I) ==
2637 getZeroLatencyHeight(maxHeight) &&
2638 getMOV(I) < getMOV(maxHeight))
2639 maxHeight = I;
2640 }
2641 NodeOrder.insert(maxHeight);
2642 LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
2643 R.remove(maxHeight);
2644 for (const auto &OE : DDG->getOutEdges(maxHeight)) {
2645 SUnit *SU = OE.getDst();
2646 if (Nodes.count(SU) == 0)
2647 continue;
2648 if (NodeOrder.contains(SU))
2649 continue;
2650 if (OE.ignoreDependence(false))
2651 continue;
2652 R.insert(SU);
2653 }
2654
2655 // FIXME: The following loop-carried dependencies may also need to be
2656 // considered.
2657 // - Physical register dependnecies (true-dependnece and WAW).
2658 // - Memory dependencies.
2659 for (const auto &IE : DDG->getInEdges(maxHeight)) {
2660 SUnit *SU = IE.getSrc();
2661 if (!IE.isAntiDep())
2662 continue;
2663 if (Nodes.count(SU) == 0)
2664 continue;
2665 if (NodeOrder.contains(SU))
2666 continue;
2667 R.insert(SU);
2668 }
2669 }
2670 Order = BottomUp;
2671 LLVM_DEBUG(dbgs() << "\n Switching order to bottom up ");
2672 SmallSetVector<SUnit *, 8> N;
2673 if (pred_L(NodeOrder, N, DDG.get(), &Nodes))
2674 R.insert_range(N);
2675 } else {
2676 // Choose the node with the maximum depth. If more than one, choose
2677 // the node with the maximum ZeroLatencyDepth. If still more than one,
2678 // choose the node with the lowest MOV.
2679 while (!R.empty()) {
2680 SUnit *maxDepth = nullptr;
2681 for (SUnit *I : R) {
2682 if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
2683 maxDepth = I;
2684 else if (getDepth(I) == getDepth(maxDepth) &&
2685 getZeroLatencyDepth(I) > getZeroLatencyDepth(maxDepth))
2686 maxDepth = I;
2687 else if (getDepth(I) == getDepth(maxDepth) &&
2688 getZeroLatencyDepth(I) == getZeroLatencyDepth(maxDepth) &&
2689 getMOV(I) < getMOV(maxDepth))
2690 maxDepth = I;
2691 }
2692 NodeOrder.insert(maxDepth);
2693 LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
2694 R.remove(maxDepth);
2695 if (Nodes.isExceedSU(maxDepth)) {
2696 Order = TopDown;
2697 R.clear();
2698 R.insert(Nodes.getNode(0));
2699 break;
2700 }
2701 for (const auto &IE : DDG->getInEdges(maxDepth)) {
2702 SUnit *SU = IE.getSrc();
2703 if (Nodes.count(SU) == 0)
2704 continue;
2705 if (NodeOrder.contains(SU))
2706 continue;
2707 R.insert(SU);
2708 }
2709
2710 // FIXME: The following loop-carried dependencies may also need to be
2711 // considered.
2712 // - Physical register dependnecies (true-dependnece and WAW).
2713 // - Memory dependencies.
2714 for (const auto &OE : DDG->getOutEdges(maxDepth)) {
2715 SUnit *SU = OE.getDst();
2716 if (!OE.isAntiDep())
2717 continue;
2718 if (Nodes.count(SU) == 0)
2719 continue;
2720 if (NodeOrder.contains(SU))
2721 continue;
2722 R.insert(SU);
2723 }
2724 }
2725 Order = TopDown;
2726 LLVM_DEBUG(dbgs() << "\n Switching order to top down ");
2727 SmallSetVector<SUnit *, 8> N;
2728 if (succ_L(NodeOrder, N, DDG.get(), &Nodes))
2729 R.insert_range(N);
2730 }
2731 }
2732 LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
2733 }
2734
2735 LLVM_DEBUG({
2736 dbgs() << "Node order: ";
2737 for (SUnit *I : NodeOrder)
2738 dbgs() << " " << I->NodeNum << " ";
2739 dbgs() << "\n";
2740 });
2741}
2742
2743/// Process the nodes in the computed order and create the pipelined schedule
2744/// of the instructions, if possible. Return true if a schedule is found.
2745bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
2746
2747 if (NodeOrder.empty()){
2748 LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
2749 return false;
2750 }
2751
2752 bool scheduleFound = false;
2753 std::unique_ptr<HighRegisterPressureDetector> HRPDetector;
2754 if (LimitRegPressure) {
2755 HRPDetector =
2756 std::make_unique<HighRegisterPressureDetector>(Loop.getHeader(), MF);
2757 HRPDetector->init(RegClassInfo);
2758 }
2759 // Keep increasing II until a valid schedule is found.
2760 for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) {
2761 Schedule.reset();
2762 Schedule.setInitiationInterval(II);
2763 LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
2764
2767 do {
2768 SUnit *SU = *NI;
2769
2770 // Compute the schedule time for the instruction, which is based
2771 // upon the scheduled time for any predecessors/successors.
2772 int EarlyStart = INT_MIN;
2773 int LateStart = INT_MAX;
2774 Schedule.computeStart(SU, &EarlyStart, &LateStart, II, this);
2775 LLVM_DEBUG({
2776 dbgs() << "\n";
2777 dbgs() << "Inst (" << SU->NodeNum << ") ";
2778 SU->getInstr()->dump();
2779 dbgs() << "\n";
2780 });
2781 LLVM_DEBUG(
2782 dbgs() << format("\tes: %8x ls: %8x\n", EarlyStart, LateStart));
2783
2784 if (EarlyStart > LateStart)
2785 scheduleFound = false;
2786 else if (EarlyStart != INT_MIN && LateStart == INT_MAX)
2787 scheduleFound =
2788 Schedule.insert(SU, EarlyStart, EarlyStart + (int)II - 1, II);
2789 else if (EarlyStart == INT_MIN && LateStart != INT_MAX)
2790 scheduleFound =
2791 Schedule.insert(SU, LateStart, LateStart - (int)II + 1, II);
2792 else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2793 LateStart = std::min(LateStart, EarlyStart + (int)II - 1);
2794 // When scheduling a Phi it is better to start at the late cycle and
2795 // go backwards. The default order may insert the Phi too far away
2796 // from its first dependence.
2797 // Also, do backward search when all scheduled predecessors are
2798 // loop-carried output/order dependencies. Empirically, there are also
2799 // cases where scheduling becomes possible with backward search.
2800 if (SU->getInstr()->isPHI() ||
2801 Schedule.onlyHasLoopCarriedOutputOrOrderPreds(SU, this->getDDG()))
2802 scheduleFound = Schedule.insert(SU, LateStart, EarlyStart, II);
2803 else
2804 scheduleFound = Schedule.insert(SU, EarlyStart, LateStart, II);
2805 } else {
2806 int FirstCycle = Schedule.getFirstCycle();
2807 scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2808 FirstCycle + getASAP(SU) + II - 1, II);
2809 }
2810
2811 // Even if we find a schedule, make sure the schedule doesn't exceed the
2812 // allowable number of stages. We keep trying if this happens.
2813 if (scheduleFound)
2814 if (SwpMaxStages > -1 &&
2815 Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2816 scheduleFound = false;
2817
2818 LLVM_DEBUG({
2819 if (!scheduleFound)
2820 dbgs() << "\tCan't schedule\n";
2821 });
2822 } while (++NI != NE && scheduleFound);
2823
2824 // If a schedule is found, validate it against the validation-only
2825 // dependencies.
2826 if (scheduleFound)
2827 scheduleFound = DDG->isValidSchedule(Schedule);
2828
2829 // If a schedule is found, ensure non-pipelined instructions are in stage 0
2830 if (scheduleFound)
2831 scheduleFound =
2832 Schedule.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo);
2833
2834 // If a schedule is found, check if it is a valid schedule too.
2835 if (scheduleFound)
2836 scheduleFound = Schedule.isValidSchedule(this);
2837
2838 // If a schedule was found and the option is enabled, check if the schedule
2839 // might generate additional register spills/fills.
2840 if (scheduleFound && LimitRegPressure)
2841 scheduleFound =
2842 !HRPDetector->detect(this, Schedule, Schedule.getMaxStageCount());
2843 }
2844
2845 LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
2846 << " (II=" << Schedule.getInitiationInterval()
2847 << ")\n");
2848
2849 if (scheduleFound) {
2850 scheduleFound = LoopPipelinerInfo->shouldUseSchedule(*this, Schedule);
2851 if (!scheduleFound)
2852 LLVM_DEBUG(dbgs() << "Target rejected schedule\n");
2853 }
2854
2855 if (scheduleFound) {
2856 Schedule.finalizeSchedule(this);
2857 Pass.ORE->emit([&]() {
2858 return MachineOptimizationRemarkAnalysis(
2859 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
2860 << "Schedule found with Initiation Interval: "
2861 << ore::NV("II", Schedule.getInitiationInterval())
2862 << ", MaxStageCount: "
2863 << ore::NV("MaxStageCount", Schedule.getMaxStageCount());
2864 });
2865 } else
2866 Schedule.reset();
2867
2868 return scheduleFound && Schedule.getMaxStageCount() > 0;
2869}
2870
2872 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
2873 Register Result;
2874 for (const MachineOperand &Use : MI.all_uses()) {
2875 Register Reg = Use.getReg();
2876 if (!Reg.isVirtual())
2877 return Register();
2878 if (MRI.getVRegDef(Reg)->getParent() != MI.getParent())
2879 continue;
2880 if (Result)
2881 return Register();
2882 Result = Reg;
2883 }
2884 return Result;
2885}
2886
2887/// When Op is a value that is incremented recursively in a loop and there is a
2888/// unique instruction that increments it, returns true and sets Value.
2890 if (!Op.isReg() || !Op.getReg().isVirtual())
2891 return false;
2892
2893 Register OrgReg = Op.getReg();
2894 Register CurReg = OrgReg;
2895 const MachineBasicBlock *LoopBB = Op.getParent()->getParent();
2896 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
2897
2898 const TargetInstrInfo *TII =
2899 LoopBB->getParent()->getSubtarget().getInstrInfo();
2900 const TargetRegisterInfo *TRI =
2901 LoopBB->getParent()->getSubtarget().getRegisterInfo();
2902
2903 MachineInstr *Phi = nullptr;
2904 MachineInstr *Increment = nullptr;
2905
2906 // Traverse definitions until it reaches Op or an instruction that does not
2907 // satisfy the condition.
2908 // Acceptable example:
2909 // bb.0:
2910 // %0 = PHI %3, %bb.0, ...
2911 // %2 = ADD %0, Value
2912 // ... = LOAD %2(Op)
2913 // %3 = COPY %2
2914 while (true) {
2915 if (!CurReg.isValid() || !CurReg.isVirtual())
2916 return false;
2917 MachineInstr *Def = MRI.getVRegDef(CurReg);
2918 if (Def->getParent() != LoopBB)
2919 return false;
2920
2921 if (Def->isCopy()) {
2922 // Ignore copy instructions unless they contain subregisters
2923 if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())
2924 return false;
2925 CurReg = Def->getOperand(1).getReg();
2926 } else if (Def->isPHI()) {
2927 // There must be just one Phi
2928 if (Phi)
2929 return false;
2930 Phi = Def;
2931 CurReg = getLoopPhiReg(*Def, LoopBB);
2932 } else if (TII->getIncrementValue(*Def, Value)) {
2933 // Potentially a unique increment
2934 if (Increment)
2935 // Multiple increments exist
2936 return false;
2937
2938 const MachineOperand *BaseOp;
2939 int64_t Offset;
2940 bool OffsetIsScalable;
2941 if (TII->getMemOperandWithOffset(*Def, BaseOp, Offset, OffsetIsScalable,
2942 TRI)) {
2943 // Pre/post increment instruction
2944 CurReg = BaseOp->getReg();
2945 } else {
2946 // If only one of the operands is defined within the loop, it is assumed
2947 // to be an incremented value.
2948 CurReg = findUniqueOperandDefinedInLoop(*Def);
2949 if (!CurReg.isValid())
2950 return false;
2951 }
2952 Increment = Def;
2953 } else {
2954 return false;
2955 }
2956 if (CurReg == OrgReg)
2957 break;
2958 }
2959
2960 if (!Phi || !Increment)
2961 return false;
2962
2963 return true;
2964}
2965
2966/// Return true if we can compute the amount the instruction changes
2967/// during each iteration. Set Delta to the amount of the change.
2968bool SwingSchedulerDAG::computeDelta(const MachineInstr &MI, int &Delta) const {
2969 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2970 const MachineOperand *BaseOp;
2971 int64_t Offset;
2972 bool OffsetIsScalable;
2973 if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
2974 return false;
2975
2976 // FIXME: This algorithm assumes instructions have fixed-size offsets.
2977 if (OffsetIsScalable)
2978 return false;
2979
2980 if (!BaseOp->isReg())
2981 return false;
2982
2983 return findLoopIncrementValue(*BaseOp, Delta);
2984}
2985
2986/// Check if we can change the instruction to use an offset value from the
2987/// previous iteration. If so, return true and set the base and offset values
2988/// so that we can rewrite the load, if necessary.
2989/// v1 = Phi(v0, v3)
2990/// v2 = load v1, 0
2991/// v3 = post_store v1, 4, x
2992/// This function enables the load to be rewritten as v2 = load v3, 4.
2993bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
2994 unsigned &BasePos,
2995 unsigned &OffsetPos,
2996 Register &NewBase,
2997 int64_t &Offset) {
2998 // Get the load instruction.
2999 if (TII->isPostIncrement(*MI))
3000 return false;
3001 unsigned BasePosLd, OffsetPosLd;
3002 if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
3003 return false;
3004 Register BaseReg = MI->getOperand(BasePosLd).getReg();
3005
3006 // Look for the Phi instruction.
3007 MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
3008 MachineInstr *Phi = MRI.getVRegDef(BaseReg);
3009 if (!Phi || !Phi->isPHI())
3010 return false;
3011 // Get the register defined in the loop block.
3012 Register PrevReg = getLoopPhiReg(*Phi, MI->getParent());
3013 if (!PrevReg)
3014 return false;
3015
3016 // Check for the post-increment load/store instruction.
3017 MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
3018 if (!PrevDef || PrevDef == MI)
3019 return false;
3020
3021 if (!TII->isPostIncrement(*PrevDef))
3022 return false;
3023
3024 unsigned BasePos1 = 0, OffsetPos1 = 0;
3025 if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
3026 return false;
3027
3028 // Make sure that the instructions do not access the same memory location in
3029 // the next iteration.
3030 int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
3031 int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
3032 MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3033 NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
3034 bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
3035 MF.deleteMachineInstr(NewMI);
3036 if (!Disjoint)
3037 return false;
3038
3039 // Set the return value once we determine that we return true.
3040 BasePos = BasePosLd;
3041 OffsetPos = OffsetPosLd;
3042 NewBase = PrevReg;
3043 Offset = StoreOffset;
3044 return true;
3045}
3046
3047/// Apply changes to the instruction if needed. The changes are need
3048/// to improve the scheduling and depend up on the final schedule.
3050 SMSchedule &Schedule) {
3051 SUnit *SU = getSUnit(MI);
3053 InstrChanges.find(SU);
3054 if (It != InstrChanges.end()) {
3055 std::pair<Register, int64_t> RegAndOffset = It->second;
3056 unsigned BasePos, OffsetPos;
3057 if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
3058 return;
3059 Register BaseReg = MI->getOperand(BasePos).getReg();
3060 MachineInstr *LoopDef = findDefInLoop(BaseReg);
3061 int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
3062 int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
3063 int BaseStageNum = Schedule.stageScheduled(SU);
3064 int BaseCycleNum = Schedule.cycleScheduled(SU);
3065 if (BaseStageNum < DefStageNum) {
3066 MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3067 int OffsetDiff = DefStageNum - BaseStageNum;
3068 if (DefCycleNum < BaseCycleNum) {
3069 NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
3070 if (OffsetDiff > 0)
3071 --OffsetDiff;
3072 }
3073 int64_t NewOffset =
3074 MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
3075 NewMI->getOperand(OffsetPos).setImm(NewOffset);
3076 SU->setInstr(NewMI);
3077 MISUnitMap[NewMI] = SU;
3078 NewMIs[MI] = NewMI;
3079 }
3080 }
3081}
3082
3083/// Return the instruction in the loop that defines the register.
3084/// If the definition is a Phi, then follow the Phi operand to
3085/// the instruction in the loop.
3086MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
3088 MachineInstr *Def = MRI.getVRegDef(Reg);
3089 while (Def->isPHI()) {
3090 if (!Visited.insert(Def).second)
3091 break;
3092 for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
3093 if (Def->getOperand(i + 1).getMBB() == BB) {
3094 Def = MRI.getVRegDef(Def->getOperand(i).getReg());
3095 break;
3096 }
3097 }
3098 return Def;
3099}
3100
3101/// Return false if there is no overlap between the region accessed by BaseMI in
3102/// an iteration and the region accessed by OtherMI in subsequent iterations.
3104 const MachineInstr *BaseMI, const MachineInstr *OtherMI) const {
3105 int DeltaB, DeltaO, Delta;
3106 if (!computeDelta(*BaseMI, DeltaB) || !computeDelta(*OtherMI, DeltaO) ||
3107 DeltaB != DeltaO)
3108 return true;
3109 Delta = DeltaB;
3110
3111 const MachineOperand *BaseOpB, *BaseOpO;
3112 int64_t OffsetB, OffsetO;
3113 bool OffsetBIsScalable, OffsetOIsScalable;
3114 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3115 if (!TII->getMemOperandWithOffset(*BaseMI, BaseOpB, OffsetB,
3116 OffsetBIsScalable, TRI) ||
3117 !TII->getMemOperandWithOffset(*OtherMI, BaseOpO, OffsetO,
3118 OffsetOIsScalable, TRI))
3119 return true;
3120
3121 if (OffsetBIsScalable || OffsetOIsScalable)
3122 return true;
3123
3124 if (!BaseOpB->isIdenticalTo(*BaseOpO)) {
3125 // Pass cases with different base operands but same initial values.
3126 // Typically for when pre/post increment is used.
3127
3128 if (!BaseOpB->isReg() || !BaseOpO->isReg())
3129 return true;
3130 Register RegB = BaseOpB->getReg(), RegO = BaseOpO->getReg();
3131 if (!RegB.isVirtual() || !RegO.isVirtual())
3132 return true;
3133
3134 MachineInstr *DefB = MRI.getVRegDef(BaseOpB->getReg());
3135 MachineInstr *DefO = MRI.getVRegDef(BaseOpO->getReg());
3136 if (!DefB || !DefO || !DefB->isPHI() || !DefO->isPHI())
3137 return true;
3138
3139 Register InitValB;
3140 Register LoopValB;
3141 Register InitValO;
3142 Register LoopValO;
3143 getPhiRegs(*DefB, BB, InitValB, LoopValB);
3144 getPhiRegs(*DefO, BB, InitValO, LoopValO);
3145 MachineInstr *InitDefB = MRI.getVRegDef(InitValB);
3146 MachineInstr *InitDefO = MRI.getVRegDef(InitValO);
3147
3148 if (!InitDefB->isIdenticalTo(*InitDefO))
3149 return true;
3150 }
3151
3152 LocationSize AccessSizeB = (*BaseMI->memoperands_begin())->getSize();
3153 LocationSize AccessSizeO = (*OtherMI->memoperands_begin())->getSize();
3154
3155 // This is the main test, which checks the offset values and the loop
3156 // increment value to determine if the accesses may be loop carried.
3157 if (!AccessSizeB.hasValue() || !AccessSizeO.hasValue())
3158 return true;
3159
3160 LLVM_DEBUG({
3161 dbgs() << "Overlap check:\n";
3162 dbgs() << " BaseMI: ";
3163 BaseMI->dump();
3164 dbgs() << " Base + " << OffsetB << " + I * " << Delta
3165 << ", Len: " << AccessSizeB.getValue() << "\n";
3166 dbgs() << " OtherMI: ";
3167 OtherMI->dump();
3168 dbgs() << " Base + " << OffsetO << " + I * " << Delta
3169 << ", Len: " << AccessSizeO.getValue() << "\n";
3170 });
3171
3172 // Excessive overlap may be detected in strided patterns.
3173 // For example, the memory addresses of the store and the load in
3174 // for (i=0; i<n; i+=2) a[i+1] = a[i];
3175 // are assumed to overlap.
3176 if (Delta < 0) {
3177 int64_t BaseMinAddr = OffsetB;
3178 int64_t OhterNextIterMaxAddr = OffsetO + Delta + AccessSizeO.getValue() - 1;
3179 if (BaseMinAddr > OhterNextIterMaxAddr) {
3180 LLVM_DEBUG(dbgs() << " Result: No overlap\n");
3181 return false;
3182 }
3183 } else {
3184 int64_t BaseMaxAddr = OffsetB + AccessSizeB.getValue() - 1;
3185 int64_t OtherNextIterMinAddr = OffsetO + Delta;
3186 if (BaseMaxAddr < OtherNextIterMinAddr) {
3187 LLVM_DEBUG(dbgs() << " Result: No overlap\n");
3188 return false;
3189 }
3190 }
3191 LLVM_DEBUG(dbgs() << " Result: Overlap\n");
3192 return true;
3193}
3194
3195/// Return true for an order or output dependence that is loop carried
3196/// potentially. A dependence is loop carried if the destination defines a value
3197/// that may be used or defined by the source in a subsequent iteration.
3199 const SwingSchedulerDDGEdge &Edge) const {
3200 if ((!Edge.isOrderDep() && !Edge.isOutputDep()) || Edge.isArtificial() ||
3201 Edge.getDst()->isBoundaryNode())
3202 return false;
3203
3205 return true;
3206
3207 if (Edge.isOutputDep())
3208 return true;
3209
3210 MachineInstr *SI = Edge.getSrc()->getInstr();
3211 MachineInstr *DI = Edge.getDst()->getInstr();
3212 assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
3213
3214 // Assume ordered loads and stores may have a loop carried dependence.
3215 if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
3216 SI->mayRaiseFPException() || DI->mayRaiseFPException() ||
3217 SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
3218 return true;
3219
3220 if (!DI->mayLoadOrStore() || !SI->mayLoadOrStore())
3221 return false;
3222
3223 // The conservative assumption is that a dependence between memory operations
3224 // may be loop carried. The following code checks when it can be proved that
3225 // there is no loop carried dependence.
3226 return mayOverlapInLaterIter(DI, SI);
3227}
3228
3229void SwingSchedulerDAG::postProcessDAG() {
3230 for (auto &M : Mutations)
3231 M->apply(this);
3232}
3233
3234/// Try to schedule the node at the specified StartCycle and continue
3235/// until the node is schedule or the EndCycle is reached. This function
3236/// returns true if the node is scheduled. This routine may search either
3237/// forward or backward for a place to insert the instruction based upon
3238/// the relative values of StartCycle and EndCycle.
3239bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
3240 bool forward = true;
3241 LLVM_DEBUG({
3242 dbgs() << "Trying to insert node between " << StartCycle << " and "
3243 << EndCycle << " II: " << II << "\n";
3244 });
3245 if (StartCycle > EndCycle)
3246 forward = false;
3247
3248 // The terminating condition depends on the direction.
3249 int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
3250 for (int curCycle = StartCycle; curCycle != termCycle;
3251 forward ? ++curCycle : --curCycle) {
3252
3253 if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
3254 ProcItinResources.canReserveResources(*SU, curCycle)) {
3255 LLVM_DEBUG({
3256 dbgs() << "\tinsert at cycle " << curCycle << " ";
3257 SU->getInstr()->dump();
3258 });
3259
3260 if (!ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
3261 ProcItinResources.reserveResources(*SU, curCycle);
3262 ScheduledInstrs[curCycle].push_back(SU);
3263 InstrToCycle.insert(std::make_pair(SU, curCycle));
3264 if (curCycle > LastCycle)
3265 LastCycle = curCycle;
3266 if (curCycle < FirstCycle)
3267 FirstCycle = curCycle;
3268 return true;
3269 }
3270 LLVM_DEBUG({
3271 dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
3272 SU->getInstr()->dump();
3273 });
3274 }
3275 return false;
3276}
3277
3278// Return the cycle of the earliest scheduled instruction in the chain.
3280 const SwingSchedulerDDG *DDG) {
3283 Worklist.push_back(Dep);
3284 int EarlyCycle = INT_MAX;
3285 while (!Worklist.empty()) {
3286 const SwingSchedulerDDGEdge &Cur = Worklist.pop_back_val();
3287 SUnit *PrevSU = Cur.getSrc();
3288 if (Visited.count(PrevSU))
3289 continue;
3290 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
3291 if (it == InstrToCycle.end())
3292 continue;
3293 EarlyCycle = std::min(EarlyCycle, it->second);
3294 for (const auto &IE : DDG->getInEdges(PrevSU))
3295 if (IE.isOrderDep() || IE.isOutputDep())
3296 Worklist.push_back(IE);
3297 Visited.insert(PrevSU);
3298 }
3299 return EarlyCycle;
3300}
3301
3302// Return the cycle of the latest scheduled instruction in the chain.
3304 const SwingSchedulerDDG *DDG) {
3307 Worklist.push_back(Dep);
3308 int LateCycle = INT_MIN;
3309 while (!Worklist.empty()) {
3310 const SwingSchedulerDDGEdge &Cur = Worklist.pop_back_val();
3311 SUnit *SuccSU = Cur.getDst();
3312 if (Visited.count(SuccSU) || SuccSU->isBoundaryNode())
3313 continue;
3314 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
3315 if (it == InstrToCycle.end())
3316 continue;
3317 LateCycle = std::max(LateCycle, it->second);
3318 for (const auto &OE : DDG->getOutEdges(SuccSU))
3319 if (OE.isOrderDep() || OE.isOutputDep())
3320 Worklist.push_back(OE);
3321 Visited.insert(SuccSU);
3322 }
3323 return LateCycle;
3324}
3325
3326/// If an instruction has a use that spans multiple iterations, then
3327/// return true. These instructions are characterized by having a back-ege
3328/// to a Phi, which contains a reference to another Phi.
3330 for (auto &P : SU->Preds)
3331 if (P.getKind() == SDep::Anti && P.getSUnit()->getInstr()->isPHI())
3332 for (auto &S : P.getSUnit()->Succs)
3333 if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
3334 return P.getSUnit();
3335 return nullptr;
3336}
3337
3338/// Compute the scheduling start slot for the instruction. The start slot
3339/// depends on any predecessor or successor nodes scheduled already.
3340void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
3341 int II, SwingSchedulerDAG *DAG) {
3342 const SwingSchedulerDDG *DDG = DAG->getDDG();
3343
3344 // Iterate over each instruction that has been scheduled already. The start
3345 // slot computation depends on whether the previously scheduled instruction
3346 // is a predecessor or successor of the specified instruction.
3347 for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
3348 for (SUnit *I : getInstructions(cycle)) {
3349 for (const auto &IE : DDG->getInEdges(SU)) {
3350 if (IE.getSrc() == I) {
3351 // FIXME: Add reverse edge to `DDG` instead of calling
3352 // `isLoopCarriedDep`
3353 if (DAG->isLoopCarriedDep(IE)) {
3354 int End = earliestCycleInChain(IE, DDG) + (II - 1);
3355 *MinLateStart = std::min(*MinLateStart, End);
3356 }
3357 int EarlyStart = cycle + IE.getLatency() - IE.getDistance() * II;
3358 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
3359 }
3360 }
3361
3362 for (const auto &OE : DDG->getOutEdges(SU)) {
3363 if (OE.getDst() == I) {
3364 // FIXME: Add reverse edge to `DDG` instead of calling
3365 // `isLoopCarriedDep`
3366 if (DAG->isLoopCarriedDep(OE)) {
3367 int Start = latestCycleInChain(OE, DDG) + 1 - II;
3368 *MaxEarlyStart = std::max(*MaxEarlyStart, Start);
3369 }
3370 int LateStart = cycle - OE.getLatency() + OE.getDistance() * II;
3371 *MinLateStart = std::min(*MinLateStart, LateStart);
3372 }
3373 }
3374
3375 SUnit *BE = multipleIterations(I, DAG);
3376 for (const auto &Dep : SU->Preds) {
3377 // For instruction that requires multiple iterations, make sure that
3378 // the dependent instruction is not scheduled past the definition.
3379 if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
3380 !SU->isPred(I))
3381 *MinLateStart = std::min(*MinLateStart, cycle);
3382 }
3383 }
3384 }
3385}
3386
3387/// Order the instructions within a cycle so that the definitions occur
3388/// before the uses. Returns true if the instruction is added to the start
3389/// of the list, or false if added to the end.
3391 std::deque<SUnit *> &Insts) const {
3392 MachineInstr *MI = SU->getInstr();
3393 bool OrderBeforeUse = false;
3394 bool OrderAfterDef = false;
3395 bool OrderBeforeDef = false;
3396 unsigned MoveDef = 0;
3397 unsigned MoveUse = 0;
3398 int StageInst1 = stageScheduled(SU);
3399 const SwingSchedulerDDG *DDG = SSD->getDDG();
3400
3401 unsigned Pos = 0;
3402 for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
3403 ++I, ++Pos) {
3404 for (MachineOperand &MO : MI->operands()) {
3405 if (!MO.isReg() || !MO.getReg().isVirtual())
3406 continue;
3407
3408 Register Reg = MO.getReg();
3409 unsigned BasePos, OffsetPos;
3410 if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
3411 if (MI->getOperand(BasePos).getReg() == Reg)
3412 if (Register NewReg = SSD->getInstrBaseReg(SU))
3413 Reg = NewReg;
3414 bool Reads, Writes;
3415 std::tie(Reads, Writes) =
3416 (*I)->getInstr()->readsWritesVirtualRegister(Reg);
3417 if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
3418 OrderBeforeUse = true;
3419 if (MoveUse == 0)
3420 MoveUse = Pos;
3421 } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
3422 // Add the instruction after the scheduled instruction.
3423 OrderAfterDef = true;
3424 MoveDef = Pos;
3425 } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
3426 if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
3427 OrderBeforeUse = true;
3428 if (MoveUse == 0)
3429 MoveUse = Pos;
3430 } else {
3431 OrderAfterDef = true;
3432 MoveDef = Pos;
3433 }
3434 } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
3435 OrderBeforeUse = true;
3436 if (MoveUse == 0)
3437 MoveUse = Pos;
3438 if (MoveUse != 0) {
3439 OrderAfterDef = true;
3440 MoveDef = Pos - 1;
3441 }
3442 } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
3443 // Add the instruction before the scheduled instruction.
3444 OrderBeforeUse = true;
3445 if (MoveUse == 0)
3446 MoveUse = Pos;
3447 } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
3448 isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
3449 if (MoveUse == 0) {
3450 OrderBeforeDef = true;
3451 MoveUse = Pos;
3452 }
3453 }
3454 }
3455 // Check for order dependences between instructions. Make sure the source
3456 // is ordered before the destination.
3457 for (auto &OE : DDG->getOutEdges(SU)) {
3458 if (OE.getDst() != *I)
3459 continue;
3460 if (OE.isOrderDep() && stageScheduled(*I) == StageInst1) {
3461 OrderBeforeUse = true;
3462 if (Pos < MoveUse)
3463 MoveUse = Pos;
3464 }
3465 // We did not handle HW dependences in previous for loop,
3466 // and we normally set Latency = 0 for Anti/Output deps,
3467 // so may have nodes in same cycle with Anti/Output dependent on HW regs.
3468 else if ((OE.isAntiDep() || OE.isOutputDep()) &&
3469 stageScheduled(*I) == StageInst1) {
3470 OrderBeforeUse = true;
3471 if ((MoveUse == 0) || (Pos < MoveUse))
3472 MoveUse = Pos;
3473 }
3474 }
3475 for (auto &IE : DDG->getInEdges(SU)) {
3476 if (IE.getSrc() != *I)
3477 continue;
3478 if ((IE.isAntiDep() || IE.isOutputDep() || IE.isOrderDep()) &&
3479 stageScheduled(*I) == StageInst1) {
3480 OrderAfterDef = true;
3481 MoveDef = Pos;
3482 }
3483 }
3484 }
3485
3486 // A circular dependence.
3487 if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
3488 OrderBeforeUse = false;
3489
3490 // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3491 // to a loop-carried dependence.
3492 if (OrderBeforeDef)
3493 OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
3494
3495 // The uncommon case when the instruction order needs to be updated because
3496 // there is both a use and def.
3497 if (OrderBeforeUse && OrderAfterDef) {
3498 SUnit *UseSU = Insts.at(MoveUse);
3499 SUnit *DefSU = Insts.at(MoveDef);
3500 if (MoveUse > MoveDef) {
3501 Insts.erase(Insts.begin() + MoveUse);
3502 Insts.erase(Insts.begin() + MoveDef);
3503 } else {
3504 Insts.erase(Insts.begin() + MoveDef);
3505 Insts.erase(Insts.begin() + MoveUse);
3506 }
3507 orderDependence(SSD, UseSU, Insts);
3508 orderDependence(SSD, SU, Insts);
3509 orderDependence(SSD, DefSU, Insts);
3510 return;
3511 }
3512 // Put the new instruction first if there is a use in the list. Otherwise,
3513 // put it at the end of the list.
3514 if (OrderBeforeUse)
3515 Insts.push_front(SU);
3516 else
3517 Insts.push_back(SU);
3518}
3519
3520/// Return true if the scheduled Phi has a loop carried operand.
3522 MachineInstr &Phi) const {
3523 if (!Phi.isPHI())
3524 return false;
3525 assert(Phi.isPHI() && "Expecting a Phi.");
3526 SUnit *DefSU = SSD->getSUnit(&Phi);
3527 unsigned DefCycle = cycleScheduled(DefSU);
3528 int DefStage = stageScheduled(DefSU);
3529
3530 Register InitVal;
3531 Register LoopVal;
3532 getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
3533 SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
3534 if (!UseSU)
3535 return true;
3536 if (UseSU->getInstr()->isPHI())
3537 return true;
3538 unsigned LoopCycle = cycleScheduled(UseSU);
3539 int LoopStage = stageScheduled(UseSU);
3540 return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
3541}
3542
3543/// Return true if the instruction is a definition that is loop carried
3544/// and defines the use on the next iteration.
3545/// v1 = phi(v2, v3)
3546/// (Def) v3 = op v1
3547/// (MO) = v1
3548/// If MO appears before Def, then v1 and v3 may get assigned to the same
3549/// register.
3551 MachineInstr *Def,
3552 MachineOperand &MO) const {
3553 if (!MO.isReg())
3554 return false;
3555 if (Def->isPHI())
3556 return false;
3557 MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
3558 if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
3559 return false;
3560 if (!isLoopCarried(SSD, *Phi))
3561 return false;
3562 Register LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
3563 for (MachineOperand &DMO : Def->all_defs()) {
3564 if (DMO.getReg() == LoopReg)
3565 return true;
3566 }
3567 return false;
3568}
3569
3570/// Return true if all scheduled predecessors are loop-carried output/order
3571/// dependencies.
3573 SUnit *SU, const SwingSchedulerDDG *DDG) const {
3574 for (const auto &IE : DDG->getInEdges(SU))
3575 if (InstrToCycle.count(IE.getSrc()))
3576 return false;
3577 return true;
3578}
3579
3580/// Determine transitive dependences of unpipelineable instructions
3583 SmallPtrSet<SUnit *, 8> DoNotPipeline;
3584 SmallVector<SUnit *, 8> Worklist;
3585
3586 for (auto &SU : SSD->SUnits)
3587 if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr()))
3588 Worklist.push_back(&SU);
3589
3590 const SwingSchedulerDDG *DDG = SSD->getDDG();
3591 while (!Worklist.empty()) {
3592 auto SU = Worklist.pop_back_val();
3593 if (DoNotPipeline.count(SU))
3594 continue;
3595 LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");
3596 DoNotPipeline.insert(SU);
3597 for (const auto &IE : DDG->getInEdges(SU))
3598 Worklist.push_back(IE.getSrc());
3599
3600 // To preserve previous behavior and prevent regression
3601 // FIXME: Remove if this doesn't have significant impact on
3602 for (const auto &OE : DDG->getOutEdges(SU))
3603 if (OE.getDistance() == 1)
3604 Worklist.push_back(OE.getDst());
3605 }
3606 return DoNotPipeline;
3607}
3608
3609// Determine all instructions upon which any unpipelineable instruction depends
3610// and ensure that they are in stage 0. If unable to do so, return false.
3614
3615 int NewLastCycle = INT_MIN;
3616 for (SUnit &SU : SSD->SUnits) {
3617 if (!SU.isInstr())
3618 continue;
3619 if (!DNP.contains(&SU) || stageScheduled(&SU) == 0) {
3620 NewLastCycle = std::max(NewLastCycle, InstrToCycle[&SU]);
3621 continue;
3622 }
3623
3624 // Put the non-pipelined instruction as early as possible in the schedule
3625 int NewCycle = getFirstCycle();
3626 for (const auto &IE : SSD->getDDG()->getInEdges(&SU))
3627 if (IE.getDistance() == 0)
3628 NewCycle = std::max(InstrToCycle[IE.getSrc()], NewCycle);
3629
3630 // To preserve previous behavior and prevent regression
3631 // FIXME: Remove if this doesn't have significant impact on performance
3632 for (auto &OE : SSD->getDDG()->getOutEdges(&SU))
3633 if (OE.getDistance() == 1)
3634 NewCycle = std::max(InstrToCycle[OE.getDst()], NewCycle);
3635
3636 int OldCycle = InstrToCycle[&SU];
3637 if (OldCycle != NewCycle) {
3638 InstrToCycle[&SU] = NewCycle;
3639 auto &OldS = getInstructions(OldCycle);
3640 llvm::erase(OldS, &SU);
3641 getInstructions(NewCycle).emplace_back(&SU);
3642 LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum
3643 << ") is not pipelined; moving from cycle " << OldCycle
3644 << " to " << NewCycle << " Instr:" << *SU.getInstr());
3645 }
3646
3647 // We traverse the SUs in the order of the original basic block. Computing
3648 // NewCycle in this order normally works fine because all dependencies
3649 // (except for loop-carried dependencies) don't violate the original order.
3650 // However, an artificial dependency (e.g., added by CopyToPhiMutation) can
3651 // break it. That is, there may be exist an artificial dependency from
3652 // bottom to top. In such a case, NewCycle may become too large to be
3653 // scheduled in Stage 0. For example, assume that Inst0 is in DNP in the
3654 // following case:
3655 //
3656 // | Inst0 <-+
3657 // SU order | | artificial dep
3658 // | Inst1 --+
3659 // v
3660 //
3661 // If Inst1 is scheduled at cycle N and is not at Stage 0, then NewCycle of
3662 // Inst0 must be greater than or equal to N so that Inst0 is not be
3663 // scheduled at Stage 0. In such cases, we reject this schedule at this
3664 // time.
3665 // FIXME: The reason for this is the existence of artificial dependencies
3666 // that are contradict to the original SU order. If ignoring artificial
3667 // dependencies does not affect correctness, then it is better to ignore
3668 // them.
3669 if (FirstCycle + InitiationInterval <= NewCycle)
3670 return false;
3671
3672 NewLastCycle = std::max(NewLastCycle, NewCycle);
3673 }
3674 LastCycle = NewLastCycle;
3675 return true;
3676}
3677
3678// Check if the generated schedule is valid. This function checks if
3679// an instruction that uses a physical register is scheduled in a
3680// different stage than the definition. The pipeliner does not handle
3681// physical register values that may cross a basic block boundary.
3682// Furthermore, if a physical def/use pair is assigned to the same
3683// cycle, orderDependence does not guarantee def/use ordering, so that
3684// case should be considered invalid. (The test checks for both
3685// earlier and same-cycle use to be more robust.)
3687 for (SUnit &SU : SSD->SUnits) {
3688 if (!SU.hasPhysRegDefs)
3689 continue;
3690 int StageDef = stageScheduled(&SU);
3691 int CycleDef = InstrToCycle[&SU];
3692 assert(StageDef != -1 && "Instruction should have been scheduled.");
3693 for (auto &OE : SSD->getDDG()->getOutEdges(&SU)) {
3694 SUnit *Dst = OE.getDst();
3695 if (OE.isAssignedRegDep() && !Dst->isBoundaryNode())
3696 if (OE.getReg().isPhysical()) {
3697 if (stageScheduled(Dst) != StageDef)
3698 return false;
3699 if (InstrToCycle[Dst] <= CycleDef)
3700 return false;
3701 }
3702 }
3703 }
3704 return true;
3705}
3706
3707/// A property of the node order in swing-modulo-scheduling is
3708/// that for nodes outside circuits the following holds:
3709/// none of them is scheduled after both a successor and a
3710/// predecessor.
3711/// The method below checks whether the property is met.
3712/// If not, debug information is printed and statistics information updated.
3713/// Note that we do not use an assert statement.
3714/// The reason is that although an invalid node order may prevent
3715/// the pipeliner from finding a pipelined schedule for arbitrary II,
3716/// it does not lead to the generation of incorrect code.
3717void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
3718
3719 // a sorted vector that maps each SUnit to its index in the NodeOrder
3720 typedef std::pair<SUnit *, unsigned> UnitIndex;
3721 std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
3722
3723 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
3724 Indices.push_back(std::make_pair(NodeOrder[i], i));
3725
3726 auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
3727 return std::get<0>(i1) < std::get<0>(i2);
3728 };
3729
3730 // sort, so that we can perform a binary search
3731 llvm::sort(Indices, CompareKey);
3732
3733 bool Valid = true;
3734 (void)Valid;
3735 // for each SUnit in the NodeOrder, check whether
3736 // it appears after both a successor and a predecessor
3737 // of the SUnit. If this is the case, and the SUnit
3738 // is not part of circuit, then the NodeOrder is not
3739 // valid.
3740 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
3741 SUnit *SU = NodeOrder[i];
3742 unsigned Index = i;
3743
3744 bool PredBefore = false;
3745 bool SuccBefore = false;
3746
3747 SUnit *Succ;
3748 SUnit *Pred;
3749 (void)Succ;
3750 (void)Pred;
3751
3752 for (const auto &IE : DDG->getInEdges(SU)) {
3753 SUnit *PredSU = IE.getSrc();
3754 unsigned PredIndex = std::get<1>(
3755 *llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey));
3756 if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
3757 PredBefore = true;
3758 Pred = PredSU;
3759 break;
3760 }
3761 }
3762
3763 for (const auto &OE : DDG->getOutEdges(SU)) {
3764 SUnit *SuccSU = OE.getDst();
3765 // Do not process a boundary node, it was not included in NodeOrder,
3766 // hence not in Indices either, call to std::lower_bound() below will
3767 // return Indices.end().
3768 if (SuccSU->isBoundaryNode())
3769 continue;
3770 unsigned SuccIndex = std::get<1>(
3771 *llvm::lower_bound(Indices, std::make_pair(SuccSU, 0), CompareKey));
3772 if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
3773 SuccBefore = true;
3774 Succ = SuccSU;
3775 break;
3776 }
3777 }
3778
3779 if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
3780 // instructions in circuits are allowed to be scheduled
3781 // after both a successor and predecessor.
3782 bool InCircuit = llvm::any_of(
3783 Circuits, [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
3784 if (InCircuit)
3785 LLVM_DEBUG(dbgs() << "In a circuit, predecessor ");
3786 else {
3787 Valid = false;
3788 NumNodeOrderIssues++;
3789 LLVM_DEBUG(dbgs() << "Predecessor ");
3790 }
3791 LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
3792 << " are scheduled before node " << SU->NodeNum
3793 << "\n");
3794 }
3795 }
3796
3797 LLVM_DEBUG({
3798 if (!Valid)
3799 dbgs() << "Invalid node order found!\n";
3800 });
3801}
3802
3803/// Attempt to fix the degenerate cases when the instruction serialization
3804/// causes the register lifetimes to overlap. For example,
3805/// p' = store_pi(p, b)
3806/// = load p, offset
3807/// In this case p and p' overlap, which means that two registers are needed.
3808/// Instead, this function changes the load to use p' and updates the offset.
3809void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
3810 Register OverlapReg;
3811 Register NewBaseReg;
3812 for (SUnit *SU : Instrs) {
3813 MachineInstr *MI = SU->getInstr();
3814 for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3815 const MachineOperand &MO = MI->getOperand(i);
3816 // Look for an instruction that uses p. The instruction occurs in the
3817 // same cycle but occurs later in the serialized order.
3818 if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
3819 // Check that the instruction appears in the InstrChanges structure,
3820 // which contains instructions that can have the offset updated.
3822 InstrChanges.find(SU);
3823 if (It != InstrChanges.end()) {
3824 unsigned BasePos, OffsetPos;
3825 // Update the base register and adjust the offset.
3826 if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
3827 MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3828 NewMI->getOperand(BasePos).setReg(NewBaseReg);
3829 int64_t NewOffset =
3830 MI->getOperand(OffsetPos).getImm() - It->second.second;
3831 NewMI->getOperand(OffsetPos).setImm(NewOffset);
3832 SU->setInstr(NewMI);
3833 MISUnitMap[NewMI] = SU;
3834 NewMIs[MI] = NewMI;
3835 }
3836 }
3837 OverlapReg = Register();
3838 NewBaseReg = Register();
3839 break;
3840 }
3841 // Look for an instruction of the form p' = op(p), which uses and defines
3842 // two virtual registers that get allocated to the same physical register.
3843 unsigned TiedUseIdx = 0;
3844 if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
3845 // OverlapReg is p in the example above.
3846 OverlapReg = MI->getOperand(TiedUseIdx).getReg();
3847 // NewBaseReg is p' in the example above.
3848 NewBaseReg = MI->getOperand(i).getReg();
3849 break;
3850 }
3851 }
3852 }
3853}
3854
3855std::deque<SUnit *>
3857 const std::deque<SUnit *> &Instrs) const {
3858 std::deque<SUnit *> NewOrderPhi;
3859 for (SUnit *SU : Instrs) {
3860 if (SU->getInstr()->isPHI())
3861 NewOrderPhi.push_back(SU);
3862 }
3863 std::deque<SUnit *> NewOrderI;
3864 for (SUnit *SU : Instrs) {
3865 if (!SU->getInstr()->isPHI())
3866 orderDependence(SSD, SU, NewOrderI);
3867 }
3868 llvm::append_range(NewOrderPhi, NewOrderI);
3869 return NewOrderPhi;
3870}
3871
3872/// After the schedule has been formed, call this function to combine
3873/// the instructions from the different stages/cycles. That is, this
3874/// function creates a schedule that represents a single iteration.
3876 // Move all instructions to the first stage from later stages.
3877 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3878 for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
3879 ++stage) {
3880 std::deque<SUnit *> &cycleInstrs =
3881 ScheduledInstrs[cycle + (stage * InitiationInterval)];
3882 for (SUnit *SU : llvm::reverse(cycleInstrs))
3883 ScheduledInstrs[cycle].push_front(SU);
3884 }
3885 }
3886
3887 // Erase all the elements in the later stages. Only one iteration should
3888 // remain in the scheduled list, and it contains all the instructions.
3889 for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
3890 ScheduledInstrs.erase(cycle);
3891
3892 // Change the registers in instruction as specified in the InstrChanges
3893 // map. We need to use the new registers to create the correct order.
3894 for (const SUnit &SU : SSD->SUnits)
3895 SSD->applyInstrChange(SU.getInstr(), *this);
3896
3897 // Reorder the instructions in each cycle to fix and improve the
3898 // generated code.
3899 for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
3900 std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
3901 cycleInstrs = reorderInstructions(SSD, cycleInstrs);
3902 SSD->fixupRegisterOverlaps(cycleInstrs);
3903 }
3904
3905 LLVM_DEBUG(dump(););
3906}
3907
3909 os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
3910 << " depth " << MaxDepth << " col " << Colocate << "\n";
3911 for (const auto &I : Nodes)
3912 os << " SU(" << I->NodeNum << ") " << *(I->getInstr());
3913 os << "\n";
3914}
3915
3916#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3917/// Print the schedule information to the given output.
3919 // Iterate over each cycle.
3920 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3921 // Iterate over each instruction in the cycle.
3922 const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
3923 for (SUnit *CI : cycleInstrs->second) {
3924 os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
3925 os << "(" << CI->NodeNum << ") ";
3926 CI->getInstr()->print(os);
3927 os << "\n";
3928 }
3929 }
3930}
3931
3932/// Utility function used for debugging to print the schedule.
3935
3936void ResourceManager::dumpMRT() const {
3937 LLVM_DEBUG({
3938 if (UseDFA)
3939 return;
3940 std::stringstream SS;
3941 SS << "MRT:\n";
3942 SS << std::setw(4) << "Slot";
3943 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3944 SS << std::setw(3) << I;
3945 SS << std::setw(7) << "#Mops"
3946 << "\n";
3947 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3948 SS << std::setw(4) << Slot;
3949 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3950 SS << std::setw(3) << MRT[Slot][I];
3951 SS << std::setw(7) << NumScheduledMops[Slot] << "\n";
3952 }
3953 dbgs() << SS.str();
3954 });
3955}
3956#endif
3957
3959 const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
3960 unsigned ProcResourceID = 0;
3961
3962 // We currently limit the resource kinds to 64 and below so that we can use
3963 // uint64_t for Masks
3964 assert(SM.getNumProcResourceKinds() < 64 &&
3965 "Too many kinds of resources, unsupported");
3966 // Create a unique bitmask for every processor resource unit.
3967 // Skip resource at index 0, since it always references 'InvalidUnit'.
3968 Masks.resize(SM.getNumProcResourceKinds());
3969 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3970 const MCProcResourceDesc &Desc = *SM.getProcResource(I);
3971 if (Desc.SubUnitsIdxBegin)
3972 continue;
3973 Masks[I] = 1ULL << ProcResourceID;
3974 ProcResourceID++;
3975 }
3976 // Create a unique bitmask for every processor resource group.
3977 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3978 const MCProcResourceDesc &Desc = *SM.getProcResource(I);
3979 if (!Desc.SubUnitsIdxBegin)
3980 continue;
3981 Masks[I] = 1ULL << ProcResourceID;
3982 for (unsigned U = 0; U < Desc.NumUnits; ++U)
3983 Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];
3984 ProcResourceID++;
3985 }
3986 LLVM_DEBUG({
3987 if (SwpShowResMask) {
3988 dbgs() << "ProcResourceDesc:\n";
3989 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3990 const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
3991 dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
3992 ProcResource->Name, I, Masks[I],
3993 ProcResource->NumUnits);
3994 }
3995 dbgs() << " -----------------\n";
3996 }
3997 });
3998}
3999
4001 LLVM_DEBUG({
4002 if (SwpDebugResource)
4003 dbgs() << "canReserveResources:\n";
4004 });
4005 if (UseDFA)
4006 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
4007 ->canReserveResources(&SU.getInstr()->getDesc());
4008
4009 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
4010 if (!SCDesc->isValid()) {
4011 LLVM_DEBUG({
4012 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
4013 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
4014 });
4015 return true;
4016 }
4017
4018 reserveResources(SCDesc, Cycle);
4019 bool Result = !isOverbooked();
4020 unreserveResources(SCDesc, Cycle);
4021
4022 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return " << Result << "\n\n");
4023 return Result;
4024}
4025
4026void ResourceManager::reserveResources(SUnit &SU, int Cycle) {
4027 LLVM_DEBUG({
4028 if (SwpDebugResource)
4029 dbgs() << "reserveResources:\n";
4030 });
4031 if (UseDFA)
4032 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
4033 ->reserveResources(&SU.getInstr()->getDesc());
4034
4035 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
4036 if (!SCDesc->isValid()) {
4037 LLVM_DEBUG({
4038 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
4039 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
4040 });
4041 return;
4042 }
4043
4044 reserveResources(SCDesc, Cycle);
4045
4046 LLVM_DEBUG({
4047 if (SwpDebugResource) {
4048 dumpMRT();
4049 dbgs() << "reserveResources: done!\n\n";
4050 }
4051 });
4052}
4053
4054void ResourceManager::reserveResources(const MCSchedClassDesc *SCDesc,
4055 int Cycle) {
4056 assert(!UseDFA);
4057 for (const MCWriteProcResEntry &PRE : make_range(
4058 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
4059 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
4060 ++MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
4061
4062 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
4063 ++NumScheduledMops[positiveModulo(C, InitiationInterval)];
4064}
4065
4066void ResourceManager::unreserveResources(const MCSchedClassDesc *SCDesc,
4067 int Cycle) {
4068 assert(!UseDFA);
4069 for (const MCWriteProcResEntry &PRE : make_range(
4070 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
4071 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
4072 --MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
4073
4074 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
4075 --NumScheduledMops[positiveModulo(C, InitiationInterval)];
4076}
4077
4078bool ResourceManager::isOverbooked() const {
4079 assert(!UseDFA);
4080 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
4081 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
4082 const MCProcResourceDesc *Desc = SM.getProcResource(I);
4083 if (MRT[Slot][I] > Desc->NumUnits)
4084 return true;
4085 }
4086 if (NumScheduledMops[Slot] > IssueWidth)
4087 return true;
4088 }
4089 return false;
4090}
4091
4092int ResourceManager::calculateResMIIDFA() const {
4093 assert(UseDFA);
4094
4095 // Sort the instructions by the number of available choices for scheduling,
4096 // least to most. Use the number of critical resources as the tie breaker.
4097 FuncUnitSorter FUS = FuncUnitSorter(*ST);
4098 for (SUnit &SU : DAG->SUnits)
4099 FUS.calcCriticalResources(*SU.getInstr());
4100 PriorityQueue<MachineInstr *, std::vector<MachineInstr *>, FuncUnitSorter>
4101 FuncUnitOrder(FUS);
4102
4103 for (SUnit &SU : DAG->SUnits)
4104 FuncUnitOrder.push(SU.getInstr());
4105
4107 Resources.push_back(
4108 std::unique_ptr<DFAPacketizer>(TII->CreateTargetScheduleState(*ST)));
4109
4110 while (!FuncUnitOrder.empty()) {
4111 MachineInstr *MI = FuncUnitOrder.top();
4112 FuncUnitOrder.pop();
4113 if (TII->isZeroCost(MI->getOpcode()))
4114 continue;
4115
4116 // Attempt to reserve the instruction in an existing DFA. At least one
4117 // DFA is needed for each cycle.
4118 unsigned NumCycles = DAG->getSUnit(MI)->Latency;
4119 unsigned ReservedCycles = 0;
4120 auto *RI = Resources.begin();
4121 auto *RE = Resources.end();
4122 LLVM_DEBUG({
4123 dbgs() << "Trying to reserve resource for " << NumCycles
4124 << " cycles for \n";
4125 MI->dump();
4126 });
4127 for (unsigned C = 0; C < NumCycles; ++C)
4128 while (RI != RE) {
4129 if ((*RI)->canReserveResources(*MI)) {
4130 (*RI)->reserveResources(*MI);
4131 ++ReservedCycles;
4132 break;
4133 }
4134 RI++;
4135 }
4136 LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
4137 << ", NumCycles:" << NumCycles << "\n");
4138 // Add new DFAs, if needed, to reserve resources.
4139 for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
4141 << "NewResource created to reserve resources"
4142 << "\n");
4143 auto *NewResource = TII->CreateTargetScheduleState(*ST);
4144 assert(NewResource->canReserveResources(*MI) && "Reserve error.");
4145 NewResource->reserveResources(*MI);
4146 Resources.push_back(std::unique_ptr<DFAPacketizer>(NewResource));
4147 }
4148 }
4149
4150 int Resmii = Resources.size();
4151 LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
4152 return Resmii;
4153}
4154
4156 if (UseDFA)
4157 return calculateResMIIDFA();
4158
4159 // Count each resource consumption and divide it by the number of units.
4160 // ResMII is the max value among them.
4161
4162 int NumMops = 0;
4163 SmallVector<uint64_t> ResourceCount(SM.getNumProcResourceKinds());
4164 for (SUnit &SU : DAG->SUnits) {
4165 if (TII->isZeroCost(SU.getInstr()->getOpcode()))
4166 continue;
4167
4168 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
4169 if (!SCDesc->isValid())
4170 continue;
4171
4172 LLVM_DEBUG({
4173 if (SwpDebugResource) {
4174 DAG->dumpNode(SU);
4175 dbgs() << " #Mops: " << SCDesc->NumMicroOps << "\n"
4176 << " WriteProcRes: ";
4177 }
4178 });
4179 NumMops += SCDesc->NumMicroOps;
4180 for (const MCWriteProcResEntry &PRE :
4181 make_range(STI->getWriteProcResBegin(SCDesc),
4182 STI->getWriteProcResEnd(SCDesc))) {
4183 LLVM_DEBUG({
4184 if (SwpDebugResource) {
4185 const MCProcResourceDesc *Desc =
4186 SM.getProcResource(PRE.ProcResourceIdx);
4187 dbgs() << Desc->Name << ": " << PRE.ReleaseAtCycle << ", ";
4188 }
4189 });
4190 ResourceCount[PRE.ProcResourceIdx] += PRE.ReleaseAtCycle;
4191 }
4192 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "\n");
4193 }
4194
4195 int Result = (NumMops + IssueWidth - 1) / IssueWidth;
4196 LLVM_DEBUG({
4197 if (SwpDebugResource)
4198 dbgs() << "#Mops: " << NumMops << ", "
4199 << "IssueWidth: " << IssueWidth << ", "
4200 << "Cycles: " << Result << "\n";
4201 });
4202
4203 LLVM_DEBUG({
4204 if (SwpDebugResource) {
4205 std::stringstream SS;
4206 SS << std::setw(2) << "ID" << std::setw(16) << "Name" << std::setw(10)
4207 << "Units" << std::setw(10) << "Consumed" << std::setw(10) << "Cycles"
4208 << "\n";
4209 dbgs() << SS.str();
4210 }
4211 });
4212 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
4213 const MCProcResourceDesc *Desc = SM.getProcResource(I);
4214 int Cycles = (ResourceCount[I] + Desc->NumUnits - 1) / Desc->NumUnits;
4215 LLVM_DEBUG({
4216 if (SwpDebugResource) {
4217 std::stringstream SS;
4218 SS << std::setw(2) << I << std::setw(16) << Desc->Name << std::setw(10)
4219 << Desc->NumUnits << std::setw(10) << ResourceCount[I]
4220 << std::setw(10) << Cycles << "\n";
4221 dbgs() << SS.str();
4222 }
4223 });
4224 if (Cycles > Result)
4225 Result = Cycles;
4226 }
4227 return Result;
4228}
4229
4231 InitiationInterval = II;
4232 DFAResources.clear();
4233 DFAResources.resize(II);
4234 for (auto &I : DFAResources)
4235 I.reset(ST->getInstrInfo()->CreateTargetScheduleState(*ST));
4236 MRT.clear();
4237 MRT.resize(II, SmallVector<uint64_t>(SM.getNumProcResourceKinds()));
4238 NumScheduledMops.clear();
4239 NumScheduledMops.resize(II);
4240}
4241
4242bool SwingSchedulerDDGEdge::ignoreDependence(bool IgnoreAnti) const {
4243 if (Pred.isArtificial() || Dst->isBoundaryNode())
4244 return true;
4245 // Currently, dependence that is an anti-dependences but not a loop-carried is
4246 // also ignored. This behavior is preserved to prevent regression.
4247 // FIXME: Remove if this doesn't have significant impact on performance
4248 return IgnoreAnti && (Pred.getKind() == SDep::Kind::Anti || Distance != 0);
4249}
4250
4251SwingSchedulerDDG::SwingSchedulerDDGEdges &
4252SwingSchedulerDDG::getEdges(const SUnit *SU) {
4253 if (SU == EntrySU)
4254 return EntrySUEdges;
4255 if (SU == ExitSU)
4256 return ExitSUEdges;
4257 return EdgesVec[SU->NodeNum];
4258}
4259
4260const SwingSchedulerDDG::SwingSchedulerDDGEdges &
4261SwingSchedulerDDG::getEdges(const SUnit *SU) const {
4262 if (SU == EntrySU)
4263 return EntrySUEdges;
4264 if (SU == ExitSU)
4265 return ExitSUEdges;
4266 return EdgesVec[SU->NodeNum];
4267}
4268
4269void SwingSchedulerDDG::addEdge(const SUnit *SU,
4270 const SwingSchedulerDDGEdge &Edge) {
4271 assert(!Edge.isValidationOnly() &&
4272 "Validation-only edges are not expected here.");
4273 auto &Edges = getEdges(SU);
4274 if (Edge.getSrc() == SU)
4275 Edges.Succs.push_back(Edge);
4276 else
4277 Edges.Preds.push_back(Edge);
4278}
4279
4280void SwingSchedulerDDG::initEdges(SUnit *SU) {
4281 for (const auto &PI : SU->Preds) {
4282 SwingSchedulerDDGEdge Edge(SU, PI, /*IsSucc=*/false,
4283 /*IsValidationOnly=*/false);
4284 addEdge(SU, Edge);
4285 }
4286
4287 for (const auto &SI : SU->Succs) {
4288 SwingSchedulerDDGEdge Edge(SU, SI, /*IsSucc=*/true,
4289 /*IsValidationOnly=*/false);
4290 addEdge(SU, Edge);
4291 }
4292}
4293
4294SwingSchedulerDDG::SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU,
4295 SUnit *ExitSU, const LoopCarriedEdges &LCE)
4296 : EntrySU(EntrySU), ExitSU(ExitSU) {
4297 EdgesVec.resize(SUnits.size());
4298
4299 // Add non-loop-carried edges based on the DAG.
4300 initEdges(EntrySU);
4301 initEdges(ExitSU);
4302 for (auto &SU : SUnits)
4303 initEdges(&SU);
4304
4305 // Add loop-carried edges, which are not represented in the DAG.
4306 for (SUnit &SU : SUnits) {
4307 SUnit *Src = &SU;
4308 if (const LoopCarriedEdges::OrderDep *OD = LCE.getOrderDepOrNull(Src)) {
4309 SDep Base(Src, SDep::Barrier);
4310 Base.setLatency(1);
4311 for (SUnit *Dst : *OD) {
4312 SwingSchedulerDDGEdge Edge(Dst, Base, /*IsSucc=*/false,
4313 /*IsValidationOnly=*/true);
4314 Edge.setDistance(1);
4315 ValidationOnlyEdges.push_back(Edge);
4316 }
4317 }
4318 }
4319}
4320
4321const SwingSchedulerDDG::EdgesType &
4323 return getEdges(SU).Preds;
4324}
4325
4326const SwingSchedulerDDG::EdgesType &
4328 return getEdges(SU).Succs;
4329}
4330
4331/// Check if \p Schedule doesn't violate the validation-only dependencies.
4333 unsigned II = Schedule.getInitiationInterval();
4334
4335 auto ExpandCycle = [&](SUnit *SU) {
4336 int Stage = Schedule.stageScheduled(SU);
4337 int Cycle = Schedule.cycleScheduled(SU);
4338 return Cycle + (Stage * II);
4339 };
4340
4341 for (const SwingSchedulerDDGEdge &Edge : ValidationOnlyEdges) {
4342 SUnit *Src = Edge.getSrc();
4343 SUnit *Dst = Edge.getDst();
4344 if (!Src->isInstr() || !Dst->isInstr())
4345 continue;
4346 int CycleSrc = ExpandCycle(Src);
4347 int CycleDst = ExpandCycle(Dst);
4348 int MaxLateStart = CycleDst + Edge.getDistance() * II - Edge.getLatency();
4349 if (CycleSrc > MaxLateStart) {
4350 LLVM_DEBUG({
4351 dbgs() << "Validation failed for edge from " << Src->NodeNum << " to "
4352 << Dst->NodeNum << "\n";
4353 });
4354 return false;
4355 }
4356 }
4357 return true;
4358}
4359
4360void LoopCarriedEdges::modifySUnits(std::vector<SUnit> &SUnits,
4361 const TargetInstrInfo *TII) {
4362 for (SUnit &SU : SUnits) {
4363 SUnit *Src = &SU;
4364 if (auto *OrderDep = getOrderDepOrNull(Src)) {
4365 SDep Dep(Src, SDep::Barrier);
4366 Dep.setLatency(1);
4367 for (SUnit *Dst : *OrderDep) {
4368 SUnit *From = Src;
4369 SUnit *To = Dst;
4370 if (From->NodeNum > To->NodeNum)
4371 std::swap(From, To);
4372
4373 // Add a forward edge if the following conditions are met:
4374 //
4375 // - The instruction of the source node (FromMI) may read memory.
4376 // - The instruction of the target node (ToMI) may modify memory, but
4377 // does not read it.
4378 // - Neither instruction is a global barrier.
4379 // - The load appears before the store in the original basic block.
4380 // - There are no barrier or store instructions between the two nodes.
4381 // - The target node is unreachable from the source node in the current
4382 // DAG.
4383 //
4384 // TODO: These conditions are inherited from a previous implementation,
4385 // and some may no longer be necessary. For now, we conservatively
4386 // retain all of them to avoid regressions, but the logic could
4387 // potentially be simplified
4388 MachineInstr *FromMI = From->getInstr();
4389 MachineInstr *ToMI = To->getInstr();
4390 if (FromMI->mayLoad() && !ToMI->mayLoad() && ToMI->mayStore() &&
4391 !TII->isGlobalMemoryObject(FromMI) &&
4392 !TII->isGlobalMemoryObject(ToMI) && !isSuccOrder(From, To)) {
4393 SDep Pred = Dep;
4394 Pred.setSUnit(From);
4395 To->addPred(Pred);
4396 }
4397 }
4398 }
4399 }
4400}
4401
4403 const MachineRegisterInfo *MRI) const {
4404 const auto *Order = getOrderDepOrNull(SU);
4405
4406 if (!Order)
4407 return;
4408
4409 const auto DumpSU = [](const SUnit *SU) {
4410 std::ostringstream OSS;
4411 OSS << "SU(" << SU->NodeNum << ")";
4412 return OSS.str();
4413 };
4414
4415 dbgs() << " Loop carried edges from " << DumpSU(SU) << "\n"
4416 << " Order\n";
4417 for (SUnit *Dst : *Order)
4418 dbgs() << " " << DumpSU(Dst) << "\n";
4419}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static std::optional< unsigned > getTag(const TargetRegisterInfo *TRI, const MachineInstr &MI, const LoadInfo &LI)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
constexpr LLT S1
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:638
This file defines the DenseMap class.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
static void addEdge(SmallVectorImpl< LazyCallGraph::Edge > &Edges, DenseMap< LazyCallGraph::Node *, int > &EdgeIndexMap, LazyCallGraph::Node &N, LazyCallGraph::Edge::Kind EK)
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
print mir2vec MIR2Vec Vocabulary Printer Pass
Definition MIR2Vec.cpp:593
static cl::opt< int > SwpForceII("pipeliner-force-ii", cl::desc("Force pipeliner to use specified II."), cl::Hidden, cl::init(-1))
A command line argument to force pipeliner to use specified initial interval.
static cl::opt< bool > ExperimentalCodeGen("pipeliner-experimental-cg", cl::Hidden, cl::init(false), cl::desc("Use the experimental peeling code generator for software pipelining"))
static bool hasPHICycleDFS(unsigned Reg, const DenseMap< unsigned, SmallVector< unsigned, 2 > > &PhiDeps, SmallSet< unsigned, 8 > &Visited, SmallSet< unsigned, 8 > &RecStack)
Depth-first search to detect cycles among PHI dependencies.
static cl::opt< bool > MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false), cl::desc("Use the MVE code generator for software pipelining"))
static cl::opt< int > RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden, cl::init(5), cl::desc("Margin representing the unused percentage of " "the register pressure limit"))
static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop, Register &InitVal, Register &LoopVal)
Return the register values for the operands of a Phi instruction.
static cl::opt< bool > SwpDebugResource("pipeliner-dbg-res", cl::Hidden, cl::init(false))
static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker, NodeSet &NS)
Compute the live-out registers for the instructions in a node-set.
static void computeScheduledInsts(const SwingSchedulerDAG *SSD, SMSchedule &Schedule, std::vector< MachineInstr * > &OrderedInsts, DenseMap< MachineInstr *, unsigned > &Stages)
Create an instruction stream that represents a single iteration and stage of each instruction.
static cl::opt< bool > EmitTestAnnotations("pipeliner-annotate-for-testing", cl::Hidden, cl::init(false), cl::desc("Instead of emitting the pipelined code, annotate instructions " "with the generated schedule for feeding into the " "-modulo-schedule-test pass"))
static Register getLoopPhiReg(const MachineInstr &Phi, const MachineBasicBlock *LoopBB)
Return the Phi register value that comes the loop block.
static bool isIntersect(SmallSetVector< SUnit *, 8 > &Set1, const NodeSet &Set2, SmallSetVector< SUnit *, 8 > &Result)
Return true if Set1 contains elements in Set2.
static bool findLoopIncrementValue(const MachineOperand &Op, int &Value)
When Op is a value that is incremented recursively in a loop and there is a unique instruction that i...
static cl::opt< bool > SwpIgnoreRecMII("pipeliner-ignore-recmii", cl::ReallyHidden, cl::desc("Ignore RecMII"))
static cl::opt< int > SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1))
static cl::opt< bool > SwpPruneLoopCarried("pipeliner-prune-loop-carried", cl::desc("Prune loop carried order dependences."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of loop carried order dependences.
static cl::opt< unsigned > SwpMaxNumStores("pipeliner-max-num-stores", cl::desc("Maximum number of stores allwed in the target loop."), cl::Hidden, cl::init(200))
A command line argument to limit the number of store instructions in the target basic block.
static cl::opt< int > SwpMaxMii("pipeliner-max-mii", cl::desc("Size limit for the MII."), cl::Hidden, cl::init(27))
A command line argument to limit minimum initial interval for pipelining.
static bool isSuccOrder(SUnit *SUa, SUnit *SUb)
Return true if SUb can be reached from SUa following the chain edges.
static cl::opt< int > SwpMaxStages("pipeliner-max-stages", cl::desc("Maximum stages allowed in the generated scheduled."), cl::Hidden, cl::init(3))
A command line argument to limit the number of stages in the pipeline.
static cl::opt< bool > EnableSWPOptSize("enable-pipeliner-opt-size", cl::desc("Enable SWP at Os."), cl::Hidden, cl::init(false))
A command line option to enable SWP at -Os.
static bool hasPHICycle(const MachineBasicBlock *LoopHeader, const MachineRegisterInfo &MRI)
static cl::opt< WindowSchedulingFlag > WindowSchedulingOption("window-sched", cl::Hidden, cl::init(WindowSchedulingFlag::WS_On), cl::desc("Set how to use window scheduling algorithm."), cl::values(clEnumValN(WindowSchedulingFlag::WS_Off, "off", "Turn off window algorithm."), clEnumValN(WindowSchedulingFlag::WS_On, "on", "Use window algorithm after SMS algorithm fails."), clEnumValN(WindowSchedulingFlag::WS_Force, "force", "Use window algorithm instead of SMS algorithm.")))
A command line argument to set the window scheduling option.
static bool pred_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Preds, SwingSchedulerDDG *DDG, const NodeSet *S=nullptr)
Compute the Pred_L(O) set, as defined in the paper.
static bool hasLoopCarriedMemDep(const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst, BatchAAResults &BAA, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI, const SwingSchedulerDAG *SSD, bool PerformCheapCheck)
Returns true if there is a loop-carried order dependency from Src to Dst.
static cl::opt< bool > SwpShowResMask("pipeliner-show-mask", cl::Hidden, cl::init(false))
static cl::opt< int > SwpIISearchRange("pipeliner-ii-search-range", cl::desc("Range to search for II"), cl::Hidden, cl::init(10))
static bool computePath(SUnit *Cur, SetVector< SUnit * > &Path, SetVector< SUnit * > &DestNodes, SetVector< SUnit * > &Exclude, SmallPtrSet< SUnit *, 8 > &Visited, SwingSchedulerDDG *DDG)
Return true if there is a path from the specified node to any of the nodes in DestNodes.
static bool succ_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Succs, SwingSchedulerDDG *DDG, const NodeSet *S=nullptr)
Compute the Succ_L(O) set, as defined in the paper.
static cl::opt< bool > LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false), cl::desc("Limit register pressure of scheduled loop"))
static cl::opt< bool > EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true), cl::desc("Enable Software Pipelining"))
A command line option to turn software pipelining on or off.
static cl::opt< bool > SwpPruneDeps("pipeliner-prune-deps", cl::desc("Prune dependences between unrelated Phi nodes."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of chain dependences due to an unrelated Phi.
static SUnit * multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG)
If an instruction has a use that spans multiple iterations, then return true.
static Register findUniqueOperandDefinedInLoop(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the PriorityQueue class.
Remove Loads Into Fake Uses
std::pair< BasicBlock *, BasicBlock * > Edge
This file contains some templates that are useful if you are working with the STL at all.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
Target-Independent Code Generator Pass Configuration Options pass.
Add loop-carried chain dependencies.
void computeDependencies()
The main function to compute loop-carried order-dependencies.
const BitVector & getLoopCarried(unsigned Idx) const
LoopCarriedOrderDepsTracker(SwingSchedulerDAG *SSD, BatchAAResults *BAA, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
bool isNoAlias(const MemoryLocation &LocA, const MemoryLocation &LocB)
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:248
bool erase(const KeyT &Val)
Definition DenseMap.h:322
bool empty() const
Definition DenseMap.h:109
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:233
bool skipFunction(const Function &F) const
Optional passes call this function to check whether the pass should be skipped.
Definition Pass.cpp:188
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isPostIncrement(const MachineInstr &MI) const override
Return true for post-incremented instructions.
DFAPacketizer * CreateTargetScheduleState(const TargetSubtargetInfo &STI) const override
Create machine specific model for scheduling.
bool getBaseAndOffsetPosition(const MachineInstr &MI, unsigned &BasePos, unsigned &OffsetPos) const override
For instructions with a base and offset, return the position of the base register and offset operands...
const InstrStage * beginStage(unsigned ItinClassIndx) const
Return the first stage of the itinerary.
const InstrStage * endStage(unsigned ItinClassIndx) const
Return the last+1 stage of the itinerary.
bool isEmpty() const
Returns true if there are no itineraries.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
bool hasValue() const
TypeSize getValue() const
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
unsigned getSchedClass() const
Return the scheduling class for this instruction.
const MCWriteProcResEntry * getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCWriteProcResEntry * getWriteProcResBegin(const MCSchedClassDesc *SC) const
Return an iterator at the first process resource consumed by the given scheduling class.
const MCSchedModel & getSchedModel() const
Get the machine model for this subtarget's CPU.
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1442
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1440
unsigned getNumOperands() const
Return number of MDNode operands.
Definition Metadata.h:1448
LLVM_ABI StringRef getString() const
Definition Metadata.cpp:618
MachineInstrBundleIterator< const MachineInstr > const_iterator
iterator_range< iterator > phis()
Returns a range that iterates over the phis in the basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Analysis pass which computes a MachineDominatorTree.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
bool mayRaiseFPException() const
Return true if this instruction could possibly raise a floating-point exception.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool isRegSequence() const
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool isIdenticalTo(const MachineInstr &Other, MICheckType Check=CheckDefs) const
Return true if this instruction is identical to Other.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isPseudo(QueryType Type=IgnoreBundle) const
Return true if this is a pseudo instruction that doesn't correspond to a real machine instruction.
LLVM_ABI void dump() const
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
const Value * getValue() const
Return the base address of the memory access.
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
Register getReg() const
getReg - Returns the register number.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
Diagnostic information for optimization analysis remarks.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
The main class in the implementation of the target independent software pipeliner pass.
bool runOnMachineFunction(MachineFunction &MF) override
The "main" function for implementing Swing Modulo Scheduling.
const TargetInstrInfo * TII
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const MachineDominatorTree * MDT
const MachineLoopInfo * MLI
MachineOptimizationRemarkEmitter * ORE
RegisterClassInfo RegClassInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
defusechain_instr_iterator< true, false, false, true > use_instr_iterator
use_instr_iterator/use_instr_begin/use_instr_end - Walk all uses of the specified register,...
static MemoryLocation getBeforeOrAfter(const Value *Ptr, const AAMDNodes &AATags=AAMDNodes())
Return a location that may access any location before or after Ptr, while remaining within the underl...
Expand the kernel using modulo variable expansion algorithm (MVE).
static bool canApply(MachineLoop &L)
Check if ModuloScheduleExpanderMVE can be applied to L.
The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place, rewriting the old loop and...
void cleanup()
Performs final cleanup after expansion.
void expand()
Performs the actual expansion.
Expander that simply annotates each scheduled instruction with a post-instr symbol that can be consum...
void annotate()
Performs the annotation.
Represents a schedule for a single-block loop.
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
SUnit * getNode(unsigned i) const
void print(raw_ostream &os) const
void setRecMII(unsigned mii)
unsigned count(SUnit *SU) const
void setColocate(unsigned c)
int compareRecMII(NodeSet &RHS)
bool insert(SUnit *SU)
LLVM_DUMP_METHOD void dump() const
bool empty() const
void dump() const
Definition Pass.cpp:146
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
A reimplementation of ModuloScheduleExpander.
PointerIntPair - This class implements a pair of a pointer and small integer.
unsigned getPSet() const
Track the current register pressure at some position in the instruction stream, and remember the high...
LLVM_ABI void addLiveRegs(ArrayRef< VRegMaskOrUnit > Regs)
Force liveness of virtual registers or physical register units.
unsigned getRegPressureSetLimit(unsigned Idx) const
Get the register unit limit for the given pressure set index.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
void initProcResourceVectors(const MCSchedModel &SM, SmallVectorImpl< uint64_t > &Masks)
void init(int II)
Initialize resources with the initiation interval II.
bool canReserveResources(SUnit &SU, int Cycle)
Check if the resources occupied by a machine instruction are available in the current state.
Scheduling dependency.
Definition ScheduleDAG.h:51
Kind
These are the different kinds of scheduling dependencies.
Definition ScheduleDAG.h:54
@ Order
Any other ordering dependency.
Definition ScheduleDAG.h:58
@ Anti
A register anti-dependence (aka WAR).
Definition ScheduleDAG.h:56
@ Data
Regular data dependence (aka true-dependence).
Definition ScheduleDAG.h:55
void setLatency(unsigned Lat)
Sets the latency for this edge.
@ Barrier
An unknown scheduling barrier.
Definition ScheduleDAG.h:71
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition ScheduleDAG.h:74
void setSUnit(SUnit *SU)
This class represents the scheduled code.
std::deque< SUnit * > reorderInstructions(const SwingSchedulerDAG *SSD, const std::deque< SUnit * > &Instrs) const
void setInitiationInterval(int ii)
Set the initiation interval for this schedule.
void dump() const
Utility function used for debugging to print the schedule.
bool insert(SUnit *SU, int StartCycle, int EndCycle, int II)
Try to schedule the node at the specified StartCycle and continue until the node is schedule or the E...
int earliestCycleInChain(const SwingSchedulerDDGEdge &Dep, const SwingSchedulerDDG *DDG)
Return the cycle of the earliest scheduled instruction in the dependence chain.
unsigned getMaxStageCount()
Return the maximum stage count needed for this schedule.
void print(raw_ostream &os) const
Print the schedule information to the given output.
bool onlyHasLoopCarriedOutputOrOrderPreds(SUnit *SU, const SwingSchedulerDDG *DDG) const
Return true if all scheduled predecessors are loop-carried output/order dependencies.
int stageScheduled(SUnit *SU) const
Return the stage for a scheduled instruction.
void orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU, std::deque< SUnit * > &Insts) const
Order the instructions within a cycle so that the definitions occur before the uses.
bool isValidSchedule(SwingSchedulerDAG *SSD)
int getInitiationInterval() const
Return the initiation interval for this schedule.
std::deque< SUnit * > & getInstructions(int cycle)
Return the instructions that are scheduled at the specified cycle.
int getFirstCycle() const
Return the first cycle in the completed schedule.
DenseMap< int, std::deque< SUnit * > >::const_iterator const_sched_iterator
bool isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD, MachineInstr *Def, MachineOperand &MO) const
Return true if the instruction is a definition that is loop carried and defines the use on the next i...
unsigned cycleScheduled(SUnit *SU) const
Return the cycle for a scheduled instruction.
SmallPtrSet< SUnit *, 8 > computeUnpipelineableNodes(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
Determine transitive dependences of unpipelineable instructions.
void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart, int II, SwingSchedulerDAG *DAG)
Compute the scheduling start slot for the instruction.
bool normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
bool isLoopCarried(const SwingSchedulerDAG *SSD, MachineInstr &Phi) const
Return true if the scheduled Phi has a loop carried operand.
int latestCycleInChain(const SwingSchedulerDDGEdge &Dep, const SwingSchedulerDDG *DDG)
Return the cycle of the latest scheduled instruction in the dependence chain.
int getFinalCycle() const
Return the last cycle in the finalized schedule.
void finalizeSchedule(SwingSchedulerDAG *SSD)
After the schedule has been formed, call this function to combine the instructions from the different...
Scheduling unit. This is a node in the scheduling DAG.
unsigned NumPreds
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
unsigned NodeNum
Entry # of node in the node vector.
void setInstr(MachineInstr *MI)
Assigns the instruction for the SUnit.
LLVM_ABI void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
bool isPred(const SUnit *N) const
Tests if node N is a predecessor of this node.
unsigned short Latency
Node latency.
bool isBoundaryNode() const
Boundary nodes are placeholders for the boundary of the scheduling region.
bool hasPhysRegDefs
Has physreg defs that are being used.
SmallVector< SDep, 4 > Succs
All sunit successors.
SmallVector< SDep, 4 > Preds
All sunit predecessors.
LLVM_ABI bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
DenseMap< MachineInstr *, SUnit * > MISUnitMap
After calling BuildSchedGraph, each machine instruction in the current scheduling region is mapped to...
virtual void finishBlock()
Cleans up after scheduling in the given block.
MachineBasicBlock * BB
The block in which to insert instructions.
void buildSchedGraph(AAResults *AA, RegPressureTracker *RPTracker=nullptr, PressureDiffs *PDiffs=nullptr, LiveIntervals *LIS=nullptr, bool TrackLaneMasks=false)
Builds SUnits for the current region.
SUnit * getSUnit(MachineInstr *MI) const
Returns an existing SUnit for this MI, or nullptr.
void dump() const override
LLVM_ABI void AddPred(SUnit *Y, SUnit *X)
Updates the topological ordering to accommodate an edge to be added from SUnit X to SUnit Y.
LLVM_ABI bool IsReachable(const SUnit *SU, const SUnit *TargetSU)
Checks if SU is reachable from TargetSU.
MachineRegisterInfo & MRI
Virtual/real register map.
const TargetInstrInfo * TII
Target instruction information.
std::vector< SUnit > SUnits
The scheduling units.
const TargetRegisterInfo * TRI
Target processor register info.
SUnit EntrySU
Special node for the region entry.
MachineFunction & MF
Machine function.
SUnit ExitSU
Special node for the region exit.
A vector that has set insertion semantics.
Definition SetVector.h:58
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:101
void insert_range(Range &&R)
Definition SetVector.h:174
typename vector_type::const_iterator iterator
Definition SetVector.h:70
void clear()
Completely clear the SetVector.
Definition SetVector.h:265
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition SetVector.h:260
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:98
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:149
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition SetVector.h:250
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
iterator end() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:337
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:175
bool erase(const T &V)
Definition SmallSet.h:199
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class builds the dependence graph for the instructions in a loop, and attempts to schedule the i...
void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule)
Apply changes to the instruction if needed.
const SwingSchedulerDDG * getDDG() const
void finishBlock() override
Clean up after the software pipeliner runs.
void fixupRegisterOverlaps(std::deque< SUnit * > &Instrs)
Attempt to fix the degenerate cases when the instruction serialization causes the register lifetimes ...
bool isLoopCarriedDep(const SwingSchedulerDDGEdge &Edge) const
Return true for an order or output dependence that is loop carried potentially.
void schedule() override
We override the schedule function in ScheduleDAGInstrs to implement the scheduling part of the Swing ...
bool mayOverlapInLaterIter(const MachineInstr *BaseMI, const MachineInstr *OtherMI) const
Return false if there is no overlap between the region accessed by BaseMI in an iteration and the reg...
Register getInstrBaseReg(SUnit *SU) const
Return the new base register that was stored away for the changed instruction.
Represents a dependence between two instruction.
SUnit * getDst() const
Returns the SUnit to which the edge points (destination node).
bool ignoreDependence(bool IgnoreAnti) const
Returns true for DDG nodes that we ignore when computing the cost functions.
SUnit * getSrc() const
Returns the SUnit from which the edge comes (source node).
This class provides APIs to retrieve edges from/to an SUnit node, with a particular focus on loop-car...
SwingSchedulerDDG(std::vector< SUnit > &SUnits, SUnit *EntrySU, SUnit *ExitSU, const LoopCarriedEdges &LCE)
const EdgesType & getInEdges(const SUnit *SU) const
bool isValidSchedule(const SMSchedule &Schedule) const
Check if Schedule doesn't violate the validation-only dependencies.
const EdgesType & getOutEdges(const SUnit *SU) const
Object returned by analyzeLoopForPipelining.
virtual bool shouldIgnoreForPipelining(const MachineInstr *MI) const =0
Return true if the given instruction should not be pipelined and should be ignored.
TargetInstrInfo - Interface to description of machine instruction set.
Primary interface to the complete machine description for the target machine.
Target-Independent Code Generator Pass Configuration Options.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual bool enableMachinePipeliner() const
True if the subtarget should run MachinePipeliner.
virtual bool useDFAforSMS() const
Default to DFA for resource management, return false when target will use ProcResource in InstrSchedM...
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
virtual const InstrItineraryData * getInstrItineraryData() const
getInstrItineraryData - Returns instruction itinerary data for the target or specific subtarget.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
Wrapper class representing a virtual register or register unit.
Definition Register.h:181
constexpr bool isVirtualReg() const
Definition Register.h:197
constexpr MCRegUnit asMCRegUnit() const
Definition Register.h:201
constexpr Register asVirtualReg() const
Definition Register.h:206
The main class in the implementation of the target independent window scheduler.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
@ Valid
The data is already valid.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
Definition Metadata.h:667
constexpr double e
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
std::set< NodeId > NodeSet
Definition RDFGraph.h:551
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
Definition SFrame.h:77
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:532
void stable_sort(R &&Range)
Definition STLExtras.h:2058
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1655
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2128
CycleInfo::CycleT Cycle
Definition CycleInfo.h:24
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
static int64_t computeDelta(SectionEntry *A, SectionEntry *B)
@ WS_Force
Use window algorithm after SMS algorithm fails.
@ WS_On
Turn off window algorithm.
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1622
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
@ Other
Any other memory.
Definition ModRef.h:68
cl::opt< bool > SwpEnableCopyToPhi
unsigned getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:1994
LLVM_ABI char & MachinePipelinerID
This pass performs software pipelining on machine instructions.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
cl::opt< int > SwpForceIssueWidth
A command line argument to force pipeliner to use specified issue width.
@ Increment
Incrementally increasing token ID.
Definition AllocToken.h:26
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=MaxLookupSearchDepth)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
LLVM_ABI bool isIdentifiedObject(const Value *V)
Return true if this pointer refers to a distinct and identifiable object.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
This class holds an SUnit corresponding to a memory operation and other information related to the in...
const Value * MemOpValue
The value of a memory operand.
SmallVector< const Value *, 2 > UnderlyingObjs
bool isTriviallyDisjoint(const SUnitWithMemInfo &Other) const
int64_t MemOpOffset
The offset of a memory operand.
bool IsAllIdentified
True if all the underlying objects are identified.
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:761
uint64_t FuncUnits
Bitmask representing a set of functional units.
static constexpr LaneBitmask getNone()
Definition LaneBitmask.h:81
Represents loop-carried dependencies.
SmallSetVector< SUnit *, 8 > OrderDep
const OrderDep * getOrderDepOrNull(SUnit *Key) const
void modifySUnits(std::vector< SUnit > &SUnits, const TargetInstrInfo *TII)
Adds some edges to the original DAG that correspond to loop-carried dependencies.
void dump(SUnit *SU, const TargetRegisterInfo *TRI, const MachineRegisterInfo *MRI) const
Define a kind of processor resource that will be modeled by the scheduler.
Definition MCSchedule.h:36
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:123
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:258
const MCSchedClassDesc * getSchedClassDesc(unsigned SchedClassIdx) const
Definition MCSchedule.h:366
bool hasInstrSchedModel() const
Does this machine model include instruction-level scheduling.
Definition MCSchedule.h:340
const MCProcResourceDesc * getProcResource(unsigned ProcResourceIdx) const
Definition MCSchedule.h:359
Identify one of the processor resource kinds consumed by a particular scheduling class for the specif...
Definition MCSchedule.h:68
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
std::vector< unsigned > MaxSetPressure
Map of max reg pressure indexed by pressure set ID, not class ID.