LLVM 23.0.0git
MachinePipeliner.cpp
Go to the documentation of this file.
1//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
10//
11// This SMS implementation is a target-independent back-end pass. When enabled,
12// the pass runs just prior to the register allocation pass, while the machine
13// IR is in SSA form. If software pipelining is successful, then the original
14// loop is replaced by the optimized loop. The optimized loop contains one or
15// more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16// the instructions cannot be scheduled in a given MII, we increase the MII by
17// one and try again.
18//
19// The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20// represent loop carried dependences in the DAG as order edges to the Phi
21// nodes. We also perform several passes over the DAG to eliminate unnecessary
22// edges that inhibit the ability to pipeline. The implementation uses the
23// DFAPacketizer class to compute the minimum initiation interval and the check
24// where an instruction may be inserted in the pipelined schedule.
25//
26// In order for the SMS pass to work, several target specific hooks need to be
27// implemented to get information about the loop structure and to rewrite
28// instructions.
29//
30//===----------------------------------------------------------------------===//
31
33#include "llvm/ADT/ArrayRef.h"
34#include "llvm/ADT/BitVector.h"
35#include "llvm/ADT/DenseMap.h"
37#include "llvm/ADT/STLExtras.h"
39#include "llvm/ADT/SetVector.h"
41#include "llvm/ADT/SmallSet.h"
43#include "llvm/ADT/Statistic.h"
72#include "llvm/Config/llvm-config.h"
73#include "llvm/IR/Attributes.h"
74#include "llvm/IR/Function.h"
76#include "llvm/MC/LaneBitmask.h"
77#include "llvm/MC/MCInstrDesc.h"
79#include "llvm/Pass.h"
82#include "llvm/Support/Debug.h"
84#include <algorithm>
85#include <cassert>
86#include <climits>
87#include <cstdint>
88#include <deque>
89#include <functional>
90#include <iomanip>
91#include <iterator>
92#include <map>
93#include <memory>
94#include <sstream>
95#include <tuple>
96#include <utility>
97#include <vector>
98
99using namespace llvm;
100
101#define DEBUG_TYPE "pipeliner"
102
103STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
104STATISTIC(NumPipelined, "Number of loops software pipelined");
105STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
106STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
107STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
108STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
109STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
110STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
111STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
112STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
113STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
114STATISTIC(NumFailTooManyStores, "Pipeliner abort due to too many stores");
115
116/// A command line option to turn software pipelining on or off.
117static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
118 cl::desc("Enable Software Pipelining"));
119
120/// A command line option to enable SWP at -Os.
121static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
122 cl::desc("Enable SWP at Os."), cl::Hidden,
123 cl::init(false));
124
125/// A command line argument to limit minimum initial interval for pipelining.
126static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
127 cl::desc("Size limit for the MII."),
128 cl::Hidden, cl::init(27));
129
130/// A command line argument to force pipeliner to use specified initial
131/// interval.
132static cl::opt<int> SwpForceII("pipeliner-force-ii",
133 cl::desc("Force pipeliner to use specified II."),
134 cl::Hidden, cl::init(-1));
135
136/// A command line argument to limit the number of stages in the pipeline.
137static cl::opt<int>
138 SwpMaxStages("pipeliner-max-stages",
139 cl::desc("Maximum stages allowed in the generated scheduled."),
140 cl::Hidden, cl::init(3));
141
142/// A command line option to disable the pruning of chain dependences due to
143/// an unrelated Phi.
144static cl::opt<bool>
145 SwpPruneDeps("pipeliner-prune-deps",
146 cl::desc("Prune dependences between unrelated Phi nodes."),
147 cl::Hidden, cl::init(true));
148
149/// A command line option to disable the pruning of loop carried order
150/// dependences.
151static cl::opt<bool>
152 SwpPruneLoopCarried("pipeliner-prune-loop-carried",
153 cl::desc("Prune loop carried order dependences."),
154 cl::Hidden, cl::init(true));
155
156#ifndef NDEBUG
157static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
158#endif
159
160static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
162 cl::desc("Ignore RecMII"));
163
164static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
165 cl::init(false));
166static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
167 cl::init(false));
168
170 "pipeliner-annotate-for-testing", cl::Hidden, cl::init(false),
171 cl::desc("Instead of emitting the pipelined code, annotate instructions "
172 "with the generated schedule for feeding into the "
173 "-modulo-schedule-test pass"));
174
176 "pipeliner-experimental-cg", cl::Hidden, cl::init(false),
177 cl::desc(
178 "Use the experimental peeling code generator for software pipelining"));
179
180static cl::opt<int> SwpIISearchRange("pipeliner-ii-search-range",
181 cl::desc("Range to search for II"),
182 cl::Hidden, cl::init(10));
183
184static cl::opt<bool>
185 LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false),
186 cl::desc("Limit register pressure of scheduled loop"));
187
188static cl::opt<int>
189 RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden,
190 cl::init(5),
191 cl::desc("Margin representing the unused percentage of "
192 "the register pressure limit"));
193
194static cl::opt<bool>
195 MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false),
196 cl::desc("Use the MVE code generator for software pipelining"));
197
198/// A command line argument to limit the number of store instructions in the
199/// target basic block.
201 "pipeliner-max-num-stores",
202 cl::desc("Maximum number of stores allwed in the target loop."), cl::Hidden,
203 cl::init(200));
204
205// A command line option to enable the CopyToPhi DAG mutation.
207 llvm::SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
208 cl::init(true),
209 cl::desc("Enable CopyToPhi DAG Mutation"));
210
211/// A command line argument to force pipeliner to use specified issue
212/// width.
214 "pipeliner-force-issue-width",
215 cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
216 cl::init(-1));
217
218/// A command line argument to set the window scheduling option.
221 cl::desc("Set how to use window scheduling algorithm."),
223 "Turn off window algorithm."),
225 "Use window algorithm after SMS algorithm fails."),
227 "Use window algorithm instead of SMS algorithm.")));
228
229unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
230char MachinePipeliner::ID = 0;
231#ifndef NDEBUG
233#endif
235
237 "Modulo Software Pipelining", false, false)
243 "Modulo Software Pipelining", false, false)
244
245namespace {
246
247/// This class holds an SUnit corresponding to a memory operation and other
248/// information related to the instruction.
252
253 /// The value of a memory operand.
254 const Value *MemOpValue = nullptr;
255
256 /// The offset of a memory operand.
257 int64_t MemOpOffset = 0;
258
260
261 /// True if all the underlying objects are identified.
262 bool IsAllIdentified = false;
263
265
266 bool isTriviallyDisjoint(const SUnitWithMemInfo &Other) const;
267
268 bool isUnknown() const { return MemOpValue == nullptr; }
269
270private:
272};
273
274/// Add loop-carried chain dependencies. This class handles the same type of
275/// dependencies added by `ScheduleDAGInstrs::buildSchedGraph`, but takes into
276/// account dependencies across iterations.
278 // Type of instruction that is relevant to order-dependencies
279 enum class InstrTag {
280 Barrier = 0, ///< A barrier event instruction.
281 LoadOrStore = 1, ///< An instruction that may load or store memory, but is
282 ///< not a barrier event.
283 FPExceptions = 2, ///< An instruction that does not match above, but may
284 ///< raise floatin-point exceptions.
285 };
286
287 struct TaggedSUnit : PointerIntPair<SUnit *, 2> {
288 TaggedSUnit(SUnit *SU, InstrTag Tag)
289 : PointerIntPair<SUnit *, 2>(SU, unsigned(Tag)) {}
290
291 InstrTag getTag() const { return InstrTag(getInt()); }
292 };
293
294 /// Holds loads and stores with memory related information.
295 struct LoadStoreChunk {
298
299 void append(SUnit *SU);
300 };
301
303 BatchAAResults *BAA;
304 std::vector<SUnit> &SUnits;
305
306 /// The size of SUnits, for convenience.
307 const unsigned N;
308
309 /// Loop-carried Edges.
310 std::vector<BitVector> LoopCarried;
311
312 /// Instructions related to chain dependencies. They are one of the
313 /// following:
314 ///
315 /// 1. Barrier event.
316 /// 2. Load, but neither a barrier event, invariant load, nor may load trap
317 /// value.
318 /// 3. Store, but not a barrier event.
319 /// 4. None of them, but may raise floating-point exceptions.
320 ///
321 /// This is used when analyzing loop-carried dependencies that access global
322 /// barrier instructions.
323 std::vector<TaggedSUnit> TaggedSUnits;
324
325 const TargetInstrInfo *TII = nullptr;
326 const TargetRegisterInfo *TRI = nullptr;
327
328public:
330 const TargetInstrInfo *TII,
331 const TargetRegisterInfo *TRI);
332
333 /// The main function to compute loop-carried order-dependencies.
334 void computeDependencies();
335
336 const BitVector &getLoopCarried(unsigned Idx) const {
337 return LoopCarried[Idx];
338 }
339
340private:
341 /// Tags to \p SU if the instruction may affect the order-dependencies.
342 std::optional<InstrTag> getInstrTag(SUnit *SU) const;
343
344 void addLoopCarriedDepenenciesForChunks(const LoadStoreChunk &From,
345 const LoadStoreChunk &To);
346
347 /// Add a loop-carried order dependency between \p Src and \p Dst if we
348 /// cannot prove they are independent.
349 void addDependenciesBetweenSUs(const SUnitWithMemInfo &Src,
350 const SUnitWithMemInfo &Dst);
351
352 void computeDependenciesAux();
353};
354
355} // end anonymous namespace
356
357/// The "main" function for implementing Swing Modulo Scheduling.
359 if (skipFunction(mf.getFunction()))
360 return false;
361
362 if (!EnableSWP)
363 return false;
364
365 if (mf.getFunction().getAttributes().hasFnAttr(Attribute::OptimizeForSize) &&
366 !EnableSWPOptSize.getPosition())
367 return false;
368
370 return false;
371
372 // Cannot pipeline loops without instruction itineraries if we are using
373 // DFA for the pipeliner.
374 if (mf.getSubtarget().useDFAforSMS() &&
377 return false;
378
379 MF = &mf;
383 TII = MF->getSubtarget().getInstrInfo();
384 RegClassInfo.runOnMachineFunction(*MF);
385
386 for (const auto &L : *MLI)
387 scheduleLoop(*L);
388
389 return false;
390}
391
392/// Attempt to perform the SMS algorithm on the specified loop. This function is
393/// the main entry point for the algorithm. The function identifies candidate
394/// loops, calculates the minimum initiation interval, and attempts to schedule
395/// the loop.
396bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
397 bool Changed = false;
398 for (const auto &InnerLoop : L)
399 Changed |= scheduleLoop(*InnerLoop);
400
401#ifndef NDEBUG
402 // Stop trying after reaching the limit (if any).
403 int Limit = SwpLoopLimit;
404 if (Limit >= 0) {
405 if (NumTries >= SwpLoopLimit)
406 return Changed;
407 NumTries++;
408 }
409#endif
410
411 setPragmaPipelineOptions(L);
412 if (!canPipelineLoop(L)) {
413 LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
414 ORE->emit([&]() {
415 return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
416 L.getStartLoc(), L.getHeader())
417 << "Failed to pipeline loop";
418 });
419
420 LI.LoopPipelinerInfo.reset();
421 return Changed;
422 }
423
424 ++NumTrytoPipeline;
425 if (useSwingModuloScheduler())
426 Changed = swingModuloScheduler(L);
427
428 if (useWindowScheduler(Changed))
429 Changed = runWindowScheduler(L);
430
431 LI.LoopPipelinerInfo.reset();
432 return Changed;
433}
434
435void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
436 // Reset the pragma for the next loop in iteration.
437 disabledByPragma = false;
438 II_setByPragma = 0;
439
440 MachineBasicBlock *LBLK = L.getTopBlock();
441
442 if (LBLK == nullptr)
443 return;
444
445 const BasicBlock *BBLK = LBLK->getBasicBlock();
446 if (BBLK == nullptr)
447 return;
448
449 const Instruction *TI = BBLK->getTerminator();
450 if (TI == nullptr)
451 return;
452
453 MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);
454 if (LoopID == nullptr)
455 return;
456
457 assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");
458 assert(LoopID->getOperand(0) == LoopID && "invalid loop");
459
460 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
461 MDNode *MD = dyn_cast<MDNode>(MDO);
462
463 if (MD == nullptr)
464 continue;
465
466 MDString *S = dyn_cast<MDString>(MD->getOperand(0));
467
468 if (S == nullptr)
469 continue;
470
471 if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
472 assert(MD->getNumOperands() == 2 &&
473 "Pipeline initiation interval hint metadata should have two operands.");
475 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
476 assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");
477 } else if (S->getString() == "llvm.loop.pipeline.disable") {
478 disabledByPragma = true;
479 }
480 }
481}
482
483/// Depth-first search to detect cycles among PHI dependencies.
484/// Returns true if a cycle is detected within the PHI-only subgraph.
485static bool hasPHICycleDFS(
486 unsigned Reg, const DenseMap<unsigned, SmallVector<unsigned, 2>> &PhiDeps,
487 SmallSet<unsigned, 8> &Visited, SmallSet<unsigned, 8> &RecStack) {
488
489 // If Reg is not a PHI-def it cannot contribute to a PHI cycle.
490 auto It = PhiDeps.find(Reg);
491 if (It == PhiDeps.end())
492 return false;
493
494 if (RecStack.count(Reg))
495 return true; // backedge.
496 if (Visited.count(Reg))
497 return false;
498
499 Visited.insert(Reg);
500 RecStack.insert(Reg);
501
502 for (unsigned Dep : It->second) {
503 if (hasPHICycleDFS(Dep, PhiDeps, Visited, RecStack))
504 return true;
505 }
506
507 RecStack.erase(Reg);
508 return false;
509}
510
511static bool hasPHICycle(const MachineBasicBlock *LoopHeader,
512 const MachineRegisterInfo &MRI) {
514
515 // Collect PHI nodes and their dependencies.
516 for (const MachineInstr &MI : LoopHeader->phis()) {
517 unsigned DefReg = MI.getOperand(0).getReg();
518 auto Ins = PhiDeps.try_emplace(DefReg).first;
519
520 // PHI operands are (Reg, MBB) pairs starting at index 1.
521 for (unsigned I = 1; I < MI.getNumOperands(); I += 2)
522 Ins->second.push_back(MI.getOperand(I).getReg());
523 }
524
525 // DFS to detect cycles among PHI nodes.
526 SmallSet<unsigned, 8> Visited, RecStack;
527
528 // Start DFS from each PHI-def.
529 for (const auto &KV : PhiDeps) {
530 unsigned Reg = KV.first;
531 if (hasPHICycleDFS(Reg, PhiDeps, Visited, RecStack))
532 return true;
533 }
534
535 return false;
536}
537
538/// Return true if the loop can be software pipelined. The algorithm is
539/// restricted to loops with a single basic block. Make sure that the
540/// branch in the loop can be analyzed.
541bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
542 if (L.getNumBlocks() != 1) {
543 ORE->emit([&]() {
544 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
545 L.getStartLoc(), L.getHeader())
546 << "Not a single basic block: "
547 << ore::NV("NumBlocks", L.getNumBlocks());
548 });
549 return false;
550 }
551
552 if (hasPHICycle(L.getHeader(), MF->getRegInfo())) {
553 LLVM_DEBUG(dbgs() << "Cannot pipeline loop due to PHI cycle\n");
554 return false;
555 }
556
557 if (disabledByPragma) {
558 ORE->emit([&]() {
559 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
560 L.getStartLoc(), L.getHeader())
561 << "Disabled by Pragma.";
562 });
563 return false;
564 }
565
566 // Check if the branch can't be understood because we can't do pipelining
567 // if that's the case.
568 LI.TBB = nullptr;
569 LI.FBB = nullptr;
570 LI.BrCond.clear();
571 if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
572 LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
573 NumFailBranch++;
574 ORE->emit([&]() {
575 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
576 L.getStartLoc(), L.getHeader())
577 << "The branch can't be understood";
578 });
579 return false;
580 }
581
582 LI.LoopInductionVar = nullptr;
583 LI.LoopCompare = nullptr;
584 LI.LoopPipelinerInfo = TII->analyzeLoopForPipelining(L.getTopBlock());
585 if (!LI.LoopPipelinerInfo) {
586 LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
587 NumFailLoop++;
588 ORE->emit([&]() {
589 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
590 L.getStartLoc(), L.getHeader())
591 << "The loop structure is not supported";
592 });
593 return false;
594 }
595
596 if (!L.getLoopPreheader()) {
597 LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
598 NumFailPreheader++;
599 ORE->emit([&]() {
600 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
601 L.getStartLoc(), L.getHeader())
602 << "No loop preheader found";
603 });
604 return false;
605 }
606
607 unsigned NumStores = 0;
608 for (MachineInstr &MI : *L.getHeader())
609 if (MI.mayStore())
610 ++NumStores;
611 if (NumStores > SwpMaxNumStores) {
612 LLVM_DEBUG(dbgs() << "Too many stores\n");
613 NumFailTooManyStores++;
614 ORE->emit([&]() {
615 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
616 L.getStartLoc(), L.getHeader())
617 << "Too many store instructions in the loop: "
618 << ore::NV("NumStores", NumStores) << " > "
619 << ore::NV("SwpMaxNumStores", SwpMaxNumStores) << ".";
620 });
621 return false;
622 }
623
624 // Remove any subregisters from inputs to phi nodes.
625 preprocessPhiNodes(*L.getHeader());
626 return true;
627}
628
629void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
630 MachineRegisterInfo &MRI = MF->getRegInfo();
631 SlotIndexes &Slots =
632 *getAnalysis<LiveIntervalsWrapperPass>().getLIS().getSlotIndexes();
633
634 for (MachineInstr &PI : B.phis()) {
635 MachineOperand &DefOp = PI.getOperand(0);
636 assert(DefOp.getSubReg() == 0);
637 auto *RC = MRI.getRegClass(DefOp.getReg());
638
639 for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
640 MachineOperand &RegOp = PI.getOperand(i);
641 if (RegOp.getSubReg() == 0)
642 continue;
643
644 // If the operand uses a subregister, replace it with a new register
645 // without subregisters, and generate a copy to the new register.
646 Register NewReg = MRI.createVirtualRegister(RC);
647 MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
649 const DebugLoc &DL = PredB.findDebugLoc(At);
650 auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
651 .addReg(RegOp.getReg(), getRegState(RegOp),
652 RegOp.getSubReg());
653 Slots.insertMachineInstrInMaps(*Copy);
654 RegOp.setReg(NewReg);
655 RegOp.setSubReg(0);
656 }
657 }
658}
659
660/// The SMS algorithm consists of the following main steps:
661/// 1. Computation and analysis of the dependence graph.
662/// 2. Ordering of the nodes (instructions).
663/// 3. Attempt to Schedule the loop.
664bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
665 assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
666
667 AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
668 SwingSchedulerDAG SMS(
670 II_setByPragma, LI.LoopPipelinerInfo.get(), AA);
671
672 MachineBasicBlock *MBB = L.getHeader();
673 // The kernel should not include any terminator instructions. These
674 // will be added back later.
675 SMS.startBlock(MBB);
676
677 // Compute the number of 'real' instructions in the basic block by
678 // ignoring terminators.
679 unsigned size = MBB->size();
681 E = MBB->instr_end();
682 I != E; ++I, --size)
683 ;
684
685 SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
686 SMS.schedule();
687 SMS.exitRegion();
688
689 SMS.finishBlock();
690 return SMS.hasNewSchedule();
691}
692
703
704bool MachinePipeliner::runWindowScheduler(MachineLoop &L) {
705 MachineSchedContext Context;
706 Context.MF = MF;
707 Context.MLI = MLI;
708 Context.MDT = MDT;
709 Context.TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
710 Context.AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
711 Context.LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
712 Context.RegClassInfo->runOnMachineFunction(*MF);
713 WindowScheduler WS(&Context, L);
714 return WS.run();
715}
716
717bool MachinePipeliner::useSwingModuloScheduler() {
718 // SwingModuloScheduler does not work when WindowScheduler is forced.
720}
721
722bool MachinePipeliner::useWindowScheduler(bool Changed) {
723 // WindowScheduler does not work for following cases:
724 // 1. when it is off.
725 // 2. when SwingModuloScheduler is successfully scheduled.
726 // 3. when pragma II is enabled.
727 if (II_setByPragma) {
728 LLVM_DEBUG(dbgs() << "Window scheduling is disabled when "
729 "llvm.loop.pipeline.initiationinterval is set.\n");
730 return false;
731 }
732
735}
736
737void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
738 if (SwpForceII > 0)
739 MII = SwpForceII;
740 else if (II_setByPragma > 0)
741 MII = II_setByPragma;
742 else
743 MII = std::max(ResMII, RecMII);
744}
745
746void SwingSchedulerDAG::setMAX_II() {
747 if (SwpForceII > 0)
748 MAX_II = SwpForceII;
749 else if (II_setByPragma > 0)
750 MAX_II = II_setByPragma;
751 else
752 MAX_II = MII + SwpIISearchRange;
753}
754
755/// We override the schedule function in ScheduleDAGInstrs to implement the
756/// scheduling part of the Swing Modulo Scheduling algorithm.
758 buildSchedGraph(AA);
759 const LoopCarriedEdges LCE = addLoopCarriedDependences();
760 updatePhiDependences();
761 Topo.InitDAGTopologicalSorting();
762 changeDependences();
763 postProcessDAG();
764 DDG = std::make_unique<SwingSchedulerDDG>(SUnits, &EntrySU, &ExitSU, LCE);
765 LLVM_DEBUG({
766 dump();
767 dbgs() << "===== Loop Carried Edges Begin =====\n";
768 for (SUnit &SU : SUnits)
769 LCE.dump(&SU, TRI, &MRI);
770 dbgs() << "===== Loop Carried Edges End =====\n";
771 });
772
773 NodeSetType NodeSets;
774 findCircuits(NodeSets);
775 NodeSetType Circuits = NodeSets;
776
777 // Calculate the MII.
778 unsigned ResMII = calculateResMII();
779 unsigned RecMII = calculateRecMII(NodeSets);
780
781 fuseRecs(NodeSets);
782
783 // This flag is used for testing and can cause correctness problems.
784 if (SwpIgnoreRecMII)
785 RecMII = 0;
786
787 setMII(ResMII, RecMII);
788 setMAX_II();
789
790 LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
791 << " (rec=" << RecMII << ", res=" << ResMII << ")\n");
792
793 // Can't schedule a loop without a valid MII.
794 if (MII == 0) {
795 LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
796 NumFailZeroMII++;
797 Pass.ORE->emit([&]() {
799 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
800 << "Invalid Minimal Initiation Interval: 0";
801 });
802 return;
803 }
804
805 // Don't pipeline large loops.
806 if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {
807 LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
808 << ", we don't pipeline large loops\n");
809 NumFailLargeMaxMII++;
810 Pass.ORE->emit([&]() {
812 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
813 << "Minimal Initiation Interval too large: "
814 << ore::NV("MII", (int)MII) << " > "
815 << ore::NV("SwpMaxMii", SwpMaxMii) << "."
816 << "Refer to -pipeliner-max-mii.";
817 });
818 return;
819 }
820
821 computeNodeFunctions(NodeSets);
822
823 registerPressureFilter(NodeSets);
824
825 colocateNodeSets(NodeSets);
826
827 checkNodeSets(NodeSets);
828
829 LLVM_DEBUG({
830 for (auto &I : NodeSets) {
831 dbgs() << " Rec NodeSet ";
832 I.dump();
833 }
834 });
835
836 llvm::stable_sort(NodeSets, std::greater<NodeSet>());
837
838 groupRemainingNodes(NodeSets);
839
840 removeDuplicateNodes(NodeSets);
841
842 LLVM_DEBUG({
843 for (auto &I : NodeSets) {
844 dbgs() << " NodeSet ";
845 I.dump();
846 }
847 });
848
849 computeNodeOrder(NodeSets);
850
851 // check for node order issues
852 checkValidNodeOrder(Circuits);
853
854 SMSchedule Schedule(Pass.MF, this);
855 Scheduled = schedulePipeline(Schedule);
856
857 if (!Scheduled){
858 LLVM_DEBUG(dbgs() << "No schedule found, return\n");
859 NumFailNoSchedule++;
860 Pass.ORE->emit([&]() {
862 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
863 << "Unable to find schedule";
864 });
865 return;
866 }
867
868 unsigned numStages = Schedule.getMaxStageCount();
869 // No need to generate pipeline if there are no overlapped iterations.
870 if (numStages == 0) {
871 LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
872 NumFailZeroStage++;
873 Pass.ORE->emit([&]() {
875 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
876 << "No need to pipeline - no overlapped iterations in schedule.";
877 });
878 return;
879 }
880 // Check that the maximum stage count is less than user-defined limit.
881 if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {
882 LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
883 << " : too many stages, abort\n");
884 NumFailLargeMaxStage++;
885 Pass.ORE->emit([&]() {
887 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
888 << "Too many stages in schedule: "
889 << ore::NV("numStages", (int)numStages) << " > "
890 << ore::NV("SwpMaxStages", SwpMaxStages)
891 << ". Refer to -pipeliner-max-stages.";
892 });
893 return;
894 }
895
896 Pass.ORE->emit([&]() {
897 return MachineOptimizationRemark(DEBUG_TYPE, "schedule", Loop.getStartLoc(),
898 Loop.getHeader())
899 << "Pipelined succesfully!";
900 });
901
902 // Generate the schedule as a ModuloSchedule.
903 DenseMap<MachineInstr *, int> Cycles, Stages;
904 std::vector<MachineInstr *> OrderedInsts;
905 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
906 ++Cycle) {
907 for (SUnit *SU : Schedule.getInstructions(Cycle)) {
908 OrderedInsts.push_back(SU->getInstr());
909 Cycles[SU->getInstr()] = Cycle;
910 Stages[SU->getInstr()] = Schedule.stageScheduled(SU);
911 }
912 }
914 for (auto &KV : NewMIs) {
915 Cycles[KV.first] = Cycles[KV.second];
916 Stages[KV.first] = Stages[KV.second];
917 NewInstrChanges[KV.first] = InstrChanges[getSUnit(KV.first)];
918 }
919
920 ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
921 std::move(Stages));
923 assert(NewInstrChanges.empty() &&
924 "Cannot serialize a schedule with InstrChanges!");
926 MSTI.annotate();
927 return;
928 }
929 // The experimental code generator can't work if there are InstChanges.
930 if (ExperimentalCodeGen && NewInstrChanges.empty()) {
931 PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
932 MSE.expand();
933 } else if (MVECodeGen && NewInstrChanges.empty() &&
934 LoopPipelinerInfo->isMVEExpanderSupported() &&
936 ModuloScheduleExpanderMVE MSE(MF, MS, LIS);
937 MSE.expand();
938 } else {
939 ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
940 MSE.expand();
941 MSE.cleanup();
942 }
943 ++NumPipelined;
944}
945
946/// Clean up after the software pipeliner runs.
948 for (auto &KV : NewMIs)
949 MF.deleteMachineInstr(KV.second);
950 NewMIs.clear();
951
952 // Call the superclass.
954}
955
956/// Return the register values for the operands of a Phi instruction.
957/// This function assume the instruction is a Phi.
959 Register &InitVal, Register &LoopVal) {
960 assert(Phi.isPHI() && "Expecting a Phi.");
961
962 InitVal = Register();
963 LoopVal = Register();
964 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
965 if (Phi.getOperand(i + 1).getMBB() != Loop)
966 InitVal = Phi.getOperand(i).getReg();
967 else
968 LoopVal = Phi.getOperand(i).getReg();
969
970 assert(InitVal && LoopVal && "Unexpected Phi structure.");
971}
972
973/// Return the Phi register value that comes the loop block.
975 const MachineBasicBlock *LoopBB) {
976 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
977 if (Phi.getOperand(i + 1).getMBB() == LoopBB)
978 return Phi.getOperand(i).getReg();
979 return Register();
980}
981
982/// Return true if SUb can be reached from SUa following the chain edges.
983static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
986 Worklist.push_back(SUa);
987 while (!Worklist.empty()) {
988 const SUnit *SU = Worklist.pop_back_val();
989 for (const auto &SI : SU->Succs) {
990 SUnit *SuccSU = SI.getSUnit();
991 if (SI.getKind() == SDep::Order) {
992 if (Visited.count(SuccSU))
993 continue;
994 if (SuccSU == SUb)
995 return true;
996 Worklist.push_back(SuccSU);
997 Visited.insert(SuccSU);
998 }
999 }
1000 }
1001 return false;
1002}
1003
1005 if (!getUnderlyingObjects())
1006 return;
1007 for (const Value *Obj : UnderlyingObjs)
1008 if (!isIdentifiedObject(Obj)) {
1009 IsAllIdentified = false;
1010 break;
1011 }
1012}
1013
1015 const SUnitWithMemInfo &Other) const {
1016 // If all underlying objects are identified objects and there is no overlap
1017 // between them, then these two instructions are disjoint.
1018 if (!IsAllIdentified || !Other.IsAllIdentified)
1019 return false;
1020 for (const Value *Obj : UnderlyingObjs)
1021 if (llvm::is_contained(Other.UnderlyingObjs, Obj))
1022 return false;
1023 return true;
1024}
1025
1026/// Collect the underlying objects for the memory references of an instruction.
1027/// This function calls the code in ValueTracking, but first checks that the
1028/// instruction has a memory operand.
1029/// Returns false if we cannot find the underlying objects.
1030bool SUnitWithMemInfo::getUnderlyingObjects() {
1031 const MachineInstr *MI = SU->getInstr();
1032 if (!MI->hasOneMemOperand())
1033 return false;
1034 MachineMemOperand *MM = *MI->memoperands_begin();
1035 if (!MM->getValue())
1036 return false;
1037 MemOpValue = MM->getValue();
1038 MemOpOffset = MM->getOffset();
1040
1041 // TODO: A no alias scope may be valid only in a single iteration. In this
1042 // case we need to peel off it like LoopAccessAnalysis does.
1043 AATags = MM->getAAInfo();
1044 return true;
1045}
1046
1047/// Returns true if there is a loop-carried order dependency from \p Src to \p
1048/// Dst.
1049static bool hasLoopCarriedMemDep(const SUnitWithMemInfo &Src,
1050 const SUnitWithMemInfo &Dst,
1051 BatchAAResults &BAA,
1052 const TargetInstrInfo *TII,
1053 const TargetRegisterInfo *TRI,
1054 const SwingSchedulerDAG *SSD) {
1055 if (Src.isTriviallyDisjoint(Dst))
1056 return false;
1057 if (isSuccOrder(Src.SU, Dst.SU))
1058 return false;
1059
1060 MachineInstr &SrcMI = *Src.SU->getInstr();
1061 MachineInstr &DstMI = *Dst.SU->getInstr();
1062
1063 if (!SSD->mayOverlapInLaterIter(&SrcMI, &DstMI))
1064 return false;
1065
1066 // Second, the more expensive check that uses alias analysis on the
1067 // base registers. If they alias, and the load offset is less than
1068 // the store offset, the mark the dependence as loop carried.
1069 if (Src.isUnknown() || Dst.isUnknown())
1070 return true;
1071 if (Src.MemOpValue == Dst.MemOpValue && Src.MemOpOffset <= Dst.MemOpOffset)
1072 return true;
1073
1074 if (BAA.isNoAlias(
1075 MemoryLocation::getBeforeOrAfter(Src.MemOpValue, Src.AATags),
1076 MemoryLocation::getBeforeOrAfter(Dst.MemOpValue, Dst.AATags)))
1077 return false;
1078
1079 // AliasAnalysis sometimes gives up on following the underlying
1080 // object. In such a case, separate checks for underlying objects may
1081 // prove that there are no aliases between two accesses.
1082 for (const Value *SrcObj : Src.UnderlyingObjs)
1083 for (const Value *DstObj : Dst.UnderlyingObjs)
1084 if (!BAA.isNoAlias(MemoryLocation::getBeforeOrAfter(SrcObj, Src.AATags),
1085 MemoryLocation::getBeforeOrAfter(DstObj, Dst.AATags)))
1086 return true;
1087
1088 return false;
1089}
1090
1091void LoopCarriedOrderDepsTracker::LoadStoreChunk::append(SUnit *SU) {
1092 const MachineInstr *MI = SU->getInstr();
1093 if (!MI->mayLoadOrStore())
1094 return;
1095 (MI->mayStore() ? Stores : Loads).emplace_back(SU);
1096}
1097
1099 SwingSchedulerDAG *SSD, BatchAAResults *BAA, const TargetInstrInfo *TII,
1100 const TargetRegisterInfo *TRI)
1101 : DAG(SSD), BAA(BAA), SUnits(DAG->SUnits), N(SUnits.size()),
1102 LoopCarried(N, BitVector(N)), TII(TII), TRI(TRI) {}
1103
1105 // Traverse all instructions and extract only what we are targetting.
1106 for (auto &SU : SUnits) {
1107 auto Tagged = getInstrTag(&SU);
1108
1109 // This instruction has no loop-carried order-dependencies.
1110 if (!Tagged)
1111 continue;
1112 TaggedSUnits.emplace_back(&SU, *Tagged);
1113 }
1114
1115 computeDependenciesAux();
1116}
1117
1118std::optional<LoopCarriedOrderDepsTracker::InstrTag>
1119LoopCarriedOrderDepsTracker::getInstrTag(SUnit *SU) const {
1120 MachineInstr *MI = SU->getInstr();
1121 if (TII->isGlobalMemoryObject(MI))
1122 return InstrTag::Barrier;
1123
1124 if (MI->mayStore() ||
1125 (MI->mayLoad() && !MI->isDereferenceableInvariantLoad()))
1126 return InstrTag::LoadOrStore;
1127
1128 if (MI->mayRaiseFPException())
1129 return InstrTag::FPExceptions;
1130
1131 return std::nullopt;
1132}
1133
1134void LoopCarriedOrderDepsTracker::addDependenciesBetweenSUs(
1135 const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst) {
1136 // Avoid self-dependencies.
1137 if (Src.SU == Dst.SU)
1138 return;
1139
1140 if (hasLoopCarriedMemDep(Src, Dst, *BAA, TII, TRI, DAG))
1141 LoopCarried[Src.SU->NodeNum].set(Dst.SU->NodeNum);
1142}
1143
1144void LoopCarriedOrderDepsTracker::addLoopCarriedDepenenciesForChunks(
1145 const LoadStoreChunk &From, const LoadStoreChunk &To) {
1146 // Add load-to-store dependencies (WAR).
1147 for (const SUnitWithMemInfo &Src : From.Loads)
1148 for (const SUnitWithMemInfo &Dst : To.Stores)
1149 addDependenciesBetweenSUs(Src, Dst);
1150
1151 // Add store-to-load dependencies (RAW).
1152 for (const SUnitWithMemInfo &Src : From.Stores)
1153 for (const SUnitWithMemInfo &Dst : To.Loads)
1154 addDependenciesBetweenSUs(Src, Dst);
1155
1156 // Add store-to-store dependencies (WAW).
1157 for (const SUnitWithMemInfo &Src : From.Stores)
1158 for (const SUnitWithMemInfo &Dst : To.Stores)
1159 addDependenciesBetweenSUs(Src, Dst);
1160}
1161
1162void LoopCarriedOrderDepsTracker::computeDependenciesAux() {
1164 for (const auto &TSU : TaggedSUnits) {
1165 InstrTag Tag = TSU.getTag();
1166 SUnit *SU = TSU.getPointer();
1167 switch (Tag) {
1168 case InstrTag::Barrier:
1169 Chunks.emplace_back();
1170 break;
1171 case InstrTag::LoadOrStore:
1172 Chunks.back().append(SU);
1173 break;
1174 case InstrTag::FPExceptions:
1175 // TODO: Handle this properly.
1176 break;
1177 }
1178 }
1179
1180 // Add dependencies between memory operations. If there are one or more
1181 // barrier events between two memory instructions, we don't add a
1182 // loop-carried dependence for them.
1183 for (const LoadStoreChunk &Chunk : Chunks)
1184 addLoopCarriedDepenenciesForChunks(Chunk, Chunk);
1185
1186 // TODO: If there are multiple barrier instructions, dependencies from the
1187 // last barrier instruction (or load/store below it) to the first barrier
1188 // instruction (or load/store above it).
1189}
1190
1191/// Add a chain edge between a load and store if the store can be an
1192/// alias of the load on a subsequent iteration, i.e., a loop carried
1193/// dependence. This code is very similar to the code in ScheduleDAGInstrs
1194/// but that code doesn't create loop carried dependences.
1195/// TODO: Also compute output-dependencies.
1196LoopCarriedEdges SwingSchedulerDAG::addLoopCarriedDependences() {
1197 LoopCarriedEdges LCE;
1198
1199 // Add loop-carried order-dependencies
1200 LoopCarriedOrderDepsTracker LCODTracker(this, &BAA, TII, TRI);
1201 LCODTracker.computeDependencies();
1202 for (unsigned I = 0; I != SUnits.size(); I++)
1203 for (const int Succ : LCODTracker.getLoopCarried(I).set_bits())
1204 LCE.OrderDeps[&SUnits[I]].insert(&SUnits[Succ]);
1205
1206 LCE.modifySUnits(SUnits, TII);
1207 return LCE;
1208}
1209
1210/// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
1211/// processes dependences for PHIs. This function adds true dependences
1212/// from a PHI to a use, and a loop carried dependence from the use to the
1213/// PHI. The loop carried dependence is represented as an anti dependence
1214/// edge. This function also removes chain dependences between unrelated
1215/// PHIs.
1216void SwingSchedulerDAG::updatePhiDependences() {
1217 SmallVector<SDep, 4> RemoveDeps;
1218 const TargetSubtargetInfo &ST = MF.getSubtarget<TargetSubtargetInfo>();
1219
1220 // Iterate over each DAG node.
1221 for (SUnit &I : SUnits) {
1222 RemoveDeps.clear();
1223 // Set to true if the instruction has an operand defined by a Phi.
1224 Register HasPhiUse;
1225 Register HasPhiDef;
1226 MachineInstr *MI = I.getInstr();
1227 // Iterate over each operand, and we process the definitions.
1228 for (const MachineOperand &MO : MI->operands()) {
1229 if (!MO.isReg())
1230 continue;
1231 Register Reg = MO.getReg();
1232 if (MO.isDef()) {
1233 // If the register is used by a Phi, then create an anti dependence.
1235 UI = MRI.use_instr_begin(Reg),
1236 UE = MRI.use_instr_end();
1237 UI != UE; ++UI) {
1238 MachineInstr *UseMI = &*UI;
1239 SUnit *SU = getSUnit(UseMI);
1240 if (SU != nullptr && UseMI->isPHI()) {
1241 if (!MI->isPHI()) {
1242 SDep Dep(SU, SDep::Anti, Reg);
1243 Dep.setLatency(1);
1244 I.addPred(Dep);
1245 } else {
1246 HasPhiDef = Reg;
1247 // Add a chain edge to a dependent Phi that isn't an existing
1248 // predecessor.
1249
1250 // %3:intregs = PHI %21:intregs, %bb.6, %7:intregs, %bb.1 - SU0
1251 // %7:intregs = PHI %21:intregs, %bb.6, %13:intregs, %bb.1 - SU1
1252 // %27:intregs = A2_zxtb %3:intregs - SU2
1253 // %13:intregs = C2_muxri %45:predregs, 0, %46:intreg
1254 // If we have dependent phis, SU0 should be the successor of SU1
1255 // not the other way around. (it used to be SU1 is the successor
1256 // of SU0). In some cases, SU0 is scheduled earlier than SU1
1257 // resulting in bad IR as we do not have a value that can be used
1258 // by SU2.
1259
1260 if (SU->NodeNum < I.NodeNum && !SU->isPred(&I))
1261 SU->addPred(SDep(&I, SDep::Barrier));
1262 }
1263 }
1264 }
1265 } else if (MO.isUse()) {
1266 // If the register is defined by a Phi, then create a true dependence.
1267 MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
1268 if (DefMI == nullptr)
1269 continue;
1270 SUnit *SU = getSUnit(DefMI);
1271 if (SU != nullptr && DefMI->isPHI()) {
1272 if (!MI->isPHI()) {
1273 SDep Dep(SU, SDep::Data, Reg);
1274 Dep.setLatency(0);
1275 ST.adjustSchedDependency(SU, 0, &I, MO.getOperandNo(), Dep,
1276 &SchedModel);
1277 I.addPred(Dep);
1278 } else {
1279 HasPhiUse = Reg;
1280 // Add a chain edge to a dependent Phi that isn't an existing
1281 // predecessor.
1282 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
1283 I.addPred(SDep(SU, SDep::Barrier));
1284 }
1285 }
1286 }
1287 }
1288 // Remove order dependences from an unrelated Phi.
1289 if (!SwpPruneDeps)
1290 continue;
1291 for (auto &PI : I.Preds) {
1292 MachineInstr *PMI = PI.getSUnit()->getInstr();
1293 if (PMI->isPHI() && PI.getKind() == SDep::Order) {
1294 if (I.getInstr()->isPHI()) {
1295 if (PMI->getOperand(0).getReg() == HasPhiUse)
1296 continue;
1297 if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
1298 continue;
1299 }
1300 RemoveDeps.push_back(PI);
1301 }
1302 }
1303 for (const SDep &D : RemoveDeps)
1304 I.removePred(D);
1305 }
1306}
1307
1308/// Iterate over each DAG node and see if we can change any dependences
1309/// in order to reduce the recurrence MII.
1310void SwingSchedulerDAG::changeDependences() {
1311 // See if an instruction can use a value from the previous iteration.
1312 // If so, we update the base and offset of the instruction and change
1313 // the dependences.
1314 for (SUnit &I : SUnits) {
1315 unsigned BasePos = 0, OffsetPos = 0;
1316 Register NewBase;
1317 int64_t NewOffset = 0;
1318 if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
1319 NewOffset))
1320 continue;
1321
1322 // Get the MI and SUnit for the instruction that defines the original base.
1323 Register OrigBase = I.getInstr()->getOperand(BasePos).getReg();
1324 MachineInstr *DefMI = MRI.getUniqueVRegDef(OrigBase);
1325 if (!DefMI)
1326 continue;
1327 SUnit *DefSU = getSUnit(DefMI);
1328 if (!DefSU)
1329 continue;
1330 // Get the MI and SUnit for the instruction that defins the new base.
1331 MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
1332 if (!LastMI)
1333 continue;
1334 SUnit *LastSU = getSUnit(LastMI);
1335 if (!LastSU)
1336 continue;
1337
1338 if (Topo.IsReachable(&I, LastSU))
1339 continue;
1340
1341 // Remove the dependence. The value now depends on a prior iteration.
1343 for (const SDep &P : I.Preds)
1344 if (P.getSUnit() == DefSU)
1345 Deps.push_back(P);
1346 for (const SDep &D : Deps) {
1347 Topo.RemovePred(&I, D.getSUnit());
1348 I.removePred(D);
1349 }
1350 // Remove the chain dependence between the instructions.
1351 Deps.clear();
1352 for (auto &P : LastSU->Preds)
1353 if (P.getSUnit() == &I && P.getKind() == SDep::Order)
1354 Deps.push_back(P);
1355 for (const SDep &D : Deps) {
1356 Topo.RemovePred(LastSU, D.getSUnit());
1357 LastSU->removePred(D);
1358 }
1359
1360 // Add a dependence between the new instruction and the instruction
1361 // that defines the new base.
1362 SDep Dep(&I, SDep::Anti, NewBase);
1363 Topo.AddPred(LastSU, &I);
1364 LastSU->addPred(Dep);
1365
1366 // Remember the base and offset information so that we can update the
1367 // instruction during code generation.
1368 InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
1369 }
1370}
1371
1372/// Create an instruction stream that represents a single iteration and stage of
1373/// each instruction. This function differs from SMSchedule::finalizeSchedule in
1374/// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this
1375/// function is an approximation of SMSchedule::finalizeSchedule with all
1376/// non-const operations removed.
1378 SMSchedule &Schedule,
1379 std::vector<MachineInstr *> &OrderedInsts,
1382
1383 // Move all instructions to the first stage from the later stages.
1384 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1385 ++Cycle) {
1386 for (int Stage = 0, LastStage = Schedule.getMaxStageCount();
1387 Stage <= LastStage; ++Stage) {
1388 for (SUnit *SU : llvm::reverse(Schedule.getInstructions(
1389 Cycle + Stage * Schedule.getInitiationInterval()))) {
1390 Instrs[Cycle].push_front(SU);
1391 }
1392 }
1393 }
1394
1395 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1396 ++Cycle) {
1397 std::deque<SUnit *> &CycleInstrs = Instrs[Cycle];
1398 CycleInstrs = Schedule.reorderInstructions(SSD, CycleInstrs);
1399 for (SUnit *SU : CycleInstrs) {
1400 MachineInstr *MI = SU->getInstr();
1401 OrderedInsts.push_back(MI);
1402 Stages[MI] = Schedule.stageScheduled(SU);
1403 }
1404 }
1405}
1406
1407namespace {
1408
1409// FuncUnitSorter - Comparison operator used to sort instructions by
1410// the number of functional unit choices.
1411struct FuncUnitSorter {
1412 const InstrItineraryData *InstrItins;
1413 const MCSubtargetInfo *STI;
1414 DenseMap<InstrStage::FuncUnits, unsigned> Resources;
1415
1416 FuncUnitSorter(const TargetSubtargetInfo &TSI)
1417 : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
1418
1419 // Compute the number of functional unit alternatives needed
1420 // at each stage, and take the minimum value. We prioritize the
1421 // instructions by the least number of choices first.
1422 unsigned minFuncUnits(const MachineInstr *Inst,
1423 InstrStage::FuncUnits &F) const {
1424 unsigned SchedClass = Inst->getDesc().getSchedClass();
1425 unsigned min = UINT_MAX;
1426 if (InstrItins && !InstrItins->isEmpty()) {
1427 for (const InstrStage &IS :
1428 make_range(InstrItins->beginStage(SchedClass),
1429 InstrItins->endStage(SchedClass))) {
1430 InstrStage::FuncUnits funcUnits = IS.getUnits();
1431 unsigned numAlternatives = llvm::popcount(funcUnits);
1432 if (numAlternatives < min) {
1433 min = numAlternatives;
1434 F = funcUnits;
1435 }
1436 }
1437 return min;
1438 }
1439 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1440 const MCSchedClassDesc *SCDesc =
1441 STI->getSchedModel().getSchedClassDesc(SchedClass);
1442 if (!SCDesc->isValid())
1443 // No valid Schedule Class Desc for schedClass, should be
1444 // Pseudo/PostRAPseudo
1445 return min;
1446
1447 for (const MCWriteProcResEntry &PRE :
1448 make_range(STI->getWriteProcResBegin(SCDesc),
1449 STI->getWriteProcResEnd(SCDesc))) {
1450 if (!PRE.ReleaseAtCycle)
1451 continue;
1452 const MCProcResourceDesc *ProcResource =
1453 STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);
1454 unsigned NumUnits = ProcResource->NumUnits;
1455 if (NumUnits < min) {
1456 min = NumUnits;
1457 F = PRE.ProcResourceIdx;
1458 }
1459 }
1460 return min;
1461 }
1462 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1463 }
1464
1465 // Compute the critical resources needed by the instruction. This
1466 // function records the functional units needed by instructions that
1467 // must use only one functional unit. We use this as a tie breaker
1468 // for computing the resource MII. The instrutions that require
1469 // the same, highly used, functional unit have high priority.
1470 void calcCriticalResources(MachineInstr &MI) {
1471 unsigned SchedClass = MI.getDesc().getSchedClass();
1472 if (InstrItins && !InstrItins->isEmpty()) {
1473 for (const InstrStage &IS :
1474 make_range(InstrItins->beginStage(SchedClass),
1475 InstrItins->endStage(SchedClass))) {
1476 InstrStage::FuncUnits FuncUnits = IS.getUnits();
1477 if (llvm::popcount(FuncUnits) == 1)
1478 Resources[FuncUnits]++;
1479 }
1480 return;
1481 }
1482 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1483 const MCSchedClassDesc *SCDesc =
1484 STI->getSchedModel().getSchedClassDesc(SchedClass);
1485 if (!SCDesc->isValid())
1486 // No valid Schedule Class Desc for schedClass, should be
1487 // Pseudo/PostRAPseudo
1488 return;
1489
1490 for (const MCWriteProcResEntry &PRE :
1491 make_range(STI->getWriteProcResBegin(SCDesc),
1492 STI->getWriteProcResEnd(SCDesc))) {
1493 if (!PRE.ReleaseAtCycle)
1494 continue;
1495 Resources[PRE.ProcResourceIdx]++;
1496 }
1497 return;
1498 }
1499 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1500 }
1501
1502 /// Return true if IS1 has less priority than IS2.
1503 bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1504 InstrStage::FuncUnits F1 = 0, F2 = 0;
1505 unsigned MFUs1 = minFuncUnits(IS1, F1);
1506 unsigned MFUs2 = minFuncUnits(IS2, F2);
1507 if (MFUs1 == MFUs2)
1508 return Resources.lookup(F1) < Resources.lookup(F2);
1509 return MFUs1 > MFUs2;
1510 }
1511};
1512
1513/// Calculate the maximum register pressure of the scheduled instructions stream
1514class HighRegisterPressureDetector {
1515 MachineBasicBlock *OrigMBB;
1516 const MachineRegisterInfo &MRI;
1517 const TargetRegisterInfo *TRI;
1518
1519 const unsigned PSetNum;
1520
1521 // Indexed by PSet ID
1522 // InitSetPressure takes into account the register pressure of live-in
1523 // registers. It's not depend on how the loop is scheduled, so it's enough to
1524 // calculate them once at the beginning.
1525 std::vector<unsigned> InitSetPressure;
1526
1527 // Indexed by PSet ID
1528 // Upper limit for each register pressure set
1529 std::vector<unsigned> PressureSetLimit;
1530
1531 DenseMap<MachineInstr *, RegisterOperands> ROMap;
1532
1533 using Instr2LastUsesTy = DenseMap<MachineInstr *, SmallDenseSet<Register, 4>>;
1534
1535public:
1536 using OrderedInstsTy = std::vector<MachineInstr *>;
1537 using Instr2StageTy = DenseMap<MachineInstr *, unsigned>;
1538
1539private:
1540 static void dumpRegisterPressures(const std::vector<unsigned> &Pressures) {
1541 if (Pressures.size() == 0) {
1542 dbgs() << "[]";
1543 } else {
1544 char Prefix = '[';
1545 for (unsigned P : Pressures) {
1546 dbgs() << Prefix << P;
1547 Prefix = ' ';
1548 }
1549 dbgs() << ']';
1550 }
1551 }
1552
1553 void dumpPSet(Register Reg) const {
1554 dbgs() << "Reg=" << printReg(Reg, TRI, 0, &MRI) << " PSet=";
1555 // FIXME: The static_cast is a bug compensating bugs in the callers.
1556 VirtRegOrUnit VRegOrUnit =
1557 Reg.isVirtual() ? VirtRegOrUnit(Reg)
1558 : VirtRegOrUnit(static_cast<MCRegUnit>(Reg.id()));
1559 for (auto PSetIter = MRI.getPressureSets(VRegOrUnit); PSetIter.isValid();
1560 ++PSetIter) {
1561 dbgs() << *PSetIter << ' ';
1562 }
1563 dbgs() << '\n';
1564 }
1565
1566 void increaseRegisterPressure(std::vector<unsigned> &Pressure,
1567 Register Reg) const {
1568 // FIXME: The static_cast is a bug compensating bugs in the callers.
1569 VirtRegOrUnit VRegOrUnit =
1570 Reg.isVirtual() ? VirtRegOrUnit(Reg)
1571 : VirtRegOrUnit(static_cast<MCRegUnit>(Reg.id()));
1572 auto PSetIter = MRI.getPressureSets(VRegOrUnit);
1573 unsigned Weight = PSetIter.getWeight();
1574 for (; PSetIter.isValid(); ++PSetIter)
1575 Pressure[*PSetIter] += Weight;
1576 }
1577
1578 void decreaseRegisterPressure(std::vector<unsigned> &Pressure,
1579 Register Reg) const {
1580 auto PSetIter = MRI.getPressureSets(VirtRegOrUnit(Reg));
1581 unsigned Weight = PSetIter.getWeight();
1582 for (; PSetIter.isValid(); ++PSetIter) {
1583 auto &P = Pressure[*PSetIter];
1584 assert(P >= Weight &&
1585 "register pressure must be greater than or equal weight");
1586 P -= Weight;
1587 }
1588 }
1589
1590 // Return true if Reg is reserved one, for example, stack pointer
1591 bool isReservedRegister(Register Reg) const {
1592 return Reg.isPhysical() && MRI.isReserved(Reg.asMCReg());
1593 }
1594
1595 bool isDefinedInThisLoop(Register Reg) const {
1596 return Reg.isVirtual() && MRI.getVRegDef(Reg)->getParent() == OrigMBB;
1597 }
1598
1599 // Search for live-in variables. They are factored into the register pressure
1600 // from the begining. Live-in variables used by every iteration should be
1601 // considered as alive throughout the loop. For example, the variable `c` in
1602 // following code. \code
1603 // int c = ...;
1604 // for (int i = 0; i < n; i++)
1605 // a[i] += b[i] + c;
1606 // \endcode
1607 void computeLiveIn() {
1608 DenseSet<Register> Used;
1609 for (auto &MI : *OrigMBB) {
1610 if (MI.isDebugInstr())
1611 continue;
1612 for (auto &Use : ROMap[&MI].Uses) {
1613 // FIXME: The static_cast is a bug.
1614 Register Reg =
1615 Use.VRegOrUnit.isVirtualReg()
1616 ? Use.VRegOrUnit.asVirtualReg()
1617 : Register(static_cast<unsigned>(Use.VRegOrUnit.asMCRegUnit()));
1618 // Ignore the variable that appears only on one side of phi instruction
1619 // because it's used only at the first iteration.
1620 if (MI.isPHI() && Reg != getLoopPhiReg(MI, OrigMBB))
1621 continue;
1622 if (isReservedRegister(Reg))
1623 continue;
1624 if (isDefinedInThisLoop(Reg))
1625 continue;
1626 Used.insert(Reg);
1627 }
1628 }
1629
1630 for (auto LiveIn : Used)
1631 increaseRegisterPressure(InitSetPressure, LiveIn);
1632 }
1633
1634 // Calculate the upper limit of each pressure set
1635 void computePressureSetLimit(const RegisterClassInfo &RCI) {
1636 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1637 PressureSetLimit[PSet] = RCI.getRegPressureSetLimit(PSet);
1638 }
1639
1640 // There are two patterns of last-use.
1641 // - by an instruction of the current iteration
1642 // - by a phi instruction of the next iteration (loop carried value)
1643 //
1644 // Furthermore, following two groups of instructions are executed
1645 // simultaneously
1646 // - next iteration's phi instructions in i-th stage
1647 // - current iteration's instructions in i+1-th stage
1648 //
1649 // This function calculates the last-use of each register while taking into
1650 // account the above two patterns.
1651 Instr2LastUsesTy computeLastUses(const OrderedInstsTy &OrderedInsts,
1652 Instr2StageTy &Stages) const {
1653 // We treat virtual registers that are defined and used in this loop.
1654 // Following virtual register will be ignored
1655 // - live-in one
1656 // - defined but not used in the loop (potentially live-out)
1657 DenseSet<Register> TargetRegs;
1658 const auto UpdateTargetRegs = [this, &TargetRegs](Register Reg) {
1659 if (isDefinedInThisLoop(Reg))
1660 TargetRegs.insert(Reg);
1661 };
1662 for (MachineInstr *MI : OrderedInsts) {
1663 if (MI->isPHI()) {
1664 Register Reg = getLoopPhiReg(*MI, OrigMBB);
1665 UpdateTargetRegs(Reg);
1666 } else {
1667 for (auto &Use : ROMap.find(MI)->getSecond().Uses) {
1668 // FIXME: The static_cast is a bug.
1669 Register Reg = Use.VRegOrUnit.isVirtualReg()
1670 ? Use.VRegOrUnit.asVirtualReg()
1671 : Register(static_cast<unsigned>(
1672 Use.VRegOrUnit.asMCRegUnit()));
1673 UpdateTargetRegs(Reg);
1674 }
1675 }
1676 }
1677
1678 const auto InstrScore = [&Stages](MachineInstr *MI) {
1679 return Stages[MI] + MI->isPHI();
1680 };
1681
1682 DenseMap<Register, MachineInstr *> LastUseMI;
1683 for (MachineInstr *MI : llvm::reverse(OrderedInsts)) {
1684 for (auto &Use : ROMap.find(MI)->getSecond().Uses) {
1685 // FIXME: The static_cast is a bug.
1686 Register Reg =
1687 Use.VRegOrUnit.isVirtualReg()
1688 ? Use.VRegOrUnit.asVirtualReg()
1689 : Register(static_cast<unsigned>(Use.VRegOrUnit.asMCRegUnit()));
1690 if (!TargetRegs.contains(Reg))
1691 continue;
1692 auto [Ite, Inserted] = LastUseMI.try_emplace(Reg, MI);
1693 if (!Inserted) {
1694 MachineInstr *Orig = Ite->second;
1695 MachineInstr *New = MI;
1696 if (InstrScore(Orig) < InstrScore(New))
1697 Ite->second = New;
1698 }
1699 }
1700 }
1701
1702 Instr2LastUsesTy LastUses;
1703 for (auto [Reg, MI] : LastUseMI)
1704 LastUses[MI].insert(Reg);
1705 return LastUses;
1706 }
1707
1708 // Compute the maximum register pressure of the kernel. We'll simulate #Stage
1709 // iterations and check the register pressure at the point where all stages
1710 // overlapping.
1711 //
1712 // An example of unrolled loop where #Stage is 4..
1713 // Iter i+0 i+1 i+2 i+3
1714 // ------------------------
1715 // Stage 0
1716 // Stage 1 0
1717 // Stage 2 1 0
1718 // Stage 3 2 1 0 <- All stages overlap
1719 //
1720 std::vector<unsigned>
1721 computeMaxSetPressure(const OrderedInstsTy &OrderedInsts,
1722 Instr2StageTy &Stages,
1723 const unsigned StageCount) const {
1724 using RegSetTy = SmallDenseSet<Register, 16>;
1725
1726 // Indexed by #Iter. To treat "local" variables of each stage separately, we
1727 // manage the liveness of the registers independently by iterations.
1728 SmallVector<RegSetTy> LiveRegSets(StageCount);
1729
1730 auto CurSetPressure = InitSetPressure;
1731 auto MaxSetPressure = InitSetPressure;
1732 auto LastUses = computeLastUses(OrderedInsts, Stages);
1733
1734 LLVM_DEBUG({
1735 dbgs() << "Ordered instructions:\n";
1736 for (MachineInstr *MI : OrderedInsts) {
1737 dbgs() << "Stage " << Stages[MI] << ": ";
1738 MI->dump();
1739 }
1740 });
1741
1742 const auto InsertReg = [this, &CurSetPressure](RegSetTy &RegSet,
1743 VirtRegOrUnit VRegOrUnit) {
1744 // FIXME: The static_cast is a bug.
1745 Register Reg =
1746 VRegOrUnit.isVirtualReg()
1747 ? VRegOrUnit.asVirtualReg()
1748 : Register(static_cast<unsigned>(VRegOrUnit.asMCRegUnit()));
1749 if (!Reg.isValid() || isReservedRegister(Reg))
1750 return;
1751
1752 bool Inserted = RegSet.insert(Reg).second;
1753 if (!Inserted)
1754 return;
1755
1756 LLVM_DEBUG(dbgs() << "insert " << printReg(Reg, TRI, 0, &MRI) << "\n");
1757 increaseRegisterPressure(CurSetPressure, Reg);
1758 LLVM_DEBUG(dumpPSet(Reg));
1759 };
1760
1761 const auto EraseReg = [this, &CurSetPressure](RegSetTy &RegSet,
1762 Register Reg) {
1763 if (!Reg.isValid() || isReservedRegister(Reg))
1764 return;
1765
1766 // live-in register
1767 if (!RegSet.contains(Reg))
1768 return;
1769
1770 LLVM_DEBUG(dbgs() << "erase " << printReg(Reg, TRI, 0, &MRI) << "\n");
1771 RegSet.erase(Reg);
1772 decreaseRegisterPressure(CurSetPressure, Reg);
1773 LLVM_DEBUG(dumpPSet(Reg));
1774 };
1775
1776 for (unsigned I = 0; I < StageCount; I++) {
1777 for (MachineInstr *MI : OrderedInsts) {
1778 const auto Stage = Stages[MI];
1779 if (I < Stage)
1780 continue;
1781
1782 const unsigned Iter = I - Stage;
1783
1784 for (auto &Def : ROMap.find(MI)->getSecond().Defs)
1785 InsertReg(LiveRegSets[Iter], Def.VRegOrUnit);
1786
1787 for (auto LastUse : LastUses[MI]) {
1788 if (MI->isPHI()) {
1789 if (Iter != 0)
1790 EraseReg(LiveRegSets[Iter - 1], LastUse);
1791 } else {
1792 EraseReg(LiveRegSets[Iter], LastUse);
1793 }
1794 }
1795
1796 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1797 MaxSetPressure[PSet] =
1798 std::max(MaxSetPressure[PSet], CurSetPressure[PSet]);
1799
1800 LLVM_DEBUG({
1801 dbgs() << "CurSetPressure=";
1802 dumpRegisterPressures(CurSetPressure);
1803 dbgs() << " iter=" << Iter << " stage=" << Stage << ":";
1804 MI->dump();
1805 });
1806 }
1807 }
1808
1809 return MaxSetPressure;
1810 }
1811
1812public:
1813 HighRegisterPressureDetector(MachineBasicBlock *OrigMBB,
1814 const MachineFunction &MF)
1815 : OrigMBB(OrigMBB), MRI(MF.getRegInfo()),
1816 TRI(MF.getSubtarget().getRegisterInfo()),
1817 PSetNum(TRI->getNumRegPressureSets()), InitSetPressure(PSetNum, 0),
1818 PressureSetLimit(PSetNum, 0) {}
1819
1820 // Used to calculate register pressure, which is independent of loop
1821 // scheduling.
1822 void init(const RegisterClassInfo &RCI) {
1823 for (MachineInstr &MI : *OrigMBB) {
1824 if (MI.isDebugInstr())
1825 continue;
1826 ROMap[&MI].collect(MI, *TRI, MRI, false, true);
1827 }
1828
1829 computeLiveIn();
1830 computePressureSetLimit(RCI);
1831 }
1832
1833 // Calculate the maximum register pressures of the loop and check if they
1834 // exceed the limit
1835 bool detect(const SwingSchedulerDAG *SSD, SMSchedule &Schedule,
1836 const unsigned MaxStage) const {
1838 "the percentage of the margin must be between 0 to 100");
1839
1840 OrderedInstsTy OrderedInsts;
1841 Instr2StageTy Stages;
1842 computeScheduledInsts(SSD, Schedule, OrderedInsts, Stages);
1843 const auto MaxSetPressure =
1844 computeMaxSetPressure(OrderedInsts, Stages, MaxStage + 1);
1845
1846 LLVM_DEBUG({
1847 dbgs() << "Dump MaxSetPressure:\n";
1848 for (unsigned I = 0; I < MaxSetPressure.size(); I++) {
1849 dbgs() << format("MaxSetPressure[%d]=%d\n", I, MaxSetPressure[I]);
1850 }
1851 dbgs() << '\n';
1852 });
1853
1854 for (unsigned PSet = 0; PSet < PSetNum; PSet++) {
1855 unsigned Limit = PressureSetLimit[PSet];
1856 unsigned Margin = Limit * RegPressureMargin / 100;
1857 LLVM_DEBUG(dbgs() << "PSet=" << PSet << " Limit=" << Limit
1858 << " Margin=" << Margin << "\n");
1859 if (Limit < MaxSetPressure[PSet] + Margin) {
1860 LLVM_DEBUG(
1861 dbgs()
1862 << "Rejected the schedule because of too high register pressure\n");
1863 return true;
1864 }
1865 }
1866 return false;
1867 }
1868};
1869
1870} // end anonymous namespace
1871
1872/// Calculate the resource constrained minimum initiation interval for the
1873/// specified loop. We use the DFA to model the resources needed for
1874/// each instruction, and we ignore dependences. A different DFA is created
1875/// for each cycle that is required. When adding a new instruction, we attempt
1876/// to add it to each existing DFA, until a legal space is found. If the
1877/// instruction cannot be reserved in an existing DFA, we create a new one.
1878unsigned SwingSchedulerDAG::calculateResMII() {
1879 LLVM_DEBUG(dbgs() << "calculateResMII:\n");
1880 ResourceManager RM(&MF.getSubtarget(), this);
1881 return RM.calculateResMII();
1882}
1883
1884/// Calculate the recurrence-constrainted minimum initiation interval.
1885/// Iterate over each circuit. Compute the delay(c) and distance(c)
1886/// for each circuit. The II needs to satisfy the inequality
1887/// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1888/// II that satisfies the inequality, and the RecMII is the maximum
1889/// of those values.
1890unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1891 unsigned RecMII = 0;
1892
1893 for (NodeSet &Nodes : NodeSets) {
1894 if (Nodes.empty())
1895 continue;
1896
1897 unsigned Delay = Nodes.getLatency();
1898 unsigned Distance = 1;
1899
1900 // ii = ceil(delay / distance)
1901 unsigned CurMII = (Delay + Distance - 1) / Distance;
1902 Nodes.setRecMII(CurMII);
1903 if (CurMII > RecMII)
1904 RecMII = CurMII;
1905 }
1906
1907 return RecMII;
1908}
1909
1910/// Create the adjacency structure of the nodes in the graph.
1911void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1912 SwingSchedulerDAG *DAG) {
1913 BitVector Added(SUnits.size());
1914 DenseMap<int, int> OutputDeps;
1915 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1916 Added.reset();
1917 // Add any successor to the adjacency matrix and exclude duplicates.
1918 for (auto &OE : DAG->DDG->getOutEdges(&SUnits[i])) {
1919 // Only create a back-edge on the first and last nodes of a dependence
1920 // chain. This records any chains and adds them later.
1921 if (OE.isOutputDep()) {
1922 int N = OE.getDst()->NodeNum;
1923 int BackEdge = i;
1924 auto Dep = OutputDeps.find(BackEdge);
1925 if (Dep != OutputDeps.end()) {
1926 BackEdge = Dep->second;
1927 OutputDeps.erase(Dep);
1928 }
1929 OutputDeps[N] = BackEdge;
1930 }
1931 // Do not process a boundary node, an artificial node.
1932 if (OE.getDst()->isBoundaryNode() || OE.isArtificial())
1933 continue;
1934
1935 // This code is retained o preserve previous behavior and prevent
1936 // regression. This condition means that anti-dependnecies within an
1937 // iteration are ignored when searching circuits. Therefore it's natural
1938 // to consider this dependence as well.
1939 // FIXME: Remove this code if it doesn't have significant impact on
1940 // performance.
1941 if (OE.isAntiDep())
1942 continue;
1943
1944 int N = OE.getDst()->NodeNum;
1945 if (!Added.test(N)) {
1946 AdjK[i].push_back(N);
1947 Added.set(N);
1948 }
1949 }
1950 // A chain edge between a store and a load is treated as a back-edge in the
1951 // adjacency matrix.
1952 for (auto &IE : DAG->DDG->getInEdges(&SUnits[i])) {
1953 SUnit *Src = IE.getSrc();
1954 SUnit *Dst = IE.getDst();
1955 if (!Dst->getInstr()->mayStore() || !DAG->isLoopCarriedDep(IE))
1956 continue;
1957 if (IE.isOrderDep() && Src->getInstr()->mayLoad()) {
1958 int N = Src->NodeNum;
1959 if (!Added.test(N)) {
1960 AdjK[i].push_back(N);
1961 Added.set(N);
1962 }
1963 }
1964 }
1965 }
1966 // Add back-edges in the adjacency matrix for the output dependences.
1967 for (auto &OD : OutputDeps)
1968 if (!Added.test(OD.second)) {
1969 AdjK[OD.first].push_back(OD.second);
1970 Added.set(OD.second);
1971 }
1972}
1973
1974/// Identify an elementary circuit in the dependence graph starting at the
1975/// specified node.
1976bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1977 const SwingSchedulerDAG *DAG,
1978 bool HasBackedge) {
1979 SUnit *SV = &SUnits[V];
1980 bool F = false;
1981 Stack.insert(SV);
1982 Blocked.set(V);
1983
1984 for (auto W : AdjK[V]) {
1985 if (NumPaths > MaxPaths)
1986 break;
1987 if (W < S)
1988 continue;
1989 if (W == S) {
1990 if (!HasBackedge)
1991 NodeSets.push_back(NodeSet(Stack.begin(), Stack.end(), DAG));
1992 F = true;
1993 ++NumPaths;
1994 break;
1995 }
1996 if (!Blocked.test(W)) {
1997 if (circuit(W, S, NodeSets, DAG,
1998 Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
1999 F = true;
2000 }
2001 }
2002
2003 if (F)
2004 unblock(V);
2005 else {
2006 for (auto W : AdjK[V]) {
2007 if (W < S)
2008 continue;
2009 B[W].insert(SV);
2010 }
2011 }
2012 Stack.pop_back();
2013 return F;
2014}
2015
2016/// Unblock a node in the circuit finding algorithm.
2017void SwingSchedulerDAG::Circuits::unblock(int U) {
2018 Blocked.reset(U);
2019 SmallPtrSet<SUnit *, 4> &BU = B[U];
2020 while (!BU.empty()) {
2021 SmallPtrSet<SUnit *, 4>::iterator SI = BU.begin();
2022 assert(SI != BU.end() && "Invalid B set.");
2023 SUnit *W = *SI;
2024 BU.erase(W);
2025 if (Blocked.test(W->NodeNum))
2026 unblock(W->NodeNum);
2027 }
2028}
2029
2030/// Identify all the elementary circuits in the dependence graph using
2031/// Johnson's circuit algorithm.
2032void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
2033 Circuits Cir(SUnits, Topo);
2034 // Create the adjacency structure.
2035 Cir.createAdjacencyStructure(this);
2036 for (int I = 0, E = SUnits.size(); I != E; ++I) {
2037 Cir.reset();
2038 Cir.circuit(I, I, NodeSets, this);
2039 }
2040}
2041
2042// Create artificial dependencies between the source of COPY/REG_SEQUENCE that
2043// is loop-carried to the USE in next iteration. This will help pipeliner avoid
2044// additional copies that are needed across iterations. An artificial dependence
2045// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
2046
2047// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
2048// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
2049// PHI-------True-Dep------> USEOfPhi
2050
2051// The mutation creates
2052// USEOfPHI -------Artificial-Dep---> SRCOfCopy
2053
2054// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
2055// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
2056// late to avoid additional copies across iterations. The possible scheduling
2057// order would be
2058// USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE.
2059
2060void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
2061 for (SUnit &SU : DAG->SUnits) {
2062 // Find the COPY/REG_SEQUENCE instruction.
2063 if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
2064 continue;
2065
2066 // Record the loop carried PHIs.
2068 // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
2070
2071 for (auto &Dep : SU.Preds) {
2072 SUnit *TmpSU = Dep.getSUnit();
2073 MachineInstr *TmpMI = TmpSU->getInstr();
2074 SDep::Kind DepKind = Dep.getKind();
2075 // Save the loop carried PHI.
2076 if (DepKind == SDep::Anti && TmpMI->isPHI())
2077 PHISUs.push_back(TmpSU);
2078 // Save the source of COPY/REG_SEQUENCE.
2079 // If the source has no pre-decessors, we will end up creating cycles.
2080 else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
2081 SrcSUs.push_back(TmpSU);
2082 }
2083
2084 if (PHISUs.size() == 0 || SrcSUs.size() == 0)
2085 continue;
2086
2087 // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
2088 // SUnit to the container.
2090 // Do not use iterator based loop here as we are updating the container.
2091 for (size_t Index = 0; Index < PHISUs.size(); ++Index) {
2092 for (auto &Dep : PHISUs[Index]->Succs) {
2093 if (Dep.getKind() != SDep::Data)
2094 continue;
2095
2096 SUnit *TmpSU = Dep.getSUnit();
2097 MachineInstr *TmpMI = TmpSU->getInstr();
2098 if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
2099 PHISUs.push_back(TmpSU);
2100 continue;
2101 }
2102 UseSUs.push_back(TmpSU);
2103 }
2104 }
2105
2106 if (UseSUs.size() == 0)
2107 continue;
2108
2109 SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
2110 // Add the artificial dependencies if it does not form a cycle.
2111 for (auto *I : UseSUs) {
2112 for (auto *Src : SrcSUs) {
2113 if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
2114 Src->addPred(SDep(I, SDep::Artificial));
2115 SDAG->Topo.AddPred(Src, I);
2116 }
2117 }
2118 }
2119 }
2120}
2121
2122/// Compute several functions need to order the nodes for scheduling.
2123/// ASAP - Earliest time to schedule a node.
2124/// ALAP - Latest time to schedule a node.
2125/// MOV - Mobility function, difference between ALAP and ASAP.
2126/// D - Depth of each node.
2127/// H - Height of each node.
2128void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
2129 ScheduleInfo.resize(SUnits.size());
2130
2131 LLVM_DEBUG({
2132 for (int I : Topo) {
2133 const SUnit &SU = SUnits[I];
2134 dumpNode(SU);
2135 }
2136 });
2137
2138 int maxASAP = 0;
2139 // Compute ASAP and ZeroLatencyDepth.
2140 for (int I : Topo) {
2141 int asap = 0;
2142 int zeroLatencyDepth = 0;
2143 SUnit *SU = &SUnits[I];
2144 for (const auto &IE : DDG->getInEdges(SU)) {
2145 SUnit *Pred = IE.getSrc();
2146 if (IE.getLatency() == 0)
2147 zeroLatencyDepth =
2148 std::max(zeroLatencyDepth, getZeroLatencyDepth(Pred) + 1);
2149 if (IE.ignoreDependence(true))
2150 continue;
2151 asap = std::max(asap, (int)(getASAP(Pred) + IE.getLatency() -
2152 IE.getDistance() * MII));
2153 }
2154 maxASAP = std::max(maxASAP, asap);
2155 ScheduleInfo[I].ASAP = asap;
2156 ScheduleInfo[I].ZeroLatencyDepth = zeroLatencyDepth;
2157 }
2158
2159 // Compute ALAP, ZeroLatencyHeight, and MOV.
2160 for (int I : llvm::reverse(Topo)) {
2161 int alap = maxASAP;
2162 int zeroLatencyHeight = 0;
2163 SUnit *SU = &SUnits[I];
2164 for (const auto &OE : DDG->getOutEdges(SU)) {
2165 SUnit *Succ = OE.getDst();
2166 if (Succ->isBoundaryNode())
2167 continue;
2168 if (OE.getLatency() == 0)
2169 zeroLatencyHeight =
2170 std::max(zeroLatencyHeight, getZeroLatencyHeight(Succ) + 1);
2171 if (OE.ignoreDependence(true))
2172 continue;
2173 alap = std::min(alap, (int)(getALAP(Succ) - OE.getLatency() +
2174 OE.getDistance() * MII));
2175 }
2176
2177 ScheduleInfo[I].ALAP = alap;
2178 ScheduleInfo[I].ZeroLatencyHeight = zeroLatencyHeight;
2179 }
2180
2181 // After computing the node functions, compute the summary for each node set.
2182 for (NodeSet &I : NodeSets)
2183 I.computeNodeSetInfo(this);
2184
2185 LLVM_DEBUG({
2186 for (unsigned i = 0; i < SUnits.size(); i++) {
2187 dbgs() << "\tNode " << i << ":\n";
2188 dbgs() << "\t ASAP = " << getASAP(&SUnits[i]) << "\n";
2189 dbgs() << "\t ALAP = " << getALAP(&SUnits[i]) << "\n";
2190 dbgs() << "\t MOV = " << getMOV(&SUnits[i]) << "\n";
2191 dbgs() << "\t D = " << getDepth(&SUnits[i]) << "\n";
2192 dbgs() << "\t H = " << getHeight(&SUnits[i]) << "\n";
2193 dbgs() << "\t ZLD = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
2194 dbgs() << "\t ZLH = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
2195 }
2196 });
2197}
2198
2199/// Compute the Pred_L(O) set, as defined in the paper. The set is defined
2200/// as the predecessors of the elements of NodeOrder that are not also in
2201/// NodeOrder.
2204 const NodeSet *S = nullptr) {
2205 Preds.clear();
2206
2207 for (SUnit *SU : NodeOrder) {
2208 for (const auto &IE : DDG->getInEdges(SU)) {
2209 SUnit *PredSU = IE.getSrc();
2210 if (S && S->count(PredSU) == 0)
2211 continue;
2212 if (IE.ignoreDependence(true))
2213 continue;
2214 if (NodeOrder.count(PredSU) == 0)
2215 Preds.insert(PredSU);
2216 }
2217
2218 // FIXME: The following loop-carried dependencies may also need to be
2219 // considered.
2220 // - Physical register dependencies (true-dependence and WAW).
2221 // - Memory dependencies.
2222 for (const auto &OE : DDG->getOutEdges(SU)) {
2223 SUnit *SuccSU = OE.getDst();
2224 if (!OE.isAntiDep())
2225 continue;
2226 if (S && S->count(SuccSU) == 0)
2227 continue;
2228 if (NodeOrder.count(SuccSU) == 0)
2229 Preds.insert(SuccSU);
2230 }
2231 }
2232 return !Preds.empty();
2233}
2234
2235/// Compute the Succ_L(O) set, as defined in the paper. The set is defined
2236/// as the successors of the elements of NodeOrder that are not also in
2237/// NodeOrder.
2240 const NodeSet *S = nullptr) {
2241 Succs.clear();
2242
2243 for (SUnit *SU : NodeOrder) {
2244 for (const auto &OE : DDG->getOutEdges(SU)) {
2245 SUnit *SuccSU = OE.getDst();
2246 if (S && S->count(SuccSU) == 0)
2247 continue;
2248 if (OE.ignoreDependence(false))
2249 continue;
2250 if (NodeOrder.count(SuccSU) == 0)
2251 Succs.insert(SuccSU);
2252 }
2253
2254 // FIXME: The following loop-carried dependencies may also need to be
2255 // considered.
2256 // - Physical register dependnecies (true-dependnece and WAW).
2257 // - Memory dependencies.
2258 for (const auto &IE : DDG->getInEdges(SU)) {
2259 SUnit *PredSU = IE.getSrc();
2260 if (!IE.isAntiDep())
2261 continue;
2262 if (S && S->count(PredSU) == 0)
2263 continue;
2264 if (NodeOrder.count(PredSU) == 0)
2265 Succs.insert(PredSU);
2266 }
2267 }
2268 return !Succs.empty();
2269}
2270
2271/// Return true if there is a path from the specified node to any of the nodes
2272/// in DestNodes. Keep track and return the nodes in any path.
2273static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
2274 SetVector<SUnit *> &DestNodes,
2275 SetVector<SUnit *> &Exclude,
2276 SmallPtrSet<SUnit *, 8> &Visited,
2277 SwingSchedulerDDG *DDG) {
2278 if (Cur->isBoundaryNode())
2279 return false;
2280 if (Exclude.contains(Cur))
2281 return false;
2282 if (DestNodes.contains(Cur))
2283 return true;
2284 if (!Visited.insert(Cur).second)
2285 return Path.contains(Cur);
2286 bool FoundPath = false;
2287 for (const auto &OE : DDG->getOutEdges(Cur))
2288 if (!OE.ignoreDependence(false))
2289 FoundPath |=
2290 computePath(OE.getDst(), Path, DestNodes, Exclude, Visited, DDG);
2291 for (const auto &IE : DDG->getInEdges(Cur))
2292 if (IE.isAntiDep() && IE.getDistance() == 0)
2293 FoundPath |=
2294 computePath(IE.getSrc(), Path, DestNodes, Exclude, Visited, DDG);
2295 if (FoundPath)
2296 Path.insert(Cur);
2297 return FoundPath;
2298}
2299
2300/// Compute the live-out registers for the instructions in a node-set.
2301/// The live-out registers are those that are defined in the node-set,
2302/// but not used. Except for use operands of Phis.
2304 NodeSet &NS) {
2309 for (SUnit *SU : NS) {
2310 const MachineInstr *MI = SU->getInstr();
2311 if (MI->isPHI())
2312 continue;
2313 for (const MachineOperand &MO : MI->all_uses()) {
2314 Register Reg = MO.getReg();
2315 if (Reg.isVirtual())
2316 Uses.insert(VirtRegOrUnit(Reg));
2317 else if (MRI.isAllocatable(Reg))
2318 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
2319 Uses.insert(VirtRegOrUnit(Unit));
2320 }
2321 }
2322 for (SUnit *SU : NS)
2323 for (const MachineOperand &MO : SU->getInstr()->all_defs())
2324 if (!MO.isDead()) {
2325 Register Reg = MO.getReg();
2326 if (Reg.isVirtual()) {
2327 if (!Uses.count(VirtRegOrUnit(Reg)))
2328 LiveOutRegs.emplace_back(VirtRegOrUnit(Reg),
2330 } else if (MRI.isAllocatable(Reg)) {
2331 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
2332 if (!Uses.count(VirtRegOrUnit(Unit)))
2333 LiveOutRegs.emplace_back(VirtRegOrUnit(Unit),
2335 }
2336 }
2337 RPTracker.addLiveRegs(LiveOutRegs);
2338}
2339
2340/// A heuristic to filter nodes in recurrent node-sets if the register
2341/// pressure of a set is too high.
2342void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
2343 for (auto &NS : NodeSets) {
2344 // Skip small node-sets since they won't cause register pressure problems.
2345 if (NS.size() <= 2)
2346 continue;
2347 IntervalPressure RecRegPressure;
2348 RegPressureTracker RecRPTracker(RecRegPressure);
2349 RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
2350 computeLiveOuts(MF, RecRPTracker, NS);
2351 RecRPTracker.closeBottom();
2352
2353 std::vector<SUnit *> SUnits(NS.begin(), NS.end());
2354 llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
2355 return A->NodeNum > B->NodeNum;
2356 });
2357
2358 for (auto &SU : SUnits) {
2359 // Since we're computing the register pressure for a subset of the
2360 // instructions in a block, we need to set the tracker for each
2361 // instruction in the node-set. The tracker is set to the instruction
2362 // just after the one we're interested in.
2364 RecRPTracker.setPos(std::next(CurInstI));
2365
2366 RegPressureDelta RPDelta;
2367 ArrayRef<PressureChange> CriticalPSets;
2368 RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
2369 CriticalPSets,
2370 RecRegPressure.MaxSetPressure);
2371 if (RPDelta.Excess.isValid()) {
2372 LLVM_DEBUG(
2373 dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
2374 << TRI->getRegPressureSetName(RPDelta.Excess.getPSet())
2375 << ":" << RPDelta.Excess.getUnitInc() << "\n");
2376 NS.setExceedPressure(SU);
2377 break;
2378 }
2379 RecRPTracker.recede();
2380 }
2381 }
2382}
2383
2384/// A heuristic to colocate node sets that have the same set of
2385/// successors.
2386void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
2387 unsigned Colocate = 0;
2388 for (int i = 0, e = NodeSets.size(); i < e; ++i) {
2389 NodeSet &N1 = NodeSets[i];
2390 SmallSetVector<SUnit *, 8> S1;
2391 if (N1.empty() || !succ_L(N1, S1, DDG.get()))
2392 continue;
2393 for (int j = i + 1; j < e; ++j) {
2394 NodeSet &N2 = NodeSets[j];
2395 if (N1.compareRecMII(N2) != 0)
2396 continue;
2397 SmallSetVector<SUnit *, 8> S2;
2398 if (N2.empty() || !succ_L(N2, S2, DDG.get()))
2399 continue;
2400 if (llvm::set_is_subset(S1, S2) && S1.size() == S2.size()) {
2401 N1.setColocate(++Colocate);
2402 N2.setColocate(Colocate);
2403 break;
2404 }
2405 }
2406 }
2407}
2408
2409/// Check if the existing node-sets are profitable. If not, then ignore the
2410/// recurrent node-sets, and attempt to schedule all nodes together. This is
2411/// a heuristic. If the MII is large and all the recurrent node-sets are small,
2412/// then it's best to try to schedule all instructions together instead of
2413/// starting with the recurrent node-sets.
2414void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
2415 // Look for loops with a large MII.
2416 if (MII < 17)
2417 return;
2418 // Check if the node-set contains only a simple add recurrence.
2419 for (auto &NS : NodeSets) {
2420 if (NS.getRecMII() > 2)
2421 return;
2422 if (NS.getMaxDepth() > MII)
2423 return;
2424 }
2425 NodeSets.clear();
2426 LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
2427}
2428
2429/// Add the nodes that do not belong to a recurrence set into groups
2430/// based upon connected components.
2431void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
2432 SetVector<SUnit *> NodesAdded;
2433 SmallPtrSet<SUnit *, 8> Visited;
2434 // Add the nodes that are on a path between the previous node sets and
2435 // the current node set.
2436 for (NodeSet &I : NodeSets) {
2437 SmallSetVector<SUnit *, 8> N;
2438 // Add the nodes from the current node set to the previous node set.
2439 if (succ_L(I, N, DDG.get())) {
2440 SetVector<SUnit *> Path;
2441 for (SUnit *NI : N) {
2442 Visited.clear();
2443 computePath(NI, Path, NodesAdded, I, Visited, DDG.get());
2444 }
2445 if (!Path.empty())
2446 I.insert(Path.begin(), Path.end());
2447 }
2448 // Add the nodes from the previous node set to the current node set.
2449 N.clear();
2450 if (succ_L(NodesAdded, N, DDG.get())) {
2451 SetVector<SUnit *> Path;
2452 for (SUnit *NI : N) {
2453 Visited.clear();
2454 computePath(NI, Path, I, NodesAdded, Visited, DDG.get());
2455 }
2456 if (!Path.empty())
2457 I.insert(Path.begin(), Path.end());
2458 }
2459 NodesAdded.insert_range(I);
2460 }
2461
2462 // Create a new node set with the connected nodes of any successor of a node
2463 // in a recurrent set.
2464 NodeSet NewSet;
2465 SmallSetVector<SUnit *, 8> N;
2466 if (succ_L(NodesAdded, N, DDG.get()))
2467 for (SUnit *I : N)
2468 addConnectedNodes(I, NewSet, NodesAdded);
2469 if (!NewSet.empty())
2470 NodeSets.push_back(NewSet);
2471
2472 // Create a new node set with the connected nodes of any predecessor of a node
2473 // in a recurrent set.
2474 NewSet.clear();
2475 if (pred_L(NodesAdded, N, DDG.get()))
2476 for (SUnit *I : N)
2477 addConnectedNodes(I, NewSet, NodesAdded);
2478 if (!NewSet.empty())
2479 NodeSets.push_back(NewSet);
2480
2481 // Create new nodes sets with the connected nodes any remaining node that
2482 // has no predecessor.
2483 for (SUnit &SU : SUnits) {
2484 if (NodesAdded.count(&SU) == 0) {
2485 NewSet.clear();
2486 addConnectedNodes(&SU, NewSet, NodesAdded);
2487 if (!NewSet.empty())
2488 NodeSets.push_back(NewSet);
2489 }
2490 }
2491}
2492
2493/// Add the node to the set, and add all of its connected nodes to the set.
2494void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
2495 SetVector<SUnit *> &NodesAdded) {
2496 NewSet.insert(SU);
2497 NodesAdded.insert(SU);
2498 for (auto &OE : DDG->getOutEdges(SU)) {
2499 SUnit *Successor = OE.getDst();
2500 if (!OE.isArtificial() && !Successor->isBoundaryNode() &&
2501 NodesAdded.count(Successor) == 0)
2502 addConnectedNodes(Successor, NewSet, NodesAdded);
2503 }
2504 for (auto &IE : DDG->getInEdges(SU)) {
2505 SUnit *Predecessor = IE.getSrc();
2506 if (!IE.isArtificial() && NodesAdded.count(Predecessor) == 0)
2507 addConnectedNodes(Predecessor, NewSet, NodesAdded);
2508 }
2509}
2510
2511/// Return true if Set1 contains elements in Set2. The elements in common
2512/// are returned in a different container.
2513static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
2515 Result.clear();
2516 for (SUnit *SU : Set1) {
2517 if (Set2.count(SU) != 0)
2518 Result.insert(SU);
2519 }
2520 return !Result.empty();
2521}
2522
2523/// Merge the recurrence node sets that have the same initial node.
2524void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
2525 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2526 ++I) {
2527 NodeSet &NI = *I;
2528 for (NodeSetType::iterator J = I + 1; J != E;) {
2529 NodeSet &NJ = *J;
2530 if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
2531 if (NJ.compareRecMII(NI) > 0)
2532 NI.setRecMII(NJ.getRecMII());
2533 for (SUnit *SU : *J)
2534 I->insert(SU);
2535 NodeSets.erase(J);
2536 E = NodeSets.end();
2537 } else {
2538 ++J;
2539 }
2540 }
2541 }
2542}
2543
2544/// Remove nodes that have been scheduled in previous NodeSets.
2545void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
2546 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2547 ++I)
2548 for (NodeSetType::iterator J = I + 1; J != E;) {
2549 J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
2550
2551 if (J->empty()) {
2552 NodeSets.erase(J);
2553 E = NodeSets.end();
2554 } else {
2555 ++J;
2556 }
2557 }
2558}
2559
2560/// Compute an ordered list of the dependence graph nodes, which
2561/// indicates the order that the nodes will be scheduled. This is a
2562/// two-level algorithm. First, a partial order is created, which
2563/// consists of a list of sets ordered from highest to lowest priority.
2564void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
2565 SmallSetVector<SUnit *, 8> R;
2566 NodeOrder.clear();
2567
2568 for (auto &Nodes : NodeSets) {
2569 LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
2570 OrderKind Order;
2571 SmallSetVector<SUnit *, 8> N;
2572 if (pred_L(NodeOrder, N, DDG.get()) && llvm::set_is_subset(N, Nodes)) {
2573 R.insert_range(N);
2574 Order = BottomUp;
2575 LLVM_DEBUG(dbgs() << " Bottom up (preds) ");
2576 } else if (succ_L(NodeOrder, N, DDG.get()) &&
2577 llvm::set_is_subset(N, Nodes)) {
2578 R.insert_range(N);
2579 Order = TopDown;
2580 LLVM_DEBUG(dbgs() << " Top down (succs) ");
2581 } else if (isIntersect(N, Nodes, R)) {
2582 // If some of the successors are in the existing node-set, then use the
2583 // top-down ordering.
2584 Order = TopDown;
2585 LLVM_DEBUG(dbgs() << " Top down (intersect) ");
2586 } else if (NodeSets.size() == 1) {
2587 for (const auto &N : Nodes)
2588 if (N->Succs.size() == 0)
2589 R.insert(N);
2590 Order = BottomUp;
2591 LLVM_DEBUG(dbgs() << " Bottom up (all) ");
2592 } else {
2593 // Find the node with the highest ASAP.
2594 SUnit *maxASAP = nullptr;
2595 for (SUnit *SU : Nodes) {
2596 if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
2597 (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
2598 maxASAP = SU;
2599 }
2600 R.insert(maxASAP);
2601 Order = BottomUp;
2602 LLVM_DEBUG(dbgs() << " Bottom up (default) ");
2603 }
2604
2605 while (!R.empty()) {
2606 if (Order == TopDown) {
2607 // Choose the node with the maximum height. If more than one, choose
2608 // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
2609 // choose the node with the lowest MOV.
2610 while (!R.empty()) {
2611 SUnit *maxHeight = nullptr;
2612 for (SUnit *I : R) {
2613 if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
2614 maxHeight = I;
2615 else if (getHeight(I) == getHeight(maxHeight) &&
2616 getZeroLatencyHeight(I) > getZeroLatencyHeight(maxHeight))
2617 maxHeight = I;
2618 else if (getHeight(I) == getHeight(maxHeight) &&
2619 getZeroLatencyHeight(I) ==
2620 getZeroLatencyHeight(maxHeight) &&
2621 getMOV(I) < getMOV(maxHeight))
2622 maxHeight = I;
2623 }
2624 NodeOrder.insert(maxHeight);
2625 LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
2626 R.remove(maxHeight);
2627 for (const auto &OE : DDG->getOutEdges(maxHeight)) {
2628 SUnit *SU = OE.getDst();
2629 if (Nodes.count(SU) == 0)
2630 continue;
2631 if (NodeOrder.contains(SU))
2632 continue;
2633 if (OE.ignoreDependence(false))
2634 continue;
2635 R.insert(SU);
2636 }
2637
2638 // FIXME: The following loop-carried dependencies may also need to be
2639 // considered.
2640 // - Physical register dependnecies (true-dependnece and WAW).
2641 // - Memory dependencies.
2642 for (const auto &IE : DDG->getInEdges(maxHeight)) {
2643 SUnit *SU = IE.getSrc();
2644 if (!IE.isAntiDep())
2645 continue;
2646 if (Nodes.count(SU) == 0)
2647 continue;
2648 if (NodeOrder.contains(SU))
2649 continue;
2650 R.insert(SU);
2651 }
2652 }
2653 Order = BottomUp;
2654 LLVM_DEBUG(dbgs() << "\n Switching order to bottom up ");
2655 SmallSetVector<SUnit *, 8> N;
2656 if (pred_L(NodeOrder, N, DDG.get(), &Nodes))
2657 R.insert_range(N);
2658 } else {
2659 // Choose the node with the maximum depth. If more than one, choose
2660 // the node with the maximum ZeroLatencyDepth. If still more than one,
2661 // choose the node with the lowest MOV.
2662 while (!R.empty()) {
2663 SUnit *maxDepth = nullptr;
2664 for (SUnit *I : R) {
2665 if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
2666 maxDepth = I;
2667 else if (getDepth(I) == getDepth(maxDepth) &&
2668 getZeroLatencyDepth(I) > getZeroLatencyDepth(maxDepth))
2669 maxDepth = I;
2670 else if (getDepth(I) == getDepth(maxDepth) &&
2671 getZeroLatencyDepth(I) == getZeroLatencyDepth(maxDepth) &&
2672 getMOV(I) < getMOV(maxDepth))
2673 maxDepth = I;
2674 }
2675 NodeOrder.insert(maxDepth);
2676 LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
2677 R.remove(maxDepth);
2678 if (Nodes.isExceedSU(maxDepth)) {
2679 Order = TopDown;
2680 R.clear();
2681 R.insert(Nodes.getNode(0));
2682 break;
2683 }
2684 for (const auto &IE : DDG->getInEdges(maxDepth)) {
2685 SUnit *SU = IE.getSrc();
2686 if (Nodes.count(SU) == 0)
2687 continue;
2688 if (NodeOrder.contains(SU))
2689 continue;
2690 R.insert(SU);
2691 }
2692
2693 // FIXME: The following loop-carried dependencies may also need to be
2694 // considered.
2695 // - Physical register dependnecies (true-dependnece and WAW).
2696 // - Memory dependencies.
2697 for (const auto &OE : DDG->getOutEdges(maxDepth)) {
2698 SUnit *SU = OE.getDst();
2699 if (!OE.isAntiDep())
2700 continue;
2701 if (Nodes.count(SU) == 0)
2702 continue;
2703 if (NodeOrder.contains(SU))
2704 continue;
2705 R.insert(SU);
2706 }
2707 }
2708 Order = TopDown;
2709 LLVM_DEBUG(dbgs() << "\n Switching order to top down ");
2710 SmallSetVector<SUnit *, 8> N;
2711 if (succ_L(NodeOrder, N, DDG.get(), &Nodes))
2712 R.insert_range(N);
2713 }
2714 }
2715 LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
2716 }
2717
2718 LLVM_DEBUG({
2719 dbgs() << "Node order: ";
2720 for (SUnit *I : NodeOrder)
2721 dbgs() << " " << I->NodeNum << " ";
2722 dbgs() << "\n";
2723 });
2724}
2725
2726/// Process the nodes in the computed order and create the pipelined schedule
2727/// of the instructions, if possible. Return true if a schedule is found.
2728bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
2729
2730 if (NodeOrder.empty()){
2731 LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
2732 return false;
2733 }
2734
2735 bool scheduleFound = false;
2736 std::unique_ptr<HighRegisterPressureDetector> HRPDetector;
2737 if (LimitRegPressure) {
2738 HRPDetector =
2739 std::make_unique<HighRegisterPressureDetector>(Loop.getHeader(), MF);
2740 HRPDetector->init(RegClassInfo);
2741 }
2742 // Keep increasing II until a valid schedule is found.
2743 for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) {
2744 Schedule.reset();
2745 Schedule.setInitiationInterval(II);
2746 LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
2747
2750 do {
2751 SUnit *SU = *NI;
2752
2753 // Compute the schedule time for the instruction, which is based
2754 // upon the scheduled time for any predecessors/successors.
2755 int EarlyStart = INT_MIN;
2756 int LateStart = INT_MAX;
2757 Schedule.computeStart(SU, &EarlyStart, &LateStart, II, this);
2758 LLVM_DEBUG({
2759 dbgs() << "\n";
2760 dbgs() << "Inst (" << SU->NodeNum << ") ";
2761 SU->getInstr()->dump();
2762 dbgs() << "\n";
2763 });
2764 LLVM_DEBUG(
2765 dbgs() << format("\tes: %8x ls: %8x\n", EarlyStart, LateStart));
2766
2767 if (EarlyStart > LateStart)
2768 scheduleFound = false;
2769 else if (EarlyStart != INT_MIN && LateStart == INT_MAX)
2770 scheduleFound =
2771 Schedule.insert(SU, EarlyStart, EarlyStart + (int)II - 1, II);
2772 else if (EarlyStart == INT_MIN && LateStart != INT_MAX)
2773 scheduleFound =
2774 Schedule.insert(SU, LateStart, LateStart - (int)II + 1, II);
2775 else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2776 LateStart = std::min(LateStart, EarlyStart + (int)II - 1);
2777 // When scheduling a Phi it is better to start at the late cycle and
2778 // go backwards. The default order may insert the Phi too far away
2779 // from its first dependence.
2780 // Also, do backward search when all scheduled predecessors are
2781 // loop-carried output/order dependencies. Empirically, there are also
2782 // cases where scheduling becomes possible with backward search.
2783 if (SU->getInstr()->isPHI() ||
2784 Schedule.onlyHasLoopCarriedOutputOrOrderPreds(SU, this->getDDG()))
2785 scheduleFound = Schedule.insert(SU, LateStart, EarlyStart, II);
2786 else
2787 scheduleFound = Schedule.insert(SU, EarlyStart, LateStart, II);
2788 } else {
2789 int FirstCycle = Schedule.getFirstCycle();
2790 scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2791 FirstCycle + getASAP(SU) + II - 1, II);
2792 }
2793
2794 // Even if we find a schedule, make sure the schedule doesn't exceed the
2795 // allowable number of stages. We keep trying if this happens.
2796 if (scheduleFound)
2797 if (SwpMaxStages > -1 &&
2798 Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2799 scheduleFound = false;
2800
2801 LLVM_DEBUG({
2802 if (!scheduleFound)
2803 dbgs() << "\tCan't schedule\n";
2804 });
2805 } while (++NI != NE && scheduleFound);
2806
2807 // If a schedule is found, validate it against the validation-only
2808 // dependencies.
2809 if (scheduleFound)
2810 scheduleFound = DDG->isValidSchedule(Schedule);
2811
2812 // If a schedule is found, ensure non-pipelined instructions are in stage 0
2813 if (scheduleFound)
2814 scheduleFound =
2815 Schedule.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo);
2816
2817 // If a schedule is found, check if it is a valid schedule too.
2818 if (scheduleFound)
2819 scheduleFound = Schedule.isValidSchedule(this);
2820
2821 // If a schedule was found and the option is enabled, check if the schedule
2822 // might generate additional register spills/fills.
2823 if (scheduleFound && LimitRegPressure)
2824 scheduleFound =
2825 !HRPDetector->detect(this, Schedule, Schedule.getMaxStageCount());
2826 }
2827
2828 LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
2829 << " (II=" << Schedule.getInitiationInterval()
2830 << ")\n");
2831
2832 if (scheduleFound) {
2833 scheduleFound = LoopPipelinerInfo->shouldUseSchedule(*this, Schedule);
2834 if (!scheduleFound)
2835 LLVM_DEBUG(dbgs() << "Target rejected schedule\n");
2836 }
2837
2838 if (scheduleFound) {
2839 Schedule.finalizeSchedule(this);
2840 Pass.ORE->emit([&]() {
2841 return MachineOptimizationRemarkAnalysis(
2842 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
2843 << "Schedule found with Initiation Interval: "
2844 << ore::NV("II", Schedule.getInitiationInterval())
2845 << ", MaxStageCount: "
2846 << ore::NV("MaxStageCount", Schedule.getMaxStageCount());
2847 });
2848 } else
2849 Schedule.reset();
2850
2851 return scheduleFound && Schedule.getMaxStageCount() > 0;
2852}
2853
2855 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
2856 Register Result;
2857 for (const MachineOperand &Use : MI.all_uses()) {
2858 Register Reg = Use.getReg();
2859 if (!Reg.isVirtual())
2860 return Register();
2861 if (MRI.getVRegDef(Reg)->getParent() != MI.getParent())
2862 continue;
2863 if (Result)
2864 return Register();
2865 Result = Reg;
2866 }
2867 return Result;
2868}
2869
2870/// When Op is a value that is incremented recursively in a loop and there is a
2871/// unique instruction that increments it, returns true and sets Value.
2873 if (!Op.isReg() || !Op.getReg().isVirtual())
2874 return false;
2875
2876 Register OrgReg = Op.getReg();
2877 Register CurReg = OrgReg;
2878 const MachineBasicBlock *LoopBB = Op.getParent()->getParent();
2879 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
2880
2881 const TargetInstrInfo *TII =
2882 LoopBB->getParent()->getSubtarget().getInstrInfo();
2883 const TargetRegisterInfo *TRI =
2884 LoopBB->getParent()->getSubtarget().getRegisterInfo();
2885
2886 MachineInstr *Phi = nullptr;
2887 MachineInstr *Increment = nullptr;
2888
2889 // Traverse definitions until it reaches Op or an instruction that does not
2890 // satisfy the condition.
2891 // Acceptable example:
2892 // bb.0:
2893 // %0 = PHI %3, %bb.0, ...
2894 // %2 = ADD %0, Value
2895 // ... = LOAD %2(Op)
2896 // %3 = COPY %2
2897 while (true) {
2898 if (!CurReg.isValid() || !CurReg.isVirtual())
2899 return false;
2900 MachineInstr *Def = MRI.getVRegDef(CurReg);
2901 if (Def->getParent() != LoopBB)
2902 return false;
2903
2904 if (Def->isCopy()) {
2905 // Ignore copy instructions unless they contain subregisters
2906 if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())
2907 return false;
2908 CurReg = Def->getOperand(1).getReg();
2909 } else if (Def->isPHI()) {
2910 // There must be just one Phi
2911 if (Phi)
2912 return false;
2913 Phi = Def;
2914 CurReg = getLoopPhiReg(*Def, LoopBB);
2915 } else if (TII->getIncrementValue(*Def, Value)) {
2916 // Potentially a unique increment
2917 if (Increment)
2918 // Multiple increments exist
2919 return false;
2920
2921 const MachineOperand *BaseOp;
2922 int64_t Offset;
2923 bool OffsetIsScalable;
2924 if (TII->getMemOperandWithOffset(*Def, BaseOp, Offset, OffsetIsScalable,
2925 TRI)) {
2926 // Pre/post increment instruction
2927 CurReg = BaseOp->getReg();
2928 } else {
2929 // If only one of the operands is defined within the loop, it is assumed
2930 // to be an incremented value.
2931 CurReg = findUniqueOperandDefinedInLoop(*Def);
2932 if (!CurReg.isValid())
2933 return false;
2934 }
2935 Increment = Def;
2936 } else {
2937 return false;
2938 }
2939 if (CurReg == OrgReg)
2940 break;
2941 }
2942
2943 if (!Phi || !Increment)
2944 return false;
2945
2946 return true;
2947}
2948
2949/// Return true if we can compute the amount the instruction changes
2950/// during each iteration. Set Delta to the amount of the change.
2951bool SwingSchedulerDAG::computeDelta(const MachineInstr &MI, int &Delta) const {
2952 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2953 const MachineOperand *BaseOp;
2954 int64_t Offset;
2955 bool OffsetIsScalable;
2956 if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
2957 return false;
2958
2959 // FIXME: This algorithm assumes instructions have fixed-size offsets.
2960 if (OffsetIsScalable)
2961 return false;
2962
2963 if (!BaseOp->isReg())
2964 return false;
2965
2966 return findLoopIncrementValue(*BaseOp, Delta);
2967}
2968
2969/// Check if we can change the instruction to use an offset value from the
2970/// previous iteration. If so, return true and set the base and offset values
2971/// so that we can rewrite the load, if necessary.
2972/// v1 = Phi(v0, v3)
2973/// v2 = load v1, 0
2974/// v3 = post_store v1, 4, x
2975/// This function enables the load to be rewritten as v2 = load v3, 4.
2976bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
2977 unsigned &BasePos,
2978 unsigned &OffsetPos,
2979 Register &NewBase,
2980 int64_t &Offset) {
2981 // Get the load instruction.
2982 if (TII->isPostIncrement(*MI))
2983 return false;
2984 unsigned BasePosLd, OffsetPosLd;
2985 if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
2986 return false;
2987 Register BaseReg = MI->getOperand(BasePosLd).getReg();
2988
2989 // Look for the Phi instruction.
2990 MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
2991 MachineInstr *Phi = MRI.getVRegDef(BaseReg);
2992 if (!Phi || !Phi->isPHI())
2993 return false;
2994 // Get the register defined in the loop block.
2995 Register PrevReg = getLoopPhiReg(*Phi, MI->getParent());
2996 if (!PrevReg)
2997 return false;
2998
2999 // Check for the post-increment load/store instruction.
3000 MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
3001 if (!PrevDef || PrevDef == MI)
3002 return false;
3003
3004 if (!TII->isPostIncrement(*PrevDef))
3005 return false;
3006
3007 unsigned BasePos1 = 0, OffsetPos1 = 0;
3008 if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
3009 return false;
3010
3011 // Make sure that the instructions do not access the same memory location in
3012 // the next iteration.
3013 int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
3014 int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
3015 MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3016 NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
3017 bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
3018 MF.deleteMachineInstr(NewMI);
3019 if (!Disjoint)
3020 return false;
3021
3022 // Set the return value once we determine that we return true.
3023 BasePos = BasePosLd;
3024 OffsetPos = OffsetPosLd;
3025 NewBase = PrevReg;
3026 Offset = StoreOffset;
3027 return true;
3028}
3029
3030/// Apply changes to the instruction if needed. The changes are need
3031/// to improve the scheduling and depend up on the final schedule.
3033 SMSchedule &Schedule) {
3034 SUnit *SU = getSUnit(MI);
3036 InstrChanges.find(SU);
3037 if (It != InstrChanges.end()) {
3038 std::pair<Register, int64_t> RegAndOffset = It->second;
3039 unsigned BasePos, OffsetPos;
3040 if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
3041 return;
3042 Register BaseReg = MI->getOperand(BasePos).getReg();
3043 MachineInstr *LoopDef = findDefInLoop(BaseReg);
3044 int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
3045 int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
3046 int BaseStageNum = Schedule.stageScheduled(SU);
3047 int BaseCycleNum = Schedule.cycleScheduled(SU);
3048 if (BaseStageNum < DefStageNum) {
3049 MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3050 int OffsetDiff = DefStageNum - BaseStageNum;
3051 if (DefCycleNum < BaseCycleNum) {
3052 NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
3053 if (OffsetDiff > 0)
3054 --OffsetDiff;
3055 }
3056 int64_t NewOffset =
3057 MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
3058 NewMI->getOperand(OffsetPos).setImm(NewOffset);
3059 SU->setInstr(NewMI);
3060 MISUnitMap[NewMI] = SU;
3061 NewMIs[MI] = NewMI;
3062 }
3063 }
3064}
3065
3066/// Return the instruction in the loop that defines the register.
3067/// If the definition is a Phi, then follow the Phi operand to
3068/// the instruction in the loop.
3069MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
3071 MachineInstr *Def = MRI.getVRegDef(Reg);
3072 while (Def->isPHI()) {
3073 if (!Visited.insert(Def).second)
3074 break;
3075 for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
3076 if (Def->getOperand(i + 1).getMBB() == BB) {
3077 Def = MRI.getVRegDef(Def->getOperand(i).getReg());
3078 break;
3079 }
3080 }
3081 return Def;
3082}
3083
3084/// Return false if there is no overlap between the region accessed by BaseMI in
3085/// an iteration and the region accessed by OtherMI in subsequent iterations.
3087 const MachineInstr *BaseMI, const MachineInstr *OtherMI) const {
3088 int DeltaB, DeltaO, Delta;
3089 if (!computeDelta(*BaseMI, DeltaB) || !computeDelta(*OtherMI, DeltaO) ||
3090 DeltaB != DeltaO)
3091 return true;
3092 Delta = DeltaB;
3093
3094 const MachineOperand *BaseOpB, *BaseOpO;
3095 int64_t OffsetB, OffsetO;
3096 bool OffsetBIsScalable, OffsetOIsScalable;
3097 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3098 if (!TII->getMemOperandWithOffset(*BaseMI, BaseOpB, OffsetB,
3099 OffsetBIsScalable, TRI) ||
3100 !TII->getMemOperandWithOffset(*OtherMI, BaseOpO, OffsetO,
3101 OffsetOIsScalable, TRI))
3102 return true;
3103
3104 if (OffsetBIsScalable || OffsetOIsScalable)
3105 return true;
3106
3107 if (!BaseOpB->isIdenticalTo(*BaseOpO)) {
3108 // Pass cases with different base operands but same initial values.
3109 // Typically for when pre/post increment is used.
3110
3111 if (!BaseOpB->isReg() || !BaseOpO->isReg())
3112 return true;
3113 Register RegB = BaseOpB->getReg(), RegO = BaseOpO->getReg();
3114 if (!RegB.isVirtual() || !RegO.isVirtual())
3115 return true;
3116
3117 MachineInstr *DefB = MRI.getVRegDef(BaseOpB->getReg());
3118 MachineInstr *DefO = MRI.getVRegDef(BaseOpO->getReg());
3119 if (!DefB || !DefO || !DefB->isPHI() || !DefO->isPHI())
3120 return true;
3121
3122 Register InitValB;
3123 Register LoopValB;
3124 Register InitValO;
3125 Register LoopValO;
3126 getPhiRegs(*DefB, BB, InitValB, LoopValB);
3127 getPhiRegs(*DefO, BB, InitValO, LoopValO);
3128 MachineInstr *InitDefB = MRI.getVRegDef(InitValB);
3129 MachineInstr *InitDefO = MRI.getVRegDef(InitValO);
3130
3131 if (!InitDefB->isIdenticalTo(*InitDefO))
3132 return true;
3133 }
3134
3135 LocationSize AccessSizeB = (*BaseMI->memoperands_begin())->getSize();
3136 LocationSize AccessSizeO = (*OtherMI->memoperands_begin())->getSize();
3137
3138 // This is the main test, which checks the offset values and the loop
3139 // increment value to determine if the accesses may be loop carried.
3140 if (!AccessSizeB.hasValue() || !AccessSizeO.hasValue())
3141 return true;
3142
3143 LLVM_DEBUG({
3144 dbgs() << "Overlap check:\n";
3145 dbgs() << " BaseMI: ";
3146 BaseMI->dump();
3147 dbgs() << " Base + " << OffsetB << " + I * " << Delta
3148 << ", Len: " << AccessSizeB.getValue() << "\n";
3149 dbgs() << " OtherMI: ";
3150 OtherMI->dump();
3151 dbgs() << " Base + " << OffsetO << " + I * " << Delta
3152 << ", Len: " << AccessSizeO.getValue() << "\n";
3153 });
3154
3155 // Excessive overlap may be detected in strided patterns.
3156 // For example, the memory addresses of the store and the load in
3157 // for (i=0; i<n; i+=2) a[i+1] = a[i];
3158 // are assumed to overlap.
3159 if (Delta < 0) {
3160 int64_t BaseMinAddr = OffsetB;
3161 int64_t OhterNextIterMaxAddr = OffsetO + Delta + AccessSizeO.getValue() - 1;
3162 if (BaseMinAddr > OhterNextIterMaxAddr) {
3163 LLVM_DEBUG(dbgs() << " Result: No overlap\n");
3164 return false;
3165 }
3166 } else {
3167 int64_t BaseMaxAddr = OffsetB + AccessSizeB.getValue() - 1;
3168 int64_t OtherNextIterMinAddr = OffsetO + Delta;
3169 if (BaseMaxAddr < OtherNextIterMinAddr) {
3170 LLVM_DEBUG(dbgs() << " Result: No overlap\n");
3171 return false;
3172 }
3173 }
3174 LLVM_DEBUG(dbgs() << " Result: Overlap\n");
3175 return true;
3176}
3177
3178/// Return true for an order or output dependence that is loop carried
3179/// potentially. A dependence is loop carried if the destination defines a value
3180/// that may be used or defined by the source in a subsequent iteration.
3182 const SwingSchedulerDDGEdge &Edge) const {
3183 if ((!Edge.isOrderDep() && !Edge.isOutputDep()) || Edge.isArtificial() ||
3184 Edge.getDst()->isBoundaryNode())
3185 return false;
3186
3188 return true;
3189
3190 if (Edge.isOutputDep())
3191 return true;
3192
3193 MachineInstr *SI = Edge.getSrc()->getInstr();
3194 MachineInstr *DI = Edge.getDst()->getInstr();
3195 assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
3196
3197 // Assume ordered loads and stores may have a loop carried dependence.
3198 if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
3199 SI->mayRaiseFPException() || DI->mayRaiseFPException() ||
3200 SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
3201 return true;
3202
3203 if (!DI->mayLoadOrStore() || !SI->mayLoadOrStore())
3204 return false;
3205
3206 // The conservative assumption is that a dependence between memory operations
3207 // may be loop carried. The following code checks when it can be proved that
3208 // there is no loop carried dependence.
3209 return mayOverlapInLaterIter(DI, SI);
3210}
3211
3212void SwingSchedulerDAG::postProcessDAG() {
3213 for (auto &M : Mutations)
3214 M->apply(this);
3215}
3216
3217/// Try to schedule the node at the specified StartCycle and continue
3218/// until the node is schedule or the EndCycle is reached. This function
3219/// returns true if the node is scheduled. This routine may search either
3220/// forward or backward for a place to insert the instruction based upon
3221/// the relative values of StartCycle and EndCycle.
3222bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
3223 bool forward = true;
3224 LLVM_DEBUG({
3225 dbgs() << "Trying to insert node between " << StartCycle << " and "
3226 << EndCycle << " II: " << II << "\n";
3227 });
3228 if (StartCycle > EndCycle)
3229 forward = false;
3230
3231 // The terminating condition depends on the direction.
3232 int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
3233 for (int curCycle = StartCycle; curCycle != termCycle;
3234 forward ? ++curCycle : --curCycle) {
3235
3236 if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
3237 ProcItinResources.canReserveResources(*SU, curCycle)) {
3238 LLVM_DEBUG({
3239 dbgs() << "\tinsert at cycle " << curCycle << " ";
3240 SU->getInstr()->dump();
3241 });
3242
3243 if (!ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
3244 ProcItinResources.reserveResources(*SU, curCycle);
3245 ScheduledInstrs[curCycle].push_back(SU);
3246 InstrToCycle.insert(std::make_pair(SU, curCycle));
3247 if (curCycle > LastCycle)
3248 LastCycle = curCycle;
3249 if (curCycle < FirstCycle)
3250 FirstCycle = curCycle;
3251 return true;
3252 }
3253 LLVM_DEBUG({
3254 dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
3255 SU->getInstr()->dump();
3256 });
3257 }
3258 return false;
3259}
3260
3261// Return the cycle of the earliest scheduled instruction in the chain.
3263 const SwingSchedulerDDG *DDG) {
3266 Worklist.push_back(Dep);
3267 int EarlyCycle = INT_MAX;
3268 while (!Worklist.empty()) {
3269 const SwingSchedulerDDGEdge &Cur = Worklist.pop_back_val();
3270 SUnit *PrevSU = Cur.getSrc();
3271 if (Visited.count(PrevSU))
3272 continue;
3273 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
3274 if (it == InstrToCycle.end())
3275 continue;
3276 EarlyCycle = std::min(EarlyCycle, it->second);
3277 for (const auto &IE : DDG->getInEdges(PrevSU))
3278 if (IE.isOrderDep() || IE.isOutputDep())
3279 Worklist.push_back(IE);
3280 Visited.insert(PrevSU);
3281 }
3282 return EarlyCycle;
3283}
3284
3285// Return the cycle of the latest scheduled instruction in the chain.
3287 const SwingSchedulerDDG *DDG) {
3290 Worklist.push_back(Dep);
3291 int LateCycle = INT_MIN;
3292 while (!Worklist.empty()) {
3293 const SwingSchedulerDDGEdge &Cur = Worklist.pop_back_val();
3294 SUnit *SuccSU = Cur.getDst();
3295 if (Visited.count(SuccSU) || SuccSU->isBoundaryNode())
3296 continue;
3297 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
3298 if (it == InstrToCycle.end())
3299 continue;
3300 LateCycle = std::max(LateCycle, it->second);
3301 for (const auto &OE : DDG->getOutEdges(SuccSU))
3302 if (OE.isOrderDep() || OE.isOutputDep())
3303 Worklist.push_back(OE);
3304 Visited.insert(SuccSU);
3305 }
3306 return LateCycle;
3307}
3308
3309/// If an instruction has a use that spans multiple iterations, then
3310/// return true. These instructions are characterized by having a back-ege
3311/// to a Phi, which contains a reference to another Phi.
3313 for (auto &P : SU->Preds)
3314 if (P.getKind() == SDep::Anti && P.getSUnit()->getInstr()->isPHI())
3315 for (auto &S : P.getSUnit()->Succs)
3316 if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
3317 return P.getSUnit();
3318 return nullptr;
3319}
3320
3321/// Compute the scheduling start slot for the instruction. The start slot
3322/// depends on any predecessor or successor nodes scheduled already.
3323void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
3324 int II, SwingSchedulerDAG *DAG) {
3325 const SwingSchedulerDDG *DDG = DAG->getDDG();
3326
3327 // Iterate over each instruction that has been scheduled already. The start
3328 // slot computation depends on whether the previously scheduled instruction
3329 // is a predecessor or successor of the specified instruction.
3330 for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
3331 for (SUnit *I : getInstructions(cycle)) {
3332 for (const auto &IE : DDG->getInEdges(SU)) {
3333 if (IE.getSrc() == I) {
3334 // FIXME: Add reverse edge to `DDG` instead of calling
3335 // `isLoopCarriedDep`
3336 if (DAG->isLoopCarriedDep(IE)) {
3337 int End = earliestCycleInChain(IE, DDG) + (II - 1);
3338 *MinLateStart = std::min(*MinLateStart, End);
3339 }
3340 int EarlyStart = cycle + IE.getLatency() - IE.getDistance() * II;
3341 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
3342 }
3343 }
3344
3345 for (const auto &OE : DDG->getOutEdges(SU)) {
3346 if (OE.getDst() == I) {
3347 // FIXME: Add reverse edge to `DDG` instead of calling
3348 // `isLoopCarriedDep`
3349 if (DAG->isLoopCarriedDep(OE)) {
3350 int Start = latestCycleInChain(OE, DDG) + 1 - II;
3351 *MaxEarlyStart = std::max(*MaxEarlyStart, Start);
3352 }
3353 int LateStart = cycle - OE.getLatency() + OE.getDistance() * II;
3354 *MinLateStart = std::min(*MinLateStart, LateStart);
3355 }
3356 }
3357
3358 SUnit *BE = multipleIterations(I, DAG);
3359 for (const auto &Dep : SU->Preds) {
3360 // For instruction that requires multiple iterations, make sure that
3361 // the dependent instruction is not scheduled past the definition.
3362 if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
3363 !SU->isPred(I))
3364 *MinLateStart = std::min(*MinLateStart, cycle);
3365 }
3366 }
3367 }
3368}
3369
3370/// Order the instructions within a cycle so that the definitions occur
3371/// before the uses. Returns true if the instruction is added to the start
3372/// of the list, or false if added to the end.
3374 std::deque<SUnit *> &Insts) const {
3375 MachineInstr *MI = SU->getInstr();
3376 bool OrderBeforeUse = false;
3377 bool OrderAfterDef = false;
3378 bool OrderBeforeDef = false;
3379 unsigned MoveDef = 0;
3380 unsigned MoveUse = 0;
3381 int StageInst1 = stageScheduled(SU);
3382 const SwingSchedulerDDG *DDG = SSD->getDDG();
3383
3384 unsigned Pos = 0;
3385 for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
3386 ++I, ++Pos) {
3387 for (MachineOperand &MO : MI->operands()) {
3388 if (!MO.isReg() || !MO.getReg().isVirtual())
3389 continue;
3390
3391 Register Reg = MO.getReg();
3392 unsigned BasePos, OffsetPos;
3393 if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
3394 if (MI->getOperand(BasePos).getReg() == Reg)
3395 if (Register NewReg = SSD->getInstrBaseReg(SU))
3396 Reg = NewReg;
3397 bool Reads, Writes;
3398 std::tie(Reads, Writes) =
3399 (*I)->getInstr()->readsWritesVirtualRegister(Reg);
3400 if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
3401 OrderBeforeUse = true;
3402 if (MoveUse == 0)
3403 MoveUse = Pos;
3404 } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
3405 // Add the instruction after the scheduled instruction.
3406 OrderAfterDef = true;
3407 MoveDef = Pos;
3408 } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
3409 if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
3410 OrderBeforeUse = true;
3411 if (MoveUse == 0)
3412 MoveUse = Pos;
3413 } else {
3414 OrderAfterDef = true;
3415 MoveDef = Pos;
3416 }
3417 } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
3418 OrderBeforeUse = true;
3419 if (MoveUse == 0)
3420 MoveUse = Pos;
3421 if (MoveUse != 0) {
3422 OrderAfterDef = true;
3423 MoveDef = Pos - 1;
3424 }
3425 } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
3426 // Add the instruction before the scheduled instruction.
3427 OrderBeforeUse = true;
3428 if (MoveUse == 0)
3429 MoveUse = Pos;
3430 } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
3431 isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
3432 if (MoveUse == 0) {
3433 OrderBeforeDef = true;
3434 MoveUse = Pos;
3435 }
3436 }
3437 }
3438 // Check for order dependences between instructions. Make sure the source
3439 // is ordered before the destination.
3440 for (auto &OE : DDG->getOutEdges(SU)) {
3441 if (OE.getDst() != *I)
3442 continue;
3443 if (OE.isOrderDep() && stageScheduled(*I) == StageInst1) {
3444 OrderBeforeUse = true;
3445 if (Pos < MoveUse)
3446 MoveUse = Pos;
3447 }
3448 // We did not handle HW dependences in previous for loop,
3449 // and we normally set Latency = 0 for Anti/Output deps,
3450 // so may have nodes in same cycle with Anti/Output dependent on HW regs.
3451 else if ((OE.isAntiDep() || OE.isOutputDep()) &&
3452 stageScheduled(*I) == StageInst1) {
3453 OrderBeforeUse = true;
3454 if ((MoveUse == 0) || (Pos < MoveUse))
3455 MoveUse = Pos;
3456 }
3457 }
3458 for (auto &IE : DDG->getInEdges(SU)) {
3459 if (IE.getSrc() != *I)
3460 continue;
3461 if ((IE.isAntiDep() || IE.isOutputDep() || IE.isOrderDep()) &&
3462 stageScheduled(*I) == StageInst1) {
3463 OrderAfterDef = true;
3464 MoveDef = Pos;
3465 }
3466 }
3467 }
3468
3469 // A circular dependence.
3470 if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
3471 OrderBeforeUse = false;
3472
3473 // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3474 // to a loop-carried dependence.
3475 if (OrderBeforeDef)
3476 OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
3477
3478 // The uncommon case when the instruction order needs to be updated because
3479 // there is both a use and def.
3480 if (OrderBeforeUse && OrderAfterDef) {
3481 SUnit *UseSU = Insts.at(MoveUse);
3482 SUnit *DefSU = Insts.at(MoveDef);
3483 if (MoveUse > MoveDef) {
3484 Insts.erase(Insts.begin() + MoveUse);
3485 Insts.erase(Insts.begin() + MoveDef);
3486 } else {
3487 Insts.erase(Insts.begin() + MoveDef);
3488 Insts.erase(Insts.begin() + MoveUse);
3489 }
3490 orderDependence(SSD, UseSU, Insts);
3491 orderDependence(SSD, SU, Insts);
3492 orderDependence(SSD, DefSU, Insts);
3493 return;
3494 }
3495 // Put the new instruction first if there is a use in the list. Otherwise,
3496 // put it at the end of the list.
3497 if (OrderBeforeUse)
3498 Insts.push_front(SU);
3499 else
3500 Insts.push_back(SU);
3501}
3502
3503/// Return true if the scheduled Phi has a loop carried operand.
3505 MachineInstr &Phi) const {
3506 if (!Phi.isPHI())
3507 return false;
3508 assert(Phi.isPHI() && "Expecting a Phi.");
3509 SUnit *DefSU = SSD->getSUnit(&Phi);
3510 unsigned DefCycle = cycleScheduled(DefSU);
3511 int DefStage = stageScheduled(DefSU);
3512
3513 Register InitVal;
3514 Register LoopVal;
3515 getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
3516 SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
3517 if (!UseSU)
3518 return true;
3519 if (UseSU->getInstr()->isPHI())
3520 return true;
3521 unsigned LoopCycle = cycleScheduled(UseSU);
3522 int LoopStage = stageScheduled(UseSU);
3523 return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
3524}
3525
3526/// Return true if the instruction is a definition that is loop carried
3527/// and defines the use on the next iteration.
3528/// v1 = phi(v2, v3)
3529/// (Def) v3 = op v1
3530/// (MO) = v1
3531/// If MO appears before Def, then v1 and v3 may get assigned to the same
3532/// register.
3534 MachineInstr *Def,
3535 MachineOperand &MO) const {
3536 if (!MO.isReg())
3537 return false;
3538 if (Def->isPHI())
3539 return false;
3540 MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
3541 if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
3542 return false;
3543 if (!isLoopCarried(SSD, *Phi))
3544 return false;
3545 Register LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
3546 for (MachineOperand &DMO : Def->all_defs()) {
3547 if (DMO.getReg() == LoopReg)
3548 return true;
3549 }
3550 return false;
3551}
3552
3553/// Return true if all scheduled predecessors are loop-carried output/order
3554/// dependencies.
3556 SUnit *SU, const SwingSchedulerDDG *DDG) const {
3557 for (const auto &IE : DDG->getInEdges(SU))
3558 if (InstrToCycle.count(IE.getSrc()))
3559 return false;
3560 return true;
3561}
3562
3563/// Determine transitive dependences of unpipelineable instructions
3566 SmallPtrSet<SUnit *, 8> DoNotPipeline;
3567 SmallVector<SUnit *, 8> Worklist;
3568
3569 for (auto &SU : SSD->SUnits)
3570 if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr()))
3571 Worklist.push_back(&SU);
3572
3573 const SwingSchedulerDDG *DDG = SSD->getDDG();
3574 while (!Worklist.empty()) {
3575 auto SU = Worklist.pop_back_val();
3576 if (DoNotPipeline.count(SU))
3577 continue;
3578 LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");
3579 DoNotPipeline.insert(SU);
3580 for (const auto &IE : DDG->getInEdges(SU))
3581 Worklist.push_back(IE.getSrc());
3582
3583 // To preserve previous behavior and prevent regression
3584 // FIXME: Remove if this doesn't have significant impact on
3585 for (const auto &OE : DDG->getOutEdges(SU))
3586 if (OE.getDistance() == 1)
3587 Worklist.push_back(OE.getDst());
3588 }
3589 return DoNotPipeline;
3590}
3591
3592// Determine all instructions upon which any unpipelineable instruction depends
3593// and ensure that they are in stage 0. If unable to do so, return false.
3597
3598 int NewLastCycle = INT_MIN;
3599 for (SUnit &SU : SSD->SUnits) {
3600 if (!SU.isInstr())
3601 continue;
3602 if (!DNP.contains(&SU) || stageScheduled(&SU) == 0) {
3603 NewLastCycle = std::max(NewLastCycle, InstrToCycle[&SU]);
3604 continue;
3605 }
3606
3607 // Put the non-pipelined instruction as early as possible in the schedule
3608 int NewCycle = getFirstCycle();
3609 for (const auto &IE : SSD->getDDG()->getInEdges(&SU))
3610 if (IE.getDistance() == 0)
3611 NewCycle = std::max(InstrToCycle[IE.getSrc()], NewCycle);
3612
3613 // To preserve previous behavior and prevent regression
3614 // FIXME: Remove if this doesn't have significant impact on performance
3615 for (auto &OE : SSD->getDDG()->getOutEdges(&SU))
3616 if (OE.getDistance() == 1)
3617 NewCycle = std::max(InstrToCycle[OE.getDst()], NewCycle);
3618
3619 int OldCycle = InstrToCycle[&SU];
3620 if (OldCycle != NewCycle) {
3621 InstrToCycle[&SU] = NewCycle;
3622 auto &OldS = getInstructions(OldCycle);
3623 llvm::erase(OldS, &SU);
3624 getInstructions(NewCycle).emplace_back(&SU);
3625 LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum
3626 << ") is not pipelined; moving from cycle " << OldCycle
3627 << " to " << NewCycle << " Instr:" << *SU.getInstr());
3628 }
3629
3630 // We traverse the SUs in the order of the original basic block. Computing
3631 // NewCycle in this order normally works fine because all dependencies
3632 // (except for loop-carried dependencies) don't violate the original order.
3633 // However, an artificial dependency (e.g., added by CopyToPhiMutation) can
3634 // break it. That is, there may be exist an artificial dependency from
3635 // bottom to top. In such a case, NewCycle may become too large to be
3636 // scheduled in Stage 0. For example, assume that Inst0 is in DNP in the
3637 // following case:
3638 //
3639 // | Inst0 <-+
3640 // SU order | | artificial dep
3641 // | Inst1 --+
3642 // v
3643 //
3644 // If Inst1 is scheduled at cycle N and is not at Stage 0, then NewCycle of
3645 // Inst0 must be greater than or equal to N so that Inst0 is not be
3646 // scheduled at Stage 0. In such cases, we reject this schedule at this
3647 // time.
3648 // FIXME: The reason for this is the existence of artificial dependencies
3649 // that are contradict to the original SU order. If ignoring artificial
3650 // dependencies does not affect correctness, then it is better to ignore
3651 // them.
3652 if (FirstCycle + InitiationInterval <= NewCycle)
3653 return false;
3654
3655 NewLastCycle = std::max(NewLastCycle, NewCycle);
3656 }
3657 LastCycle = NewLastCycle;
3658 return true;
3659}
3660
3661// Check if the generated schedule is valid. This function checks if
3662// an instruction that uses a physical register is scheduled in a
3663// different stage than the definition. The pipeliner does not handle
3664// physical register values that may cross a basic block boundary.
3665// Furthermore, if a physical def/use pair is assigned to the same
3666// cycle, orderDependence does not guarantee def/use ordering, so that
3667// case should be considered invalid. (The test checks for both
3668// earlier and same-cycle use to be more robust.)
3670 for (SUnit &SU : SSD->SUnits) {
3671 if (!SU.hasPhysRegDefs)
3672 continue;
3673 int StageDef = stageScheduled(&SU);
3674 int CycleDef = InstrToCycle[&SU];
3675 assert(StageDef != -1 && "Instruction should have been scheduled.");
3676 for (auto &OE : SSD->getDDG()->getOutEdges(&SU)) {
3677 SUnit *Dst = OE.getDst();
3678 if (OE.isAssignedRegDep() && !Dst->isBoundaryNode())
3679 if (OE.getReg().isPhysical()) {
3680 if (stageScheduled(Dst) != StageDef)
3681 return false;
3682 if (InstrToCycle[Dst] <= CycleDef)
3683 return false;
3684 }
3685 }
3686 }
3687 return true;
3688}
3689
3690/// A property of the node order in swing-modulo-scheduling is
3691/// that for nodes outside circuits the following holds:
3692/// none of them is scheduled after both a successor and a
3693/// predecessor.
3694/// The method below checks whether the property is met.
3695/// If not, debug information is printed and statistics information updated.
3696/// Note that we do not use an assert statement.
3697/// The reason is that although an invalid node order may prevent
3698/// the pipeliner from finding a pipelined schedule for arbitrary II,
3699/// it does not lead to the generation of incorrect code.
3700void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
3701
3702 // a sorted vector that maps each SUnit to its index in the NodeOrder
3703 typedef std::pair<SUnit *, unsigned> UnitIndex;
3704 std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
3705
3706 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
3707 Indices.push_back(std::make_pair(NodeOrder[i], i));
3708
3709 auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
3710 return std::get<0>(i1) < std::get<0>(i2);
3711 };
3712
3713 // sort, so that we can perform a binary search
3714 llvm::sort(Indices, CompareKey);
3715
3716 bool Valid = true;
3717 (void)Valid;
3718 // for each SUnit in the NodeOrder, check whether
3719 // it appears after both a successor and a predecessor
3720 // of the SUnit. If this is the case, and the SUnit
3721 // is not part of circuit, then the NodeOrder is not
3722 // valid.
3723 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
3724 SUnit *SU = NodeOrder[i];
3725 unsigned Index = i;
3726
3727 bool PredBefore = false;
3728 bool SuccBefore = false;
3729
3730 SUnit *Succ;
3731 SUnit *Pred;
3732 (void)Succ;
3733 (void)Pred;
3734
3735 for (const auto &IE : DDG->getInEdges(SU)) {
3736 SUnit *PredSU = IE.getSrc();
3737 unsigned PredIndex = std::get<1>(
3738 *llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey));
3739 if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
3740 PredBefore = true;
3741 Pred = PredSU;
3742 break;
3743 }
3744 }
3745
3746 for (const auto &OE : DDG->getOutEdges(SU)) {
3747 SUnit *SuccSU = OE.getDst();
3748 // Do not process a boundary node, it was not included in NodeOrder,
3749 // hence not in Indices either, call to std::lower_bound() below will
3750 // return Indices.end().
3751 if (SuccSU->isBoundaryNode())
3752 continue;
3753 unsigned SuccIndex = std::get<1>(
3754 *llvm::lower_bound(Indices, std::make_pair(SuccSU, 0), CompareKey));
3755 if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
3756 SuccBefore = true;
3757 Succ = SuccSU;
3758 break;
3759 }
3760 }
3761
3762 if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
3763 // instructions in circuits are allowed to be scheduled
3764 // after both a successor and predecessor.
3765 bool InCircuit = llvm::any_of(
3766 Circuits, [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
3767 if (InCircuit)
3768 LLVM_DEBUG(dbgs() << "In a circuit, predecessor ");
3769 else {
3770 Valid = false;
3771 NumNodeOrderIssues++;
3772 LLVM_DEBUG(dbgs() << "Predecessor ");
3773 }
3774 LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
3775 << " are scheduled before node " << SU->NodeNum
3776 << "\n");
3777 }
3778 }
3779
3780 LLVM_DEBUG({
3781 if (!Valid)
3782 dbgs() << "Invalid node order found!\n";
3783 });
3784}
3785
3786/// Attempt to fix the degenerate cases when the instruction serialization
3787/// causes the register lifetimes to overlap. For example,
3788/// p' = store_pi(p, b)
3789/// = load p, offset
3790/// In this case p and p' overlap, which means that two registers are needed.
3791/// Instead, this function changes the load to use p' and updates the offset.
3792void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
3793 Register OverlapReg;
3794 Register NewBaseReg;
3795 for (SUnit *SU : Instrs) {
3796 MachineInstr *MI = SU->getInstr();
3797 for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3798 const MachineOperand &MO = MI->getOperand(i);
3799 // Look for an instruction that uses p. The instruction occurs in the
3800 // same cycle but occurs later in the serialized order.
3801 if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
3802 // Check that the instruction appears in the InstrChanges structure,
3803 // which contains instructions that can have the offset updated.
3805 InstrChanges.find(SU);
3806 if (It != InstrChanges.end()) {
3807 unsigned BasePos, OffsetPos;
3808 // Update the base register and adjust the offset.
3809 if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
3810 MachineInstr *NewMI = MF.CloneMachineInstr(MI);
3811 NewMI->getOperand(BasePos).setReg(NewBaseReg);
3812 int64_t NewOffset =
3813 MI->getOperand(OffsetPos).getImm() - It->second.second;
3814 NewMI->getOperand(OffsetPos).setImm(NewOffset);
3815 SU->setInstr(NewMI);
3816 MISUnitMap[NewMI] = SU;
3817 NewMIs[MI] = NewMI;
3818 }
3819 }
3820 OverlapReg = Register();
3821 NewBaseReg = Register();
3822 break;
3823 }
3824 // Look for an instruction of the form p' = op(p), which uses and defines
3825 // two virtual registers that get allocated to the same physical register.
3826 unsigned TiedUseIdx = 0;
3827 if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
3828 // OverlapReg is p in the example above.
3829 OverlapReg = MI->getOperand(TiedUseIdx).getReg();
3830 // NewBaseReg is p' in the example above.
3831 NewBaseReg = MI->getOperand(i).getReg();
3832 break;
3833 }
3834 }
3835 }
3836}
3837
3838std::deque<SUnit *>
3840 const std::deque<SUnit *> &Instrs) const {
3841 std::deque<SUnit *> NewOrderPhi;
3842 for (SUnit *SU : Instrs) {
3843 if (SU->getInstr()->isPHI())
3844 NewOrderPhi.push_back(SU);
3845 }
3846 std::deque<SUnit *> NewOrderI;
3847 for (SUnit *SU : Instrs) {
3848 if (!SU->getInstr()->isPHI())
3849 orderDependence(SSD, SU, NewOrderI);
3850 }
3851 llvm::append_range(NewOrderPhi, NewOrderI);
3852 return NewOrderPhi;
3853}
3854
3855/// After the schedule has been formed, call this function to combine
3856/// the instructions from the different stages/cycles. That is, this
3857/// function creates a schedule that represents a single iteration.
3859 // Move all instructions to the first stage from later stages.
3860 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3861 for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
3862 ++stage) {
3863 std::deque<SUnit *> &cycleInstrs =
3864 ScheduledInstrs[cycle + (stage * InitiationInterval)];
3865 for (SUnit *SU : llvm::reverse(cycleInstrs))
3866 ScheduledInstrs[cycle].push_front(SU);
3867 }
3868 }
3869
3870 // Erase all the elements in the later stages. Only one iteration should
3871 // remain in the scheduled list, and it contains all the instructions.
3872 for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
3873 ScheduledInstrs.erase(cycle);
3874
3875 // Change the registers in instruction as specified in the InstrChanges
3876 // map. We need to use the new registers to create the correct order.
3877 for (const SUnit &SU : SSD->SUnits)
3878 SSD->applyInstrChange(SU.getInstr(), *this);
3879
3880 // Reorder the instructions in each cycle to fix and improve the
3881 // generated code.
3882 for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
3883 std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
3884 cycleInstrs = reorderInstructions(SSD, cycleInstrs);
3885 SSD->fixupRegisterOverlaps(cycleInstrs);
3886 }
3887
3888 LLVM_DEBUG(dump(););
3889}
3890
3892 os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
3893 << " depth " << MaxDepth << " col " << Colocate << "\n";
3894 for (const auto &I : Nodes)
3895 os << " SU(" << I->NodeNum << ") " << *(I->getInstr());
3896 os << "\n";
3897}
3898
3899#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3900/// Print the schedule information to the given output.
3902 // Iterate over each cycle.
3903 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3904 // Iterate over each instruction in the cycle.
3905 const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
3906 for (SUnit *CI : cycleInstrs->second) {
3907 os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
3908 os << "(" << CI->NodeNum << ") ";
3909 CI->getInstr()->print(os);
3910 os << "\n";
3911 }
3912 }
3913}
3914
3915/// Utility function used for debugging to print the schedule.
3918
3919void ResourceManager::dumpMRT() const {
3920 LLVM_DEBUG({
3921 if (UseDFA)
3922 return;
3923 std::stringstream SS;
3924 SS << "MRT:\n";
3925 SS << std::setw(4) << "Slot";
3926 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3927 SS << std::setw(3) << I;
3928 SS << std::setw(7) << "#Mops"
3929 << "\n";
3930 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3931 SS << std::setw(4) << Slot;
3932 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3933 SS << std::setw(3) << MRT[Slot][I];
3934 SS << std::setw(7) << NumScheduledMops[Slot] << "\n";
3935 }
3936 dbgs() << SS.str();
3937 });
3938}
3939#endif
3940
3942 const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
3943 unsigned ProcResourceID = 0;
3944
3945 // We currently limit the resource kinds to 64 and below so that we can use
3946 // uint64_t for Masks
3947 assert(SM.getNumProcResourceKinds() < 64 &&
3948 "Too many kinds of resources, unsupported");
3949 // Create a unique bitmask for every processor resource unit.
3950 // Skip resource at index 0, since it always references 'InvalidUnit'.
3951 Masks.resize(SM.getNumProcResourceKinds());
3952 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3953 const MCProcResourceDesc &Desc = *SM.getProcResource(I);
3954 if (Desc.SubUnitsIdxBegin)
3955 continue;
3956 Masks[I] = 1ULL << ProcResourceID;
3957 ProcResourceID++;
3958 }
3959 // Create a unique bitmask for every processor resource group.
3960 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3961 const MCProcResourceDesc &Desc = *SM.getProcResource(I);
3962 if (!Desc.SubUnitsIdxBegin)
3963 continue;
3964 Masks[I] = 1ULL << ProcResourceID;
3965 for (unsigned U = 0; U < Desc.NumUnits; ++U)
3966 Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];
3967 ProcResourceID++;
3968 }
3969 LLVM_DEBUG({
3970 if (SwpShowResMask) {
3971 dbgs() << "ProcResourceDesc:\n";
3972 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3973 const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
3974 dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
3975 ProcResource->Name, I, Masks[I],
3976 ProcResource->NumUnits);
3977 }
3978 dbgs() << " -----------------\n";
3979 }
3980 });
3981}
3982
3984 LLVM_DEBUG({
3985 if (SwpDebugResource)
3986 dbgs() << "canReserveResources:\n";
3987 });
3988 if (UseDFA)
3989 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3990 ->canReserveResources(&SU.getInstr()->getDesc());
3991
3992 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3993 if (!SCDesc->isValid()) {
3994 LLVM_DEBUG({
3995 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3996 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3997 });
3998 return true;
3999 }
4000
4001 reserveResources(SCDesc, Cycle);
4002 bool Result = !isOverbooked();
4003 unreserveResources(SCDesc, Cycle);
4004
4005 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return " << Result << "\n\n");
4006 return Result;
4007}
4008
4009void ResourceManager::reserveResources(SUnit &SU, int Cycle) {
4010 LLVM_DEBUG({
4011 if (SwpDebugResource)
4012 dbgs() << "reserveResources:\n";
4013 });
4014 if (UseDFA)
4015 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
4016 ->reserveResources(&SU.getInstr()->getDesc());
4017
4018 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
4019 if (!SCDesc->isValid()) {
4020 LLVM_DEBUG({
4021 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
4022 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
4023 });
4024 return;
4025 }
4026
4027 reserveResources(SCDesc, Cycle);
4028
4029 LLVM_DEBUG({
4030 if (SwpDebugResource) {
4031 dumpMRT();
4032 dbgs() << "reserveResources: done!\n\n";
4033 }
4034 });
4035}
4036
4037void ResourceManager::reserveResources(const MCSchedClassDesc *SCDesc,
4038 int Cycle) {
4039 assert(!UseDFA);
4040 for (const MCWriteProcResEntry &PRE : make_range(
4041 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
4042 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
4043 ++MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
4044
4045 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
4046 ++NumScheduledMops[positiveModulo(C, InitiationInterval)];
4047}
4048
4049void ResourceManager::unreserveResources(const MCSchedClassDesc *SCDesc,
4050 int Cycle) {
4051 assert(!UseDFA);
4052 for (const MCWriteProcResEntry &PRE : make_range(
4053 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
4054 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
4055 --MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
4056
4057 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
4058 --NumScheduledMops[positiveModulo(C, InitiationInterval)];
4059}
4060
4061bool ResourceManager::isOverbooked() const {
4062 assert(!UseDFA);
4063 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
4064 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
4065 const MCProcResourceDesc *Desc = SM.getProcResource(I);
4066 if (MRT[Slot][I] > Desc->NumUnits)
4067 return true;
4068 }
4069 if (NumScheduledMops[Slot] > IssueWidth)
4070 return true;
4071 }
4072 return false;
4073}
4074
4075int ResourceManager::calculateResMIIDFA() const {
4076 assert(UseDFA);
4077
4078 // Sort the instructions by the number of available choices for scheduling,
4079 // least to most. Use the number of critical resources as the tie breaker.
4080 FuncUnitSorter FUS = FuncUnitSorter(*ST);
4081 for (SUnit &SU : DAG->SUnits)
4082 FUS.calcCriticalResources(*SU.getInstr());
4083 PriorityQueue<MachineInstr *, std::vector<MachineInstr *>, FuncUnitSorter>
4084 FuncUnitOrder(FUS);
4085
4086 for (SUnit &SU : DAG->SUnits)
4087 FuncUnitOrder.push(SU.getInstr());
4088
4090 Resources.push_back(
4091 std::unique_ptr<DFAPacketizer>(TII->CreateTargetScheduleState(*ST)));
4092
4093 while (!FuncUnitOrder.empty()) {
4094 MachineInstr *MI = FuncUnitOrder.top();
4095 FuncUnitOrder.pop();
4096 if (TII->isZeroCost(MI->getOpcode()))
4097 continue;
4098
4099 // Attempt to reserve the instruction in an existing DFA. At least one
4100 // DFA is needed for each cycle.
4101 unsigned NumCycles = DAG->getSUnit(MI)->Latency;
4102 unsigned ReservedCycles = 0;
4103 auto *RI = Resources.begin();
4104 auto *RE = Resources.end();
4105 LLVM_DEBUG({
4106 dbgs() << "Trying to reserve resource for " << NumCycles
4107 << " cycles for \n";
4108 MI->dump();
4109 });
4110 for (unsigned C = 0; C < NumCycles; ++C)
4111 while (RI != RE) {
4112 if ((*RI)->canReserveResources(*MI)) {
4113 (*RI)->reserveResources(*MI);
4114 ++ReservedCycles;
4115 break;
4116 }
4117 RI++;
4118 }
4119 LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
4120 << ", NumCycles:" << NumCycles << "\n");
4121 // Add new DFAs, if needed, to reserve resources.
4122 for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
4124 << "NewResource created to reserve resources"
4125 << "\n");
4126 auto *NewResource = TII->CreateTargetScheduleState(*ST);
4127 assert(NewResource->canReserveResources(*MI) && "Reserve error.");
4128 NewResource->reserveResources(*MI);
4129 Resources.push_back(std::unique_ptr<DFAPacketizer>(NewResource));
4130 }
4131 }
4132
4133 int Resmii = Resources.size();
4134 LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
4135 return Resmii;
4136}
4137
4139 if (UseDFA)
4140 return calculateResMIIDFA();
4141
4142 // Count each resource consumption and divide it by the number of units.
4143 // ResMII is the max value among them.
4144
4145 int NumMops = 0;
4146 SmallVector<uint64_t> ResourceCount(SM.getNumProcResourceKinds());
4147 for (SUnit &SU : DAG->SUnits) {
4148 if (TII->isZeroCost(SU.getInstr()->getOpcode()))
4149 continue;
4150
4151 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
4152 if (!SCDesc->isValid())
4153 continue;
4154
4155 LLVM_DEBUG({
4156 if (SwpDebugResource) {
4157 DAG->dumpNode(SU);
4158 dbgs() << " #Mops: " << SCDesc->NumMicroOps << "\n"
4159 << " WriteProcRes: ";
4160 }
4161 });
4162 NumMops += SCDesc->NumMicroOps;
4163 for (const MCWriteProcResEntry &PRE :
4164 make_range(STI->getWriteProcResBegin(SCDesc),
4165 STI->getWriteProcResEnd(SCDesc))) {
4166 LLVM_DEBUG({
4167 if (SwpDebugResource) {
4168 const MCProcResourceDesc *Desc =
4169 SM.getProcResource(PRE.ProcResourceIdx);
4170 dbgs() << Desc->Name << ": " << PRE.ReleaseAtCycle << ", ";
4171 }
4172 });
4173 ResourceCount[PRE.ProcResourceIdx] += PRE.ReleaseAtCycle;
4174 }
4175 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "\n");
4176 }
4177
4178 int Result = (NumMops + IssueWidth - 1) / IssueWidth;
4179 LLVM_DEBUG({
4180 if (SwpDebugResource)
4181 dbgs() << "#Mops: " << NumMops << ", "
4182 << "IssueWidth: " << IssueWidth << ", "
4183 << "Cycles: " << Result << "\n";
4184 });
4185
4186 LLVM_DEBUG({
4187 if (SwpDebugResource) {
4188 std::stringstream SS;
4189 SS << std::setw(2) << "ID" << std::setw(16) << "Name" << std::setw(10)
4190 << "Units" << std::setw(10) << "Consumed" << std::setw(10) << "Cycles"
4191 << "\n";
4192 dbgs() << SS.str();
4193 }
4194 });
4195 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
4196 const MCProcResourceDesc *Desc = SM.getProcResource(I);
4197 int Cycles = (ResourceCount[I] + Desc->NumUnits - 1) / Desc->NumUnits;
4198 LLVM_DEBUG({
4199 if (SwpDebugResource) {
4200 std::stringstream SS;
4201 SS << std::setw(2) << I << std::setw(16) << Desc->Name << std::setw(10)
4202 << Desc->NumUnits << std::setw(10) << ResourceCount[I]
4203 << std::setw(10) << Cycles << "\n";
4204 dbgs() << SS.str();
4205 }
4206 });
4207 if (Cycles > Result)
4208 Result = Cycles;
4209 }
4210 return Result;
4211}
4212
4214 InitiationInterval = II;
4215 DFAResources.clear();
4216 DFAResources.resize(II);
4217 for (auto &I : DFAResources)
4218 I.reset(ST->getInstrInfo()->CreateTargetScheduleState(*ST));
4219 MRT.clear();
4220 MRT.resize(II, SmallVector<uint64_t>(SM.getNumProcResourceKinds()));
4221 NumScheduledMops.clear();
4222 NumScheduledMops.resize(II);
4223}
4224
4225bool SwingSchedulerDDGEdge::ignoreDependence(bool IgnoreAnti) const {
4226 if (Pred.isArtificial() || Dst->isBoundaryNode())
4227 return true;
4228 // Currently, dependence that is an anti-dependences but not a loop-carried is
4229 // also ignored. This behavior is preserved to prevent regression.
4230 // FIXME: Remove if this doesn't have significant impact on performance
4231 return IgnoreAnti && (Pred.getKind() == SDep::Kind::Anti || Distance != 0);
4232}
4233
4234SwingSchedulerDDG::SwingSchedulerDDGEdges &
4235SwingSchedulerDDG::getEdges(const SUnit *SU) {
4236 if (SU == EntrySU)
4237 return EntrySUEdges;
4238 if (SU == ExitSU)
4239 return ExitSUEdges;
4240 return EdgesVec[SU->NodeNum];
4241}
4242
4243const SwingSchedulerDDG::SwingSchedulerDDGEdges &
4244SwingSchedulerDDG::getEdges(const SUnit *SU) const {
4245 if (SU == EntrySU)
4246 return EntrySUEdges;
4247 if (SU == ExitSU)
4248 return ExitSUEdges;
4249 return EdgesVec[SU->NodeNum];
4250}
4251
4252void SwingSchedulerDDG::addEdge(const SUnit *SU,
4253 const SwingSchedulerDDGEdge &Edge) {
4254 assert(!Edge.isValidationOnly() &&
4255 "Validation-only edges are not expected here.");
4256 auto &Edges = getEdges(SU);
4257 if (Edge.getSrc() == SU)
4258 Edges.Succs.push_back(Edge);
4259 else
4260 Edges.Preds.push_back(Edge);
4261}
4262
4263void SwingSchedulerDDG::initEdges(SUnit *SU) {
4264 for (const auto &PI : SU->Preds) {
4265 SwingSchedulerDDGEdge Edge(SU, PI, /*IsSucc=*/false,
4266 /*IsValidationOnly=*/false);
4267 addEdge(SU, Edge);
4268 }
4269
4270 for (const auto &SI : SU->Succs) {
4271 SwingSchedulerDDGEdge Edge(SU, SI, /*IsSucc=*/true,
4272 /*IsValidationOnly=*/false);
4273 addEdge(SU, Edge);
4274 }
4275}
4276
4277SwingSchedulerDDG::SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU,
4278 SUnit *ExitSU, const LoopCarriedEdges &LCE)
4279 : EntrySU(EntrySU), ExitSU(ExitSU) {
4280 EdgesVec.resize(SUnits.size());
4281
4282 // Add non-loop-carried edges based on the DAG.
4283 initEdges(EntrySU);
4284 initEdges(ExitSU);
4285 for (auto &SU : SUnits)
4286 initEdges(&SU);
4287
4288 // Add loop-carried edges, which are not represented in the DAG.
4289 for (SUnit &SU : SUnits) {
4290 SUnit *Src = &SU;
4291 if (const LoopCarriedEdges::OrderDep *OD = LCE.getOrderDepOrNull(Src)) {
4292 SDep Base(Src, SDep::Barrier);
4293 Base.setLatency(1);
4294 for (SUnit *Dst : *OD) {
4295 SwingSchedulerDDGEdge Edge(Dst, Base, /*IsSucc=*/false,
4296 /*IsValidationOnly=*/true);
4297 Edge.setDistance(1);
4298 ValidationOnlyEdges.push_back(Edge);
4299 }
4300 }
4301 }
4302}
4303
4304const SwingSchedulerDDG::EdgesType &
4306 return getEdges(SU).Preds;
4307}
4308
4309const SwingSchedulerDDG::EdgesType &
4311 return getEdges(SU).Succs;
4312}
4313
4314/// Check if \p Schedule doesn't violate the validation-only dependencies.
4316 unsigned II = Schedule.getInitiationInterval();
4317
4318 auto ExpandCycle = [&](SUnit *SU) {
4319 int Stage = Schedule.stageScheduled(SU);
4320 int Cycle = Schedule.cycleScheduled(SU);
4321 return Cycle + (Stage * II);
4322 };
4323
4324 for (const SwingSchedulerDDGEdge &Edge : ValidationOnlyEdges) {
4325 SUnit *Src = Edge.getSrc();
4326 SUnit *Dst = Edge.getDst();
4327 if (!Src->isInstr() || !Dst->isInstr())
4328 continue;
4329 int CycleSrc = ExpandCycle(Src);
4330 int CycleDst = ExpandCycle(Dst);
4331 int MaxLateStart = CycleDst + Edge.getDistance() * II - Edge.getLatency();
4332 if (CycleSrc > MaxLateStart) {
4333 LLVM_DEBUG({
4334 dbgs() << "Validation failed for edge from " << Src->NodeNum << " to "
4335 << Dst->NodeNum << "\n";
4336 });
4337 return false;
4338 }
4339 }
4340 return true;
4341}
4342
4343void LoopCarriedEdges::modifySUnits(std::vector<SUnit> &SUnits,
4344 const TargetInstrInfo *TII) {
4345 for (SUnit &SU : SUnits) {
4346 SUnit *Src = &SU;
4347 if (auto *OrderDep = getOrderDepOrNull(Src)) {
4348 SDep Dep(Src, SDep::Barrier);
4349 Dep.setLatency(1);
4350 for (SUnit *Dst : *OrderDep) {
4351 SUnit *From = Src;
4352 SUnit *To = Dst;
4353 if (From->NodeNum > To->NodeNum)
4354 std::swap(From, To);
4355
4356 // Add a forward edge if the following conditions are met:
4357 //
4358 // - The instruction of the source node (FromMI) may read memory.
4359 // - The instruction of the target node (ToMI) may modify memory, but
4360 // does not read it.
4361 // - Neither instruction is a global barrier.
4362 // - The load appears before the store in the original basic block.
4363 // - There are no barrier or store instructions between the two nodes.
4364 // - The target node is unreachable from the source node in the current
4365 // DAG.
4366 //
4367 // TODO: These conditions are inherited from a previous implementation,
4368 // and some may no longer be necessary. For now, we conservatively
4369 // retain all of them to avoid regressions, but the logic could
4370 // potentially be simplified
4371 MachineInstr *FromMI = From->getInstr();
4372 MachineInstr *ToMI = To->getInstr();
4373 if (FromMI->mayLoad() && !ToMI->mayLoad() && ToMI->mayStore() &&
4374 !TII->isGlobalMemoryObject(FromMI) &&
4375 !TII->isGlobalMemoryObject(ToMI) && !isSuccOrder(From, To)) {
4376 SDep Pred = Dep;
4377 Pred.setSUnit(From);
4378 To->addPred(Pred);
4379 }
4380 }
4381 }
4382 }
4383}
4384
4386 const MachineRegisterInfo *MRI) const {
4387 const auto *Order = getOrderDepOrNull(SU);
4388
4389 if (!Order)
4390 return;
4391
4392 const auto DumpSU = [](const SUnit *SU) {
4393 std::ostringstream OSS;
4394 OSS << "SU(" << SU->NodeNum << ")";
4395 return OSS.str();
4396 };
4397
4398 dbgs() << " Loop carried edges from " << DumpSU(SU) << "\n"
4399 << " Order\n";
4400 for (SUnit *Dst : *Order)
4401 dbgs() << " " << DumpSU(Dst) << "\n";
4402}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static std::optional< unsigned > getTag(const TargetRegisterInfo *TRI, const MachineInstr &MI, const LoadInfo &LI)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
constexpr LLT S1
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661
This file defines the DenseMap class.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
static void addEdge(SmallVectorImpl< LazyCallGraph::Edge > &Edges, DenseMap< LazyCallGraph::Node *, int > &EdgeIndexMap, LazyCallGraph::Node &N, LazyCallGraph::Edge::Kind EK)
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
print mir2vec MIR2Vec Vocabulary Printer Pass
Definition MIR2Vec.cpp:598
static cl::opt< int > SwpForceII("pipeliner-force-ii", cl::desc("Force pipeliner to use specified II."), cl::Hidden, cl::init(-1))
A command line argument to force pipeliner to use specified initial interval.
static cl::opt< bool > ExperimentalCodeGen("pipeliner-experimental-cg", cl::Hidden, cl::init(false), cl::desc("Use the experimental peeling code generator for software pipelining"))
static bool hasPHICycleDFS(unsigned Reg, const DenseMap< unsigned, SmallVector< unsigned, 2 > > &PhiDeps, SmallSet< unsigned, 8 > &Visited, SmallSet< unsigned, 8 > &RecStack)
Depth-first search to detect cycles among PHI dependencies.
static cl::opt< bool > MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false), cl::desc("Use the MVE code generator for software pipelining"))
static cl::opt< int > RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden, cl::init(5), cl::desc("Margin representing the unused percentage of " "the register pressure limit"))
static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop, Register &InitVal, Register &LoopVal)
Return the register values for the operands of a Phi instruction.
static cl::opt< bool > SwpDebugResource("pipeliner-dbg-res", cl::Hidden, cl::init(false))
static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker, NodeSet &NS)
Compute the live-out registers for the instructions in a node-set.
static void computeScheduledInsts(const SwingSchedulerDAG *SSD, SMSchedule &Schedule, std::vector< MachineInstr * > &OrderedInsts, DenseMap< MachineInstr *, unsigned > &Stages)
Create an instruction stream that represents a single iteration and stage of each instruction.
static cl::opt< bool > EmitTestAnnotations("pipeliner-annotate-for-testing", cl::Hidden, cl::init(false), cl::desc("Instead of emitting the pipelined code, annotate instructions " "with the generated schedule for feeding into the " "-modulo-schedule-test pass"))
static Register getLoopPhiReg(const MachineInstr &Phi, const MachineBasicBlock *LoopBB)
Return the Phi register value that comes the loop block.
static bool isIntersect(SmallSetVector< SUnit *, 8 > &Set1, const NodeSet &Set2, SmallSetVector< SUnit *, 8 > &Result)
Return true if Set1 contains elements in Set2.
static bool findLoopIncrementValue(const MachineOperand &Op, int &Value)
When Op is a value that is incremented recursively in a loop and there is a unique instruction that i...
static cl::opt< bool > SwpIgnoreRecMII("pipeliner-ignore-recmii", cl::ReallyHidden, cl::desc("Ignore RecMII"))
static cl::opt< int > SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1))
static cl::opt< bool > SwpPruneLoopCarried("pipeliner-prune-loop-carried", cl::desc("Prune loop carried order dependences."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of loop carried order dependences.
static cl::opt< unsigned > SwpMaxNumStores("pipeliner-max-num-stores", cl::desc("Maximum number of stores allwed in the target loop."), cl::Hidden, cl::init(200))
A command line argument to limit the number of store instructions in the target basic block.
static cl::opt< int > SwpMaxMii("pipeliner-max-mii", cl::desc("Size limit for the MII."), cl::Hidden, cl::init(27))
A command line argument to limit minimum initial interval for pipelining.
static bool isSuccOrder(SUnit *SUa, SUnit *SUb)
Return true if SUb can be reached from SUa following the chain edges.
static cl::opt< int > SwpMaxStages("pipeliner-max-stages", cl::desc("Maximum stages allowed in the generated scheduled."), cl::Hidden, cl::init(3))
A command line argument to limit the number of stages in the pipeline.
static cl::opt< bool > EnableSWPOptSize("enable-pipeliner-opt-size", cl::desc("Enable SWP at Os."), cl::Hidden, cl::init(false))
A command line option to enable SWP at -Os.
static bool hasPHICycle(const MachineBasicBlock *LoopHeader, const MachineRegisterInfo &MRI)
static cl::opt< WindowSchedulingFlag > WindowSchedulingOption("window-sched", cl::Hidden, cl::init(WindowSchedulingFlag::WS_On), cl::desc("Set how to use window scheduling algorithm."), cl::values(clEnumValN(WindowSchedulingFlag::WS_Off, "off", "Turn off window algorithm."), clEnumValN(WindowSchedulingFlag::WS_On, "on", "Use window algorithm after SMS algorithm fails."), clEnumValN(WindowSchedulingFlag::WS_Force, "force", "Use window algorithm instead of SMS algorithm.")))
A command line argument to set the window scheduling option.
static bool pred_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Preds, SwingSchedulerDDG *DDG, const NodeSet *S=nullptr)
Compute the Pred_L(O) set, as defined in the paper.
static cl::opt< bool > SwpShowResMask("pipeliner-show-mask", cl::Hidden, cl::init(false))
static cl::opt< int > SwpIISearchRange("pipeliner-ii-search-range", cl::desc("Range to search for II"), cl::Hidden, cl::init(10))
static bool computePath(SUnit *Cur, SetVector< SUnit * > &Path, SetVector< SUnit * > &DestNodes, SetVector< SUnit * > &Exclude, SmallPtrSet< SUnit *, 8 > &Visited, SwingSchedulerDDG *DDG)
Return true if there is a path from the specified node to any of the nodes in DestNodes.
static bool succ_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Succs, SwingSchedulerDDG *DDG, const NodeSet *S=nullptr)
Compute the Succ_L(O) set, as defined in the paper.
static cl::opt< bool > LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false), cl::desc("Limit register pressure of scheduled loop"))
static cl::opt< bool > EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true), cl::desc("Enable Software Pipelining"))
A command line option to turn software pipelining on or off.
static bool hasLoopCarriedMemDep(const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst, BatchAAResults &BAA, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI, const SwingSchedulerDAG *SSD)
Returns true if there is a loop-carried order dependency from Src to Dst.
static cl::opt< bool > SwpPruneDeps("pipeliner-prune-deps", cl::desc("Prune dependences between unrelated Phi nodes."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of chain dependences due to an unrelated Phi.
static SUnit * multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG)
If an instruction has a use that spans multiple iterations, then return true.
static Register findUniqueOperandDefinedInLoop(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the PriorityQueue class.
Remove Loads Into Fake Uses
std::pair< BasicBlock *, BasicBlock * > Edge
This file contains some templates that are useful if you are working with the STL at all.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
Target-Independent Code Generator Pass Configuration Options pass.
Add loop-carried chain dependencies.
void computeDependencies()
The main function to compute loop-carried order-dependencies.
const BitVector & getLoopCarried(unsigned Idx) const
LoopCarriedOrderDepsTracker(SwingSchedulerDAG *SSD, BatchAAResults *BAA, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
bool isNoAlias(const MemoryLocation &LocA, const MemoryLocation &LocB)
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool erase(const KeyT &Val)
Definition DenseMap.h:330
bool empty() const
Definition DenseMap.h:109
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
bool skipFunction(const Function &F) const
Optional passes call this function to check whether the pass should be skipped.
Definition Pass.cpp:188
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isPostIncrement(const MachineInstr &MI) const override
Return true for post-incremented instructions.
DFAPacketizer * CreateTargetScheduleState(const TargetSubtargetInfo &STI) const override
Create machine specific model for scheduling.
bool getBaseAndOffsetPosition(const MachineInstr &MI, unsigned &BasePos, unsigned &OffsetPos) const override
For instructions with a base and offset, return the position of the base register and offset operands...
const InstrStage * beginStage(unsigned ItinClassIndx) const
Return the first stage of the itinerary.
const InstrStage * endStage(unsigned ItinClassIndx) const
Return the last+1 stage of the itinerary.
bool isEmpty() const
Returns true if there are no itineraries.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
bool hasValue() const
TypeSize getValue() const
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
unsigned getSchedClass() const
Return the scheduling class for this instruction.
const MCWriteProcResEntry * getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCWriteProcResEntry * getWriteProcResBegin(const MCSchedClassDesc *SC) const
Return an iterator at the first process resource consumed by the given scheduling class.
const MCSchedModel & getSchedModel() const
Get the machine model for this subtarget's CPU.
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1444
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1442
unsigned getNumOperands() const
Return number of MDNode operands.
Definition Metadata.h:1450
LLVM_ABI StringRef getString() const
Definition Metadata.cpp:632
MachineInstrBundleIterator< const MachineInstr > const_iterator
iterator_range< iterator > phis()
Returns a range that iterates over the phis in the basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Analysis pass which computes a MachineDominatorTree.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
bool mayRaiseFPException() const
Return true if this instruction could possibly raise a floating-point exception.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool isRegSequence() const
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool isIdenticalTo(const MachineInstr &Other, MICheckType Check=CheckDefs) const
Return true if this instruction is identical to Other.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isPseudo(QueryType Type=IgnoreBundle) const
Return true if this is a pseudo instruction that doesn't correspond to a real machine instruction.
LLVM_ABI void dump() const
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
const Value * getValue() const
Return the base address of the memory access.
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
Register getReg() const
getReg - Returns the register number.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
Diagnostic information for optimization analysis remarks.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
The main class in the implementation of the target independent software pipeliner pass.
bool runOnMachineFunction(MachineFunction &MF) override
The "main" function for implementing Swing Modulo Scheduling.
const TargetInstrInfo * TII
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const MachineDominatorTree * MDT
const MachineLoopInfo * MLI
MachineOptimizationRemarkEmitter * ORE
RegisterClassInfo RegClassInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
defusechain_instr_iterator< true, false, false, true > use_instr_iterator
use_instr_iterator/use_instr_begin/use_instr_end - Walk all uses of the specified register,...
static MemoryLocation getBeforeOrAfter(const Value *Ptr, const AAMDNodes &AATags=AAMDNodes())
Return a location that may access any location before or after Ptr, while remaining within the underl...
Expand the kernel using modulo variable expansion algorithm (MVE).
static bool canApply(MachineLoop &L)
Check if ModuloScheduleExpanderMVE can be applied to L.
The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place, rewriting the old loop and...
void cleanup()
Performs final cleanup after expansion.
void expand()
Performs the actual expansion.
Expander that simply annotates each scheduled instruction with a post-instr symbol that can be consum...
void annotate()
Performs the annotation.
Represents a schedule for a single-block loop.
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
SUnit * getNode(unsigned i) const
void print(raw_ostream &os) const
void setRecMII(unsigned mii)
unsigned count(SUnit *SU) const
void setColocate(unsigned c)
int compareRecMII(NodeSet &RHS)
bool insert(SUnit *SU)
LLVM_DUMP_METHOD void dump() const
bool empty() const
void dump() const
Definition Pass.cpp:146
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
A reimplementation of ModuloScheduleExpander.
PointerIntPair - This class implements a pair of a pointer and small integer.
unsigned getPSet() const
Track the current register pressure at some position in the instruction stream, and remember the high...
LLVM_ABI void addLiveRegs(ArrayRef< VRegMaskOrUnit > Regs)
Force liveness of virtual registers or physical register units.
unsigned getRegPressureSetLimit(unsigned Idx) const
Get the register unit limit for the given pressure set index.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
void initProcResourceVectors(const MCSchedModel &SM, SmallVectorImpl< uint64_t > &Masks)
void init(int II)
Initialize resources with the initiation interval II.
bool canReserveResources(SUnit &SU, int Cycle)
Check if the resources occupied by a machine instruction are available in the current state.
Scheduling dependency.
Definition ScheduleDAG.h:51
Kind
These are the different kinds of scheduling dependencies.
Definition ScheduleDAG.h:54
@ Order
Any other ordering dependency.
Definition ScheduleDAG.h:58
@ Anti
A register anti-dependence (aka WAR).
Definition ScheduleDAG.h:56
@ Data
Regular data dependence (aka true-dependence).
Definition ScheduleDAG.h:55
void setLatency(unsigned Lat)
Sets the latency for this edge.
@ Barrier
An unknown scheduling barrier.
Definition ScheduleDAG.h:71
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition ScheduleDAG.h:74
void setSUnit(SUnit *SU)
This class represents the scheduled code.
std::deque< SUnit * > reorderInstructions(const SwingSchedulerDAG *SSD, const std::deque< SUnit * > &Instrs) const
void setInitiationInterval(int ii)
Set the initiation interval for this schedule.
void dump() const
Utility function used for debugging to print the schedule.
bool insert(SUnit *SU, int StartCycle, int EndCycle, int II)
Try to schedule the node at the specified StartCycle and continue until the node is schedule or the E...
int earliestCycleInChain(const SwingSchedulerDDGEdge &Dep, const SwingSchedulerDDG *DDG)
Return the cycle of the earliest scheduled instruction in the dependence chain.
unsigned getMaxStageCount()
Return the maximum stage count needed for this schedule.
void print(raw_ostream &os) const
Print the schedule information to the given output.
bool onlyHasLoopCarriedOutputOrOrderPreds(SUnit *SU, const SwingSchedulerDDG *DDG) const
Return true if all scheduled predecessors are loop-carried output/order dependencies.
int stageScheduled(SUnit *SU) const
Return the stage for a scheduled instruction.
void orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU, std::deque< SUnit * > &Insts) const
Order the instructions within a cycle so that the definitions occur before the uses.
bool isValidSchedule(SwingSchedulerDAG *SSD)
int getInitiationInterval() const
Return the initiation interval for this schedule.
std::deque< SUnit * > & getInstructions(int cycle)
Return the instructions that are scheduled at the specified cycle.
int getFirstCycle() const
Return the first cycle in the completed schedule.
DenseMap< int, std::deque< SUnit * > >::const_iterator const_sched_iterator
bool isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD, MachineInstr *Def, MachineOperand &MO) const
Return true if the instruction is a definition that is loop carried and defines the use on the next i...
unsigned cycleScheduled(SUnit *SU) const
Return the cycle for a scheduled instruction.
SmallPtrSet< SUnit *, 8 > computeUnpipelineableNodes(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
Determine transitive dependences of unpipelineable instructions.
void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart, int II, SwingSchedulerDAG *DAG)
Compute the scheduling start slot for the instruction.
bool normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
bool isLoopCarried(const SwingSchedulerDAG *SSD, MachineInstr &Phi) const
Return true if the scheduled Phi has a loop carried operand.
int latestCycleInChain(const SwingSchedulerDDGEdge &Dep, const SwingSchedulerDDG *DDG)
Return the cycle of the latest scheduled instruction in the dependence chain.
int getFinalCycle() const
Return the last cycle in the finalized schedule.
void finalizeSchedule(SwingSchedulerDAG *SSD)
After the schedule has been formed, call this function to combine the instructions from the different...
Scheduling unit. This is a node in the scheduling DAG.
unsigned NumPreds
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
unsigned NodeNum
Entry # of node in the node vector.
void setInstr(MachineInstr *MI)
Assigns the instruction for the SUnit.
LLVM_ABI void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
bool isPred(const SUnit *N) const
Tests if node N is a predecessor of this node.
unsigned short Latency
Node latency.
bool isBoundaryNode() const
Boundary nodes are placeholders for the boundary of the scheduling region.
bool hasPhysRegDefs
Has physreg defs that are being used.
SmallVector< SDep, 4 > Succs
All sunit successors.
SmallVector< SDep, 4 > Preds
All sunit predecessors.
LLVM_ABI bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
DenseMap< MachineInstr *, SUnit * > MISUnitMap
After calling BuildSchedGraph, each machine instruction in the current scheduling region is mapped to...
virtual void finishBlock()
Cleans up after scheduling in the given block.
MachineBasicBlock * BB
The block in which to insert instructions.
void buildSchedGraph(AAResults *AA, RegPressureTracker *RPTracker=nullptr, PressureDiffs *PDiffs=nullptr, LiveIntervals *LIS=nullptr, bool TrackLaneMasks=false)
Builds SUnits for the current region.
SUnit * getSUnit(MachineInstr *MI) const
Returns an existing SUnit for this MI, or nullptr.
void dump() const override
LLVM_ABI void AddPred(SUnit *Y, SUnit *X)
Updates the topological ordering to accommodate an edge to be added from SUnit X to SUnit Y.
LLVM_ABI bool IsReachable(const SUnit *SU, const SUnit *TargetSU)
Checks if SU is reachable from TargetSU.
MachineRegisterInfo & MRI
Virtual/real register map.
const TargetInstrInfo * TII
Target instruction information.
std::vector< SUnit > SUnits
The scheduling units.
const TargetRegisterInfo * TRI
Target processor register info.
SUnit EntrySU
Special node for the region entry.
MachineFunction & MF
Machine function.
SUnit ExitSU
Special node for the region exit.
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
void insert_range(Range &&R)
Definition SetVector.h:176
size_type count(const_arg_type key) const
Count the number of elements of a given key in the SetVector.
Definition SetVector.h:262
typename vector_type::const_iterator iterator
Definition SetVector.h:72
bool contains(const_arg_type key) const
Check if the SetVector contains the given key.
Definition SetVector.h:252
void clear()
Completely clear the SetVector.
Definition SetVector.h:267
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
iterator end() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:175
bool erase(const T &V)
Definition SmallSet.h:199
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class builds the dependence graph for the instructions in a loop, and attempts to schedule the i...
void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule)
Apply changes to the instruction if needed.
const SwingSchedulerDDG * getDDG() const
void finishBlock() override
Clean up after the software pipeliner runs.
void fixupRegisterOverlaps(std::deque< SUnit * > &Instrs)
Attempt to fix the degenerate cases when the instruction serialization causes the register lifetimes ...
bool isLoopCarriedDep(const SwingSchedulerDDGEdge &Edge) const
Return true for an order or output dependence that is loop carried potentially.
void schedule() override
We override the schedule function in ScheduleDAGInstrs to implement the scheduling part of the Swing ...
bool mayOverlapInLaterIter(const MachineInstr *BaseMI, const MachineInstr *OtherMI) const
Return false if there is no overlap between the region accessed by BaseMI in an iteration and the reg...
Register getInstrBaseReg(SUnit *SU) const
Return the new base register that was stored away for the changed instruction.
Represents a dependence between two instruction.
SUnit * getDst() const
Returns the SUnit to which the edge points (destination node).
bool ignoreDependence(bool IgnoreAnti) const
Returns true for DDG nodes that we ignore when computing the cost functions.
SUnit * getSrc() const
Returns the SUnit from which the edge comes (source node).
This class provides APIs to retrieve edges from/to an SUnit node, with a particular focus on loop-car...
SwingSchedulerDDG(std::vector< SUnit > &SUnits, SUnit *EntrySU, SUnit *ExitSU, const LoopCarriedEdges &LCE)
const EdgesType & getInEdges(const SUnit *SU) const
bool isValidSchedule(const SMSchedule &Schedule) const
Check if Schedule doesn't violate the validation-only dependencies.
const EdgesType & getOutEdges(const SUnit *SU) const
Object returned by analyzeLoopForPipelining.
virtual bool shouldIgnoreForPipelining(const MachineInstr *MI) const =0
Return true if the given instruction should not be pipelined and should be ignored.
TargetInstrInfo - Interface to description of machine instruction set.
Primary interface to the complete machine description for the target machine.
Target-Independent Code Generator Pass Configuration Options.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual bool enableMachinePipeliner() const
True if the subtarget should run MachinePipeliner.
virtual bool useDFAforSMS() const
Default to DFA for resource management, return false when target will use ProcResource in InstrSchedM...
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
virtual const InstrItineraryData * getInstrItineraryData() const
getInstrItineraryData - Returns instruction itinerary data for the target or specific subtarget.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
Wrapper class representing a virtual register or register unit.
Definition Register.h:181
constexpr bool isVirtualReg() const
Definition Register.h:197
constexpr MCRegUnit asMCRegUnit() const
Definition Register.h:201
constexpr Register asVirtualReg() const
Definition Register.h:206
The main class in the implementation of the target independent window scheduler.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
@ Valid
The data is already valid.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
Definition Metadata.h:668
constexpr double e
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
std::set< NodeId > NodeSet
Definition RDFGraph.h:551
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
Definition SFrame.h:77
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:532
void stable_sort(R &&Range)
Definition STLExtras.h:2106
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1667
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2198
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2190
CycleInfo::CycleT Cycle
Definition CycleInfo.h:24
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
static int64_t computeDelta(SectionEntry *A, SectionEntry *B)
@ WS_Force
Use window algorithm after SMS algorithm fails.
@ WS_On
Turn off window algorithm.
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
RegState getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
@ Other
Any other memory.
Definition ModRef.h:68
cl::opt< bool > SwpEnableCopyToPhi
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2042
LLVM_ABI char & MachinePipelinerID
This pass performs software pipelining on machine instructions.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
cl::opt< int > SwpForceIssueWidth
A command line argument to force pipeliner to use specified issue width.
@ Increment
Incrementally increasing token ID.
Definition AllocToken.h:26
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=MaxLookupSearchDepth)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
LLVM_ABI bool isIdentifiedObject(const Value *V)
Return true if this pointer refers to a distinct and identifiable object.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
This class holds an SUnit corresponding to a memory operation and other information related to the in...
const Value * MemOpValue
The value of a memory operand.
SmallVector< const Value *, 2 > UnderlyingObjs
bool isTriviallyDisjoint(const SUnitWithMemInfo &Other) const
int64_t MemOpOffset
The offset of a memory operand.
bool IsAllIdentified
True if all the underlying objects are identified.
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
uint64_t FuncUnits
Bitmask representing a set of functional units.
static constexpr LaneBitmask getNone()
Definition LaneBitmask.h:81
Represents loop-carried dependencies.
SmallSetVector< SUnit *, 8 > OrderDep
const OrderDep * getOrderDepOrNull(SUnit *Key) const
void modifySUnits(std::vector< SUnit > &SUnits, const TargetInstrInfo *TII)
Adds some edges to the original DAG that correspond to loop-carried dependencies.
void dump(SUnit *SU, const TargetRegisterInfo *TRI, const MachineRegisterInfo *MRI) const
Define a kind of processor resource that will be modeled by the scheduler.
Definition MCSchedule.h:36
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:123
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:258
const MCSchedClassDesc * getSchedClassDesc(unsigned SchedClassIdx) const
Definition MCSchedule.h:366
bool hasInstrSchedModel() const
Does this machine model include instruction-level scheduling.
Definition MCSchedule.h:340
const MCProcResourceDesc * getProcResource(unsigned ProcResourceIdx) const
Definition MCSchedule.h:359
Identify one of the processor resource kinds consumed by a particular scheduling class for the specif...
Definition MCSchedule.h:68
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
std::vector< unsigned > MaxSetPressure
Map of max reg pressure indexed by pressure set ID, not class ID.