LLVM 19.0.0git
MachinePipeliner.cpp
Go to the documentation of this file.
1//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
10//
11// This SMS implementation is a target-independent back-end pass. When enabled,
12// the pass runs just prior to the register allocation pass, while the machine
13// IR is in SSA form. If software pipelining is successful, then the original
14// loop is replaced by the optimized loop. The optimized loop contains one or
15// more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16// the instructions cannot be scheduled in a given MII, we increase the MII by
17// one and try again.
18//
19// The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20// represent loop carried dependences in the DAG as order edges to the Phi
21// nodes. We also perform several passes over the DAG to eliminate unnecessary
22// edges that inhibit the ability to pipeline. The implementation uses the
23// DFAPacketizer class to compute the minimum initiation interval and the check
24// where an instruction may be inserted in the pipelined schedule.
25//
26// In order for the SMS pass to work, several target specific hooks need to be
27// implemented to get information about the loop structure and to rewrite
28// instructions.
29//
30//===----------------------------------------------------------------------===//
31
33#include "llvm/ADT/ArrayRef.h"
34#include "llvm/ADT/BitVector.h"
35#include "llvm/ADT/DenseMap.h"
36#include "llvm/ADT/MapVector.h"
38#include "llvm/ADT/STLExtras.h"
40#include "llvm/ADT/SetVector.h"
42#include "llvm/ADT/SmallSet.h"
44#include "llvm/ADT/Statistic.h"
74#include "llvm/Config/llvm-config.h"
75#include "llvm/IR/Attributes.h"
76#include "llvm/IR/Function.h"
77#include "llvm/MC/LaneBitmask.h"
78#include "llvm/MC/MCInstrDesc.h"
81#include "llvm/Pass.h"
84#include "llvm/Support/Debug.h"
87#include <algorithm>
88#include <cassert>
89#include <climits>
90#include <cstdint>
91#include <deque>
92#include <functional>
93#include <iomanip>
94#include <iterator>
95#include <map>
96#include <memory>
97#include <sstream>
98#include <tuple>
99#include <utility>
100#include <vector>
101
102using namespace llvm;
103
104#define DEBUG_TYPE "pipeliner"
105
106STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
107STATISTIC(NumPipelined, "Number of loops software pipelined");
108STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
109STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
110STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
111STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
112STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
113STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
114STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
115STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
116STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
117
118/// A command line option to turn software pipelining on or off.
119static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
120 cl::desc("Enable Software Pipelining"));
121
122/// A command line option to enable SWP at -Os.
123static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
124 cl::desc("Enable SWP at Os."), cl::Hidden,
125 cl::init(false));
126
127/// A command line argument to limit minimum initial interval for pipelining.
128static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
129 cl::desc("Size limit for the MII."),
130 cl::Hidden, cl::init(27));
131
132/// A command line argument to force pipeliner to use specified initial
133/// interval.
134static cl::opt<int> SwpForceII("pipeliner-force-ii",
135 cl::desc("Force pipeliner to use specified II."),
136 cl::Hidden, cl::init(-1));
137
138/// A command line argument to limit the number of stages in the pipeline.
139static cl::opt<int>
140 SwpMaxStages("pipeliner-max-stages",
141 cl::desc("Maximum stages allowed in the generated scheduled."),
142 cl::Hidden, cl::init(3));
143
144/// A command line option to disable the pruning of chain dependences due to
145/// an unrelated Phi.
146static cl::opt<bool>
147 SwpPruneDeps("pipeliner-prune-deps",
148 cl::desc("Prune dependences between unrelated Phi nodes."),
149 cl::Hidden, cl::init(true));
150
151/// A command line option to disable the pruning of loop carried order
152/// dependences.
153static cl::opt<bool>
154 SwpPruneLoopCarried("pipeliner-prune-loop-carried",
155 cl::desc("Prune loop carried order dependences."),
156 cl::Hidden, cl::init(true));
157
158#ifndef NDEBUG
159static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
160#endif
161
162static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
164 cl::desc("Ignore RecMII"));
165
166static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
167 cl::init(false));
168static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
169 cl::init(false));
170
172 "pipeliner-annotate-for-testing", cl::Hidden, cl::init(false),
173 cl::desc("Instead of emitting the pipelined code, annotate instructions "
174 "with the generated schedule for feeding into the "
175 "-modulo-schedule-test pass"));
176
178 "pipeliner-experimental-cg", cl::Hidden, cl::init(false),
179 cl::desc(
180 "Use the experimental peeling code generator for software pipelining"));
181
182static cl::opt<int> SwpIISearchRange("pipeliner-ii-search-range",
183 cl::desc("Range to search for II"),
184 cl::Hidden, cl::init(10));
185
186static cl::opt<bool>
187 LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false),
188 cl::desc("Limit register pressure of scheduled loop"));
189
190static cl::opt<int>
191 RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden,
192 cl::init(5),
193 cl::desc("Margin representing the unused percentage of "
194 "the register pressure limit"));
195
196static cl::opt<bool>
197 MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false),
198 cl::desc("Use the MVE code generator for software pipelining"));
199
200namespace llvm {
201
202// A command line option to enable the CopyToPhi DAG mutation.
204 cl::init(true),
205 cl::desc("Enable CopyToPhi DAG Mutation"));
206
207/// A command line argument to force pipeliner to use specified issue
208/// width.
210 "pipeliner-force-issue-width",
211 cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
212 cl::init(-1));
213
214/// A command line argument to set the window scheduling option.
217 cl::desc("Set how to use window scheduling algorithm."),
219 "Turn off window algorithm."),
221 "Use window algorithm after SMS algorithm fails."),
223 "Use window algorithm instead of SMS algorithm.")));
224
225} // end namespace llvm
226
227unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
228char MachinePipeliner::ID = 0;
229#ifndef NDEBUG
231#endif
233
235 "Modulo Software Pipelining", false, false)
241 "Modulo Software Pipelining", false, false)
242
243/// The "main" function for implementing Swing Modulo Scheduling.
244bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
245 if (skipFunction(mf.getFunction()))
246 return false;
247
248 if (!EnableSWP)
249 return false;
250
251 if (mf.getFunction().getAttributes().hasFnAttr(Attribute::OptimizeForSize) &&
252 !EnableSWPOptSize.getPosition())
253 return false;
254
255 if (!mf.getSubtarget().enableMachinePipeliner())
256 return false;
257
258 // Cannot pipeline loops without instruction itineraries if we are using
259 // DFA for the pipeliner.
260 if (mf.getSubtarget().useDFAforSMS() &&
261 (!mf.getSubtarget().getInstrItineraryData() ||
262 mf.getSubtarget().getInstrItineraryData()->isEmpty()))
263 return false;
264
265 MF = &mf;
266 MLI = &getAnalysis<MachineLoopInfo>();
267 MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
268 ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
269 TII = MF->getSubtarget().getInstrInfo();
270 RegClassInfo.runOnMachineFunction(*MF);
271
272 for (const auto &L : *MLI)
273 scheduleLoop(*L);
274
275 return false;
276}
277
278/// Attempt to perform the SMS algorithm on the specified loop. This function is
279/// the main entry point for the algorithm. The function identifies candidate
280/// loops, calculates the minimum initiation interval, and attempts to schedule
281/// the loop.
282bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
283 bool Changed = false;
284 for (const auto &InnerLoop : L)
285 Changed |= scheduleLoop(*InnerLoop);
286
287#ifndef NDEBUG
288 // Stop trying after reaching the limit (if any).
289 int Limit = SwpLoopLimit;
290 if (Limit >= 0) {
291 if (NumTries >= SwpLoopLimit)
292 return Changed;
293 NumTries++;
294 }
295#endif
296
297 setPragmaPipelineOptions(L);
298 if (!canPipelineLoop(L)) {
299 LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
300 ORE->emit([&]() {
301 return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
302 L.getStartLoc(), L.getHeader())
303 << "Failed to pipeline loop";
304 });
305
306 LI.LoopPipelinerInfo.reset();
307 return Changed;
308 }
309
310 ++NumTrytoPipeline;
311 if (useSwingModuloScheduler())
312 Changed = swingModuloScheduler(L);
313
314 if (useWindowScheduler(Changed))
315 Changed = runWindowScheduler(L);
316
317 LI.LoopPipelinerInfo.reset();
318 return Changed;
319}
320
321void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
322 // Reset the pragma for the next loop in iteration.
323 disabledByPragma = false;
324 II_setByPragma = 0;
325
326 MachineBasicBlock *LBLK = L.getTopBlock();
327
328 if (LBLK == nullptr)
329 return;
330
331 const BasicBlock *BBLK = LBLK->getBasicBlock();
332 if (BBLK == nullptr)
333 return;
334
335 const Instruction *TI = BBLK->getTerminator();
336 if (TI == nullptr)
337 return;
338
339 MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);
340 if (LoopID == nullptr)
341 return;
342
343 assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");
344 assert(LoopID->getOperand(0) == LoopID && "invalid loop");
345
346 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
347 MDNode *MD = dyn_cast<MDNode>(MDO);
348
349 if (MD == nullptr)
350 continue;
351
352 MDString *S = dyn_cast<MDString>(MD->getOperand(0));
353
354 if (S == nullptr)
355 continue;
356
357 if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
358 assert(MD->getNumOperands() == 2 &&
359 "Pipeline initiation interval hint metadata should have two operands.");
361 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
362 assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");
363 } else if (S->getString() == "llvm.loop.pipeline.disable") {
364 disabledByPragma = true;
365 }
366 }
367}
368
369/// Return true if the loop can be software pipelined. The algorithm is
370/// restricted to loops with a single basic block. Make sure that the
371/// branch in the loop can be analyzed.
372bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
373 if (L.getNumBlocks() != 1) {
374 ORE->emit([&]() {
375 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
376 L.getStartLoc(), L.getHeader())
377 << "Not a single basic block: "
378 << ore::NV("NumBlocks", L.getNumBlocks());
379 });
380 return false;
381 }
382
383 if (disabledByPragma) {
384 ORE->emit([&]() {
385 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
386 L.getStartLoc(), L.getHeader())
387 << "Disabled by Pragma.";
388 });
389 return false;
390 }
391
392 // Check if the branch can't be understood because we can't do pipelining
393 // if that's the case.
394 LI.TBB = nullptr;
395 LI.FBB = nullptr;
396 LI.BrCond.clear();
397 if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
398 LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
399 NumFailBranch++;
400 ORE->emit([&]() {
401 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
402 L.getStartLoc(), L.getHeader())
403 << "The branch can't be understood";
404 });
405 return false;
406 }
407
408 LI.LoopInductionVar = nullptr;
409 LI.LoopCompare = nullptr;
411 if (!LI.LoopPipelinerInfo) {
412 LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
413 NumFailLoop++;
414 ORE->emit([&]() {
415 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
416 L.getStartLoc(), L.getHeader())
417 << "The loop structure is not supported";
418 });
419 return false;
420 }
421
422 if (!L.getLoopPreheader()) {
423 LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
424 NumFailPreheader++;
425 ORE->emit([&]() {
426 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
427 L.getStartLoc(), L.getHeader())
428 << "No loop preheader found";
429 });
430 return false;
431 }
432
433 // Remove any subregisters from inputs to phi nodes.
434 preprocessPhiNodes(*L.getHeader());
435 return true;
436}
437
438void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
440 SlotIndexes &Slots = *getAnalysis<LiveIntervals>().getSlotIndexes();
441
442 for (MachineInstr &PI : B.phis()) {
443 MachineOperand &DefOp = PI.getOperand(0);
444 assert(DefOp.getSubReg() == 0);
445 auto *RC = MRI.getRegClass(DefOp.getReg());
446
447 for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
448 MachineOperand &RegOp = PI.getOperand(i);
449 if (RegOp.getSubReg() == 0)
450 continue;
451
452 // If the operand uses a subregister, replace it with a new register
453 // without subregisters, and generate a copy to the new register.
454 Register NewReg = MRI.createVirtualRegister(RC);
455 MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
457 const DebugLoc &DL = PredB.findDebugLoc(At);
458 auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
459 .addReg(RegOp.getReg(), getRegState(RegOp),
460 RegOp.getSubReg());
461 Slots.insertMachineInstrInMaps(*Copy);
462 RegOp.setReg(NewReg);
463 RegOp.setSubReg(0);
464 }
465 }
466}
467
468/// The SMS algorithm consists of the following main steps:
469/// 1. Computation and analysis of the dependence graph.
470/// 2. Ordering of the nodes (instructions).
471/// 3. Attempt to Schedule the loop.
472bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
473 assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
474
475 SwingSchedulerDAG SMS(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo,
477
478 MachineBasicBlock *MBB = L.getHeader();
479 // The kernel should not include any terminator instructions. These
480 // will be added back later.
481 SMS.startBlock(MBB);
482
483 // Compute the number of 'real' instructions in the basic block by
484 // ignoring terminators.
485 unsigned size = MBB->size();
487 E = MBB->instr_end();
488 I != E; ++I, --size)
489 ;
490
491 SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
492 SMS.schedule();
493 SMS.exitRegion();
494
495 SMS.finishBlock();
496 return SMS.hasNewSchedule();
497}
498
508}
509
510bool MachinePipeliner::runWindowScheduler(MachineLoop &L) {
511 MachineSchedContext Context;
512 Context.MF = MF;
513 Context.MLI = MLI;
514 Context.MDT = MDT;
515 Context.PassConfig = &getAnalysis<TargetPassConfig>();
516 Context.AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
517 Context.LIS = &getAnalysis<LiveIntervals>();
519 WindowScheduler WS(&Context, L);
520 return WS.run();
521}
522
523bool MachinePipeliner::useSwingModuloScheduler() {
524 // SwingModuloScheduler does not work when WindowScheduler is forced.
526}
527
528bool MachinePipeliner::useWindowScheduler(bool Changed) {
529 // WindowScheduler does not work when it is off or when SwingModuloScheduler
530 // is successfully scheduled.
533}
534
535void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
536 if (SwpForceII > 0)
537 MII = SwpForceII;
538 else if (II_setByPragma > 0)
539 MII = II_setByPragma;
540 else
541 MII = std::max(ResMII, RecMII);
542}
543
544void SwingSchedulerDAG::setMAX_II() {
545 if (SwpForceII > 0)
546 MAX_II = SwpForceII;
547 else if (II_setByPragma > 0)
548 MAX_II = II_setByPragma;
549 else
550 MAX_II = MII + SwpIISearchRange;
551}
552
553/// We override the schedule function in ScheduleDAGInstrs to implement the
554/// scheduling part of the Swing Modulo Scheduling algorithm.
556 AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
557 buildSchedGraph(AA);
558 addLoopCarriedDependences(AA);
559 updatePhiDependences();
561 changeDependences();
562 postProcessDAG();
563 LLVM_DEBUG(dump());
564
565 NodeSetType NodeSets;
566 findCircuits(NodeSets);
567 NodeSetType Circuits = NodeSets;
568
569 // Calculate the MII.
570 unsigned ResMII = calculateResMII();
571 unsigned RecMII = calculateRecMII(NodeSets);
572
573 fuseRecs(NodeSets);
574
575 // This flag is used for testing and can cause correctness problems.
576 if (SwpIgnoreRecMII)
577 RecMII = 0;
578
579 setMII(ResMII, RecMII);
580 setMAX_II();
581
582 LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
583 << " (rec=" << RecMII << ", res=" << ResMII << ")\n");
584
585 // Can't schedule a loop without a valid MII.
586 if (MII == 0) {
587 LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
588 NumFailZeroMII++;
589 Pass.ORE->emit([&]() {
591 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
592 << "Invalid Minimal Initiation Interval: 0";
593 });
594 return;
595 }
596
597 // Don't pipeline large loops.
598 if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {
599 LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
600 << ", we don't pipeline large loops\n");
601 NumFailLargeMaxMII++;
602 Pass.ORE->emit([&]() {
604 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
605 << "Minimal Initiation Interval too large: "
606 << ore::NV("MII", (int)MII) << " > "
607 << ore::NV("SwpMaxMii", SwpMaxMii) << "."
608 << "Refer to -pipeliner-max-mii.";
609 });
610 return;
611 }
612
613 computeNodeFunctions(NodeSets);
614
615 registerPressureFilter(NodeSets);
616
617 colocateNodeSets(NodeSets);
618
619 checkNodeSets(NodeSets);
620
621 LLVM_DEBUG({
622 for (auto &I : NodeSets) {
623 dbgs() << " Rec NodeSet ";
624 I.dump();
625 }
626 });
627
628 llvm::stable_sort(NodeSets, std::greater<NodeSet>());
629
630 groupRemainingNodes(NodeSets);
631
632 removeDuplicateNodes(NodeSets);
633
634 LLVM_DEBUG({
635 for (auto &I : NodeSets) {
636 dbgs() << " NodeSet ";
637 I.dump();
638 }
639 });
640
641 computeNodeOrder(NodeSets);
642
643 // check for node order issues
644 checkValidNodeOrder(Circuits);
645
646 SMSchedule Schedule(Pass.MF, this);
647 Scheduled = schedulePipeline(Schedule);
648
649 if (!Scheduled){
650 LLVM_DEBUG(dbgs() << "No schedule found, return\n");
651 NumFailNoSchedule++;
652 Pass.ORE->emit([&]() {
654 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
655 << "Unable to find schedule";
656 });
657 return;
658 }
659
660 unsigned numStages = Schedule.getMaxStageCount();
661 // No need to generate pipeline if there are no overlapped iterations.
662 if (numStages == 0) {
663 LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
664 NumFailZeroStage++;
665 Pass.ORE->emit([&]() {
667 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
668 << "No need to pipeline - no overlapped iterations in schedule.";
669 });
670 return;
671 }
672 // Check that the maximum stage count is less than user-defined limit.
673 if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {
674 LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
675 << " : too many stages, abort\n");
676 NumFailLargeMaxStage++;
677 Pass.ORE->emit([&]() {
679 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
680 << "Too many stages in schedule: "
681 << ore::NV("numStages", (int)numStages) << " > "
682 << ore::NV("SwpMaxStages", SwpMaxStages)
683 << ". Refer to -pipeliner-max-stages.";
684 });
685 return;
686 }
687
688 Pass.ORE->emit([&]() {
690 Loop.getHeader())
691 << "Pipelined succesfully!";
692 });
693
694 // Generate the schedule as a ModuloSchedule.
695 DenseMap<MachineInstr *, int> Cycles, Stages;
696 std::vector<MachineInstr *> OrderedInsts;
697 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
698 ++Cycle) {
699 for (SUnit *SU : Schedule.getInstructions(Cycle)) {
700 OrderedInsts.push_back(SU->getInstr());
701 Cycles[SU->getInstr()] = Cycle;
702 Stages[SU->getInstr()] = Schedule.stageScheduled(SU);
703 }
704 }
706 for (auto &KV : NewMIs) {
707 Cycles[KV.first] = Cycles[KV.second];
708 Stages[KV.first] = Stages[KV.second];
709 NewInstrChanges[KV.first] = InstrChanges[getSUnit(KV.first)];
710 }
711
712 ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
713 std::move(Stages));
715 assert(NewInstrChanges.empty() &&
716 "Cannot serialize a schedule with InstrChanges!");
718 MSTI.annotate();
719 return;
720 }
721 // The experimental code generator can't work if there are InstChanges.
722 if (ExperimentalCodeGen && NewInstrChanges.empty()) {
723 PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
724 MSE.expand();
725 } else if (MVECodeGen && NewInstrChanges.empty() &&
726 LoopPipelinerInfo->isMVEExpanderSupported() &&
728 ModuloScheduleExpanderMVE MSE(MF, MS, LIS);
729 MSE.expand();
730 } else {
731 ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
732 MSE.expand();
733 MSE.cleanup();
734 }
735 ++NumPipelined;
736}
737
738/// Clean up after the software pipeliner runs.
740 for (auto &KV : NewMIs)
741 MF.deleteMachineInstr(KV.second);
742 NewMIs.clear();
743
744 // Call the superclass.
746}
747
748/// Return the register values for the operands of a Phi instruction.
749/// This function assume the instruction is a Phi.
751 unsigned &InitVal, unsigned &LoopVal) {
752 assert(Phi.isPHI() && "Expecting a Phi.");
753
754 InitVal = 0;
755 LoopVal = 0;
756 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
757 if (Phi.getOperand(i + 1).getMBB() != Loop)
758 InitVal = Phi.getOperand(i).getReg();
759 else
760 LoopVal = Phi.getOperand(i).getReg();
761
762 assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
763}
764
765/// Return the Phi register value that comes the loop block.
766static unsigned getLoopPhiReg(const MachineInstr &Phi,
767 const MachineBasicBlock *LoopBB) {
768 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
769 if (Phi.getOperand(i + 1).getMBB() == LoopBB)
770 return Phi.getOperand(i).getReg();
771 return 0;
772}
773
774/// Return true if SUb can be reached from SUa following the chain edges.
775static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
778 Worklist.push_back(SUa);
779 while (!Worklist.empty()) {
780 const SUnit *SU = Worklist.pop_back_val();
781 for (const auto &SI : SU->Succs) {
782 SUnit *SuccSU = SI.getSUnit();
783 if (SI.getKind() == SDep::Order) {
784 if (Visited.count(SuccSU))
785 continue;
786 if (SuccSU == SUb)
787 return true;
788 Worklist.push_back(SuccSU);
789 Visited.insert(SuccSU);
790 }
791 }
792 }
793 return false;
794}
795
796/// Return true if the instruction causes a chain between memory
797/// references before and after it.
799 return MI.isCall() || MI.mayRaiseFPException() ||
800 MI.hasUnmodeledSideEffects() ||
801 (MI.hasOrderedMemoryRef() &&
802 (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad()));
803}
804
805/// Return the underlying objects for the memory references of an instruction.
806/// This function calls the code in ValueTracking, but first checks that the
807/// instruction has a memory operand.
810 if (!MI->hasOneMemOperand())
811 return;
812 MachineMemOperand *MM = *MI->memoperands_begin();
813 if (!MM->getValue())
814 return;
815 getUnderlyingObjects(MM->getValue(), Objs);
816 for (const Value *V : Objs) {
817 if (!isIdentifiedObject(V)) {
818 Objs.clear();
819 return;
820 }
821 }
822}
823
824/// Add a chain edge between a load and store if the store can be an
825/// alias of the load on a subsequent iteration, i.e., a loop carried
826/// dependence. This code is very similar to the code in ScheduleDAGInstrs
827/// but that code doesn't create loop carried dependences.
828void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
832 for (auto &SU : SUnits) {
833 MachineInstr &MI = *SU.getInstr();
835 PendingLoads.clear();
836 else if (MI.mayLoad()) {
839 if (Objs.empty())
841 for (const auto *V : Objs) {
842 SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
843 SUs.push_back(&SU);
844 }
845 } else if (MI.mayStore()) {
848 if (Objs.empty())
850 for (const auto *V : Objs) {
852 PendingLoads.find(V);
853 if (I == PendingLoads.end())
854 continue;
855 for (auto *Load : I->second) {
856 if (isSuccOrder(Load, &SU))
857 continue;
858 MachineInstr &LdMI = *Load->getInstr();
859 // First, perform the cheaper check that compares the base register.
860 // If they are the same and the load offset is less than the store
861 // offset, then mark the dependence as loop carried potentially.
862 const MachineOperand *BaseOp1, *BaseOp2;
863 int64_t Offset1, Offset2;
864 bool Offset1IsScalable, Offset2IsScalable;
865 if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1,
866 Offset1IsScalable, TRI) &&
867 TII->getMemOperandWithOffset(MI, BaseOp2, Offset2,
868 Offset2IsScalable, TRI)) {
869 if (BaseOp1->isIdenticalTo(*BaseOp2) &&
870 Offset1IsScalable == Offset2IsScalable &&
871 (int)Offset1 < (int)Offset2) {
873 "What happened to the chain edge?");
874 SDep Dep(Load, SDep::Barrier);
875 Dep.setLatency(1);
876 SU.addPred(Dep);
877 continue;
878 }
879 }
880 // Second, the more expensive check that uses alias analysis on the
881 // base registers. If they alias, and the load offset is less than
882 // the store offset, the mark the dependence as loop carried.
883 if (!AA) {
884 SDep Dep(Load, SDep::Barrier);
885 Dep.setLatency(1);
886 SU.addPred(Dep);
887 continue;
888 }
889 MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
890 MachineMemOperand *MMO2 = *MI.memoperands_begin();
891 if (!MMO1->getValue() || !MMO2->getValue()) {
892 SDep Dep(Load, SDep::Barrier);
893 Dep.setLatency(1);
894 SU.addPred(Dep);
895 continue;
896 }
897 if (MMO1->getValue() == MMO2->getValue() &&
898 MMO1->getOffset() <= MMO2->getOffset()) {
899 SDep Dep(Load, SDep::Barrier);
900 Dep.setLatency(1);
901 SU.addPred(Dep);
902 continue;
903 }
904 if (!AA->isNoAlias(
907 MMO2->getAAInfo()))) {
908 SDep Dep(Load, SDep::Barrier);
909 Dep.setLatency(1);
910 SU.addPred(Dep);
911 }
912 }
913 }
914 }
915 }
916}
917
918/// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
919/// processes dependences for PHIs. This function adds true dependences
920/// from a PHI to a use, and a loop carried dependence from the use to the
921/// PHI. The loop carried dependence is represented as an anti dependence
922/// edge. This function also removes chain dependences between unrelated
923/// PHIs.
924void SwingSchedulerDAG::updatePhiDependences() {
925 SmallVector<SDep, 4> RemoveDeps;
927
928 // Iterate over each DAG node.
929 for (SUnit &I : SUnits) {
930 RemoveDeps.clear();
931 // Set to true if the instruction has an operand defined by a Phi.
932 unsigned HasPhiUse = 0;
933 unsigned HasPhiDef = 0;
934 MachineInstr *MI = I.getInstr();
935 // Iterate over each operand, and we process the definitions.
936 for (const MachineOperand &MO : MI->operands()) {
937 if (!MO.isReg())
938 continue;
939 Register Reg = MO.getReg();
940 if (MO.isDef()) {
941 // If the register is used by a Phi, then create an anti dependence.
943 UI = MRI.use_instr_begin(Reg),
944 UE = MRI.use_instr_end();
945 UI != UE; ++UI) {
946 MachineInstr *UseMI = &*UI;
947 SUnit *SU = getSUnit(UseMI);
948 if (SU != nullptr && UseMI->isPHI()) {
949 if (!MI->isPHI()) {
950 SDep Dep(SU, SDep::Anti, Reg);
951 Dep.setLatency(1);
952 I.addPred(Dep);
953 } else {
954 HasPhiDef = Reg;
955 // Add a chain edge to a dependent Phi that isn't an existing
956 // predecessor.
957 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
958 I.addPred(SDep(SU, SDep::Barrier));
959 }
960 }
961 }
962 } else if (MO.isUse()) {
963 // If the register is defined by a Phi, then create a true dependence.
965 if (DefMI == nullptr)
966 continue;
967 SUnit *SU = getSUnit(DefMI);
968 if (SU != nullptr && DefMI->isPHI()) {
969 if (!MI->isPHI()) {
970 SDep Dep(SU, SDep::Data, Reg);
971 Dep.setLatency(0);
972 ST.adjustSchedDependency(SU, 0, &I, MO.getOperandNo(), Dep,
973 &SchedModel);
974 I.addPred(Dep);
975 } else {
976 HasPhiUse = Reg;
977 // Add a chain edge to a dependent Phi that isn't an existing
978 // predecessor.
979 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
980 I.addPred(SDep(SU, SDep::Barrier));
981 }
982 }
983 }
984 }
985 // Remove order dependences from an unrelated Phi.
986 if (!SwpPruneDeps)
987 continue;
988 for (auto &PI : I.Preds) {
989 MachineInstr *PMI = PI.getSUnit()->getInstr();
990 if (PMI->isPHI() && PI.getKind() == SDep::Order) {
991 if (I.getInstr()->isPHI()) {
992 if (PMI->getOperand(0).getReg() == HasPhiUse)
993 continue;
994 if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
995 continue;
996 }
997 RemoveDeps.push_back(PI);
998 }
999 }
1000 for (int i = 0, e = RemoveDeps.size(); i != e; ++i)
1001 I.removePred(RemoveDeps[i]);
1002 }
1003}
1004
1005/// Iterate over each DAG node and see if we can change any dependences
1006/// in order to reduce the recurrence MII.
1007void SwingSchedulerDAG::changeDependences() {
1008 // See if an instruction can use a value from the previous iteration.
1009 // If so, we update the base and offset of the instruction and change
1010 // the dependences.
1011 for (SUnit &I : SUnits) {
1012 unsigned BasePos = 0, OffsetPos = 0, NewBase = 0;
1013 int64_t NewOffset = 0;
1014 if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
1015 NewOffset))
1016 continue;
1017
1018 // Get the MI and SUnit for the instruction that defines the original base.
1019 Register OrigBase = I.getInstr()->getOperand(BasePos).getReg();
1021 if (!DefMI)
1022 continue;
1023 SUnit *DefSU = getSUnit(DefMI);
1024 if (!DefSU)
1025 continue;
1026 // Get the MI and SUnit for the instruction that defins the new base.
1027 MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
1028 if (!LastMI)
1029 continue;
1030 SUnit *LastSU = getSUnit(LastMI);
1031 if (!LastSU)
1032 continue;
1033
1034 if (Topo.IsReachable(&I, LastSU))
1035 continue;
1036
1037 // Remove the dependence. The value now depends on a prior iteration.
1039 for (const SDep &P : I.Preds)
1040 if (P.getSUnit() == DefSU)
1041 Deps.push_back(P);
1042 for (int i = 0, e = Deps.size(); i != e; i++) {
1043 Topo.RemovePred(&I, Deps[i].getSUnit());
1044 I.removePred(Deps[i]);
1045 }
1046 // Remove the chain dependence between the instructions.
1047 Deps.clear();
1048 for (auto &P : LastSU->Preds)
1049 if (P.getSUnit() == &I && P.getKind() == SDep::Order)
1050 Deps.push_back(P);
1051 for (int i = 0, e = Deps.size(); i != e; i++) {
1052 Topo.RemovePred(LastSU, Deps[i].getSUnit());
1053 LastSU->removePred(Deps[i]);
1054 }
1055
1056 // Add a dependence between the new instruction and the instruction
1057 // that defines the new base.
1058 SDep Dep(&I, SDep::Anti, NewBase);
1059 Topo.AddPred(LastSU, &I);
1060 LastSU->addPred(Dep);
1061
1062 // Remember the base and offset information so that we can update the
1063 // instruction during code generation.
1064 InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
1065 }
1066}
1067
1068/// Create an instruction stream that represents a single iteration and stage of
1069/// each instruction. This function differs from SMSchedule::finalizeSchedule in
1070/// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this
1071/// function is an approximation of SMSchedule::finalizeSchedule with all
1072/// non-const operations removed.
1074 SMSchedule &Schedule,
1075 std::vector<MachineInstr *> &OrderedInsts,
1078
1079 // Move all instructions to the first stage from the later stages.
1080 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1081 ++Cycle) {
1082 for (int Stage = 0, LastStage = Schedule.getMaxStageCount();
1083 Stage <= LastStage; ++Stage) {
1084 for (SUnit *SU : llvm::reverse(Schedule.getInstructions(
1085 Cycle + Stage * Schedule.getInitiationInterval()))) {
1086 Instrs[Cycle].push_front(SU);
1087 }
1088 }
1089 }
1090
1091 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1092 ++Cycle) {
1093 std::deque<SUnit *> &CycleInstrs = Instrs[Cycle];
1094 CycleInstrs = Schedule.reorderInstructions(SSD, CycleInstrs);
1095 for (SUnit *SU : CycleInstrs) {
1096 MachineInstr *MI = SU->getInstr();
1097 OrderedInsts.push_back(MI);
1098 Stages[MI] = Schedule.stageScheduled(SU);
1099 }
1100 }
1101}
1102
1103namespace {
1104
1105// FuncUnitSorter - Comparison operator used to sort instructions by
1106// the number of functional unit choices.
1107struct FuncUnitSorter {
1108 const InstrItineraryData *InstrItins;
1109 const MCSubtargetInfo *STI;
1111
1112 FuncUnitSorter(const TargetSubtargetInfo &TSI)
1113 : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
1114
1115 // Compute the number of functional unit alternatives needed
1116 // at each stage, and take the minimum value. We prioritize the
1117 // instructions by the least number of choices first.
1118 unsigned minFuncUnits(const MachineInstr *Inst,
1119 InstrStage::FuncUnits &F) const {
1120 unsigned SchedClass = Inst->getDesc().getSchedClass();
1121 unsigned min = UINT_MAX;
1122 if (InstrItins && !InstrItins->isEmpty()) {
1123 for (const InstrStage &IS :
1124 make_range(InstrItins->beginStage(SchedClass),
1125 InstrItins->endStage(SchedClass))) {
1126 InstrStage::FuncUnits funcUnits = IS.getUnits();
1127 unsigned numAlternatives = llvm::popcount(funcUnits);
1128 if (numAlternatives < min) {
1129 min = numAlternatives;
1130 F = funcUnits;
1131 }
1132 }
1133 return min;
1134 }
1135 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1136 const MCSchedClassDesc *SCDesc =
1137 STI->getSchedModel().getSchedClassDesc(SchedClass);
1138 if (!SCDesc->isValid())
1139 // No valid Schedule Class Desc for schedClass, should be
1140 // Pseudo/PostRAPseudo
1141 return min;
1142
1143 for (const MCWriteProcResEntry &PRE :
1144 make_range(STI->getWriteProcResBegin(SCDesc),
1145 STI->getWriteProcResEnd(SCDesc))) {
1146 if (!PRE.ReleaseAtCycle)
1147 continue;
1148 const MCProcResourceDesc *ProcResource =
1149 STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);
1150 unsigned NumUnits = ProcResource->NumUnits;
1151 if (NumUnits < min) {
1152 min = NumUnits;
1153 F = PRE.ProcResourceIdx;
1154 }
1155 }
1156 return min;
1157 }
1158 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1159 }
1160
1161 // Compute the critical resources needed by the instruction. This
1162 // function records the functional units needed by instructions that
1163 // must use only one functional unit. We use this as a tie breaker
1164 // for computing the resource MII. The instrutions that require
1165 // the same, highly used, functional unit have high priority.
1166 void calcCriticalResources(MachineInstr &MI) {
1167 unsigned SchedClass = MI.getDesc().getSchedClass();
1168 if (InstrItins && !InstrItins->isEmpty()) {
1169 for (const InstrStage &IS :
1170 make_range(InstrItins->beginStage(SchedClass),
1171 InstrItins->endStage(SchedClass))) {
1172 InstrStage::FuncUnits FuncUnits = IS.getUnits();
1173 if (llvm::popcount(FuncUnits) == 1)
1174 Resources[FuncUnits]++;
1175 }
1176 return;
1177 }
1178 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1179 const MCSchedClassDesc *SCDesc =
1180 STI->getSchedModel().getSchedClassDesc(SchedClass);
1181 if (!SCDesc->isValid())
1182 // No valid Schedule Class Desc for schedClass, should be
1183 // Pseudo/PostRAPseudo
1184 return;
1185
1186 for (const MCWriteProcResEntry &PRE :
1187 make_range(STI->getWriteProcResBegin(SCDesc),
1188 STI->getWriteProcResEnd(SCDesc))) {
1189 if (!PRE.ReleaseAtCycle)
1190 continue;
1191 Resources[PRE.ProcResourceIdx]++;
1192 }
1193 return;
1194 }
1195 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1196 }
1197
1198 /// Return true if IS1 has less priority than IS2.
1199 bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1200 InstrStage::FuncUnits F1 = 0, F2 = 0;
1201 unsigned MFUs1 = minFuncUnits(IS1, F1);
1202 unsigned MFUs2 = minFuncUnits(IS2, F2);
1203 if (MFUs1 == MFUs2)
1204 return Resources.lookup(F1) < Resources.lookup(F2);
1205 return MFUs1 > MFUs2;
1206 }
1207};
1208
1209/// Calculate the maximum register pressure of the scheduled instructions stream
1210class HighRegisterPressureDetector {
1211 MachineBasicBlock *OrigMBB;
1212 const MachineFunction &MF;
1213 const MachineRegisterInfo &MRI;
1214 const TargetRegisterInfo *TRI;
1215
1216 const unsigned PSetNum;
1217
1218 // Indexed by PSet ID
1219 // InitSetPressure takes into account the register pressure of live-in
1220 // registers. It's not depend on how the loop is scheduled, so it's enough to
1221 // calculate them once at the beginning.
1222 std::vector<unsigned> InitSetPressure;
1223
1224 // Indexed by PSet ID
1225 // Upper limit for each register pressure set
1226 std::vector<unsigned> PressureSetLimit;
1227
1229
1231
1232public:
1233 using OrderedInstsTy = std::vector<MachineInstr *>;
1234 using Instr2StageTy = DenseMap<MachineInstr *, unsigned>;
1235
1236private:
1237 static void dumpRegisterPressures(const std::vector<unsigned> &Pressures) {
1238 if (Pressures.size() == 0) {
1239 dbgs() << "[]";
1240 } else {
1241 char Prefix = '[';
1242 for (unsigned P : Pressures) {
1243 dbgs() << Prefix << P;
1244 Prefix = ' ';
1245 }
1246 dbgs() << ']';
1247 }
1248 }
1249
1250 void dumpPSet(Register Reg) const {
1251 dbgs() << "Reg=" << printReg(Reg, TRI, 0, &MRI) << " PSet=";
1252 for (auto PSetIter = MRI.getPressureSets(Reg); PSetIter.isValid();
1253 ++PSetIter) {
1254 dbgs() << *PSetIter << ' ';
1255 }
1256 dbgs() << '\n';
1257 }
1258
1259 void increaseRegisterPressure(std::vector<unsigned> &Pressure,
1260 Register Reg) const {
1261 auto PSetIter = MRI.getPressureSets(Reg);
1262 unsigned Weight = PSetIter.getWeight();
1263 for (; PSetIter.isValid(); ++PSetIter)
1264 Pressure[*PSetIter] += Weight;
1265 }
1266
1267 void decreaseRegisterPressure(std::vector<unsigned> &Pressure,
1268 Register Reg) const {
1269 auto PSetIter = MRI.getPressureSets(Reg);
1270 unsigned Weight = PSetIter.getWeight();
1271 for (; PSetIter.isValid(); ++PSetIter) {
1272 auto &P = Pressure[*PSetIter];
1273 assert(P >= Weight &&
1274 "register pressure must be greater than or equal weight");
1275 P -= Weight;
1276 }
1277 }
1278
1279 // Return true if Reg is fixed one, for example, stack pointer
1280 bool isFixedRegister(Register Reg) const {
1281 return Reg.isPhysical() && TRI->isFixedRegister(MF, Reg.asMCReg());
1282 }
1283
1284 bool isDefinedInThisLoop(Register Reg) const {
1285 return Reg.isVirtual() && MRI.getVRegDef(Reg)->getParent() == OrigMBB;
1286 }
1287
1288 // Search for live-in variables. They are factored into the register pressure
1289 // from the begining. Live-in variables used by every iteration should be
1290 // considered as alive throughout the loop. For example, the variable `c` in
1291 // following code. \code
1292 // int c = ...;
1293 // for (int i = 0; i < n; i++)
1294 // a[i] += b[i] + c;
1295 // \endcode
1296 void computeLiveIn() {
1298 for (auto &MI : *OrigMBB) {
1299 if (MI.isDebugInstr())
1300 continue;
1301 for (auto &Use : ROMap[&MI].Uses) {
1302 auto Reg = Use.RegUnit;
1303 // Ignore the variable that appears only on one side of phi instruction
1304 // because it's used only at the first iteration.
1305 if (MI.isPHI() && Reg != getLoopPhiReg(MI, OrigMBB))
1306 continue;
1307 if (isFixedRegister(Reg))
1308 continue;
1309 if (isDefinedInThisLoop(Reg))
1310 continue;
1311 Used.insert(Reg);
1312 }
1313 }
1314
1315 for (auto LiveIn : Used)
1316 increaseRegisterPressure(InitSetPressure, LiveIn);
1317 }
1318
1319 // Calculate the upper limit of each pressure set
1320 void computePressureSetLimit(const RegisterClassInfo &RCI) {
1321 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1322 PressureSetLimit[PSet] = TRI->getRegPressureSetLimit(MF, PSet);
1323
1324 // We assume fixed registers, such as stack pointer, are already in use.
1325 // Therefore subtracting the weight of the fixed registers from the limit of
1326 // each pressure set in advance.
1328 for (const TargetRegisterClass *TRC : TRI->regclasses()) {
1329 for (const MCPhysReg Reg : *TRC)
1330 if (isFixedRegister(Reg))
1331 FixedRegs.insert(Reg);
1332 }
1333
1334 LLVM_DEBUG({
1335 for (auto Reg : FixedRegs) {
1336 dbgs() << printReg(Reg, TRI, 0, &MRI) << ": [";
1337 const int *Sets = TRI->getRegUnitPressureSets(Reg);
1338 for (; *Sets != -1; Sets++) {
1339 dbgs() << TRI->getRegPressureSetName(*Sets) << ", ";
1340 }
1341 dbgs() << "]\n";
1342 }
1343 });
1344
1345 for (auto Reg : FixedRegs) {
1346 LLVM_DEBUG(dbgs() << "fixed register: " << printReg(Reg, TRI, 0, &MRI)
1347 << "\n");
1348 auto PSetIter = MRI.getPressureSets(Reg);
1349 unsigned Weight = PSetIter.getWeight();
1350 for (; PSetIter.isValid(); ++PSetIter) {
1351 unsigned &Limit = PressureSetLimit[*PSetIter];
1352 assert(Limit >= Weight &&
1353 "register pressure limit must be greater than or equal weight");
1354 Limit -= Weight;
1355 LLVM_DEBUG(dbgs() << "PSet=" << *PSetIter << " Limit=" << Limit
1356 << " (decreased by " << Weight << ")\n");
1357 }
1358 }
1359 }
1360
1361 // There are two patterns of last-use.
1362 // - by an instruction of the current iteration
1363 // - by a phi instruction of the next iteration (loop carried value)
1364 //
1365 // Furthermore, following two groups of instructions are executed
1366 // simultaneously
1367 // - next iteration's phi instructions in i-th stage
1368 // - current iteration's instructions in i+1-th stage
1369 //
1370 // This function calculates the last-use of each register while taking into
1371 // account the above two patterns.
1372 Instr2LastUsesTy computeLastUses(const OrderedInstsTy &OrderedInsts,
1373 Instr2StageTy &Stages) const {
1374 // We treat virtual registers that are defined and used in this loop.
1375 // Following virtual register will be ignored
1376 // - live-in one
1377 // - defined but not used in the loop (potentially live-out)
1378 DenseSet<Register> TargetRegs;
1379 const auto UpdateTargetRegs = [this, &TargetRegs](Register Reg) {
1380 if (isDefinedInThisLoop(Reg))
1381 TargetRegs.insert(Reg);
1382 };
1383 for (MachineInstr *MI : OrderedInsts) {
1384 if (MI->isPHI()) {
1385 Register Reg = getLoopPhiReg(*MI, OrigMBB);
1386 UpdateTargetRegs(Reg);
1387 } else {
1388 for (auto &Use : ROMap.find(MI)->getSecond().Uses)
1389 UpdateTargetRegs(Use.RegUnit);
1390 }
1391 }
1392
1393 const auto InstrScore = [&Stages](MachineInstr *MI) {
1394 return Stages[MI] + MI->isPHI();
1395 };
1396
1398 for (MachineInstr *MI : llvm::reverse(OrderedInsts)) {
1399 for (auto &Use : ROMap.find(MI)->getSecond().Uses) {
1400 auto Reg = Use.RegUnit;
1401 if (!TargetRegs.contains(Reg))
1402 continue;
1403 auto Ite = LastUseMI.find(Reg);
1404 if (Ite == LastUseMI.end()) {
1405 LastUseMI[Reg] = MI;
1406 } else {
1407 MachineInstr *Orig = Ite->second;
1408 MachineInstr *New = MI;
1409 if (InstrScore(Orig) < InstrScore(New))
1410 LastUseMI[Reg] = New;
1411 }
1412 }
1413 }
1414
1415 Instr2LastUsesTy LastUses;
1416 for (auto &Entry : LastUseMI)
1417 LastUses[Entry.second].insert(Entry.first);
1418 return LastUses;
1419 }
1420
1421 // Compute the maximum register pressure of the kernel. We'll simulate #Stage
1422 // iterations and check the register pressure at the point where all stages
1423 // overlapping.
1424 //
1425 // An example of unrolled loop where #Stage is 4..
1426 // Iter i+0 i+1 i+2 i+3
1427 // ------------------------
1428 // Stage 0
1429 // Stage 1 0
1430 // Stage 2 1 0
1431 // Stage 3 2 1 0 <- All stages overlap
1432 //
1433 std::vector<unsigned>
1434 computeMaxSetPressure(const OrderedInstsTy &OrderedInsts,
1435 Instr2StageTy &Stages,
1436 const unsigned StageCount) const {
1437 using RegSetTy = SmallDenseSet<Register, 16>;
1438
1439 // Indexed by #Iter. To treat "local" variables of each stage separately, we
1440 // manage the liveness of the registers independently by iterations.
1441 SmallVector<RegSetTy> LiveRegSets(StageCount);
1442
1443 auto CurSetPressure = InitSetPressure;
1444 auto MaxSetPressure = InitSetPressure;
1445 auto LastUses = computeLastUses(OrderedInsts, Stages);
1446
1447 LLVM_DEBUG({
1448 dbgs() << "Ordered instructions:\n";
1449 for (MachineInstr *MI : OrderedInsts) {
1450 dbgs() << "Stage " << Stages[MI] << ": ";
1451 MI->dump();
1452 }
1453 });
1454
1455 const auto InsertReg = [this, &CurSetPressure](RegSetTy &RegSet,
1456 Register Reg) {
1457 if (!Reg.isValid() || isFixedRegister(Reg))
1458 return;
1459
1460 bool Inserted = RegSet.insert(Reg).second;
1461 if (!Inserted)
1462 return;
1463
1464 LLVM_DEBUG(dbgs() << "insert " << printReg(Reg, TRI, 0, &MRI) << "\n");
1465 increaseRegisterPressure(CurSetPressure, Reg);
1466 LLVM_DEBUG(dumpPSet(Reg));
1467 };
1468
1469 const auto EraseReg = [this, &CurSetPressure](RegSetTy &RegSet,
1470 Register Reg) {
1471 if (!Reg.isValid() || isFixedRegister(Reg))
1472 return;
1473
1474 // live-in register
1475 if (!RegSet.contains(Reg))
1476 return;
1477
1478 LLVM_DEBUG(dbgs() << "erase " << printReg(Reg, TRI, 0, &MRI) << "\n");
1479 RegSet.erase(Reg);
1480 decreaseRegisterPressure(CurSetPressure, Reg);
1481 LLVM_DEBUG(dumpPSet(Reg));
1482 };
1483
1484 for (unsigned I = 0; I < StageCount; I++) {
1485 for (MachineInstr *MI : OrderedInsts) {
1486 const auto Stage = Stages[MI];
1487 if (I < Stage)
1488 continue;
1489
1490 const unsigned Iter = I - Stage;
1491
1492 for (auto &Def : ROMap.find(MI)->getSecond().Defs)
1493 InsertReg(LiveRegSets[Iter], Def.RegUnit);
1494
1495 for (auto LastUse : LastUses[MI]) {
1496 if (MI->isPHI()) {
1497 if (Iter != 0)
1498 EraseReg(LiveRegSets[Iter - 1], LastUse);
1499 } else {
1500 EraseReg(LiveRegSets[Iter], LastUse);
1501 }
1502 }
1503
1504 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1505 MaxSetPressure[PSet] =
1506 std::max(MaxSetPressure[PSet], CurSetPressure[PSet]);
1507
1508 LLVM_DEBUG({
1509 dbgs() << "CurSetPressure=";
1510 dumpRegisterPressures(CurSetPressure);
1511 dbgs() << " iter=" << Iter << " stage=" << Stage << ":";
1512 MI->dump();
1513 });
1514 }
1515 }
1516
1517 return MaxSetPressure;
1518 }
1519
1520public:
1521 HighRegisterPressureDetector(MachineBasicBlock *OrigMBB,
1522 const MachineFunction &MF)
1523 : OrigMBB(OrigMBB), MF(MF), MRI(MF.getRegInfo()),
1524 TRI(MF.getSubtarget().getRegisterInfo()),
1525 PSetNum(TRI->getNumRegPressureSets()), InitSetPressure(PSetNum, 0),
1526 PressureSetLimit(PSetNum, 0) {}
1527
1528 // Used to calculate register pressure, which is independent of loop
1529 // scheduling.
1530 void init(const RegisterClassInfo &RCI) {
1531 for (MachineInstr &MI : *OrigMBB) {
1532 if (MI.isDebugInstr())
1533 continue;
1534 ROMap[&MI].collect(MI, *TRI, MRI, false, true);
1535 }
1536
1537 computeLiveIn();
1538 computePressureSetLimit(RCI);
1539 }
1540
1541 // Calculate the maximum register pressures of the loop and check if they
1542 // exceed the limit
1543 bool detect(const SwingSchedulerDAG *SSD, SMSchedule &Schedule,
1544 const unsigned MaxStage) const {
1546 "the percentage of the margin must be between 0 to 100");
1547
1548 OrderedInstsTy OrderedInsts;
1549 Instr2StageTy Stages;
1550 computeScheduledInsts(SSD, Schedule, OrderedInsts, Stages);
1551 const auto MaxSetPressure =
1552 computeMaxSetPressure(OrderedInsts, Stages, MaxStage + 1);
1553
1554 LLVM_DEBUG({
1555 dbgs() << "Dump MaxSetPressure:\n";
1556 for (unsigned I = 0; I < MaxSetPressure.size(); I++) {
1557 dbgs() << format("MaxSetPressure[%d]=%d\n", I, MaxSetPressure[I]);
1558 }
1559 dbgs() << '\n';
1560 });
1561
1562 for (unsigned PSet = 0; PSet < PSetNum; PSet++) {
1563 unsigned Limit = PressureSetLimit[PSet];
1564 unsigned Margin = Limit * RegPressureMargin / 100;
1565 LLVM_DEBUG(dbgs() << "PSet=" << PSet << " Limit=" << Limit
1566 << " Margin=" << Margin << "\n");
1567 if (Limit < MaxSetPressure[PSet] + Margin) {
1568 LLVM_DEBUG(
1569 dbgs()
1570 << "Rejected the schedule because of too high register pressure\n");
1571 return true;
1572 }
1573 }
1574 return false;
1575 }
1576};
1577
1578} // end anonymous namespace
1579
1580/// Calculate the resource constrained minimum initiation interval for the
1581/// specified loop. We use the DFA to model the resources needed for
1582/// each instruction, and we ignore dependences. A different DFA is created
1583/// for each cycle that is required. When adding a new instruction, we attempt
1584/// to add it to each existing DFA, until a legal space is found. If the
1585/// instruction cannot be reserved in an existing DFA, we create a new one.
1586unsigned SwingSchedulerDAG::calculateResMII() {
1587 LLVM_DEBUG(dbgs() << "calculateResMII:\n");
1589 return RM.calculateResMII();
1590}
1591
1592/// Calculate the recurrence-constrainted minimum initiation interval.
1593/// Iterate over each circuit. Compute the delay(c) and distance(c)
1594/// for each circuit. The II needs to satisfy the inequality
1595/// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1596/// II that satisfies the inequality, and the RecMII is the maximum
1597/// of those values.
1598unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1599 unsigned RecMII = 0;
1600
1601 for (NodeSet &Nodes : NodeSets) {
1602 if (Nodes.empty())
1603 continue;
1604
1605 unsigned Delay = Nodes.getLatency();
1606 unsigned Distance = 1;
1607
1608 // ii = ceil(delay / distance)
1609 unsigned CurMII = (Delay + Distance - 1) / Distance;
1610 Nodes.setRecMII(CurMII);
1611 if (CurMII > RecMII)
1612 RecMII = CurMII;
1613 }
1614
1615 return RecMII;
1616}
1617
1618/// Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1619/// but we do this to find the circuits, and then change them back.
1620static void swapAntiDependences(std::vector<SUnit> &SUnits) {
1622 for (SUnit &SU : SUnits) {
1623 for (SDep &Pred : SU.Preds)
1624 if (Pred.getKind() == SDep::Anti)
1625 DepsAdded.push_back(std::make_pair(&SU, Pred));
1626 }
1627 for (std::pair<SUnit *, SDep> &P : DepsAdded) {
1628 // Remove this anti dependency and add one in the reverse direction.
1629 SUnit *SU = P.first;
1630 SDep &D = P.second;
1631 SUnit *TargetSU = D.getSUnit();
1632 unsigned Reg = D.getReg();
1633 unsigned Lat = D.getLatency();
1634 SU->removePred(D);
1635 SDep Dep(SU, SDep::Anti, Reg);
1636 Dep.setLatency(Lat);
1637 TargetSU->addPred(Dep);
1638 }
1639}
1640
1641/// Create the adjacency structure of the nodes in the graph.
1642void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1643 SwingSchedulerDAG *DAG) {
1644 BitVector Added(SUnits.size());
1645 DenseMap<int, int> OutputDeps;
1646 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1647 Added.reset();
1648 // Add any successor to the adjacency matrix and exclude duplicates.
1649 for (auto &SI : SUnits[i].Succs) {
1650 // Only create a back-edge on the first and last nodes of a dependence
1651 // chain. This records any chains and adds them later.
1652 if (SI.getKind() == SDep::Output) {
1653 int N = SI.getSUnit()->NodeNum;
1654 int BackEdge = i;
1655 auto Dep = OutputDeps.find(BackEdge);
1656 if (Dep != OutputDeps.end()) {
1657 BackEdge = Dep->second;
1658 OutputDeps.erase(Dep);
1659 }
1660 OutputDeps[N] = BackEdge;
1661 }
1662 // Do not process a boundary node, an artificial node.
1663 // A back-edge is processed only if it goes to a Phi.
1664 if (SI.getSUnit()->isBoundaryNode() || SI.isArtificial() ||
1665 (SI.getKind() == SDep::Anti && !SI.getSUnit()->getInstr()->isPHI()))
1666 continue;
1667 int N = SI.getSUnit()->NodeNum;
1668 if (!Added.test(N)) {
1669 AdjK[i].push_back(N);
1670 Added.set(N);
1671 }
1672 }
1673 // A chain edge between a store and a load is treated as a back-edge in the
1674 // adjacency matrix.
1675 for (auto &PI : SUnits[i].Preds) {
1676 if (!SUnits[i].getInstr()->mayStore() ||
1677 !DAG->isLoopCarriedDep(&SUnits[i], PI, false))
1678 continue;
1679 if (PI.getKind() == SDep::Order && PI.getSUnit()->getInstr()->mayLoad()) {
1680 int N = PI.getSUnit()->NodeNum;
1681 if (!Added.test(N)) {
1682 AdjK[i].push_back(N);
1683 Added.set(N);
1684 }
1685 }
1686 }
1687 }
1688 // Add back-edges in the adjacency matrix for the output dependences.
1689 for (auto &OD : OutputDeps)
1690 if (!Added.test(OD.second)) {
1691 AdjK[OD.first].push_back(OD.second);
1692 Added.set(OD.second);
1693 }
1694}
1695
1696/// Identify an elementary circuit in the dependence graph starting at the
1697/// specified node.
1698bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1699 bool HasBackedge) {
1700 SUnit *SV = &SUnits[V];
1701 bool F = false;
1702 Stack.insert(SV);
1703 Blocked.set(V);
1704
1705 for (auto W : AdjK[V]) {
1706 if (NumPaths > MaxPaths)
1707 break;
1708 if (W < S)
1709 continue;
1710 if (W == S) {
1711 if (!HasBackedge)
1712 NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
1713 F = true;
1714 ++NumPaths;
1715 break;
1716 } else if (!Blocked.test(W)) {
1717 if (circuit(W, S, NodeSets,
1718 Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
1719 F = true;
1720 }
1721 }
1722
1723 if (F)
1724 unblock(V);
1725 else {
1726 for (auto W : AdjK[V]) {
1727 if (W < S)
1728 continue;
1729 B[W].insert(SV);
1730 }
1731 }
1732 Stack.pop_back();
1733 return F;
1734}
1735
1736/// Unblock a node in the circuit finding algorithm.
1737void SwingSchedulerDAG::Circuits::unblock(int U) {
1738 Blocked.reset(U);
1740 while (!BU.empty()) {
1742 assert(SI != BU.end() && "Invalid B set.");
1743 SUnit *W = *SI;
1744 BU.erase(W);
1745 if (Blocked.test(W->NodeNum))
1746 unblock(W->NodeNum);
1747 }
1748}
1749
1750/// Identify all the elementary circuits in the dependence graph using
1751/// Johnson's circuit algorithm.
1752void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
1753 // Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1754 // but we do this to find the circuits, and then change them back.
1756
1757 Circuits Cir(SUnits, Topo);
1758 // Create the adjacency structure.
1759 Cir.createAdjacencyStructure(this);
1760 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1761 Cir.reset();
1762 Cir.circuit(i, i, NodeSets);
1763 }
1764
1765 // Change the dependences back so that we've created a DAG again.
1767}
1768
1769// Create artificial dependencies between the source of COPY/REG_SEQUENCE that
1770// is loop-carried to the USE in next iteration. This will help pipeliner avoid
1771// additional copies that are needed across iterations. An artificial dependence
1772// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
1773
1774// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
1775// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
1776// PHI-------True-Dep------> USEOfPhi
1777
1778// The mutation creates
1779// USEOfPHI -------Artificial-Dep---> SRCOfCopy
1780
1781// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
1782// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
1783// late to avoid additional copies across iterations. The possible scheduling
1784// order would be
1785// USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE.
1786
1787void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
1788 for (SUnit &SU : DAG->SUnits) {
1789 // Find the COPY/REG_SEQUENCE instruction.
1790 if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
1791 continue;
1792
1793 // Record the loop carried PHIs.
1795 // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
1797
1798 for (auto &Dep : SU.Preds) {
1799 SUnit *TmpSU = Dep.getSUnit();
1800 MachineInstr *TmpMI = TmpSU->getInstr();
1801 SDep::Kind DepKind = Dep.getKind();
1802 // Save the loop carried PHI.
1803 if (DepKind == SDep::Anti && TmpMI->isPHI())
1804 PHISUs.push_back(TmpSU);
1805 // Save the source of COPY/REG_SEQUENCE.
1806 // If the source has no pre-decessors, we will end up creating cycles.
1807 else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
1808 SrcSUs.push_back(TmpSU);
1809 }
1810
1811 if (PHISUs.size() == 0 || SrcSUs.size() == 0)
1812 continue;
1813
1814 // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
1815 // SUnit to the container.
1817 // Do not use iterator based loop here as we are updating the container.
1818 for (size_t Index = 0; Index < PHISUs.size(); ++Index) {
1819 for (auto &Dep : PHISUs[Index]->Succs) {
1820 if (Dep.getKind() != SDep::Data)
1821 continue;
1822
1823 SUnit *TmpSU = Dep.getSUnit();
1824 MachineInstr *TmpMI = TmpSU->getInstr();
1825 if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
1826 PHISUs.push_back(TmpSU);
1827 continue;
1828 }
1829 UseSUs.push_back(TmpSU);
1830 }
1831 }
1832
1833 if (UseSUs.size() == 0)
1834 continue;
1835
1836 SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
1837 // Add the artificial dependencies if it does not form a cycle.
1838 for (auto *I : UseSUs) {
1839 for (auto *Src : SrcSUs) {
1840 if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
1841 Src->addPred(SDep(I, SDep::Artificial));
1842 SDAG->Topo.AddPred(Src, I);
1843 }
1844 }
1845 }
1846 }
1847}
1848
1849/// Return true for DAG nodes that we ignore when computing the cost functions.
1850/// We ignore the back-edge recurrence in order to avoid unbounded recursion
1851/// in the calculation of the ASAP, ALAP, etc functions.
1852static bool ignoreDependence(const SDep &D, bool isPred) {
1853 if (D.isArtificial() || D.getSUnit()->isBoundaryNode())
1854 return true;
1855 return D.getKind() == SDep::Anti && isPred;
1856}
1857
1858/// Compute several functions need to order the nodes for scheduling.
1859/// ASAP - Earliest time to schedule a node.
1860/// ALAP - Latest time to schedule a node.
1861/// MOV - Mobility function, difference between ALAP and ASAP.
1862/// D - Depth of each node.
1863/// H - Height of each node.
1864void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
1865 ScheduleInfo.resize(SUnits.size());
1866
1867 LLVM_DEBUG({
1868 for (int I : Topo) {
1869 const SUnit &SU = SUnits[I];
1870 dumpNode(SU);
1871 }
1872 });
1873
1874 int maxASAP = 0;
1875 // Compute ASAP and ZeroLatencyDepth.
1876 for (int I : Topo) {
1877 int asap = 0;
1878 int zeroLatencyDepth = 0;
1879 SUnit *SU = &SUnits[I];
1880 for (const SDep &P : SU->Preds) {
1881 SUnit *pred = P.getSUnit();
1882 if (P.getLatency() == 0)
1883 zeroLatencyDepth =
1884 std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1);
1885 if (ignoreDependence(P, true))
1886 continue;
1887 asap = std::max(asap, (int)(getASAP(pred) + P.getLatency() -
1888 getDistance(pred, SU, P) * MII));
1889 }
1890 maxASAP = std::max(maxASAP, asap);
1891 ScheduleInfo[I].ASAP = asap;
1892 ScheduleInfo[I].ZeroLatencyDepth = zeroLatencyDepth;
1893 }
1894
1895 // Compute ALAP, ZeroLatencyHeight, and MOV.
1896 for (int I : llvm::reverse(Topo)) {
1897 int alap = maxASAP;
1898 int zeroLatencyHeight = 0;
1899 SUnit *SU = &SUnits[I];
1900 for (const SDep &S : SU->Succs) {
1901 SUnit *succ = S.getSUnit();
1902 if (succ->isBoundaryNode())
1903 continue;
1904 if (S.getLatency() == 0)
1905 zeroLatencyHeight =
1906 std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
1907 if (ignoreDependence(S, true))
1908 continue;
1909 alap = std::min(alap, (int)(getALAP(succ) - S.getLatency() +
1910 getDistance(SU, succ, S) * MII));
1911 }
1912
1913 ScheduleInfo[I].ALAP = alap;
1914 ScheduleInfo[I].ZeroLatencyHeight = zeroLatencyHeight;
1915 }
1916
1917 // After computing the node functions, compute the summary for each node set.
1918 for (NodeSet &I : NodeSets)
1919 I.computeNodeSetInfo(this);
1920
1921 LLVM_DEBUG({
1922 for (unsigned i = 0; i < SUnits.size(); i++) {
1923 dbgs() << "\tNode " << i << ":\n";
1924 dbgs() << "\t ASAP = " << getASAP(&SUnits[i]) << "\n";
1925 dbgs() << "\t ALAP = " << getALAP(&SUnits[i]) << "\n";
1926 dbgs() << "\t MOV = " << getMOV(&SUnits[i]) << "\n";
1927 dbgs() << "\t D = " << getDepth(&SUnits[i]) << "\n";
1928 dbgs() << "\t H = " << getHeight(&SUnits[i]) << "\n";
1929 dbgs() << "\t ZLD = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
1930 dbgs() << "\t ZLH = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
1931 }
1932 });
1933}
1934
1935/// Compute the Pred_L(O) set, as defined in the paper. The set is defined
1936/// as the predecessors of the elements of NodeOrder that are not also in
1937/// NodeOrder.
1940 const NodeSet *S = nullptr) {
1941 Preds.clear();
1942 for (const SUnit *SU : NodeOrder) {
1943 for (const SDep &Pred : SU->Preds) {
1944 if (S && S->count(Pred.getSUnit()) == 0)
1945 continue;
1946 if (ignoreDependence(Pred, true))
1947 continue;
1948 if (NodeOrder.count(Pred.getSUnit()) == 0)
1949 Preds.insert(Pred.getSUnit());
1950 }
1951 // Back-edges are predecessors with an anti-dependence.
1952 for (const SDep &Succ : SU->Succs) {
1953 if (Succ.getKind() != SDep::Anti)
1954 continue;
1955 if (S && S->count(Succ.getSUnit()) == 0)
1956 continue;
1957 if (NodeOrder.count(Succ.getSUnit()) == 0)
1958 Preds.insert(Succ.getSUnit());
1959 }
1960 }
1961 return !Preds.empty();
1962}
1963
1964/// Compute the Succ_L(O) set, as defined in the paper. The set is defined
1965/// as the successors of the elements of NodeOrder that are not also in
1966/// NodeOrder.
1969 const NodeSet *S = nullptr) {
1970 Succs.clear();
1971 for (const SUnit *SU : NodeOrder) {
1972 for (const SDep &Succ : SU->Succs) {
1973 if (S && S->count(Succ.getSUnit()) == 0)
1974 continue;
1975 if (ignoreDependence(Succ, false))
1976 continue;
1977 if (NodeOrder.count(Succ.getSUnit()) == 0)
1978 Succs.insert(Succ.getSUnit());
1979 }
1980 for (const SDep &Pred : SU->Preds) {
1981 if (Pred.getKind() != SDep::Anti)
1982 continue;
1983 if (S && S->count(Pred.getSUnit()) == 0)
1984 continue;
1985 if (NodeOrder.count(Pred.getSUnit()) == 0)
1986 Succs.insert(Pred.getSUnit());
1987 }
1988 }
1989 return !Succs.empty();
1990}
1991
1992/// Return true if there is a path from the specified node to any of the nodes
1993/// in DestNodes. Keep track and return the nodes in any path.
1994static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
1995 SetVector<SUnit *> &DestNodes,
1996 SetVector<SUnit *> &Exclude,
1997 SmallPtrSet<SUnit *, 8> &Visited) {
1998 if (Cur->isBoundaryNode())
1999 return false;
2000 if (Exclude.contains(Cur))
2001 return false;
2002 if (DestNodes.contains(Cur))
2003 return true;
2004 if (!Visited.insert(Cur).second)
2005 return Path.contains(Cur);
2006 bool FoundPath = false;
2007 for (auto &SI : Cur->Succs)
2008 if (!ignoreDependence(SI, false))
2009 FoundPath |=
2010 computePath(SI.getSUnit(), Path, DestNodes, Exclude, Visited);
2011 for (auto &PI : Cur->Preds)
2012 if (PI.getKind() == SDep::Anti)
2013 FoundPath |=
2014 computePath(PI.getSUnit(), Path, DestNodes, Exclude, Visited);
2015 if (FoundPath)
2016 Path.insert(Cur);
2017 return FoundPath;
2018}
2019
2020/// Compute the live-out registers for the instructions in a node-set.
2021/// The live-out registers are those that are defined in the node-set,
2022/// but not used. Except for use operands of Phis.
2024 NodeSet &NS) {
2029 for (SUnit *SU : NS) {
2030 const MachineInstr *MI = SU->getInstr();
2031 if (MI->isPHI())
2032 continue;
2033 for (const MachineOperand &MO : MI->all_uses()) {
2034 Register Reg = MO.getReg();
2035 if (Reg.isVirtual())
2036 Uses.insert(Reg);
2037 else if (MRI.isAllocatable(Reg))
2038 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
2039 Uses.insert(Unit);
2040 }
2041 }
2042 for (SUnit *SU : NS)
2043 for (const MachineOperand &MO : SU->getInstr()->all_defs())
2044 if (!MO.isDead()) {
2045 Register Reg = MO.getReg();
2046 if (Reg.isVirtual()) {
2047 if (!Uses.count(Reg))
2048 LiveOutRegs.push_back(RegisterMaskPair(Reg,
2050 } else if (MRI.isAllocatable(Reg)) {
2051 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
2052 if (!Uses.count(Unit))
2053 LiveOutRegs.push_back(
2055 }
2056 }
2057 RPTracker.addLiveRegs(LiveOutRegs);
2058}
2059
2060/// A heuristic to filter nodes in recurrent node-sets if the register
2061/// pressure of a set is too high.
2062void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
2063 for (auto &NS : NodeSets) {
2064 // Skip small node-sets since they won't cause register pressure problems.
2065 if (NS.size() <= 2)
2066 continue;
2067 IntervalPressure RecRegPressure;
2068 RegPressureTracker RecRPTracker(RecRegPressure);
2069 RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
2070 computeLiveOuts(MF, RecRPTracker, NS);
2071 RecRPTracker.closeBottom();
2072
2073 std::vector<SUnit *> SUnits(NS.begin(), NS.end());
2074 llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
2075 return A->NodeNum > B->NodeNum;
2076 });
2077
2078 for (auto &SU : SUnits) {
2079 // Since we're computing the register pressure for a subset of the
2080 // instructions in a block, we need to set the tracker for each
2081 // instruction in the node-set. The tracker is set to the instruction
2082 // just after the one we're interested in.
2084 RecRPTracker.setPos(std::next(CurInstI));
2085
2086 RegPressureDelta RPDelta;
2087 ArrayRef<PressureChange> CriticalPSets;
2088 RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
2089 CriticalPSets,
2090 RecRegPressure.MaxSetPressure);
2091 if (RPDelta.Excess.isValid()) {
2092 LLVM_DEBUG(
2093 dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
2095 << ":" << RPDelta.Excess.getUnitInc() << "\n");
2096 NS.setExceedPressure(SU);
2097 break;
2098 }
2099 RecRPTracker.recede();
2100 }
2101 }
2102}
2103
2104/// A heuristic to colocate node sets that have the same set of
2105/// successors.
2106void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
2107 unsigned Colocate = 0;
2108 for (int i = 0, e = NodeSets.size(); i < e; ++i) {
2109 NodeSet &N1 = NodeSets[i];
2111 if (N1.empty() || !succ_L(N1, S1))
2112 continue;
2113 for (int j = i + 1; j < e; ++j) {
2114 NodeSet &N2 = NodeSets[j];
2115 if (N1.compareRecMII(N2) != 0)
2116 continue;
2118 if (N2.empty() || !succ_L(N2, S2))
2119 continue;
2120 if (llvm::set_is_subset(S1, S2) && S1.size() == S2.size()) {
2121 N1.setColocate(++Colocate);
2122 N2.setColocate(Colocate);
2123 break;
2124 }
2125 }
2126 }
2127}
2128
2129/// Check if the existing node-sets are profitable. If not, then ignore the
2130/// recurrent node-sets, and attempt to schedule all nodes together. This is
2131/// a heuristic. If the MII is large and all the recurrent node-sets are small,
2132/// then it's best to try to schedule all instructions together instead of
2133/// starting with the recurrent node-sets.
2134void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
2135 // Look for loops with a large MII.
2136 if (MII < 17)
2137 return;
2138 // Check if the node-set contains only a simple add recurrence.
2139 for (auto &NS : NodeSets) {
2140 if (NS.getRecMII() > 2)
2141 return;
2142 if (NS.getMaxDepth() > MII)
2143 return;
2144 }
2145 NodeSets.clear();
2146 LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
2147}
2148
2149/// Add the nodes that do not belong to a recurrence set into groups
2150/// based upon connected components.
2151void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
2152 SetVector<SUnit *> NodesAdded;
2154 // Add the nodes that are on a path between the previous node sets and
2155 // the current node set.
2156 for (NodeSet &I : NodeSets) {
2158 // Add the nodes from the current node set to the previous node set.
2159 if (succ_L(I, N)) {
2161 for (SUnit *NI : N) {
2162 Visited.clear();
2163 computePath(NI, Path, NodesAdded, I, Visited);
2164 }
2165 if (!Path.empty())
2166 I.insert(Path.begin(), Path.end());
2167 }
2168 // Add the nodes from the previous node set to the current node set.
2169 N.clear();
2170 if (succ_L(NodesAdded, N)) {
2172 for (SUnit *NI : N) {
2173 Visited.clear();
2174 computePath(NI, Path, I, NodesAdded, Visited);
2175 }
2176 if (!Path.empty())
2177 I.insert(Path.begin(), Path.end());
2178 }
2179 NodesAdded.insert(I.begin(), I.end());
2180 }
2181
2182 // Create a new node set with the connected nodes of any successor of a node
2183 // in a recurrent set.
2184 NodeSet NewSet;
2186 if (succ_L(NodesAdded, N))
2187 for (SUnit *I : N)
2188 addConnectedNodes(I, NewSet, NodesAdded);
2189 if (!NewSet.empty())
2190 NodeSets.push_back(NewSet);
2191
2192 // Create a new node set with the connected nodes of any predecessor of a node
2193 // in a recurrent set.
2194 NewSet.clear();
2195 if (pred_L(NodesAdded, N))
2196 for (SUnit *I : N)
2197 addConnectedNodes(I, NewSet, NodesAdded);
2198 if (!NewSet.empty())
2199 NodeSets.push_back(NewSet);
2200
2201 // Create new nodes sets with the connected nodes any remaining node that
2202 // has no predecessor.
2203 for (SUnit &SU : SUnits) {
2204 if (NodesAdded.count(&SU) == 0) {
2205 NewSet.clear();
2206 addConnectedNodes(&SU, NewSet, NodesAdded);
2207 if (!NewSet.empty())
2208 NodeSets.push_back(NewSet);
2209 }
2210 }
2211}
2212
2213/// Add the node to the set, and add all of its connected nodes to the set.
2214void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
2215 SetVector<SUnit *> &NodesAdded) {
2216 NewSet.insert(SU);
2217 NodesAdded.insert(SU);
2218 for (auto &SI : SU->Succs) {
2219 SUnit *Successor = SI.getSUnit();
2220 if (!SI.isArtificial() && !Successor->isBoundaryNode() &&
2221 NodesAdded.count(Successor) == 0)
2222 addConnectedNodes(Successor, NewSet, NodesAdded);
2223 }
2224 for (auto &PI : SU->Preds) {
2225 SUnit *Predecessor = PI.getSUnit();
2226 if (!PI.isArtificial() && NodesAdded.count(Predecessor) == 0)
2227 addConnectedNodes(Predecessor, NewSet, NodesAdded);
2228 }
2229}
2230
2231/// Return true if Set1 contains elements in Set2. The elements in common
2232/// are returned in a different container.
2233static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
2235 Result.clear();
2236 for (SUnit *SU : Set1) {
2237 if (Set2.count(SU) != 0)
2238 Result.insert(SU);
2239 }
2240 return !Result.empty();
2241}
2242
2243/// Merge the recurrence node sets that have the same initial node.
2244void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
2245 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2246 ++I) {
2247 NodeSet &NI = *I;
2248 for (NodeSetType::iterator J = I + 1; J != E;) {
2249 NodeSet &NJ = *J;
2250 if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
2251 if (NJ.compareRecMII(NI) > 0)
2252 NI.setRecMII(NJ.getRecMII());
2253 for (SUnit *SU : *J)
2254 I->insert(SU);
2255 NodeSets.erase(J);
2256 E = NodeSets.end();
2257 } else {
2258 ++J;
2259 }
2260 }
2261 }
2262}
2263
2264/// Remove nodes that have been scheduled in previous NodeSets.
2265void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
2266 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2267 ++I)
2268 for (NodeSetType::iterator J = I + 1; J != E;) {
2269 J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
2270
2271 if (J->empty()) {
2272 NodeSets.erase(J);
2273 E = NodeSets.end();
2274 } else {
2275 ++J;
2276 }
2277 }
2278}
2279
2280/// Compute an ordered list of the dependence graph nodes, which
2281/// indicates the order that the nodes will be scheduled. This is a
2282/// two-level algorithm. First, a partial order is created, which
2283/// consists of a list of sets ordered from highest to lowest priority.
2284void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
2286 NodeOrder.clear();
2287
2288 for (auto &Nodes : NodeSets) {
2289 LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
2290 OrderKind Order;
2292 if (pred_L(NodeOrder, N) && llvm::set_is_subset(N, Nodes)) {
2293 R.insert(N.begin(), N.end());
2294 Order = BottomUp;
2295 LLVM_DEBUG(dbgs() << " Bottom up (preds) ");
2296 } else if (succ_L(NodeOrder, N) && llvm::set_is_subset(N, Nodes)) {
2297 R.insert(N.begin(), N.end());
2298 Order = TopDown;
2299 LLVM_DEBUG(dbgs() << " Top down (succs) ");
2300 } else if (isIntersect(N, Nodes, R)) {
2301 // If some of the successors are in the existing node-set, then use the
2302 // top-down ordering.
2303 Order = TopDown;
2304 LLVM_DEBUG(dbgs() << " Top down (intersect) ");
2305 } else if (NodeSets.size() == 1) {
2306 for (const auto &N : Nodes)
2307 if (N->Succs.size() == 0)
2308 R.insert(N);
2309 Order = BottomUp;
2310 LLVM_DEBUG(dbgs() << " Bottom up (all) ");
2311 } else {
2312 // Find the node with the highest ASAP.
2313 SUnit *maxASAP = nullptr;
2314 for (SUnit *SU : Nodes) {
2315 if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
2316 (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
2317 maxASAP = SU;
2318 }
2319 R.insert(maxASAP);
2320 Order = BottomUp;
2321 LLVM_DEBUG(dbgs() << " Bottom up (default) ");
2322 }
2323
2324 while (!R.empty()) {
2325 if (Order == TopDown) {
2326 // Choose the node with the maximum height. If more than one, choose
2327 // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
2328 // choose the node with the lowest MOV.
2329 while (!R.empty()) {
2330 SUnit *maxHeight = nullptr;
2331 for (SUnit *I : R) {
2332 if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
2333 maxHeight = I;
2334 else if (getHeight(I) == getHeight(maxHeight) &&
2336 maxHeight = I;
2337 else if (getHeight(I) == getHeight(maxHeight) &&
2339 getZeroLatencyHeight(maxHeight) &&
2340 getMOV(I) < getMOV(maxHeight))
2341 maxHeight = I;
2342 }
2343 NodeOrder.insert(maxHeight);
2344 LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
2345 R.remove(maxHeight);
2346 for (const auto &I : maxHeight->Succs) {
2347 if (Nodes.count(I.getSUnit()) == 0)
2348 continue;
2349 if (NodeOrder.contains(I.getSUnit()))
2350 continue;
2351 if (ignoreDependence(I, false))
2352 continue;
2353 R.insert(I.getSUnit());
2354 }
2355 // Back-edges are predecessors with an anti-dependence.
2356 for (const auto &I : maxHeight->Preds) {
2357 if (I.getKind() != SDep::Anti)
2358 continue;
2359 if (Nodes.count(I.getSUnit()) == 0)
2360 continue;
2361 if (NodeOrder.contains(I.getSUnit()))
2362 continue;
2363 R.insert(I.getSUnit());
2364 }
2365 }
2366 Order = BottomUp;
2367 LLVM_DEBUG(dbgs() << "\n Switching order to bottom up ");
2369 if (pred_L(NodeOrder, N, &Nodes))
2370 R.insert(N.begin(), N.end());
2371 } else {
2372 // Choose the node with the maximum depth. If more than one, choose
2373 // the node with the maximum ZeroLatencyDepth. If still more than one,
2374 // choose the node with the lowest MOV.
2375 while (!R.empty()) {
2376 SUnit *maxDepth = nullptr;
2377 for (SUnit *I : R) {
2378 if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
2379 maxDepth = I;
2380 else if (getDepth(I) == getDepth(maxDepth) &&
2382 maxDepth = I;
2383 else if (getDepth(I) == getDepth(maxDepth) &&
2385 getMOV(I) < getMOV(maxDepth))
2386 maxDepth = I;
2387 }
2388 NodeOrder.insert(maxDepth);
2389 LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
2390 R.remove(maxDepth);
2391 if (Nodes.isExceedSU(maxDepth)) {
2392 Order = TopDown;
2393 R.clear();
2394 R.insert(Nodes.getNode(0));
2395 break;
2396 }
2397 for (const auto &I : maxDepth->Preds) {
2398 if (Nodes.count(I.getSUnit()) == 0)
2399 continue;
2400 if (NodeOrder.contains(I.getSUnit()))
2401 continue;
2402 R.insert(I.getSUnit());
2403 }
2404 // Back-edges are predecessors with an anti-dependence.
2405 for (const auto &I : maxDepth->Succs) {
2406 if (I.getKind() != SDep::Anti)
2407 continue;
2408 if (Nodes.count(I.getSUnit()) == 0)
2409 continue;
2410 if (NodeOrder.contains(I.getSUnit()))
2411 continue;
2412 R.insert(I.getSUnit());
2413 }
2414 }
2415 Order = TopDown;
2416 LLVM_DEBUG(dbgs() << "\n Switching order to top down ");
2418 if (succ_L(NodeOrder, N, &Nodes))
2419 R.insert(N.begin(), N.end());
2420 }
2421 }
2422 LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
2423 }
2424
2425 LLVM_DEBUG({
2426 dbgs() << "Node order: ";
2427 for (SUnit *I : NodeOrder)
2428 dbgs() << " " << I->NodeNum << " ";
2429 dbgs() << "\n";
2430 });
2431}
2432
2433/// Process the nodes in the computed order and create the pipelined schedule
2434/// of the instructions, if possible. Return true if a schedule is found.
2435bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
2436
2437 if (NodeOrder.empty()){
2438 LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
2439 return false;
2440 }
2441
2442 bool scheduleFound = false;
2443 std::unique_ptr<HighRegisterPressureDetector> HRPDetector;
2444 if (LimitRegPressure) {
2445 HRPDetector =
2446 std::make_unique<HighRegisterPressureDetector>(Loop.getHeader(), MF);
2447 HRPDetector->init(RegClassInfo);
2448 }
2449 // Keep increasing II until a valid schedule is found.
2450 for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) {
2451 Schedule.reset();
2452 Schedule.setInitiationInterval(II);
2453 LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
2454
2455 SetVector<SUnit *>::iterator NI = NodeOrder.begin();
2456 SetVector<SUnit *>::iterator NE = NodeOrder.end();
2457 do {
2458 SUnit *SU = *NI;
2459
2460 // Compute the schedule time for the instruction, which is based
2461 // upon the scheduled time for any predecessors/successors.
2462 int EarlyStart = INT_MIN;
2463 int LateStart = INT_MAX;
2464 // These values are set when the size of the schedule window is limited
2465 // due to chain dependences.
2466 int SchedEnd = INT_MAX;
2467 int SchedStart = INT_MIN;
2468 Schedule.computeStart(SU, &EarlyStart, &LateStart, &SchedEnd, &SchedStart,
2469 II, this);
2470 LLVM_DEBUG({
2471 dbgs() << "\n";
2472 dbgs() << "Inst (" << SU->NodeNum << ") ";
2473 SU->getInstr()->dump();
2474 dbgs() << "\n";
2475 });
2476 LLVM_DEBUG({
2477 dbgs() << format("\tes: %8x ls: %8x me: %8x ms: %8x\n", EarlyStart,
2478 LateStart, SchedEnd, SchedStart);
2479 });
2480
2481 if (EarlyStart > LateStart || SchedEnd < EarlyStart ||
2482 SchedStart > LateStart)
2483 scheduleFound = false;
2484 else if (EarlyStart != INT_MIN && LateStart == INT_MAX) {
2485 SchedEnd = std::min(SchedEnd, EarlyStart + (int)II - 1);
2486 scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2487 } else if (EarlyStart == INT_MIN && LateStart != INT_MAX) {
2488 SchedStart = std::max(SchedStart, LateStart - (int)II + 1);
2489 scheduleFound = Schedule.insert(SU, LateStart, SchedStart, II);
2490 } else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2491 SchedEnd =
2492 std::min(SchedEnd, std::min(LateStart, EarlyStart + (int)II - 1));
2493 // When scheduling a Phi it is better to start at the late cycle and go
2494 // backwards. The default order may insert the Phi too far away from
2495 // its first dependence.
2496 if (SU->getInstr()->isPHI())
2497 scheduleFound = Schedule.insert(SU, SchedEnd, EarlyStart, II);
2498 else
2499 scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2500 } else {
2501 int FirstCycle = Schedule.getFirstCycle();
2502 scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2503 FirstCycle + getASAP(SU) + II - 1, II);
2504 }
2505 // Even if we find a schedule, make sure the schedule doesn't exceed the
2506 // allowable number of stages. We keep trying if this happens.
2507 if (scheduleFound)
2508 if (SwpMaxStages > -1 &&
2509 Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2510 scheduleFound = false;
2511
2512 LLVM_DEBUG({
2513 if (!scheduleFound)
2514 dbgs() << "\tCan't schedule\n";
2515 });
2516 } while (++NI != NE && scheduleFound);
2517
2518 // If a schedule is found, ensure non-pipelined instructions are in stage 0
2519 if (scheduleFound)
2520 scheduleFound =
2521 Schedule.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo);
2522
2523 // If a schedule is found, check if it is a valid schedule too.
2524 if (scheduleFound)
2525 scheduleFound = Schedule.isValidSchedule(this);
2526
2527 // If a schedule was found and the option is enabled, check if the schedule
2528 // might generate additional register spills/fills.
2529 if (scheduleFound && LimitRegPressure)
2530 scheduleFound =
2531 !HRPDetector->detect(this, Schedule, Schedule.getMaxStageCount());
2532 }
2533
2534 LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
2535 << " (II=" << Schedule.getInitiationInterval()
2536 << ")\n");
2537
2538 if (scheduleFound) {
2539 scheduleFound = LoopPipelinerInfo->shouldUseSchedule(*this, Schedule);
2540 if (!scheduleFound)
2541 LLVM_DEBUG(dbgs() << "Target rejected schedule\n");
2542 }
2543
2544 if (scheduleFound) {
2545 Schedule.finalizeSchedule(this);
2546 Pass.ORE->emit([&]() {
2548 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
2549 << "Schedule found with Initiation Interval: "
2550 << ore::NV("II", Schedule.getInitiationInterval())
2551 << ", MaxStageCount: "
2552 << ore::NV("MaxStageCount", Schedule.getMaxStageCount());
2553 });
2554 } else
2555 Schedule.reset();
2556
2557 return scheduleFound && Schedule.getMaxStageCount() > 0;
2558}
2559
2560/// Return true if we can compute the amount the instruction changes
2561/// during each iteration. Set Delta to the amount of the change.
2562bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
2564 const MachineOperand *BaseOp;
2565 int64_t Offset;
2566 bool OffsetIsScalable;
2567 if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
2568 return false;
2569
2570 // FIXME: This algorithm assumes instructions have fixed-size offsets.
2571 if (OffsetIsScalable)
2572 return false;
2573
2574 if (!BaseOp->isReg())
2575 return false;
2576
2577 Register BaseReg = BaseOp->getReg();
2578
2580 // Check if there is a Phi. If so, get the definition in the loop.
2581 MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
2582 if (BaseDef && BaseDef->isPHI()) {
2583 BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
2584 BaseDef = MRI.getVRegDef(BaseReg);
2585 }
2586 if (!BaseDef)
2587 return false;
2588
2589 int D = 0;
2590 if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
2591 return false;
2592
2593 Delta = D;
2594 return true;
2595}
2596
2597/// Check if we can change the instruction to use an offset value from the
2598/// previous iteration. If so, return true and set the base and offset values
2599/// so that we can rewrite the load, if necessary.
2600/// v1 = Phi(v0, v3)
2601/// v2 = load v1, 0
2602/// v3 = post_store v1, 4, x
2603/// This function enables the load to be rewritten as v2 = load v3, 4.
2604bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
2605 unsigned &BasePos,
2606 unsigned &OffsetPos,
2607 unsigned &NewBase,
2608 int64_t &Offset) {
2609 // Get the load instruction.
2610 if (TII->isPostIncrement(*MI))
2611 return false;
2612 unsigned BasePosLd, OffsetPosLd;
2613 if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
2614 return false;
2615 Register BaseReg = MI->getOperand(BasePosLd).getReg();
2616
2617 // Look for the Phi instruction.
2618 MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
2619 MachineInstr *Phi = MRI.getVRegDef(BaseReg);
2620 if (!Phi || !Phi->isPHI())
2621 return false;
2622 // Get the register defined in the loop block.
2623 unsigned PrevReg = getLoopPhiReg(*Phi, MI->getParent());
2624 if (!PrevReg)
2625 return false;
2626
2627 // Check for the post-increment load/store instruction.
2628 MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
2629 if (!PrevDef || PrevDef == MI)
2630 return false;
2631
2632 if (!TII->isPostIncrement(*PrevDef))
2633 return false;
2634
2635 unsigned BasePos1 = 0, OffsetPos1 = 0;
2636 if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
2637 return false;
2638
2639 // Make sure that the instructions do not access the same memory location in
2640 // the next iteration.
2641 int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
2642 int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
2644 NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
2645 bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
2646 MF.deleteMachineInstr(NewMI);
2647 if (!Disjoint)
2648 return false;
2649
2650 // Set the return value once we determine that we return true.
2651 BasePos = BasePosLd;
2652 OffsetPos = OffsetPosLd;
2653 NewBase = PrevReg;
2654 Offset = StoreOffset;
2655 return true;
2656}
2657
2658/// Apply changes to the instruction if needed. The changes are need
2659/// to improve the scheduling and depend up on the final schedule.
2661 SMSchedule &Schedule) {
2662 SUnit *SU = getSUnit(MI);
2664 InstrChanges.find(SU);
2665 if (It != InstrChanges.end()) {
2666 std::pair<unsigned, int64_t> RegAndOffset = It->second;
2667 unsigned BasePos, OffsetPos;
2668 if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2669 return;
2670 Register BaseReg = MI->getOperand(BasePos).getReg();
2671 MachineInstr *LoopDef = findDefInLoop(BaseReg);
2672 int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
2673 int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
2674 int BaseStageNum = Schedule.stageScheduled(SU);
2675 int BaseCycleNum = Schedule.cycleScheduled(SU);
2676 if (BaseStageNum < DefStageNum) {
2678 int OffsetDiff = DefStageNum - BaseStageNum;
2679 if (DefCycleNum < BaseCycleNum) {
2680 NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
2681 if (OffsetDiff > 0)
2682 --OffsetDiff;
2683 }
2684 int64_t NewOffset =
2685 MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
2686 NewMI->getOperand(OffsetPos).setImm(NewOffset);
2687 SU->setInstr(NewMI);
2688 MISUnitMap[NewMI] = SU;
2689 NewMIs[MI] = NewMI;
2690 }
2691 }
2692}
2693
2694/// Return the instruction in the loop that defines the register.
2695/// If the definition is a Phi, then follow the Phi operand to
2696/// the instruction in the loop.
2697MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
2699 MachineInstr *Def = MRI.getVRegDef(Reg);
2700 while (Def->isPHI()) {
2701 if (!Visited.insert(Def).second)
2702 break;
2703 for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
2704 if (Def->getOperand(i + 1).getMBB() == BB) {
2705 Def = MRI.getVRegDef(Def->getOperand(i).getReg());
2706 break;
2707 }
2708 }
2709 return Def;
2710}
2711
2712/// Return true for an order or output dependence that is loop carried
2713/// potentially. A dependence is loop carried if the destination defines a value
2714/// that may be used or defined by the source in a subsequent iteration.
2716 bool isSucc) {
2717 if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
2718 Dep.isArtificial() || Dep.getSUnit()->isBoundaryNode())
2719 return false;
2720
2722 return true;
2723
2724 if (Dep.getKind() == SDep::Output)
2725 return true;
2726
2727 MachineInstr *SI = Source->getInstr();
2728 MachineInstr *DI = Dep.getSUnit()->getInstr();
2729 if (!isSucc)
2730 std::swap(SI, DI);
2731 assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
2732
2733 // Assume ordered loads and stores may have a loop carried dependence.
2734 if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
2735 SI->mayRaiseFPException() || DI->mayRaiseFPException() ||
2736 SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
2737 return true;
2738
2739 if (!DI->mayLoadOrStore() || !SI->mayLoadOrStore())
2740 return false;
2741
2742 // The conservative assumption is that a dependence between memory operations
2743 // may be loop carried. The following code checks when it can be proved that
2744 // there is no loop carried dependence.
2745 unsigned DeltaS, DeltaD;
2746 if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD))
2747 return true;
2748
2749 const MachineOperand *BaseOpS, *BaseOpD;
2750 int64_t OffsetS, OffsetD;
2751 bool OffsetSIsScalable, OffsetDIsScalable;
2753 if (!TII->getMemOperandWithOffset(*SI, BaseOpS, OffsetS, OffsetSIsScalable,
2754 TRI) ||
2755 !TII->getMemOperandWithOffset(*DI, BaseOpD, OffsetD, OffsetDIsScalable,
2756 TRI))
2757 return true;
2758
2759 assert(!OffsetSIsScalable && !OffsetDIsScalable &&
2760 "Expected offsets to be byte offsets");
2761
2762 MachineInstr *DefS = MRI.getVRegDef(BaseOpS->getReg());
2763 MachineInstr *DefD = MRI.getVRegDef(BaseOpD->getReg());
2764 if (!DefS || !DefD || !DefS->isPHI() || !DefD->isPHI())
2765 return true;
2766
2767 unsigned InitValS = 0;
2768 unsigned LoopValS = 0;
2769 unsigned InitValD = 0;
2770 unsigned LoopValD = 0;
2771 getPhiRegs(*DefS, BB, InitValS, LoopValS);
2772 getPhiRegs(*DefD, BB, InitValD, LoopValD);
2773 MachineInstr *InitDefS = MRI.getVRegDef(InitValS);
2774 MachineInstr *InitDefD = MRI.getVRegDef(InitValD);
2775
2776 if (!InitDefS->isIdenticalTo(*InitDefD))
2777 return true;
2778
2779 // Check that the base register is incremented by a constant value for each
2780 // iteration.
2781 MachineInstr *LoopDefS = MRI.getVRegDef(LoopValS);
2782 int D = 0;
2783 if (!LoopDefS || !TII->getIncrementValue(*LoopDefS, D))
2784 return true;
2785
2786 LocationSize AccessSizeS = (*SI->memoperands_begin())->getSize();
2787 LocationSize AccessSizeD = (*DI->memoperands_begin())->getSize();
2788
2789 // This is the main test, which checks the offset values and the loop
2790 // increment value to determine if the accesses may be loop carried.
2791 if (!AccessSizeS.hasValue() || !AccessSizeD.hasValue())
2792 return true;
2793
2794 if (DeltaS != DeltaD || DeltaS < AccessSizeS.getValue() ||
2795 DeltaD < AccessSizeD.getValue())
2796 return true;
2797
2798 return (OffsetS + (int64_t)AccessSizeS.getValue() <
2799 OffsetD + (int64_t)AccessSizeD.getValue());
2800}
2801
2802void SwingSchedulerDAG::postProcessDAG() {
2803 for (auto &M : Mutations)
2804 M->apply(this);
2805}
2806
2807/// Try to schedule the node at the specified StartCycle and continue
2808/// until the node is schedule or the EndCycle is reached. This function
2809/// returns true if the node is scheduled. This routine may search either
2810/// forward or backward for a place to insert the instruction based upon
2811/// the relative values of StartCycle and EndCycle.
2812bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
2813 bool forward = true;
2814 LLVM_DEBUG({
2815 dbgs() << "Trying to insert node between " << StartCycle << " and "
2816 << EndCycle << " II: " << II << "\n";
2817 });
2818 if (StartCycle > EndCycle)
2819 forward = false;
2820
2821 // The terminating condition depends on the direction.
2822 int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
2823 for (int curCycle = StartCycle; curCycle != termCycle;
2824 forward ? ++curCycle : --curCycle) {
2825
2826 if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
2827 ProcItinResources.canReserveResources(*SU, curCycle)) {
2828 LLVM_DEBUG({
2829 dbgs() << "\tinsert at cycle " << curCycle << " ";
2830 SU->getInstr()->dump();
2831 });
2832
2833 if (!ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
2834 ProcItinResources.reserveResources(*SU, curCycle);
2835 ScheduledInstrs[curCycle].push_back(SU);
2836 InstrToCycle.insert(std::make_pair(SU, curCycle));
2837 if (curCycle > LastCycle)
2838 LastCycle = curCycle;
2839 if (curCycle < FirstCycle)
2840 FirstCycle = curCycle;
2841 return true;
2842 }
2843 LLVM_DEBUG({
2844 dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
2845 SU->getInstr()->dump();
2846 });
2847 }
2848 return false;
2849}
2850
2851// Return the cycle of the earliest scheduled instruction in the chain.
2854 SmallVector<SDep, 8> Worklist;
2855 Worklist.push_back(Dep);
2856 int EarlyCycle = INT_MAX;
2857 while (!Worklist.empty()) {
2858 const SDep &Cur = Worklist.pop_back_val();
2859 SUnit *PrevSU = Cur.getSUnit();
2860 if (Visited.count(PrevSU))
2861 continue;
2862 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
2863 if (it == InstrToCycle.end())
2864 continue;
2865 EarlyCycle = std::min(EarlyCycle, it->second);
2866 for (const auto &PI : PrevSU->Preds)
2867 if (PI.getKind() == SDep::Order || PI.getKind() == SDep::Output)
2868 Worklist.push_back(PI);
2869 Visited.insert(PrevSU);
2870 }
2871 return EarlyCycle;
2872}
2873
2874// Return the cycle of the latest scheduled instruction in the chain.
2877 SmallVector<SDep, 8> Worklist;
2878 Worklist.push_back(Dep);
2879 int LateCycle = INT_MIN;
2880 while (!Worklist.empty()) {
2881 const SDep &Cur = Worklist.pop_back_val();
2882 SUnit *SuccSU = Cur.getSUnit();
2883 if (Visited.count(SuccSU) || SuccSU->isBoundaryNode())
2884 continue;
2885 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
2886 if (it == InstrToCycle.end())
2887 continue;
2888 LateCycle = std::max(LateCycle, it->second);
2889 for (const auto &SI : SuccSU->Succs)
2890 if (SI.getKind() == SDep::Order || SI.getKind() == SDep::Output)
2891 Worklist.push_back(SI);
2892 Visited.insert(SuccSU);
2893 }
2894 return LateCycle;
2895}
2896
2897/// If an instruction has a use that spans multiple iterations, then
2898/// return true. These instructions are characterized by having a back-ege
2899/// to a Phi, which contains a reference to another Phi.
2901 for (auto &P : SU->Preds)
2902 if (DAG->isBackedge(SU, P) && P.getSUnit()->getInstr()->isPHI())
2903 for (auto &S : P.getSUnit()->Succs)
2904 if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
2905 return P.getSUnit();
2906 return nullptr;
2907}
2908
2909/// Compute the scheduling start slot for the instruction. The start slot
2910/// depends on any predecessor or successor nodes scheduled already.
2911void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
2912 int *MinEnd, int *MaxStart, int II,
2913 SwingSchedulerDAG *DAG) {
2914 // Iterate over each instruction that has been scheduled already. The start
2915 // slot computation depends on whether the previously scheduled instruction
2916 // is a predecessor or successor of the specified instruction.
2917 for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
2918
2919 // Iterate over each instruction in the current cycle.
2920 for (SUnit *I : getInstructions(cycle)) {
2921 // Because we're processing a DAG for the dependences, we recognize
2922 // the back-edge in recurrences by anti dependences.
2923 for (unsigned i = 0, e = (unsigned)SU->Preds.size(); i != e; ++i) {
2924 const SDep &Dep = SU->Preds[i];
2925 if (Dep.getSUnit() == I) {
2926 if (!DAG->isBackedge(SU, Dep)) {
2927 int EarlyStart = cycle + Dep.getLatency() -
2928 DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
2929 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
2930 if (DAG->isLoopCarriedDep(SU, Dep, false)) {
2931 int End = earliestCycleInChain(Dep) + (II - 1);
2932 *MinEnd = std::min(*MinEnd, End);
2933 }
2934 } else {
2935 int LateStart = cycle - Dep.getLatency() +
2936 DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
2937 *MinLateStart = std::min(*MinLateStart, LateStart);
2938 }
2939 }
2940 // For instruction that requires multiple iterations, make sure that
2941 // the dependent instruction is not scheduled past the definition.
2942 SUnit *BE = multipleIterations(I, DAG);
2943 if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
2944 !SU->isPred(I))
2945 *MinLateStart = std::min(*MinLateStart, cycle);
2946 }
2947 for (unsigned i = 0, e = (unsigned)SU->Succs.size(); i != e; ++i) {
2948 if (SU->Succs[i].getSUnit() == I) {
2949 const SDep &Dep = SU->Succs[i];
2950 if (!DAG->isBackedge(SU, Dep)) {
2951 int LateStart = cycle - Dep.getLatency() +
2952 DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
2953 *MinLateStart = std::min(*MinLateStart, LateStart);
2954 if (DAG->isLoopCarriedDep(SU, Dep)) {
2955 int Start = latestCycleInChain(Dep) + 1 - II;
2956 *MaxStart = std::max(*MaxStart, Start);
2957 }
2958 } else {
2959 int EarlyStart = cycle + Dep.getLatency() -
2960 DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
2961 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
2962 }
2963 }
2964 }
2965 }
2966 }
2967}
2968
2969/// Order the instructions within a cycle so that the definitions occur
2970/// before the uses. Returns true if the instruction is added to the start
2971/// of the list, or false if added to the end.
2973 std::deque<SUnit *> &Insts) const {
2974 MachineInstr *MI = SU->getInstr();
2975 bool OrderBeforeUse = false;
2976 bool OrderAfterDef = false;
2977 bool OrderBeforeDef = false;
2978 unsigned MoveDef = 0;
2979 unsigned MoveUse = 0;
2980 int StageInst1 = stageScheduled(SU);
2981
2982 unsigned Pos = 0;
2983 for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
2984 ++I, ++Pos) {
2985 for (MachineOperand &MO : MI->operands()) {
2986 if (!MO.isReg() || !MO.getReg().isVirtual())
2987 continue;
2988
2989 Register Reg = MO.getReg();
2990 unsigned BasePos, OffsetPos;
2991 if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2992 if (MI->getOperand(BasePos).getReg() == Reg)
2993 if (unsigned NewReg = SSD->getInstrBaseReg(SU))
2994 Reg = NewReg;
2995 bool Reads, Writes;
2996 std::tie(Reads, Writes) =
2997 (*I)->getInstr()->readsWritesVirtualRegister(Reg);
2998 if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
2999 OrderBeforeUse = true;
3000 if (MoveUse == 0)
3001 MoveUse = Pos;
3002 } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
3003 // Add the instruction after the scheduled instruction.
3004 OrderAfterDef = true;
3005 MoveDef = Pos;
3006 } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
3007 if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
3008 OrderBeforeUse = true;
3009 if (MoveUse == 0)
3010 MoveUse = Pos;
3011 } else {
3012 OrderAfterDef = true;
3013 MoveDef = Pos;
3014 }
3015 } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
3016 OrderBeforeUse = true;
3017 if (MoveUse == 0)
3018 MoveUse = Pos;
3019 if (MoveUse != 0) {
3020 OrderAfterDef = true;
3021 MoveDef = Pos - 1;
3022 }
3023 } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
3024 // Add the instruction before the scheduled instruction.
3025 OrderBeforeUse = true;
3026 if (MoveUse == 0)
3027 MoveUse = Pos;
3028 } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
3029 isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
3030 if (MoveUse == 0) {
3031 OrderBeforeDef = true;
3032 MoveUse = Pos;
3033 }
3034 }
3035 }
3036 // Check for order dependences between instructions. Make sure the source
3037 // is ordered before the destination.
3038 for (auto &S : SU->Succs) {
3039 if (S.getSUnit() != *I)
3040 continue;
3041 if (S.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
3042 OrderBeforeUse = true;
3043 if (Pos < MoveUse)
3044 MoveUse = Pos;
3045 }
3046 // We did not handle HW dependences in previous for loop,
3047 // and we normally set Latency = 0 for Anti deps,
3048 // so may have nodes in same cycle with Anti denpendent on HW regs.
3049 else if (S.getKind() == SDep::Anti && stageScheduled(*I) == StageInst1) {
3050 OrderBeforeUse = true;
3051 if ((MoveUse == 0) || (Pos < MoveUse))
3052 MoveUse = Pos;
3053 }
3054 }
3055 for (auto &P : SU->Preds) {
3056 if (P.getSUnit() != *I)
3057 continue;
3058 if (P.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
3059 OrderAfterDef = true;
3060 MoveDef = Pos;
3061 }
3062 }
3063 }
3064
3065 // A circular dependence.
3066 if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
3067 OrderBeforeUse = false;
3068
3069 // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3070 // to a loop-carried dependence.
3071 if (OrderBeforeDef)
3072 OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
3073
3074 // The uncommon case when the instruction order needs to be updated because
3075 // there is both a use and def.
3076 if (OrderBeforeUse && OrderAfterDef) {
3077 SUnit *UseSU = Insts.at(MoveUse);
3078 SUnit *DefSU = Insts.at(MoveDef);
3079 if (MoveUse > MoveDef) {
3080 Insts.erase(Insts.begin() + MoveUse);
3081 Insts.erase(Insts.begin() + MoveDef);
3082 } else {
3083 Insts.erase(Insts.begin() + MoveDef);
3084 Insts.erase(Insts.begin() + MoveUse);
3085 }
3086 orderDependence(SSD, UseSU, Insts);
3087 orderDependence(SSD, SU, Insts);
3088 orderDependence(SSD, DefSU, Insts);
3089 return;
3090 }
3091 // Put the new instruction first if there is a use in the list. Otherwise,
3092 // put it at the end of the list.
3093 if (OrderBeforeUse)
3094 Insts.push_front(SU);
3095 else
3096 Insts.push_back(SU);
3097}
3098
3099/// Return true if the scheduled Phi has a loop carried operand.
3101 MachineInstr &Phi) const {
3102 if (!Phi.isPHI())
3103 return false;
3104 assert(Phi.isPHI() && "Expecting a Phi.");
3105 SUnit *DefSU = SSD->getSUnit(&Phi);
3106 unsigned DefCycle = cycleScheduled(DefSU);
3107 int DefStage = stageScheduled(DefSU);
3108
3109 unsigned InitVal = 0;
3110 unsigned LoopVal = 0;
3111 getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
3112 SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
3113 if (!UseSU)
3114 return true;
3115 if (UseSU->getInstr()->isPHI())
3116 return true;
3117 unsigned LoopCycle = cycleScheduled(UseSU);
3118 int LoopStage = stageScheduled(UseSU);
3119 return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
3120}
3121
3122/// Return true if the instruction is a definition that is loop carried
3123/// and defines the use on the next iteration.
3124/// v1 = phi(v2, v3)
3125/// (Def) v3 = op v1
3126/// (MO) = v1
3127/// If MO appears before Def, then v1 and v3 may get assigned to the same
3128/// register.
3130 MachineInstr *Def,
3131 MachineOperand &MO) const {
3132 if (!MO.isReg())
3133 return false;
3134 if (Def->isPHI())
3135 return false;
3136 MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
3137 if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
3138 return false;
3139 if (!isLoopCarried(SSD, *Phi))
3140 return false;
3141 unsigned LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
3142 for (MachineOperand &DMO : Def->all_defs()) {
3143 if (DMO.getReg() == LoopReg)
3144 return true;
3145 }
3146 return false;
3147}
3148
3149/// Determine transitive dependences of unpipelineable instructions
3152 SmallSet<SUnit *, 8> DoNotPipeline;
3153 SmallVector<SUnit *, 8> Worklist;
3154
3155 for (auto &SU : SSD->SUnits)
3156 if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr()))
3157 Worklist.push_back(&SU);
3158
3159 while (!Worklist.empty()) {
3160 auto SU = Worklist.pop_back_val();
3161 if (DoNotPipeline.count(SU))
3162 continue;
3163 LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");
3164 DoNotPipeline.insert(SU);
3165 for (auto &Dep : SU->Preds)
3166 Worklist.push_back(Dep.getSUnit());
3167 if (SU->getInstr()->isPHI())
3168 for (auto &Dep : SU->Succs)
3169 if (Dep.getKind() == SDep::Anti)
3170 Worklist.push_back(Dep.getSUnit());
3171 }
3172 return DoNotPipeline;
3173}
3174
3175// Determine all instructions upon which any unpipelineable instruction depends
3176// and ensure that they are in stage 0. If unable to do so, return false.
3180
3181 int NewLastCycle = INT_MIN;
3182 for (SUnit &SU : SSD->SUnits) {
3183 if (!SU.isInstr())
3184 continue;
3185 if (!DNP.contains(&SU) || stageScheduled(&SU) == 0) {
3186 NewLastCycle = std::max(NewLastCycle, InstrToCycle[&SU]);
3187 continue;
3188 }
3189
3190 // Put the non-pipelined instruction as early as possible in the schedule
3191 int NewCycle = getFirstCycle();
3192 for (auto &Dep : SU.Preds)
3193 NewCycle = std::max(InstrToCycle[Dep.getSUnit()], NewCycle);
3194
3195 int OldCycle = InstrToCycle[&SU];
3196 if (OldCycle != NewCycle) {
3197 InstrToCycle[&SU] = NewCycle;
3198 auto &OldS = getInstructions(OldCycle);
3199 llvm::erase(OldS, &SU);
3200 getInstructions(NewCycle).emplace_back(&SU);
3201 LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum
3202 << ") is not pipelined; moving from cycle " << OldCycle
3203 << " to " << NewCycle << " Instr:" << *SU.getInstr());
3204 }
3205 NewLastCycle = std::max(NewLastCycle, NewCycle);
3206 }
3207 LastCycle = NewLastCycle;
3208 return true;
3209}
3210
3211// Check if the generated schedule is valid. This function checks if
3212// an instruction that uses a physical register is scheduled in a
3213// different stage than the definition. The pipeliner does not handle
3214// physical register values that may cross a basic block boundary.
3215// Furthermore, if a physical def/use pair is assigned to the same
3216// cycle, orderDependence does not guarantee def/use ordering, so that
3217// case should be considered invalid. (The test checks for both
3218// earlier and same-cycle use to be more robust.)
3220 for (SUnit &SU : SSD->SUnits) {
3221 if (!SU.hasPhysRegDefs)
3222 continue;
3223 int StageDef = stageScheduled(&SU);
3224 int CycleDef = InstrToCycle[&SU];
3225 assert(StageDef != -1 && "Instruction should have been scheduled.");
3226 for (auto &SI : SU.Succs)
3227 if (SI.isAssignedRegDep() && !SI.getSUnit()->isBoundaryNode())
3228 if (Register::isPhysicalRegister(SI.getReg())) {
3229 if (stageScheduled(SI.getSUnit()) != StageDef)
3230 return false;
3231 if (InstrToCycle[SI.getSUnit()] <= CycleDef)
3232 return false;
3233 }
3234 }
3235 return true;
3236}
3237
3238/// A property of the node order in swing-modulo-scheduling is
3239/// that for nodes outside circuits the following holds:
3240/// none of them is scheduled after both a successor and a
3241/// predecessor.
3242/// The method below checks whether the property is met.
3243/// If not, debug information is printed and statistics information updated.
3244/// Note that we do not use an assert statement.
3245/// The reason is that although an invalid node oder may prevent
3246/// the pipeliner from finding a pipelined schedule for arbitrary II,
3247/// it does not lead to the generation of incorrect code.
3248void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
3249
3250 // a sorted vector that maps each SUnit to its index in the NodeOrder
3251 typedef std::pair<SUnit *, unsigned> UnitIndex;
3252 std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
3253
3254 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
3255 Indices.push_back(std::make_pair(NodeOrder[i], i));
3256
3257 auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
3258 return std::get<0>(i1) < std::get<0>(i2);
3259 };
3260
3261 // sort, so that we can perform a binary search
3262 llvm::sort(Indices, CompareKey);
3263
3264 bool Valid = true;
3265 (void)Valid;
3266 // for each SUnit in the NodeOrder, check whether
3267 // it appears after both a successor and a predecessor
3268 // of the SUnit. If this is the case, and the SUnit
3269 // is not part of circuit, then the NodeOrder is not
3270 // valid.
3271 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
3272 SUnit *SU = NodeOrder[i];
3273 unsigned Index = i;
3274
3275 bool PredBefore = false;
3276 bool SuccBefore = false;
3277
3278 SUnit *Succ;
3279 SUnit *Pred;
3280 (void)Succ;
3281 (void)Pred;
3282
3283 for (SDep &PredEdge : SU->Preds) {
3284 SUnit *PredSU = PredEdge.getSUnit();
3285 unsigned PredIndex = std::get<1>(
3286 *llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey));
3287 if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
3288 PredBefore = true;
3289 Pred = PredSU;
3290 break;
3291 }
3292 }
3293
3294 for (SDep &SuccEdge : SU->Succs) {
3295 SUnit *SuccSU = SuccEdge.getSUnit();
3296 // Do not process a boundary node, it was not included in NodeOrder,
3297 // hence not in Indices either, call to std::lower_bound() below will
3298 // return Indices.end().
3299 if (SuccSU->isBoundaryNode())
3300 continue;
3301 unsigned SuccIndex = std::get<1>(
3302 *llvm::lower_bound(Indices, std::make_pair(SuccSU, 0), CompareKey));
3303 if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
3304 SuccBefore = true;
3305 Succ = SuccSU;
3306 break;
3307 }
3308 }
3309
3310 if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
3311 // instructions in circuits are allowed to be scheduled
3312 // after both a successor and predecessor.
3313 bool InCircuit = llvm::any_of(
3314 Circuits, [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
3315 if (InCircuit)
3316 LLVM_DEBUG(dbgs() << "In a circuit, predecessor ";);
3317 else {
3318 Valid = false;
3319 NumNodeOrderIssues++;
3320 LLVM_DEBUG(dbgs() << "Predecessor ";);
3321 }
3322 LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
3323 << " are scheduled before node " << SU->NodeNum
3324 << "\n";);
3325 }
3326 }
3327
3328 LLVM_DEBUG({
3329 if (!Valid)
3330 dbgs() << "Invalid node order found!\n";
3331 });
3332}
3333
3334/// Attempt to fix the degenerate cases when the instruction serialization
3335/// causes the register lifetimes to overlap. For example,
3336/// p' = store_pi(p, b)
3337/// = load p, offset
3338/// In this case p and p' overlap, which means that two registers are needed.
3339/// Instead, this function changes the load to use p' and updates the offset.
3340void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
3341 unsigned OverlapReg = 0;
3342 unsigned NewBaseReg = 0;
3343 for (SUnit *SU : Instrs) {
3344 MachineInstr *MI = SU->getInstr();
3345 for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3346 const MachineOperand &MO = MI->getOperand(i);
3347 // Look for an instruction that uses p. The instruction occurs in the
3348 // same cycle but occurs later in the serialized order.
3349 if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
3350 // Check that the instruction appears in the InstrChanges structure,
3351 // which contains instructions that can have the offset updated.
3353 InstrChanges.find(SU);
3354 if (It != InstrChanges.end()) {
3355 unsigned BasePos, OffsetPos;
3356 // Update the base register and adjust the offset.
3357 if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
3359 NewMI->getOperand(BasePos).setReg(NewBaseReg);
3360 int64_t NewOffset =
3361 MI->getOperand(OffsetPos).getImm() - It->second.second;
3362 NewMI->getOperand(OffsetPos).setImm(NewOffset);
3363 SU->setInstr(NewMI);
3364 MISUnitMap[NewMI] = SU;
3365 NewMIs[MI] = NewMI;
3366 }
3367 }
3368 OverlapReg = 0;
3369 NewBaseReg = 0;
3370 break;
3371 }
3372 // Look for an instruction of the form p' = op(p), which uses and defines
3373 // two virtual registers that get allocated to the same physical register.
3374 unsigned TiedUseIdx = 0;
3375 if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
3376 // OverlapReg is p in the example above.
3377 OverlapReg = MI->getOperand(TiedUseIdx).getReg();
3378 // NewBaseReg is p' in the example above.
3379 NewBaseReg = MI->getOperand(i).getReg();
3380 break;
3381 }
3382 }
3383 }
3384}
3385
3386std::deque<SUnit *>
3388 const std::deque<SUnit *> &Instrs) const {
3389 std::deque<SUnit *> NewOrderPhi;
3390 for (SUnit *SU : Instrs) {
3391 if (SU->getInstr()->isPHI())
3392 NewOrderPhi.push_back(SU);
3393 }
3394 std::deque<SUnit *> NewOrderI;
3395 for (SUnit *SU : Instrs) {
3396 if (!SU->getInstr()->isPHI())
3397 orderDependence(SSD, SU, NewOrderI);
3398 }
3399 llvm::append_range(NewOrderPhi, NewOrderI);
3400 return NewOrderPhi;
3401}
3402
3403/// After the schedule has been formed, call this function to combine
3404/// the instructions from the different stages/cycles. That is, this
3405/// function creates a schedule that represents a single iteration.
3407 // Move all instructions to the first stage from later stages.
3408 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3409 for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
3410 ++stage) {
3411 std::deque<SUnit *> &cycleInstrs =
3412 ScheduledInstrs[cycle + (stage * InitiationInterval)];
3413 for (SUnit *SU : llvm::reverse(cycleInstrs))
3414 ScheduledInstrs[cycle].push_front(SU);
3415 }
3416 }
3417
3418 // Erase all the elements in the later stages. Only one iteration should
3419 // remain in the scheduled list, and it contains all the instructions.
3420 for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
3421 ScheduledInstrs.erase(cycle);
3422
3423 // Change the registers in instruction as specified in the InstrChanges
3424 // map. We need to use the new registers to create the correct order.
3425 for (const SUnit &SU : SSD->SUnits)
3426 SSD->applyInstrChange(SU.getInstr(), *this);
3427
3428 // Reorder the instructions in each cycle to fix and improve the
3429 // generated code.
3430 for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
3431 std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
3432 cycleInstrs = reorderInstructions(SSD, cycleInstrs);
3433 SSD->fixupRegisterOverlaps(cycleInstrs);
3434 }
3435
3436 LLVM_DEBUG(dump(););
3437}
3438
3440 os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
3441 << " depth " << MaxDepth << " col " << Colocate << "\n";
3442 for (const auto &I : Nodes)
3443 os << " SU(" << I->NodeNum << ") " << *(I->getInstr());
3444 os << "\n";
3445}
3446
3447#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3448/// Print the schedule information to the given output.
3450 // Iterate over each cycle.
3451 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3452 // Iterate over each instruction in the cycle.
3453 const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
3454 for (SUnit *CI : cycleInstrs->second) {
3455 os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
3456 os << "(" << CI->NodeNum << ") ";
3457 CI->getInstr()->print(os);
3458 os << "\n";
3459 }
3460 }
3461}
3462
3463/// Utility function used for debugging to print the schedule.
3466
3467void ResourceManager::dumpMRT() const {
3468 LLVM_DEBUG({
3469 if (UseDFA)
3470 return;
3471 std::stringstream SS;
3472 SS << "MRT:\n";
3473 SS << std::setw(4) << "Slot";
3474 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3475 SS << std::setw(3) << I;
3476 SS << std::setw(7) << "#Mops"
3477 << "\n";
3478 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3479 SS << std::setw(4) << Slot;
3480 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3481 SS << std::setw(3) << MRT[Slot][I];
3482 SS << std::setw(7) << NumScheduledMops[Slot] << "\n";
3483 }
3484 dbgs() << SS.str();
3485 });
3486}
3487#endif
3488
3490 const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
3491 unsigned ProcResourceID = 0;
3492
3493 // We currently limit the resource kinds to 64 and below so that we can use
3494 // uint64_t for Masks
3495 assert(SM.getNumProcResourceKinds() < 64 &&
3496 "Too many kinds of resources, unsupported");
3497 // Create a unique bitmask for every processor resource unit.
3498 // Skip resource at index 0, since it always references 'InvalidUnit'.
3499 Masks.resize(SM.getNumProcResourceKinds());
3500 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3502 if (Desc.SubUnitsIdxBegin)
3503 continue;
3504 Masks[I] = 1ULL << ProcResourceID;
3505 ProcResourceID++;
3506 }
3507 // Create a unique bitmask for every processor resource group.
3508 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3510 if (!Desc.SubUnitsIdxBegin)
3511 continue;
3512 Masks[I] = 1ULL << ProcResourceID;
3513 for (unsigned U = 0; U < Desc.NumUnits; ++U)
3514 Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];
3515 ProcResourceID++;
3516 }
3517 LLVM_DEBUG({
3518 if (SwpShowResMask) {
3519 dbgs() << "ProcResourceDesc:\n";
3520 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3521 const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
3522 dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
3523 ProcResource->Name, I, Masks[I],
3524 ProcResource->NumUnits);
3525 }
3526 dbgs() << " -----------------\n";
3527 }
3528 });
3529}
3530
3532 LLVM_DEBUG({
3533 if (SwpDebugResource)
3534 dbgs() << "canReserveResources:\n";
3535 });
3536 if (UseDFA)
3537 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3538 ->canReserveResources(&SU.getInstr()->getDesc());
3539
3540 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3541 if (!SCDesc->isValid()) {
3542 LLVM_DEBUG({
3543 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3544 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3545 });
3546 return true;
3547 }
3548
3549 reserveResources(SCDesc, Cycle);
3550 bool Result = !isOverbooked();
3551 unreserveResources(SCDesc, Cycle);
3552
3553 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return " << Result << "\n\n";);
3554 return Result;
3555}
3556
3557void ResourceManager::reserveResources(SUnit &SU, int Cycle) {
3558 LLVM_DEBUG({
3559 if (SwpDebugResource)
3560 dbgs() << "reserveResources:\n";
3561 });
3562 if (UseDFA)
3563 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3564 ->reserveResources(&SU.getInstr()->getDesc());
3565
3566 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3567 if (!SCDesc->isValid()) {
3568 LLVM_DEBUG({
3569 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3570 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3571 });
3572 return;
3573 }
3574
3575 reserveResources(SCDesc, Cycle);
3576
3577 LLVM_DEBUG({
3578 if (SwpDebugResource) {
3579 dumpMRT();
3580 dbgs() << "reserveResources: done!\n\n";
3581 }
3582 });
3583}
3584
3585void ResourceManager::reserveResources(const MCSchedClassDesc *SCDesc,
3586 int Cycle) {
3587 assert(!UseDFA);
3588 for (const MCWriteProcResEntry &PRE : make_range(
3589 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3590 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3591 ++MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3592
3593 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3594 ++NumScheduledMops[positiveModulo(C, InitiationInterval)];
3595}
3596
3597void ResourceManager::unreserveResources(const MCSchedClassDesc *SCDesc,
3598 int Cycle) {
3599 assert(!UseDFA);
3600 for (const MCWriteProcResEntry &PRE : make_range(
3601 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3602 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3603 --MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3604
3605 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3606 --NumScheduledMops[positiveModulo(C, InitiationInterval)];
3607}
3608
3609bool ResourceManager::isOverbooked() const {
3610 assert(!UseDFA);
3611 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3612 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3614 if (MRT[Slot][I] > Desc->NumUnits)
3615 return true;
3616 }
3617 if (NumScheduledMops[Slot] > IssueWidth)
3618 return true;
3619 }
3620 return false;
3621}
3622
3623int ResourceManager::calculateResMIIDFA() const {
3624 assert(UseDFA);
3625
3626 // Sort the instructions by the number of available choices for scheduling,
3627 // least to most. Use the number of critical resources as the tie breaker.
3628 FuncUnitSorter FUS = FuncUnitSorter(*ST);
3629 for (SUnit &SU : DAG->SUnits)
3630 FUS.calcCriticalResources(*SU.getInstr());
3632 FuncUnitOrder(FUS);
3633
3634 for (SUnit &SU : DAG->SUnits)
3635 FuncUnitOrder.push(SU.getInstr());
3636
3638 Resources.push_back(
3639 std::unique_ptr<DFAPacketizer>(TII->CreateTargetScheduleState(*ST)));
3640
3641 while (!FuncUnitOrder.empty()) {
3642 MachineInstr *MI = FuncUnitOrder.top();
3643 FuncUnitOrder.pop();
3644 if (TII->isZeroCost(MI->getOpcode()))
3645 continue;
3646
3647 // Attempt to reserve the instruction in an existing DFA. At least one
3648 // DFA is needed for each cycle.
3649 unsigned NumCycles = DAG->getSUnit(MI)->Latency;
3650 unsigned ReservedCycles = 0;
3651 auto *RI = Resources.begin();
3652 auto *RE = Resources.end();
3653 LLVM_DEBUG({
3654 dbgs() << "Trying to reserve resource for " << NumCycles
3655 << " cycles for \n";
3656 MI->dump();
3657 });
3658 for (unsigned C = 0; C < NumCycles; ++C)
3659 while (RI != RE) {
3660 if ((*RI)->canReserveResources(*MI)) {
3661 (*RI)->reserveResources(*MI);
3662 ++ReservedCycles;
3663 break;
3664 }
3665 RI++;
3666 }
3667 LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
3668 << ", NumCycles:" << NumCycles << "\n");
3669 // Add new DFAs, if needed, to reserve resources.
3670 for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
3672 << "NewResource created to reserve resources"
3673 << "\n");
3674 auto *NewResource = TII->CreateTargetScheduleState(*ST);
3675 assert(NewResource->canReserveResources(*MI) && "Reserve error.");
3676 NewResource->reserveResources(*MI);
3677 Resources.push_back(std::unique_ptr<DFAPacketizer>(NewResource));
3678 }
3679 }
3680
3681 int Resmii = Resources.size();
3682 LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
3683 return Resmii;
3684}
3685
3687 if (UseDFA)
3688 return calculateResMIIDFA();
3689
3690 // Count each resource consumption and divide it by the number of units.
3691 // ResMII is the max value among them.
3692
3693 int NumMops = 0;
3695 for (SUnit &SU : DAG->SUnits) {
3696 if (TII->isZeroCost(SU.getInstr()->getOpcode()))
3697 continue;
3698
3699 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3700 if (!SCDesc->isValid())
3701 continue;
3702
3703 LLVM_DEBUG({
3704 if (SwpDebugResource) {
3705 DAG->dumpNode(SU);
3706 dbgs() << " #Mops: " << SCDesc->NumMicroOps << "\n"
3707 << " WriteProcRes: ";
3708 }
3709 });
3710 NumMops += SCDesc->NumMicroOps;
3711 for (const MCWriteProcResEntry &PRE :
3712 make_range(STI->getWriteProcResBegin(SCDesc),
3713 STI->getWriteProcResEnd(SCDesc))) {
3714 LLVM_DEBUG({
3715 if (SwpDebugResource) {
3716 const MCProcResourceDesc *Desc =
3717 SM.getProcResource(PRE.ProcResourceIdx);
3718 dbgs() << Desc->Name << ": " << PRE.ReleaseAtCycle << ", ";
3719 }
3720 });
3721 ResourceCount[PRE.ProcResourceIdx] += PRE.ReleaseAtCycle;
3722 }
3723 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "\n");
3724 }
3725
3726 int Result = (NumMops + IssueWidth - 1) / IssueWidth;
3727 LLVM_DEBUG({
3728 if (SwpDebugResource)
3729 dbgs() << "#Mops: " << NumMops << ", "
3730 << "IssueWidth: " << IssueWidth << ", "
3731 << "Cycles: " << Result << "\n";
3732 });
3733
3734 LLVM_DEBUG({
3735 if (SwpDebugResource) {
3736 std::stringstream SS;
3737 SS << std::setw(2) << "ID" << std::setw(16) << "Name" << std::setw(10)
3738 << "Units" << std::setw(10) << "Consumed" << std::setw(10) << "Cycles"
3739 << "\n";
3740 dbgs() << SS.str();
3741 }
3742 });
3743 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3745 int Cycles = (ResourceCount[I] + Desc->NumUnits - 1) / Desc->NumUnits;
3746 LLVM_DEBUG({
3747 if (SwpDebugResource) {
3748 std::stringstream SS;
3749 SS << std::setw(2) << I << std::setw(16) << Desc->Name << std::setw(10)
3750 << Desc->NumUnits << std::setw(10) << ResourceCount[I]
3751 << std::setw(10) << Cycles << "\n";
3752 dbgs() << SS.str();
3753 }
3754 });
3755 if (Cycles > Result)
3756 Result = Cycles;
3757 }
3758 return Result;
3759}
3760
3762 InitiationInterval = II;
3763 DFAResources.clear();
3764 DFAResources.resize(II);
3765 for (auto &I : DFAResources)
3766 I.reset(ST->getInstrInfo()->CreateTargetScheduleState(*ST));
3767 MRT.clear();
3769 NumScheduledMops.clear();
3770 NumScheduledMops.resize(II);
3771}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const LLT S1
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:693
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:537
This file declares an analysis pass that computes CycleInfo for LLVM IR, specialized from GenericCycl...
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
bool End
Definition: ELF_riscv.cpp:480
SmallVector< uint32_t, 0 > Writes
Definition: ELF_riscv.cpp:497
Rewrite Partial Register Uses
#define DEBUG_TYPE
const HexagonInstrInfo * TII
hexagon gen pred
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
static cl::opt< int > SwpForceII("pipeliner-force-ii", cl::desc("Force pipeliner to use specified II."), cl::Hidden, cl::init(-1))
A command line argument to force pipeliner to use specified initial interval.
static cl::opt< bool > ExperimentalCodeGen("pipeliner-experimental-cg", cl::Hidden, cl::init(false), cl::desc("Use the experimental peeling code generator for software pipelining"))
static bool pred_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Preds, const NodeSet *S=nullptr)
Compute the Pred_L(O) set, as defined in the paper.
static cl::opt< bool > MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false), cl::desc("Use the MVE code generator for software pipelining"))
static cl::opt< int > RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden, cl::init(5), cl::desc("Margin representing the unused percentage of " "the register pressure limit"))
static cl::opt< bool > SwpDebugResource("pipeliner-dbg-res", cl::Hidden, cl::init(false))
static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker, NodeSet &NS)
Compute the live-out registers for the instructions in a node-set.
static void computeScheduledInsts(const SwingSchedulerDAG *SSD, SMSchedule &Schedule, std::vector< MachineInstr * > &OrderedInsts, DenseMap< MachineInstr *, unsigned > &Stages)
Create an instruction stream that represents a single iteration and stage of each instruction.
static cl::opt< bool > EmitTestAnnotations("pipeliner-annotate-for-testing", cl::Hidden, cl::init(false), cl::desc("Instead of emitting the pipelined code, annotate instructions " "with the generated schedule for feeding into the " "-modulo-schedule-test pass"))
static bool isIntersect(SmallSetVector< SUnit *, 8 > &Set1, const NodeSet &Set2, SmallSetVector< SUnit *, 8 > &Result)
Return true if Set1 contains elements in Set2.
static cl::opt< bool > SwpIgnoreRecMII("pipeliner-ignore-recmii", cl::ReallyHidden, cl::desc("Ignore RecMII"))
static cl::opt< int > SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1))
static bool succ_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Succs, const NodeSet *S=nullptr)
Compute the Succ_L(O) set, as defined in the paper.
Modulo Software Pipelining
static cl::opt< bool > SwpPruneLoopCarried("pipeliner-prune-loop-carried", cl::desc("Prune loop carried order dependences."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of loop carried order dependences.
static bool isDependenceBarrier(MachineInstr &MI)
Return true if the instruction causes a chain between memory references before and after it.
static cl::opt< int > SwpMaxMii("pipeliner-max-mii", cl::desc("Size limit for the MII."), cl::Hidden, cl::init(27))
A command line argument to limit minimum initial interval for pipelining.
static unsigned getLoopPhiReg(const MachineInstr &Phi, const MachineBasicBlock *LoopBB)
Return the Phi register value that comes the loop block.
static void swapAntiDependences(std::vector< SUnit > &SUnits)
Swap all the anti dependences in the DAG.
static bool isSuccOrder(SUnit *SUa, SUnit *SUb)
Return true if SUb can be reached from SUa following the chain edges.
static cl::opt< int > SwpMaxStages("pipeliner-max-stages", cl::desc("Maximum stages allowed in the generated scheduled."), cl::Hidden, cl::init(3))
A command line argument to limit the number of stages in the pipeline.
static cl::opt< bool > EnableSWPOptSize("enable-pipeliner-opt-size", cl::desc("Enable SWP at Os."), cl::Hidden, cl::init(false))
A command line option to enable SWP at -Os.
static cl::opt< bool > SwpShowResMask("pipeliner-show-mask", cl::Hidden, cl::init(false))
static cl::opt< int > SwpIISearchRange("pipeliner-ii-search-range", cl::desc("Range to search for II"), cl::Hidden, cl::init(10))
static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop, unsigned &InitVal, unsigned &LoopVal)
Return the register values for the operands of a Phi instruction.
static cl::opt< bool > LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false), cl::desc("Limit register pressure of scheduled loop"))
#define DEBUG_TYPE
static cl::opt< bool > EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true), cl::desc("Enable Software Pipelining"))
A command line option to turn software pipelining on or off.
static bool ignoreDependence(const SDep &D, bool isPred)
Return true for DAG nodes that we ignore when computing the cost functions.
static cl::opt< bool > SwpPruneDeps("pipeliner-prune-deps", cl::desc("Prune dependences between unrelated Phi nodes."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of chain dependences due to an unrelated Phi.
static SUnit * multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG)
If an instruction has a use that spans multiple iterations, then return true.
static bool computePath(SUnit *Cur, SetVector< SUnit * > &Path, SetVector< SUnit * > &DestNodes, SetVector< SUnit * > &Exclude, SmallPtrSet< SUnit *, 8 > &Visited)
Return true if there is a path from the specified node to any of the nodes in DestNodes.
unsigned const TargetRegisterInfo * TRI
unsigned Reg
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file defines the PriorityQueue class.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
Target-Independent Code Generator Pass Configuration Options pass.
static unsigned getSize(unsigned Kind)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
bool isNoAlias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A trivial helper function to check to see if the specified pointers are no-alias.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:221
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
bool erase(const KeyT &Val)
Definition: DenseMap.h:345
bool empty() const
Definition: DenseMap.h:98
iterator end()
Definition: DenseMap.h:84
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:358
A possibly irreducible generalization of a Loop.
Itinerary data supplied by a subtarget to be used by a target.
const InstrStage * beginStage(unsigned ItinClassIndx) const
Return the first stage of the itinerary.
const InstrStage * endStage(unsigned ItinClassIndx) const
Return the last+1 stage of the itinerary.
bool isEmpty() const
Returns true if there are no itineraries.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:359
bool hasValue() const
TypeSize getValue() const
BlockT * getHeader() const
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:631
unsigned getSchedClass() const
Return the scheduling class for this instruction.
Definition: MCInstrDesc.h:600
const MCInstrDesc & get(unsigned Opcode) const
Return the machine instruction descriptor that corresponds to the specified instruction opcode.
Definition: MCInstrInfo.h:63
Generic base class for all target subtargets.
const MCWriteProcResEntry * getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCWriteProcResEntry * getWriteProcResBegin(const MCSchedClassDesc *SC) const
Return an iterator at the first process resource consumed by the given scheduling class.
const MCSchedModel & getSchedModel() const
Get the machine model for this subtarget's CPU.
Metadata node.
Definition: Metadata.h:1067
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1428
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1426
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1434
Tracking metadata reference owned by Metadata.
Definition: Metadata.h:889
A single uniqued string.
Definition: Metadata.h:720
StringRef getString() const
Definition: Metadata.cpp:610
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
instr_iterator instr_end()
Analysis pass which computes a MachineDominatorTree.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void deleteMachineInstr(MachineInstr *MI)
DeleteMachineInstr - Delete the given MachineInstr.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
bool mayRaiseFPException() const
Return true if this instruction could possibly raise a floating-point exception.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:566
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:343
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition: MachineInstr.h:563
bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool isRegSequence() const
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:797
bool isIdenticalTo(const MachineInstr &Other, MICheckType Check=CheckDefs) const
Return true if this instruction is identical to Other.
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool isPseudo(QueryType Type=IgnoreBundle) const
Return true if this is a pseudo instruction that doesn't correspond to a real machine instruction.
Definition: MachineInstr.h:927
bool isPHI() const
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:576
iterator_range< filtered_mop_iterator > all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
Definition: MachineInstr.h:753
A description of a memory reference used in the backend.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
const Value * getValue() const
Return the base address of the memory access.
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
Register getReg() const
getReg - Returns the register number.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
Diagnostic information for optimization analysis remarks.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
The main class in the implementation of the target independent software pipeliner pass.
const TargetInstrInfo * TII
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
MachineFunction * MF
const MachineDominatorTree * MDT
const MachineLoopInfo * MLI
MachineOptimizationRemarkEmitter * ORE
RegisterClassInfo RegClassInfo
defusechain_iterator - This class provides iterator support for machine operands in the function that...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
use_instr_iterator use_instr_begin(Register RegNo) const
static use_instr_iterator use_instr_end()
MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:71
iterator find(const KeyT &Key)
Definition: MapVector.h:167
void clear()
Definition: MapVector.h:88
static MemoryLocation getAfter(const Value *Ptr, const AAMDNodes &AATags=AAMDNodes())
Return a location that may access any location after Ptr, while remaining within the underlying objec...
Expand the kernel using modulo variable expansion algorithm (MVE).
static bool canApply(MachineLoop &L)
Check if ModuloScheduleExpanderMVE can be applied to L.
The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place, rewriting the old loop and...
void cleanup()
Performs final cleanup after expansion.
void expand()
Performs the actual expansion.
Expander that simply annotates each scheduled instruction with a post-instr symbol that can be consum...
void annotate()
Performs the annotation.
Represents a schedule for a single-block loop.
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
SUnit * getNode(unsigned i) const
void print(raw_ostream &os) const
void setRecMII(unsigned mii)
unsigned count(SUnit *SU) const
void setColocate(unsigned c)
int compareRecMII(NodeSet &RHS)
unsigned size() const
bool insert(SUnit *SU)
LLVM_DUMP_METHOD void dump() const
bool empty() const
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
void dump() const
Definition: Pass.cpp:136
A reimplementation of ModuloScheduleExpander.
unsigned getPSet() const
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
Track the current register pressure at some position in the instruction stream, and remember the high...
void addLiveRegs(ArrayRef< RegisterMaskPair > Regs)
Force liveness of virtual registers or physical register units.
void runOnMachineFunction(const MachineFunction &MF)
runOnFunction - Prepare to answer questions about MF.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static constexpr bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:65
void initProcResourceVectors(const MCSchedModel &SM, SmallVectorImpl< uint64_t > &Masks)
void init(int II)
Initialize resources with the initiation interval II.
bool canReserveResources(SUnit &SU, int Cycle)
Check if the resources occupied by a machine instruction are available in the current state.
Scheduling dependency.
Definition: ScheduleDAG.h:49
SUnit * getSUnit() const
Definition: ScheduleDAG.h:498
Kind getKind() const
Returns an enum value representing the kind of the dependence.
Definition: ScheduleDAG.h:504
Kind
These are the different kinds of scheduling dependencies.
Definition: ScheduleDAG.h:52
@ Output
A register output-dependence (aka WAW).
Definition: ScheduleDAG.h:55
@ Order
Any other ordering dependency.
Definition: ScheduleDAG.h:56
@ Anti
A register anti-dependence (aka WAR).
Definition: ScheduleDAG.h:54
@ Data
Regular data dependence (aka true-dependence).
Definition: ScheduleDAG.h:53
void setLatency(unsigned Lat)
Sets the latency for this edge.
Definition: ScheduleDAG.h:147
@ Barrier
An unknown scheduling barrier.
Definition: ScheduleDAG.h:69
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
Definition: ScheduleDAG.h:142
bool isArtificial() const
Tests if this is an Order dependence that is marked as "artificial", meaning it isn't necessary for c...
Definition: ScheduleDAG.h:200
This class represents the scheduled code.
std::deque< SUnit * > reorderInstructions(const SwingSchedulerDAG *SSD, const std::deque< SUnit * > &Instrs) const
int earliestCycleInChain(const SDep &Dep)
Return the cycle of the earliest scheduled instruction in the dependence chain.
void setInitiationInterval(int ii)
Set the initiation interval for this schedule.
SmallSet< SUnit *, 8 > computeUnpipelineableNodes(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
Determine transitive dependences of unpipelineable instructions.
void dump() const
Utility function used for debugging to print the schedule.
bool insert(SUnit *SU, int StartCycle, int EndCycle, int II)
Try to schedule the node at the specified StartCycle and continue until the node is schedule or the E...
unsigned getMaxStageCount()
Return the maximum stage count needed for this schedule.
void print(raw_ostream &os) const
Print the schedule information to the given output.
int latestCycleInChain(const SDep &Dep)
Return the cycle of the latest scheduled instruction in the dependence chain.
int stageScheduled(SUnit *SU) const
Return the stage for a scheduled instruction.
void orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU, std::deque< SUnit * > &Insts) const
Order the instructions within a cycle so that the definitions occur before the uses.
void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart, int *MinEnd, int *MaxStart, int II, SwingSchedulerDAG *DAG)
Compute the scheduling start slot for the instruction.
bool isValidSchedule(SwingSchedulerDAG *SSD)
int getInitiationInterval() const
Return the initiation interval for this schedule.
std::deque< SUnit * > & getInstructions(int cycle)
Return the instructions that are scheduled at the specified cycle.
int getFirstCycle() const
Return the first cycle in the completed schedule.
bool isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD, MachineInstr *Def, MachineOperand &MO) const
Return true if the instruction is a definition that is loop carried and defines the use on the next i...
unsigned cycleScheduled(SUnit *SU) const
Return the cycle for a scheduled instruction.
bool normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
bool isLoopCarried(const SwingSchedulerDAG *SSD, MachineInstr &Phi) const
Return true if the scheduled Phi has a loop carried operand.
int getFinalCycle() const
Return the last cycle in the finalized schedule.
void finalizeSchedule(SwingSchedulerDAG *SSD)
After the schedule has been formed, call this function to combine the instructions from the different...
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
unsigned NumPreds
Definition: ScheduleDAG.h:272
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
Definition: ScheduleDAG.h:378
unsigned NodeNum
Entry # of node in the node vector.
Definition: ScheduleDAG.h:270
void setInstr(MachineInstr *MI)
Assigns the instruction for the SUnit.
Definition: ScheduleDAG.h:382
void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
bool isPred(const SUnit *N) const
Tests if node N is a predecessor of this node.
Definition: ScheduleDAG.h:449
unsigned short Latency
Node latency.
Definition: ScheduleDAG.h:303
bool isBoundaryNode() const
Boundary nodes are placeholders for the boundary of the scheduling region.
Definition: ScheduleDAG.h:358
bool hasPhysRegDefs
Has physreg defs that are being used.
Definition: ScheduleDAG.h:292
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:263
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:262
bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:390
A ScheduleDAG for scheduling lists of MachineInstr.
DenseMap< MachineInstr *, SUnit * > MISUnitMap
After calling BuildSchedGraph, each machine instruction in the current scheduling region is mapped to...
virtual void finishBlock()
Cleans up after scheduling in the given block.
MachineBasicBlock * BB
The block in which to insert instructions.
const MCSchedClassDesc * getSchedClass(SUnit *SU) const
Resolves and cache a resolved scheduling class for an SUnit.
void dumpNode(const SUnit &SU) const override
UndefValue * UnknownValue
For an unanalyzable memory access, this Value is used in maps.
void buildSchedGraph(AAResults *AA, RegPressureTracker *RPTracker=nullptr, PressureDiffs *PDiffs=nullptr, LiveIntervals *LIS=nullptr, bool TrackLaneMasks=false)
Builds SUnits for the current region.
SUnit * getSUnit(MachineInstr *MI) const
Returns an existing SUnit for this MI, or nullptr.
TargetSchedModel SchedModel
TargetSchedModel provides an interface to the machine model.
void dump() const override
void RemovePred(SUnit *M, SUnit *N)
Updates the topological ordering to accommodate an edge to be removed from the specified node N from ...
void InitDAGTopologicalSorting()
Creates the initial topological ordering from the DAG to be scheduled.
void AddPred(SUnit *Y, SUnit *X)
Updates the topological ordering to accommodate an edge to be added from SUnit X to SUnit Y.
bool IsReachable(const SUnit *SU, const SUnit *TargetSU)
Checks if SU is reachable from TargetSU.
MachineRegisterInfo & MRI
Virtual/real register map.
Definition: ScheduleDAG.h:578
const TargetInstrInfo * TII
Target instruction information.
Definition: ScheduleDAG.h:575
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:579
const TargetRegisterInfo * TRI
Target processor register info.
Definition: ScheduleDAG.h:576
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:577
A vector that has set insertion semantics.
Definition: SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
typename vector_type::const_iterator iterator
Definition: SetVector.h:69
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
SlotIndexes pass.
Definition: SlotIndexes.h:300
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:523
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:290
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:356
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
iterator end() const
Definition: SmallPtrSet.h:385
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
iterator begin() const
Definition: SmallPtrSet.h:380
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:236
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
This class builds the dependence graph for the instructions in a loop, and attempts to schedule the i...
unsigned getInstrBaseReg(SUnit *SU) const
Return the new base register that was stored away for the changed instruction.
unsigned getDepth(SUnit *Node)
The depth, in the dependence graph, for a node.
int getASAP(SUnit *Node)
Return the earliest time an instruction may be scheduled.
void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule)
Apply changes to the instruction if needed.
void finishBlock() override
Clean up after the software pipeliner runs.
void fixupRegisterOverlaps(std::deque< SUnit * > &Instrs)
Attempt to fix the degenerate cases when the instruction serialization causes the register lifetimes ...
int getZeroLatencyDepth(SUnit *Node)
The maximum unweighted length of a path from an arbitrary node to the given node in which each edge h...
bool isLoopCarriedDep(SUnit *Source, const SDep &Dep, bool isSucc=true)
Return true for an order or output dependence that is loop carried potentially.
unsigned getDistance(SUnit *U, SUnit *V, const SDep &Dep)
The distance function, which indicates that operation V of iteration I depends on operations U of ite...
void schedule() override
We override the schedule function in ScheduleDAGInstrs to implement the scheduling part of the Swing ...
int getMOV(SUnit *Node)
The mobility function, which the number of slots in which an instruction may be scheduled.
int getZeroLatencyHeight(SUnit *Node)
The maximum unweighted length of a path from the given node to an arbitrary node in which each edge h...
bool isBackedge(SUnit *Source, const SDep &Dep)
Return true if the dependence is a back-edge in the data dependence graph.
unsigned getHeight(SUnit *Node)
The height, in the dependence graph, for a node.
int getALAP(SUnit *Node)
Return the latest time an instruction my be scheduled.
Object returned by analyzeLoopForPipelining.
virtual bool isMVEExpanderSupported()
Return true if the target can expand pipelined schedule with modulo variable expansion.
virtual bool shouldIgnoreForPipelining(const MachineInstr *MI) const =0
Return true if the given instruction should not be pipelined and should be ignored.
virtual bool shouldUseSchedule(SwingSchedulerDAG &SSD, SMSchedule &SMS)
Return true if the proposed schedule should used.
virtual std::unique_ptr< PipelinerLoopInfo > analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const
Analyze loop L, which must be a single-basic-block loop, and if the conditions can be understood enou...
bool isZeroCost(unsigned Opcode) const
Return true for pseudo instructions that don't consume any machine resources in their current form.
virtual bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
virtual DFAPacketizer * CreateTargetScheduleState(const TargetSubtargetInfo &) const
Create machine specific model for scheduling.
virtual bool isPostIncrement(const MachineInstr &MI) const
Return true for post-incremented instructions.
virtual bool getBaseAndOffsetPosition(const MachineInstr &MI, unsigned &BasePos, unsigned &OffsetPos) const
Return true if the instruction contains a base register and offset.
virtual bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const
Sometimes, it is possible for the target to tell, even without aliasing information,...
virtual bool getIncrementValue(const MachineInstr &MI, int &Value) const
If the instruction is an increment of a constant value, return the amount.
bool getMemOperandWithOffset(const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const
Get the base operand and byte offset of an instruction that reads/writes memory.
Target-Independent Code Generator Pass Configuration Options.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const char * getRegPressureSetName(unsigned Idx) const =0
Get the name of this register unit pressure set.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
static Type * getVoidTy(LLVMContext &C)
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1795
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
The main class in the implementation of the target independent window scheduler.
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:139
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:718
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
constexpr double e
Definition: MathExtras.h:31
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Offset
Definition: DWP.cpp:456
void stable_sort(R &&Range)
Definition: STLExtras.h:1995
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2067
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2059
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
@ WS_Force
Use window algorithm after SMS algorithm fails.
@ WS_On
Turn off window algorithm.
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
unsigned getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1954
char & MachinePipelinerID
This pass performs software pipelining on machine instructions.
CycleInfo::CycleT Cycle
Definition: CycleInfo.h:24
cl::opt< bool > SwpEnableCopyToPhi
cl::opt< WindowSchedulingFlag > WindowSchedulingOption("window-sched", cl::Hidden, cl::init(WindowSchedulingFlag::WS_On), cl::desc("Set how to use window scheduling algorithm."), cl::values(clEnumValN(WindowSchedulingFlag::WS_Off, "off", "Turn off window algorithm."), clEnumValN(WindowSchedulingFlag::WS_On, "on", "Use window algorithm after SMS algorithm fails."), clEnumValN(WindowSchedulingFlag::WS_Force, "force", "Use window algorithm instead of SMS algorithm.")))
A command line argument to set the window scheduling option.
bool isIdentifiedObject(const Value *V)
Return true if this pointer refers to a distinct and identifiable object.
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
cl::opt< int > SwpForceIssueWidth
A command line argument to force pipeliner to use specified issue width.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Description of the encoding of one expression Op.
These values represent a non-pipelined step in the execution of an instruction.
RegisterPressure computed within a region of instructions delimited by TopIdx and BottomIdx.
static constexpr LaneBitmask getNone()
Definition: LaneBitmask.h:81
Define a kind of processor resource that will be modeled by the scheduler.
Definition: MCSchedule.h:31
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition: MCSchedule.h:118
bool isValid() const
Definition: MCSchedule.h:136
Machine model for scheduling, bundling, and heuristics.
Definition: MCSchedule.h:253
const MCSchedClassDesc * getSchedClassDesc(unsigned SchedClassIdx) const
Definition: MCSchedule.h:360
unsigned getNumProcResourceKinds() const
Definition: MCSchedule.h:349
bool hasInstrSchedModel() const
Does this machine model include instruction-level scheduling.
Definition: MCSchedule.h:334
const MCProcResourceDesc * getProcResource(unsigned ProcResourceIdx) const
Definition: MCSchedule.h:353