LLVM 19.0.0git
MachinePipeliner.cpp
Go to the documentation of this file.
1//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
10//
11// This SMS implementation is a target-independent back-end pass. When enabled,
12// the pass runs just prior to the register allocation pass, while the machine
13// IR is in SSA form. If software pipelining is successful, then the original
14// loop is replaced by the optimized loop. The optimized loop contains one or
15// more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16// the instructions cannot be scheduled in a given MII, we increase the MII by
17// one and try again.
18//
19// The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20// represent loop carried dependences in the DAG as order edges to the Phi
21// nodes. We also perform several passes over the DAG to eliminate unnecessary
22// edges that inhibit the ability to pipeline. The implementation uses the
23// DFAPacketizer class to compute the minimum initiation interval and the check
24// where an instruction may be inserted in the pipelined schedule.
25//
26// In order for the SMS pass to work, several target specific hooks need to be
27// implemented to get information about the loop structure and to rewrite
28// instructions.
29//
30//===----------------------------------------------------------------------===//
31
33#include "llvm/ADT/ArrayRef.h"
34#include "llvm/ADT/BitVector.h"
35#include "llvm/ADT/DenseMap.h"
36#include "llvm/ADT/MapVector.h"
38#include "llvm/ADT/STLExtras.h"
40#include "llvm/ADT/SetVector.h"
42#include "llvm/ADT/SmallSet.h"
44#include "llvm/ADT/Statistic.h"
73#include "llvm/Config/llvm-config.h"
74#include "llvm/IR/Attributes.h"
75#include "llvm/IR/Function.h"
76#include "llvm/MC/LaneBitmask.h"
77#include "llvm/MC/MCInstrDesc.h"
80#include "llvm/Pass.h"
83#include "llvm/Support/Debug.h"
86#include <algorithm>
87#include <cassert>
88#include <climits>
89#include <cstdint>
90#include <deque>
91#include <functional>
92#include <iomanip>
93#include <iterator>
94#include <map>
95#include <memory>
96#include <sstream>
97#include <tuple>
98#include <utility>
99#include <vector>
100
101using namespace llvm;
102
103#define DEBUG_TYPE "pipeliner"
104
105STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
106STATISTIC(NumPipelined, "Number of loops software pipelined");
107STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
108STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
109STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
110STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
111STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
112STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
113STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
114STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
115STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
116
117/// A command line option to turn software pipelining on or off.
118static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
119 cl::desc("Enable Software Pipelining"));
120
121/// A command line option to enable SWP at -Os.
122static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
123 cl::desc("Enable SWP at Os."), cl::Hidden,
124 cl::init(false));
125
126/// A command line argument to limit minimum initial interval for pipelining.
127static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
128 cl::desc("Size limit for the MII."),
129 cl::Hidden, cl::init(27));
130
131/// A command line argument to force pipeliner to use specified initial
132/// interval.
133static cl::opt<int> SwpForceII("pipeliner-force-ii",
134 cl::desc("Force pipeliner to use specified II."),
135 cl::Hidden, cl::init(-1));
136
137/// A command line argument to limit the number of stages in the pipeline.
138static cl::opt<int>
139 SwpMaxStages("pipeliner-max-stages",
140 cl::desc("Maximum stages allowed in the generated scheduled."),
141 cl::Hidden, cl::init(3));
142
143/// A command line option to disable the pruning of chain dependences due to
144/// an unrelated Phi.
145static cl::opt<bool>
146 SwpPruneDeps("pipeliner-prune-deps",
147 cl::desc("Prune dependences between unrelated Phi nodes."),
148 cl::Hidden, cl::init(true));
149
150/// A command line option to disable the pruning of loop carried order
151/// dependences.
152static cl::opt<bool>
153 SwpPruneLoopCarried("pipeliner-prune-loop-carried",
154 cl::desc("Prune loop carried order dependences."),
155 cl::Hidden, cl::init(true));
156
157#ifndef NDEBUG
158static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
159#endif
160
161static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
163 cl::desc("Ignore RecMII"));
164
165static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
166 cl::init(false));
167static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
168 cl::init(false));
169
171 "pipeliner-annotate-for-testing", cl::Hidden, cl::init(false),
172 cl::desc("Instead of emitting the pipelined code, annotate instructions "
173 "with the generated schedule for feeding into the "
174 "-modulo-schedule-test pass"));
175
177 "pipeliner-experimental-cg", cl::Hidden, cl::init(false),
178 cl::desc(
179 "Use the experimental peeling code generator for software pipelining"));
180
181static cl::opt<int> SwpIISearchRange("pipeliner-ii-search-range",
182 cl::desc("Range to search for II"),
183 cl::Hidden, cl::init(10));
184
185static cl::opt<bool>
186 LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false),
187 cl::desc("Limit register pressure of scheduled loop"));
188
189static cl::opt<int>
190 RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden,
191 cl::init(5),
192 cl::desc("Margin representing the unused percentage of "
193 "the register pressure limit"));
194
195namespace llvm {
196
197// A command line option to enable the CopyToPhi DAG mutation.
199 cl::init(true),
200 cl::desc("Enable CopyToPhi DAG Mutation"));
201
202/// A command line argument to force pipeliner to use specified issue
203/// width.
205 "pipeliner-force-issue-width",
206 cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
207 cl::init(-1));
208
209} // end namespace llvm
210
211unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
212char MachinePipeliner::ID = 0;
213#ifndef NDEBUG
215#endif
217
219 "Modulo Software Pipelining", false, false)
225 "Modulo Software Pipelining", false, false)
226
227/// The "main" function for implementing Swing Modulo Scheduling.
228bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
229 if (skipFunction(mf.getFunction()))
230 return false;
231
232 if (!EnableSWP)
233 return false;
234
235 if (mf.getFunction().getAttributes().hasFnAttr(Attribute::OptimizeForSize) &&
236 !EnableSWPOptSize.getPosition())
237 return false;
238
239 if (!mf.getSubtarget().enableMachinePipeliner())
240 return false;
241
242 // Cannot pipeline loops without instruction itineraries if we are using
243 // DFA for the pipeliner.
244 if (mf.getSubtarget().useDFAforSMS() &&
245 (!mf.getSubtarget().getInstrItineraryData() ||
246 mf.getSubtarget().getInstrItineraryData()->isEmpty()))
247 return false;
248
249 MF = &mf;
250 MLI = &getAnalysis<MachineLoopInfo>();
251 MDT = &getAnalysis<MachineDominatorTree>();
252 ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
253 TII = MF->getSubtarget().getInstrInfo();
254 RegClassInfo.runOnMachineFunction(*MF);
255
256 for (const auto &L : *MLI)
257 scheduleLoop(*L);
258
259 return false;
260}
261
262/// Attempt to perform the SMS algorithm on the specified loop. This function is
263/// the main entry point for the algorithm. The function identifies candidate
264/// loops, calculates the minimum initiation interval, and attempts to schedule
265/// the loop.
266bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
267 bool Changed = false;
268 for (const auto &InnerLoop : L)
269 Changed |= scheduleLoop(*InnerLoop);
270
271#ifndef NDEBUG
272 // Stop trying after reaching the limit (if any).
273 int Limit = SwpLoopLimit;
274 if (Limit >= 0) {
275 if (NumTries >= SwpLoopLimit)
276 return Changed;
277 NumTries++;
278 }
279#endif
280
281 setPragmaPipelineOptions(L);
282 if (!canPipelineLoop(L)) {
283 LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
284 ORE->emit([&]() {
285 return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
286 L.getStartLoc(), L.getHeader())
287 << "Failed to pipeline loop";
288 });
289
290 LI.LoopPipelinerInfo.reset();
291 return Changed;
292 }
293
294 ++NumTrytoPipeline;
295
296 Changed = swingModuloScheduler(L);
297
298 LI.LoopPipelinerInfo.reset();
299 return Changed;
300}
301
302void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
303 // Reset the pragma for the next loop in iteration.
304 disabledByPragma = false;
305 II_setByPragma = 0;
306
307 MachineBasicBlock *LBLK = L.getTopBlock();
308
309 if (LBLK == nullptr)
310 return;
311
312 const BasicBlock *BBLK = LBLK->getBasicBlock();
313 if (BBLK == nullptr)
314 return;
315
316 const Instruction *TI = BBLK->getTerminator();
317 if (TI == nullptr)
318 return;
319
320 MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);
321 if (LoopID == nullptr)
322 return;
323
324 assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");
325 assert(LoopID->getOperand(0) == LoopID && "invalid loop");
326
327 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
328 MDNode *MD = dyn_cast<MDNode>(MDO);
329
330 if (MD == nullptr)
331 continue;
332
333 MDString *S = dyn_cast<MDString>(MD->getOperand(0));
334
335 if (S == nullptr)
336 continue;
337
338 if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
339 assert(MD->getNumOperands() == 2 &&
340 "Pipeline initiation interval hint metadata should have two operands.");
342 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
343 assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");
344 } else if (S->getString() == "llvm.loop.pipeline.disable") {
345 disabledByPragma = true;
346 }
347 }
348}
349
350/// Return true if the loop can be software pipelined. The algorithm is
351/// restricted to loops with a single basic block. Make sure that the
352/// branch in the loop can be analyzed.
353bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
354 if (L.getNumBlocks() != 1) {
355 ORE->emit([&]() {
356 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
357 L.getStartLoc(), L.getHeader())
358 << "Not a single basic block: "
359 << ore::NV("NumBlocks", L.getNumBlocks());
360 });
361 return false;
362 }
363
364 if (disabledByPragma) {
365 ORE->emit([&]() {
366 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
367 L.getStartLoc(), L.getHeader())
368 << "Disabled by Pragma.";
369 });
370 return false;
371 }
372
373 // Check if the branch can't be understood because we can't do pipelining
374 // if that's the case.
375 LI.TBB = nullptr;
376 LI.FBB = nullptr;
377 LI.BrCond.clear();
378 if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
379 LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
380 NumFailBranch++;
381 ORE->emit([&]() {
382 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
383 L.getStartLoc(), L.getHeader())
384 << "The branch can't be understood";
385 });
386 return false;
387 }
388
389 LI.LoopInductionVar = nullptr;
390 LI.LoopCompare = nullptr;
392 if (!LI.LoopPipelinerInfo) {
393 LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
394 NumFailLoop++;
395 ORE->emit([&]() {
396 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
397 L.getStartLoc(), L.getHeader())
398 << "The loop structure is not supported";
399 });
400 return false;
401 }
402
403 if (!L.getLoopPreheader()) {
404 LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
405 NumFailPreheader++;
406 ORE->emit([&]() {
407 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
408 L.getStartLoc(), L.getHeader())
409 << "No loop preheader found";
410 });
411 return false;
412 }
413
414 // Remove any subregisters from inputs to phi nodes.
415 preprocessPhiNodes(*L.getHeader());
416 return true;
417}
418
419void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
421 SlotIndexes &Slots = *getAnalysis<LiveIntervals>().getSlotIndexes();
422
423 for (MachineInstr &PI : B.phis()) {
424 MachineOperand &DefOp = PI.getOperand(0);
425 assert(DefOp.getSubReg() == 0);
426 auto *RC = MRI.getRegClass(DefOp.getReg());
427
428 for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
429 MachineOperand &RegOp = PI.getOperand(i);
430 if (RegOp.getSubReg() == 0)
431 continue;
432
433 // If the operand uses a subregister, replace it with a new register
434 // without subregisters, and generate a copy to the new register.
435 Register NewReg = MRI.createVirtualRegister(RC);
436 MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
438 const DebugLoc &DL = PredB.findDebugLoc(At);
439 auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
440 .addReg(RegOp.getReg(), getRegState(RegOp),
441 RegOp.getSubReg());
442 Slots.insertMachineInstrInMaps(*Copy);
443 RegOp.setReg(NewReg);
444 RegOp.setSubReg(0);
445 }
446 }
447}
448
449/// The SMS algorithm consists of the following main steps:
450/// 1. Computation and analysis of the dependence graph.
451/// 2. Ordering of the nodes (instructions).
452/// 3. Attempt to Schedule the loop.
453bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
454 assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
455
456 SwingSchedulerDAG SMS(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo,
458
459 MachineBasicBlock *MBB = L.getHeader();
460 // The kernel should not include any terminator instructions. These
461 // will be added back later.
462 SMS.startBlock(MBB);
463
464 // Compute the number of 'real' instructions in the basic block by
465 // ignoring terminators.
466 unsigned size = MBB->size();
468 E = MBB->instr_end();
469 I != E; ++I, --size)
470 ;
471
472 SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
473 SMS.schedule();
474 SMS.exitRegion();
475
476 SMS.finishBlock();
477 return SMS.hasNewSchedule();
478}
479
488}
489
490void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
491 if (SwpForceII > 0)
492 MII = SwpForceII;
493 else if (II_setByPragma > 0)
494 MII = II_setByPragma;
495 else
496 MII = std::max(ResMII, RecMII);
497}
498
499void SwingSchedulerDAG::setMAX_II() {
500 if (SwpForceII > 0)
501 MAX_II = SwpForceII;
502 else if (II_setByPragma > 0)
503 MAX_II = II_setByPragma;
504 else
505 MAX_II = MII + SwpIISearchRange;
506}
507
508/// We override the schedule function in ScheduleDAGInstrs to implement the
509/// scheduling part of the Swing Modulo Scheduling algorithm.
511 AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
512 buildSchedGraph(AA);
513 addLoopCarriedDependences(AA);
514 updatePhiDependences();
516 changeDependences();
517 postProcessDAG();
518 LLVM_DEBUG(dump());
519
520 NodeSetType NodeSets;
521 findCircuits(NodeSets);
522 NodeSetType Circuits = NodeSets;
523
524 // Calculate the MII.
525 unsigned ResMII = calculateResMII();
526 unsigned RecMII = calculateRecMII(NodeSets);
527
528 fuseRecs(NodeSets);
529
530 // This flag is used for testing and can cause correctness problems.
531 if (SwpIgnoreRecMII)
532 RecMII = 0;
533
534 setMII(ResMII, RecMII);
535 setMAX_II();
536
537 LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
538 << " (rec=" << RecMII << ", res=" << ResMII << ")\n");
539
540 // Can't schedule a loop without a valid MII.
541 if (MII == 0) {
542 LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
543 NumFailZeroMII++;
544 Pass.ORE->emit([&]() {
546 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
547 << "Invalid Minimal Initiation Interval: 0";
548 });
549 return;
550 }
551
552 // Don't pipeline large loops.
553 if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {
554 LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
555 << ", we don't pipeline large loops\n");
556 NumFailLargeMaxMII++;
557 Pass.ORE->emit([&]() {
559 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
560 << "Minimal Initiation Interval too large: "
561 << ore::NV("MII", (int)MII) << " > "
562 << ore::NV("SwpMaxMii", SwpMaxMii) << "."
563 << "Refer to -pipeliner-max-mii.";
564 });
565 return;
566 }
567
568 computeNodeFunctions(NodeSets);
569
570 registerPressureFilter(NodeSets);
571
572 colocateNodeSets(NodeSets);
573
574 checkNodeSets(NodeSets);
575
576 LLVM_DEBUG({
577 for (auto &I : NodeSets) {
578 dbgs() << " Rec NodeSet ";
579 I.dump();
580 }
581 });
582
583 llvm::stable_sort(NodeSets, std::greater<NodeSet>());
584
585 groupRemainingNodes(NodeSets);
586
587 removeDuplicateNodes(NodeSets);
588
589 LLVM_DEBUG({
590 for (auto &I : NodeSets) {
591 dbgs() << " NodeSet ";
592 I.dump();
593 }
594 });
595
596 computeNodeOrder(NodeSets);
597
598 // check for node order issues
599 checkValidNodeOrder(Circuits);
600
601 SMSchedule Schedule(Pass.MF, this);
602 Scheduled = schedulePipeline(Schedule);
603
604 if (!Scheduled){
605 LLVM_DEBUG(dbgs() << "No schedule found, return\n");
606 NumFailNoSchedule++;
607 Pass.ORE->emit([&]() {
609 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
610 << "Unable to find schedule";
611 });
612 return;
613 }
614
615 unsigned numStages = Schedule.getMaxStageCount();
616 // No need to generate pipeline if there are no overlapped iterations.
617 if (numStages == 0) {
618 LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
619 NumFailZeroStage++;
620 Pass.ORE->emit([&]() {
622 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
623 << "No need to pipeline - no overlapped iterations in schedule.";
624 });
625 return;
626 }
627 // Check that the maximum stage count is less than user-defined limit.
628 if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {
629 LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
630 << " : too many stages, abort\n");
631 NumFailLargeMaxStage++;
632 Pass.ORE->emit([&]() {
634 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
635 << "Too many stages in schedule: "
636 << ore::NV("numStages", (int)numStages) << " > "
637 << ore::NV("SwpMaxStages", SwpMaxStages)
638 << ". Refer to -pipeliner-max-stages.";
639 });
640 return;
641 }
642
643 Pass.ORE->emit([&]() {
645 Loop.getHeader())
646 << "Pipelined succesfully!";
647 });
648
649 // Generate the schedule as a ModuloSchedule.
650 DenseMap<MachineInstr *, int> Cycles, Stages;
651 std::vector<MachineInstr *> OrderedInsts;
652 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
653 ++Cycle) {
654 for (SUnit *SU : Schedule.getInstructions(Cycle)) {
655 OrderedInsts.push_back(SU->getInstr());
656 Cycles[SU->getInstr()] = Cycle;
657 Stages[SU->getInstr()] = Schedule.stageScheduled(SU);
658 }
659 }
661 for (auto &KV : NewMIs) {
662 Cycles[KV.first] = Cycles[KV.second];
663 Stages[KV.first] = Stages[KV.second];
664 NewInstrChanges[KV.first] = InstrChanges[getSUnit(KV.first)];
665 }
666
667 ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
668 std::move(Stages));
670 assert(NewInstrChanges.empty() &&
671 "Cannot serialize a schedule with InstrChanges!");
673 MSTI.annotate();
674 return;
675 }
676 // The experimental code generator can't work if there are InstChanges.
677 if (ExperimentalCodeGen && NewInstrChanges.empty()) {
678 PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
679 MSE.expand();
680 } else {
681 ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
682 MSE.expand();
683 MSE.cleanup();
684 }
685 ++NumPipelined;
686}
687
688/// Clean up after the software pipeliner runs.
690 for (auto &KV : NewMIs)
691 MF.deleteMachineInstr(KV.second);
692 NewMIs.clear();
693
694 // Call the superclass.
696}
697
698/// Return the register values for the operands of a Phi instruction.
699/// This function assume the instruction is a Phi.
701 unsigned &InitVal, unsigned &LoopVal) {
702 assert(Phi.isPHI() && "Expecting a Phi.");
703
704 InitVal = 0;
705 LoopVal = 0;
706 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
707 if (Phi.getOperand(i + 1).getMBB() != Loop)
708 InitVal = Phi.getOperand(i).getReg();
709 else
710 LoopVal = Phi.getOperand(i).getReg();
711
712 assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
713}
714
715/// Return the Phi register value that comes the loop block.
716static unsigned getLoopPhiReg(const MachineInstr &Phi,
717 const MachineBasicBlock *LoopBB) {
718 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
719 if (Phi.getOperand(i + 1).getMBB() == LoopBB)
720 return Phi.getOperand(i).getReg();
721 return 0;
722}
723
724/// Return true if SUb can be reached from SUa following the chain edges.
725static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
728 Worklist.push_back(SUa);
729 while (!Worklist.empty()) {
730 const SUnit *SU = Worklist.pop_back_val();
731 for (const auto &SI : SU->Succs) {
732 SUnit *SuccSU = SI.getSUnit();
733 if (SI.getKind() == SDep::Order) {
734 if (Visited.count(SuccSU))
735 continue;
736 if (SuccSU == SUb)
737 return true;
738 Worklist.push_back(SuccSU);
739 Visited.insert(SuccSU);
740 }
741 }
742 }
743 return false;
744}
745
746/// Return true if the instruction causes a chain between memory
747/// references before and after it.
749 return MI.isCall() || MI.mayRaiseFPException() ||
750 MI.hasUnmodeledSideEffects() ||
751 (MI.hasOrderedMemoryRef() &&
752 (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad()));
753}
754
755/// Return the underlying objects for the memory references of an instruction.
756/// This function calls the code in ValueTracking, but first checks that the
757/// instruction has a memory operand.
760 if (!MI->hasOneMemOperand())
761 return;
762 MachineMemOperand *MM = *MI->memoperands_begin();
763 if (!MM->getValue())
764 return;
765 getUnderlyingObjects(MM->getValue(), Objs);
766 for (const Value *V : Objs) {
767 if (!isIdentifiedObject(V)) {
768 Objs.clear();
769 return;
770 }
771 }
772}
773
774/// Add a chain edge between a load and store if the store can be an
775/// alias of the load on a subsequent iteration, i.e., a loop carried
776/// dependence. This code is very similar to the code in ScheduleDAGInstrs
777/// but that code doesn't create loop carried dependences.
778void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
782 for (auto &SU : SUnits) {
783 MachineInstr &MI = *SU.getInstr();
785 PendingLoads.clear();
786 else if (MI.mayLoad()) {
789 if (Objs.empty())
791 for (const auto *V : Objs) {
792 SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
793 SUs.push_back(&SU);
794 }
795 } else if (MI.mayStore()) {
798 if (Objs.empty())
800 for (const auto *V : Objs) {
802 PendingLoads.find(V);
803 if (I == PendingLoads.end())
804 continue;
805 for (auto *Load : I->second) {
806 if (isSuccOrder(Load, &SU))
807 continue;
808 MachineInstr &LdMI = *Load->getInstr();
809 // First, perform the cheaper check that compares the base register.
810 // If they are the same and the load offset is less than the store
811 // offset, then mark the dependence as loop carried potentially.
812 const MachineOperand *BaseOp1, *BaseOp2;
813 int64_t Offset1, Offset2;
814 bool Offset1IsScalable, Offset2IsScalable;
815 if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1,
816 Offset1IsScalable, TRI) &&
817 TII->getMemOperandWithOffset(MI, BaseOp2, Offset2,
818 Offset2IsScalable, TRI)) {
819 if (BaseOp1->isIdenticalTo(*BaseOp2) &&
820 Offset1IsScalable == Offset2IsScalable &&
821 (int)Offset1 < (int)Offset2) {
823 "What happened to the chain edge?");
824 SDep Dep(Load, SDep::Barrier);
825 Dep.setLatency(1);
826 SU.addPred(Dep);
827 continue;
828 }
829 }
830 // Second, the more expensive check that uses alias analysis on the
831 // base registers. If they alias, and the load offset is less than
832 // the store offset, the mark the dependence as loop carried.
833 if (!AA) {
834 SDep Dep(Load, SDep::Barrier);
835 Dep.setLatency(1);
836 SU.addPred(Dep);
837 continue;
838 }
839 MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
840 MachineMemOperand *MMO2 = *MI.memoperands_begin();
841 if (!MMO1->getValue() || !MMO2->getValue()) {
842 SDep Dep(Load, SDep::Barrier);
843 Dep.setLatency(1);
844 SU.addPred(Dep);
845 continue;
846 }
847 if (MMO1->getValue() == MMO2->getValue() &&
848 MMO1->getOffset() <= MMO2->getOffset()) {
849 SDep Dep(Load, SDep::Barrier);
850 Dep.setLatency(1);
851 SU.addPred(Dep);
852 continue;
853 }
854 if (!AA->isNoAlias(
857 MMO2->getAAInfo()))) {
858 SDep Dep(Load, SDep::Barrier);
859 Dep.setLatency(1);
860 SU.addPred(Dep);
861 }
862 }
863 }
864 }
865 }
866}
867
868/// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
869/// processes dependences for PHIs. This function adds true dependences
870/// from a PHI to a use, and a loop carried dependence from the use to the
871/// PHI. The loop carried dependence is represented as an anti dependence
872/// edge. This function also removes chain dependences between unrelated
873/// PHIs.
874void SwingSchedulerDAG::updatePhiDependences() {
875 SmallVector<SDep, 4> RemoveDeps;
877
878 // Iterate over each DAG node.
879 for (SUnit &I : SUnits) {
880 RemoveDeps.clear();
881 // Set to true if the instruction has an operand defined by a Phi.
882 unsigned HasPhiUse = 0;
883 unsigned HasPhiDef = 0;
884 MachineInstr *MI = I.getInstr();
885 // Iterate over each operand, and we process the definitions.
886 for (const MachineOperand &MO : MI->operands()) {
887 if (!MO.isReg())
888 continue;
889 Register Reg = MO.getReg();
890 if (MO.isDef()) {
891 // If the register is used by a Phi, then create an anti dependence.
893 UI = MRI.use_instr_begin(Reg),
894 UE = MRI.use_instr_end();
895 UI != UE; ++UI) {
896 MachineInstr *UseMI = &*UI;
897 SUnit *SU = getSUnit(UseMI);
898 if (SU != nullptr && UseMI->isPHI()) {
899 if (!MI->isPHI()) {
900 SDep Dep(SU, SDep::Anti, Reg);
901 Dep.setLatency(1);
902 I.addPred(Dep);
903 } else {
904 HasPhiDef = Reg;
905 // Add a chain edge to a dependent Phi that isn't an existing
906 // predecessor.
907 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
908 I.addPred(SDep(SU, SDep::Barrier));
909 }
910 }
911 }
912 } else if (MO.isUse()) {
913 // If the register is defined by a Phi, then create a true dependence.
915 if (DefMI == nullptr)
916 continue;
917 SUnit *SU = getSUnit(DefMI);
918 if (SU != nullptr && DefMI->isPHI()) {
919 if (!MI->isPHI()) {
920 SDep Dep(SU, SDep::Data, Reg);
921 Dep.setLatency(0);
922 ST.adjustSchedDependency(SU, 0, &I, MO.getOperandNo(), Dep,
923 &SchedModel);
924 I.addPred(Dep);
925 } else {
926 HasPhiUse = Reg;
927 // Add a chain edge to a dependent Phi that isn't an existing
928 // predecessor.
929 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
930 I.addPred(SDep(SU, SDep::Barrier));
931 }
932 }
933 }
934 }
935 // Remove order dependences from an unrelated Phi.
936 if (!SwpPruneDeps)
937 continue;
938 for (auto &PI : I.Preds) {
939 MachineInstr *PMI = PI.getSUnit()->getInstr();
940 if (PMI->isPHI() && PI.getKind() == SDep::Order) {
941 if (I.getInstr()->isPHI()) {
942 if (PMI->getOperand(0).getReg() == HasPhiUse)
943 continue;
944 if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
945 continue;
946 }
947 RemoveDeps.push_back(PI);
948 }
949 }
950 for (int i = 0, e = RemoveDeps.size(); i != e; ++i)
951 I.removePred(RemoveDeps[i]);
952 }
953}
954
955/// Iterate over each DAG node and see if we can change any dependences
956/// in order to reduce the recurrence MII.
957void SwingSchedulerDAG::changeDependences() {
958 // See if an instruction can use a value from the previous iteration.
959 // If so, we update the base and offset of the instruction and change
960 // the dependences.
961 for (SUnit &I : SUnits) {
962 unsigned BasePos = 0, OffsetPos = 0, NewBase = 0;
963 int64_t NewOffset = 0;
964 if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
965 NewOffset))
966 continue;
967
968 // Get the MI and SUnit for the instruction that defines the original base.
969 Register OrigBase = I.getInstr()->getOperand(BasePos).getReg();
971 if (!DefMI)
972 continue;
973 SUnit *DefSU = getSUnit(DefMI);
974 if (!DefSU)
975 continue;
976 // Get the MI and SUnit for the instruction that defins the new base.
977 MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
978 if (!LastMI)
979 continue;
980 SUnit *LastSU = getSUnit(LastMI);
981 if (!LastSU)
982 continue;
983
984 if (Topo.IsReachable(&I, LastSU))
985 continue;
986
987 // Remove the dependence. The value now depends on a prior iteration.
989 for (const SDep &P : I.Preds)
990 if (P.getSUnit() == DefSU)
991 Deps.push_back(P);
992 for (int i = 0, e = Deps.size(); i != e; i++) {
993 Topo.RemovePred(&I, Deps[i].getSUnit());
994 I.removePred(Deps[i]);
995 }
996 // Remove the chain dependence between the instructions.
997 Deps.clear();
998 for (auto &P : LastSU->Preds)
999 if (P.getSUnit() == &I && P.getKind() == SDep::Order)
1000 Deps.push_back(P);
1001 for (int i = 0, e = Deps.size(); i != e; i++) {
1002 Topo.RemovePred(LastSU, Deps[i].getSUnit());
1003 LastSU->removePred(Deps[i]);
1004 }
1005
1006 // Add a dependence between the new instruction and the instruction
1007 // that defines the new base.
1008 SDep Dep(&I, SDep::Anti, NewBase);
1009 Topo.AddPred(LastSU, &I);
1010 LastSU->addPred(Dep);
1011
1012 // Remember the base and offset information so that we can update the
1013 // instruction during code generation.
1014 InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
1015 }
1016}
1017
1018/// Create an instruction stream that represents a single iteration and stage of
1019/// each instruction. This function differs from SMSchedule::finalizeSchedule in
1020/// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this
1021/// function is an approximation of SMSchedule::finalizeSchedule with all
1022/// non-const operations removed.
1024 SMSchedule &Schedule,
1025 std::vector<MachineInstr *> &OrderedInsts,
1028
1029 // Move all instructions to the first stage from the later stages.
1030 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1031 ++Cycle) {
1032 for (int Stage = 0, LastStage = Schedule.getMaxStageCount();
1033 Stage <= LastStage; ++Stage) {
1034 for (SUnit *SU : llvm::reverse(Schedule.getInstructions(
1035 Cycle + Stage * Schedule.getInitiationInterval()))) {
1036 Instrs[Cycle].push_front(SU);
1037 }
1038 }
1039 }
1040
1041 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1042 ++Cycle) {
1043 std::deque<SUnit *> &CycleInstrs = Instrs[Cycle];
1044 CycleInstrs = Schedule.reorderInstructions(SSD, CycleInstrs);
1045 for (SUnit *SU : CycleInstrs) {
1046 MachineInstr *MI = SU->getInstr();
1047 OrderedInsts.push_back(MI);
1048 Stages[MI] = Schedule.stageScheduled(SU);
1049 }
1050 }
1051}
1052
1053namespace {
1054
1055// FuncUnitSorter - Comparison operator used to sort instructions by
1056// the number of functional unit choices.
1057struct FuncUnitSorter {
1058 const InstrItineraryData *InstrItins;
1059 const MCSubtargetInfo *STI;
1061
1062 FuncUnitSorter(const TargetSubtargetInfo &TSI)
1063 : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
1064
1065 // Compute the number of functional unit alternatives needed
1066 // at each stage, and take the minimum value. We prioritize the
1067 // instructions by the least number of choices first.
1068 unsigned minFuncUnits(const MachineInstr *Inst,
1069 InstrStage::FuncUnits &F) const {
1070 unsigned SchedClass = Inst->getDesc().getSchedClass();
1071 unsigned min = UINT_MAX;
1072 if (InstrItins && !InstrItins->isEmpty()) {
1073 for (const InstrStage &IS :
1074 make_range(InstrItins->beginStage(SchedClass),
1075 InstrItins->endStage(SchedClass))) {
1076 InstrStage::FuncUnits funcUnits = IS.getUnits();
1077 unsigned numAlternatives = llvm::popcount(funcUnits);
1078 if (numAlternatives < min) {
1079 min = numAlternatives;
1080 F = funcUnits;
1081 }
1082 }
1083 return min;
1084 }
1085 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1086 const MCSchedClassDesc *SCDesc =
1087 STI->getSchedModel().getSchedClassDesc(SchedClass);
1088 if (!SCDesc->isValid())
1089 // No valid Schedule Class Desc for schedClass, should be
1090 // Pseudo/PostRAPseudo
1091 return min;
1092
1093 for (const MCWriteProcResEntry &PRE :
1094 make_range(STI->getWriteProcResBegin(SCDesc),
1095 STI->getWriteProcResEnd(SCDesc))) {
1096 if (!PRE.ReleaseAtCycle)
1097 continue;
1098 const MCProcResourceDesc *ProcResource =
1099 STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);
1100 unsigned NumUnits = ProcResource->NumUnits;
1101 if (NumUnits < min) {
1102 min = NumUnits;
1103 F = PRE.ProcResourceIdx;
1104 }
1105 }
1106 return min;
1107 }
1108 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1109 }
1110
1111 // Compute the critical resources needed by the instruction. This
1112 // function records the functional units needed by instructions that
1113 // must use only one functional unit. We use this as a tie breaker
1114 // for computing the resource MII. The instrutions that require
1115 // the same, highly used, functional unit have high priority.
1116 void calcCriticalResources(MachineInstr &MI) {
1117 unsigned SchedClass = MI.getDesc().getSchedClass();
1118 if (InstrItins && !InstrItins->isEmpty()) {
1119 for (const InstrStage &IS :
1120 make_range(InstrItins->beginStage(SchedClass),
1121 InstrItins->endStage(SchedClass))) {
1122 InstrStage::FuncUnits FuncUnits = IS.getUnits();
1123 if (llvm::popcount(FuncUnits) == 1)
1124 Resources[FuncUnits]++;
1125 }
1126 return;
1127 }
1128 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1129 const MCSchedClassDesc *SCDesc =
1130 STI->getSchedModel().getSchedClassDesc(SchedClass);
1131 if (!SCDesc->isValid())
1132 // No valid Schedule Class Desc for schedClass, should be
1133 // Pseudo/PostRAPseudo
1134 return;
1135
1136 for (const MCWriteProcResEntry &PRE :
1137 make_range(STI->getWriteProcResBegin(SCDesc),
1138 STI->getWriteProcResEnd(SCDesc))) {
1139 if (!PRE.ReleaseAtCycle)
1140 continue;
1141 Resources[PRE.ProcResourceIdx]++;
1142 }
1143 return;
1144 }
1145 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1146 }
1147
1148 /// Return true if IS1 has less priority than IS2.
1149 bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1150 InstrStage::FuncUnits F1 = 0, F2 = 0;
1151 unsigned MFUs1 = minFuncUnits(IS1, F1);
1152 unsigned MFUs2 = minFuncUnits(IS2, F2);
1153 if (MFUs1 == MFUs2)
1154 return Resources.lookup(F1) < Resources.lookup(F2);
1155 return MFUs1 > MFUs2;
1156 }
1157};
1158
1159/// Calculate the maximum register pressure of the scheduled instructions stream
1160class HighRegisterPressureDetector {
1161 MachineBasicBlock *OrigMBB;
1162 const MachineFunction &MF;
1163 const MachineRegisterInfo &MRI;
1164 const TargetRegisterInfo *TRI;
1165
1166 const unsigned PSetNum;
1167
1168 // Indexed by PSet ID
1169 // InitSetPressure takes into account the register pressure of live-in
1170 // registers. It's not depend on how the loop is scheduled, so it's enough to
1171 // calculate them once at the beginning.
1172 std::vector<unsigned> InitSetPressure;
1173
1174 // Indexed by PSet ID
1175 // Upper limit for each register pressure set
1176 std::vector<unsigned> PressureSetLimit;
1177
1179
1181
1182public:
1183 using OrderedInstsTy = std::vector<MachineInstr *>;
1184 using Instr2StageTy = DenseMap<MachineInstr *, unsigned>;
1185
1186private:
1187 static void dumpRegisterPressures(const std::vector<unsigned> &Pressures) {
1188 if (Pressures.size() == 0) {
1189 dbgs() << "[]";
1190 } else {
1191 char Prefix = '[';
1192 for (unsigned P : Pressures) {
1193 dbgs() << Prefix << P;
1194 Prefix = ' ';
1195 }
1196 dbgs() << ']';
1197 }
1198 }
1199
1200 void dumpPSet(Register Reg) const {
1201 dbgs() << "Reg=" << printReg(Reg, TRI, 0, &MRI) << " PSet=";
1202 for (auto PSetIter = MRI.getPressureSets(Reg); PSetIter.isValid();
1203 ++PSetIter) {
1204 dbgs() << *PSetIter << ' ';
1205 }
1206 dbgs() << '\n';
1207 }
1208
1209 void increaseRegisterPressure(std::vector<unsigned> &Pressure,
1210 Register Reg) const {
1211 auto PSetIter = MRI.getPressureSets(Reg);
1212 unsigned Weight = PSetIter.getWeight();
1213 for (; PSetIter.isValid(); ++PSetIter)
1214 Pressure[*PSetIter] += Weight;
1215 }
1216
1217 void decreaseRegisterPressure(std::vector<unsigned> &Pressure,
1218 Register Reg) const {
1219 auto PSetIter = MRI.getPressureSets(Reg);
1220 unsigned Weight = PSetIter.getWeight();
1221 for (; PSetIter.isValid(); ++PSetIter) {
1222 auto &P = Pressure[*PSetIter];
1223 assert(P >= Weight &&
1224 "register pressure must be greater than or equal weight");
1225 P -= Weight;
1226 }
1227 }
1228
1229 // Return true if Reg is fixed one, for example, stack pointer
1230 bool isFixedRegister(Register Reg) const {
1231 return Reg.isPhysical() && TRI->isFixedRegister(MF, Reg.asMCReg());
1232 }
1233
1234 bool isDefinedInThisLoop(Register Reg) const {
1235 return Reg.isVirtual() && MRI.getVRegDef(Reg)->getParent() == OrigMBB;
1236 }
1237
1238 // Search for live-in variables. They are factored into the register pressure
1239 // from the begining. Live-in variables used by every iteration should be
1240 // considered as alive throughout the loop. For example, the variable `c` in
1241 // following code. \code
1242 // int c = ...;
1243 // for (int i = 0; i < n; i++)
1244 // a[i] += b[i] + c;
1245 // \endcode
1246 void computeLiveIn() {
1248 for (auto &MI : *OrigMBB) {
1249 if (MI.isDebugInstr())
1250 continue;
1251 for (auto &Use : ROMap[&MI].Uses) {
1252 auto Reg = Use.RegUnit;
1253 // Ignore the variable that appears only on one side of phi instruction
1254 // because it's used only at the first iteration.
1255 if (MI.isPHI() && Reg != getLoopPhiReg(MI, OrigMBB))
1256 continue;
1257 if (isFixedRegister(Reg))
1258 continue;
1259 if (isDefinedInThisLoop(Reg))
1260 continue;
1261 Used.insert(Reg);
1262 }
1263 }
1264
1265 for (auto LiveIn : Used)
1266 increaseRegisterPressure(InitSetPressure, LiveIn);
1267 }
1268
1269 // Calculate the upper limit of each pressure set
1270 void computePressureSetLimit(const RegisterClassInfo &RCI) {
1271 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1272 PressureSetLimit[PSet] = TRI->getRegPressureSetLimit(MF, PSet);
1273
1274 // We assume fixed registers, such as stack pointer, are already in use.
1275 // Therefore subtracting the weight of the fixed registers from the limit of
1276 // each pressure set in advance.
1278 for (const TargetRegisterClass *TRC : TRI->regclasses()) {
1279 for (const MCPhysReg Reg : *TRC)
1280 if (isFixedRegister(Reg))
1281 FixedRegs.insert(Reg);
1282 }
1283
1284 LLVM_DEBUG({
1285 for (auto Reg : FixedRegs) {
1286 dbgs() << printReg(Reg, TRI, 0, &MRI) << ": [";
1287 const int *Sets = TRI->getRegUnitPressureSets(Reg);
1288 for (; *Sets != -1; Sets++) {
1289 dbgs() << TRI->getRegPressureSetName(*Sets) << ", ";
1290 }
1291 dbgs() << "]\n";
1292 }
1293 });
1294
1295 for (auto Reg : FixedRegs) {
1296 LLVM_DEBUG(dbgs() << "fixed register: " << printReg(Reg, TRI, 0, &MRI)
1297 << "\n");
1298 auto PSetIter = MRI.getPressureSets(Reg);
1299 unsigned Weight = PSetIter.getWeight();
1300 for (; PSetIter.isValid(); ++PSetIter) {
1301 unsigned &Limit = PressureSetLimit[*PSetIter];
1302 assert(Limit >= Weight &&
1303 "register pressure limit must be greater than or equal weight");
1304 Limit -= Weight;
1305 LLVM_DEBUG(dbgs() << "PSet=" << *PSetIter << " Limit=" << Limit
1306 << " (decreased by " << Weight << ")\n");
1307 }
1308 }
1309 }
1310
1311 // There are two patterns of last-use.
1312 // - by an instruction of the current iteration
1313 // - by a phi instruction of the next iteration (loop carried value)
1314 //
1315 // Furthermore, following two groups of instructions are executed
1316 // simultaneously
1317 // - next iteration's phi instructions in i-th stage
1318 // - current iteration's instructions in i+1-th stage
1319 //
1320 // This function calculates the last-use of each register while taking into
1321 // account the above two patterns.
1322 Instr2LastUsesTy computeLastUses(const OrderedInstsTy &OrderedInsts,
1323 Instr2StageTy &Stages) const {
1324 // We treat virtual registers that are defined and used in this loop.
1325 // Following virtual register will be ignored
1326 // - live-in one
1327 // - defined but not used in the loop (potentially live-out)
1328 DenseSet<Register> TargetRegs;
1329 const auto UpdateTargetRegs = [this, &TargetRegs](Register Reg) {
1330 if (isDefinedInThisLoop(Reg))
1331 TargetRegs.insert(Reg);
1332 };
1333 for (MachineInstr *MI : OrderedInsts) {
1334 if (MI->isPHI()) {
1335 Register Reg = getLoopPhiReg(*MI, OrigMBB);
1336 UpdateTargetRegs(Reg);
1337 } else {
1338 for (auto &Use : ROMap.find(MI)->getSecond().Uses)
1339 UpdateTargetRegs(Use.RegUnit);
1340 }
1341 }
1342
1343 const auto InstrScore = [&Stages](MachineInstr *MI) {
1344 return Stages[MI] + MI->isPHI();
1345 };
1346
1348 for (MachineInstr *MI : llvm::reverse(OrderedInsts)) {
1349 for (auto &Use : ROMap.find(MI)->getSecond().Uses) {
1350 auto Reg = Use.RegUnit;
1351 if (!TargetRegs.contains(Reg))
1352 continue;
1353 auto Ite = LastUseMI.find(Reg);
1354 if (Ite == LastUseMI.end()) {
1355 LastUseMI[Reg] = MI;
1356 } else {
1357 MachineInstr *Orig = Ite->second;
1358 MachineInstr *New = MI;
1359 if (InstrScore(Orig) < InstrScore(New))
1360 LastUseMI[Reg] = New;
1361 }
1362 }
1363 }
1364
1365 Instr2LastUsesTy LastUses;
1366 for (auto &Entry : LastUseMI)
1367 LastUses[Entry.second].insert(Entry.first);
1368 return LastUses;
1369 }
1370
1371 // Compute the maximum register pressure of the kernel. We'll simulate #Stage
1372 // iterations and check the register pressure at the point where all stages
1373 // overlapping.
1374 //
1375 // An example of unrolled loop where #Stage is 4..
1376 // Iter i+0 i+1 i+2 i+3
1377 // ------------------------
1378 // Stage 0
1379 // Stage 1 0
1380 // Stage 2 1 0
1381 // Stage 3 2 1 0 <- All stages overlap
1382 //
1383 std::vector<unsigned>
1384 computeMaxSetPressure(const OrderedInstsTy &OrderedInsts,
1385 Instr2StageTy &Stages,
1386 const unsigned StageCount) const {
1387 using RegSetTy = SmallDenseSet<Register, 16>;
1388
1389 // Indexed by #Iter. To treat "local" variables of each stage separately, we
1390 // manage the liveness of the registers independently by iterations.
1391 SmallVector<RegSetTy> LiveRegSets(StageCount);
1392
1393 auto CurSetPressure = InitSetPressure;
1394 auto MaxSetPressure = InitSetPressure;
1395 auto LastUses = computeLastUses(OrderedInsts, Stages);
1396
1397 LLVM_DEBUG({
1398 dbgs() << "Ordered instructions:\n";
1399 for (MachineInstr *MI : OrderedInsts) {
1400 dbgs() << "Stage " << Stages[MI] << ": ";
1401 MI->dump();
1402 }
1403 });
1404
1405 const auto InsertReg = [this, &CurSetPressure](RegSetTy &RegSet,
1406 Register Reg) {
1407 if (!Reg.isValid() || isFixedRegister(Reg))
1408 return;
1409
1410 bool Inserted = RegSet.insert(Reg).second;
1411 if (!Inserted)
1412 return;
1413
1414 LLVM_DEBUG(dbgs() << "insert " << printReg(Reg, TRI, 0, &MRI) << "\n");
1415 increaseRegisterPressure(CurSetPressure, Reg);
1416 LLVM_DEBUG(dumpPSet(Reg));
1417 };
1418
1419 const auto EraseReg = [this, &CurSetPressure](RegSetTy &RegSet,
1420 Register Reg) {
1421 if (!Reg.isValid() || isFixedRegister(Reg))
1422 return;
1423
1424 // live-in register
1425 if (!RegSet.contains(Reg))
1426 return;
1427
1428 LLVM_DEBUG(dbgs() << "erase " << printReg(Reg, TRI, 0, &MRI) << "\n");
1429 RegSet.erase(Reg);
1430 decreaseRegisterPressure(CurSetPressure, Reg);
1431 LLVM_DEBUG(dumpPSet(Reg));
1432 };
1433
1434 for (unsigned I = 0; I < StageCount; I++) {
1435 for (MachineInstr *MI : OrderedInsts) {
1436 const auto Stage = Stages[MI];
1437 if (I < Stage)
1438 continue;
1439
1440 const unsigned Iter = I - Stage;
1441
1442 for (auto &Def : ROMap.find(MI)->getSecond().Defs)
1443 InsertReg(LiveRegSets[Iter], Def.RegUnit);
1444
1445 for (auto LastUse : LastUses[MI]) {
1446 if (MI->isPHI()) {
1447 if (Iter != 0)
1448 EraseReg(LiveRegSets[Iter - 1], LastUse);
1449 } else {
1450 EraseReg(LiveRegSets[Iter], LastUse);
1451 }
1452 }
1453
1454 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1455 MaxSetPressure[PSet] =
1456 std::max(MaxSetPressure[PSet], CurSetPressure[PSet]);
1457
1458 LLVM_DEBUG({
1459 dbgs() << "CurSetPressure=";
1460 dumpRegisterPressures(CurSetPressure);
1461 dbgs() << " iter=" << Iter << " stage=" << Stage << ":";
1462 MI->dump();
1463 });
1464 }
1465 }
1466
1467 return MaxSetPressure;
1468 }
1469
1470public:
1471 HighRegisterPressureDetector(MachineBasicBlock *OrigMBB,
1472 const MachineFunction &MF)
1473 : OrigMBB(OrigMBB), MF(MF), MRI(MF.getRegInfo()),
1474 TRI(MF.getSubtarget().getRegisterInfo()),
1475 PSetNum(TRI->getNumRegPressureSets()), InitSetPressure(PSetNum, 0),
1476 PressureSetLimit(PSetNum, 0) {}
1477
1478 // Used to calculate register pressure, which is independent of loop
1479 // scheduling.
1480 void init(const RegisterClassInfo &RCI) {
1481 for (MachineInstr &MI : *OrigMBB) {
1482 if (MI.isDebugInstr())
1483 continue;
1484 ROMap[&MI].collect(MI, *TRI, MRI, false, true);
1485 }
1486
1487 computeLiveIn();
1488 computePressureSetLimit(RCI);
1489 }
1490
1491 // Calculate the maximum register pressures of the loop and check if they
1492 // exceed the limit
1493 bool detect(const SwingSchedulerDAG *SSD, SMSchedule &Schedule,
1494 const unsigned MaxStage) const {
1496 "the percentage of the margin must be between 0 to 100");
1497
1498 OrderedInstsTy OrderedInsts;
1499 Instr2StageTy Stages;
1500 computeScheduledInsts(SSD, Schedule, OrderedInsts, Stages);
1501 const auto MaxSetPressure =
1502 computeMaxSetPressure(OrderedInsts, Stages, MaxStage + 1);
1503
1504 LLVM_DEBUG({
1505 dbgs() << "Dump MaxSetPressure:\n";
1506 for (unsigned I = 0; I < MaxSetPressure.size(); I++) {
1507 dbgs() << format("MaxSetPressure[%d]=%d\n", I, MaxSetPressure[I]);
1508 }
1509 dbgs() << '\n';
1510 });
1511
1512 for (unsigned PSet = 0; PSet < PSetNum; PSet++) {
1513 unsigned Limit = PressureSetLimit[PSet];
1514 unsigned Margin = Limit * RegPressureMargin / 100;
1515 LLVM_DEBUG(dbgs() << "PSet=" << PSet << " Limit=" << Limit
1516 << " Margin=" << Margin << "\n");
1517 if (Limit < MaxSetPressure[PSet] + Margin) {
1518 LLVM_DEBUG(
1519 dbgs()
1520 << "Rejected the schedule because of too high register pressure\n");
1521 return true;
1522 }
1523 }
1524 return false;
1525 }
1526};
1527
1528} // end anonymous namespace
1529
1530/// Calculate the resource constrained minimum initiation interval for the
1531/// specified loop. We use the DFA to model the resources needed for
1532/// each instruction, and we ignore dependences. A different DFA is created
1533/// for each cycle that is required. When adding a new instruction, we attempt
1534/// to add it to each existing DFA, until a legal space is found. If the
1535/// instruction cannot be reserved in an existing DFA, we create a new one.
1536unsigned SwingSchedulerDAG::calculateResMII() {
1537 LLVM_DEBUG(dbgs() << "calculateResMII:\n");
1539 return RM.calculateResMII();
1540}
1541
1542/// Calculate the recurrence-constrainted minimum initiation interval.
1543/// Iterate over each circuit. Compute the delay(c) and distance(c)
1544/// for each circuit. The II needs to satisfy the inequality
1545/// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1546/// II that satisfies the inequality, and the RecMII is the maximum
1547/// of those values.
1548unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1549 unsigned RecMII = 0;
1550
1551 for (NodeSet &Nodes : NodeSets) {
1552 if (Nodes.empty())
1553 continue;
1554
1555 unsigned Delay = Nodes.getLatency();
1556 unsigned Distance = 1;
1557
1558 // ii = ceil(delay / distance)
1559 unsigned CurMII = (Delay + Distance - 1) / Distance;
1560 Nodes.setRecMII(CurMII);
1561 if (CurMII > RecMII)
1562 RecMII = CurMII;
1563 }
1564
1565 return RecMII;
1566}
1567
1568/// Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1569/// but we do this to find the circuits, and then change them back.
1570static void swapAntiDependences(std::vector<SUnit> &SUnits) {
1572 for (SUnit &SU : SUnits) {
1573 for (SDep &Pred : SU.Preds)
1574 if (Pred.getKind() == SDep::Anti)
1575 DepsAdded.push_back(std::make_pair(&SU, Pred));
1576 }
1577 for (std::pair<SUnit *, SDep> &P : DepsAdded) {
1578 // Remove this anti dependency and add one in the reverse direction.
1579 SUnit *SU = P.first;
1580 SDep &D = P.second;
1581 SUnit *TargetSU = D.getSUnit();
1582 unsigned Reg = D.getReg();
1583 unsigned Lat = D.getLatency();
1584 SU->removePred(D);
1585 SDep Dep(SU, SDep::Anti, Reg);
1586 Dep.setLatency(Lat);
1587 TargetSU->addPred(Dep);
1588 }
1589}
1590
1591/// Create the adjacency structure of the nodes in the graph.
1592void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1593 SwingSchedulerDAG *DAG) {
1594 BitVector Added(SUnits.size());
1595 DenseMap<int, int> OutputDeps;
1596 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1597 Added.reset();
1598 // Add any successor to the adjacency matrix and exclude duplicates.
1599 for (auto &SI : SUnits[i].Succs) {
1600 // Only create a back-edge on the first and last nodes of a dependence
1601 // chain. This records any chains and adds them later.
1602 if (SI.getKind() == SDep::Output) {
1603 int N = SI.getSUnit()->NodeNum;
1604 int BackEdge = i;
1605 auto Dep = OutputDeps.find(BackEdge);
1606 if (Dep != OutputDeps.end()) {
1607 BackEdge = Dep->second;
1608 OutputDeps.erase(Dep);
1609 }
1610 OutputDeps[N] = BackEdge;
1611 }
1612 // Do not process a boundary node, an artificial node.
1613 // A back-edge is processed only if it goes to a Phi.
1614 if (SI.getSUnit()->isBoundaryNode() || SI.isArtificial() ||
1615 (SI.getKind() == SDep::Anti && !SI.getSUnit()->getInstr()->isPHI()))
1616 continue;
1617 int N = SI.getSUnit()->NodeNum;
1618 if (!Added.test(N)) {
1619 AdjK[i].push_back(N);
1620 Added.set(N);
1621 }
1622 }
1623 // A chain edge between a store and a load is treated as a back-edge in the
1624 // adjacency matrix.
1625 for (auto &PI : SUnits[i].Preds) {
1626 if (!SUnits[i].getInstr()->mayStore() ||
1627 !DAG->isLoopCarriedDep(&SUnits[i], PI, false))
1628 continue;
1629 if (PI.getKind() == SDep::Order && PI.getSUnit()->getInstr()->mayLoad()) {
1630 int N = PI.getSUnit()->NodeNum;
1631 if (!Added.test(N)) {
1632 AdjK[i].push_back(N);
1633 Added.set(N);
1634 }
1635 }
1636 }
1637 }
1638 // Add back-edges in the adjacency matrix for the output dependences.
1639 for (auto &OD : OutputDeps)
1640 if (!Added.test(OD.second)) {
1641 AdjK[OD.first].push_back(OD.second);
1642 Added.set(OD.second);
1643 }
1644}
1645
1646/// Identify an elementary circuit in the dependence graph starting at the
1647/// specified node.
1648bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1649 bool HasBackedge) {
1650 SUnit *SV = &SUnits[V];
1651 bool F = false;
1652 Stack.insert(SV);
1653 Blocked.set(V);
1654
1655 for (auto W : AdjK[V]) {
1656 if (NumPaths > MaxPaths)
1657 break;
1658 if (W < S)
1659 continue;
1660 if (W == S) {
1661 if (!HasBackedge)
1662 NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
1663 F = true;
1664 ++NumPaths;
1665 break;
1666 } else if (!Blocked.test(W)) {
1667 if (circuit(W, S, NodeSets,
1668 Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
1669 F = true;
1670 }
1671 }
1672
1673 if (F)
1674 unblock(V);
1675 else {
1676 for (auto W : AdjK[V]) {
1677 if (W < S)
1678 continue;
1679 B[W].insert(SV);
1680 }
1681 }
1682 Stack.pop_back();
1683 return F;
1684}
1685
1686/// Unblock a node in the circuit finding algorithm.
1687void SwingSchedulerDAG::Circuits::unblock(int U) {
1688 Blocked.reset(U);
1690 while (!BU.empty()) {
1692 assert(SI != BU.end() && "Invalid B set.");
1693 SUnit *W = *SI;
1694 BU.erase(W);
1695 if (Blocked.test(W->NodeNum))
1696 unblock(W->NodeNum);
1697 }
1698}
1699
1700/// Identify all the elementary circuits in the dependence graph using
1701/// Johnson's circuit algorithm.
1702void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
1703 // Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1704 // but we do this to find the circuits, and then change them back.
1706
1707 Circuits Cir(SUnits, Topo);
1708 // Create the adjacency structure.
1709 Cir.createAdjacencyStructure(this);
1710 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1711 Cir.reset();
1712 Cir.circuit(i, i, NodeSets);
1713 }
1714
1715 // Change the dependences back so that we've created a DAG again.
1717}
1718
1719// Create artificial dependencies between the source of COPY/REG_SEQUENCE that
1720// is loop-carried to the USE in next iteration. This will help pipeliner avoid
1721// additional copies that are needed across iterations. An artificial dependence
1722// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
1723
1724// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
1725// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
1726// PHI-------True-Dep------> USEOfPhi
1727
1728// The mutation creates
1729// USEOfPHI -------Artificial-Dep---> SRCOfCopy
1730
1731// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
1732// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
1733// late to avoid additional copies across iterations. The possible scheduling
1734// order would be
1735// USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE.
1736
1737void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
1738 for (SUnit &SU : DAG->SUnits) {
1739 // Find the COPY/REG_SEQUENCE instruction.
1740 if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
1741 continue;
1742
1743 // Record the loop carried PHIs.
1745 // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
1747
1748 for (auto &Dep : SU.Preds) {
1749 SUnit *TmpSU = Dep.getSUnit();
1750 MachineInstr *TmpMI = TmpSU->getInstr();
1751 SDep::Kind DepKind = Dep.getKind();
1752 // Save the loop carried PHI.
1753 if (DepKind == SDep::Anti && TmpMI->isPHI())
1754 PHISUs.push_back(TmpSU);
1755 // Save the source of COPY/REG_SEQUENCE.
1756 // If the source has no pre-decessors, we will end up creating cycles.
1757 else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
1758 SrcSUs.push_back(TmpSU);
1759 }
1760
1761 if (PHISUs.size() == 0 || SrcSUs.size() == 0)
1762 continue;
1763
1764 // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
1765 // SUnit to the container.
1767 // Do not use iterator based loop here as we are updating the container.
1768 for (size_t Index = 0; Index < PHISUs.size(); ++Index) {
1769 for (auto &Dep : PHISUs[Index]->Succs) {
1770 if (Dep.getKind() != SDep::Data)
1771 continue;
1772
1773 SUnit *TmpSU = Dep.getSUnit();
1774 MachineInstr *TmpMI = TmpSU->getInstr();
1775 if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
1776 PHISUs.push_back(TmpSU);
1777 continue;
1778 }
1779 UseSUs.push_back(TmpSU);
1780 }
1781 }
1782
1783 if (UseSUs.size() == 0)
1784 continue;
1785
1786 SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
1787 // Add the artificial dependencies if it does not form a cycle.
1788 for (auto *I : UseSUs) {
1789 for (auto *Src : SrcSUs) {
1790 if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
1791 Src->addPred(SDep(I, SDep::Artificial));
1792 SDAG->Topo.AddPred(Src, I);
1793 }
1794 }
1795 }
1796 }
1797}
1798
1799/// Return true for DAG nodes that we ignore when computing the cost functions.
1800/// We ignore the back-edge recurrence in order to avoid unbounded recursion
1801/// in the calculation of the ASAP, ALAP, etc functions.
1802static bool ignoreDependence(const SDep &D, bool isPred) {
1803 if (D.isArtificial() || D.getSUnit()->isBoundaryNode())
1804 return true;
1805 return D.getKind() == SDep::Anti && isPred;
1806}
1807
1808/// Compute several functions need to order the nodes for scheduling.
1809/// ASAP - Earliest time to schedule a node.
1810/// ALAP - Latest time to schedule a node.
1811/// MOV - Mobility function, difference between ALAP and ASAP.
1812/// D - Depth of each node.
1813/// H - Height of each node.
1814void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
1815 ScheduleInfo.resize(SUnits.size());
1816
1817 LLVM_DEBUG({
1818 for (int I : Topo) {
1819 const SUnit &SU = SUnits[I];
1820 dumpNode(SU);
1821 }
1822 });
1823
1824 int maxASAP = 0;
1825 // Compute ASAP and ZeroLatencyDepth.
1826 for (int I : Topo) {
1827 int asap = 0;
1828 int zeroLatencyDepth = 0;
1829 SUnit *SU = &SUnits[I];
1830 for (const SDep &P : SU->Preds) {
1831 SUnit *pred = P.getSUnit();
1832 if (P.getLatency() == 0)
1833 zeroLatencyDepth =
1834 std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1);
1835 if (ignoreDependence(P, true))
1836 continue;
1837 asap = std::max(asap, (int)(getASAP(pred) + P.getLatency() -
1838 getDistance(pred, SU, P) * MII));
1839 }
1840 maxASAP = std::max(maxASAP, asap);
1841 ScheduleInfo[I].ASAP = asap;
1842 ScheduleInfo[I].ZeroLatencyDepth = zeroLatencyDepth;
1843 }
1844
1845 // Compute ALAP, ZeroLatencyHeight, and MOV.
1846 for (int I : llvm::reverse(Topo)) {
1847 int alap = maxASAP;
1848 int zeroLatencyHeight = 0;
1849 SUnit *SU = &SUnits[I];
1850 for (const SDep &S : SU->Succs) {
1851 SUnit *succ = S.getSUnit();
1852 if (succ->isBoundaryNode())
1853 continue;
1854 if (S.getLatency() == 0)
1855 zeroLatencyHeight =
1856 std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
1857 if (ignoreDependence(S, true))
1858 continue;
1859 alap = std::min(alap, (int)(getALAP(succ) - S.getLatency() +
1860 getDistance(SU, succ, S) * MII));
1861 }
1862
1863 ScheduleInfo[I].ALAP = alap;
1864 ScheduleInfo[I].ZeroLatencyHeight = zeroLatencyHeight;
1865 }
1866
1867 // After computing the node functions, compute the summary for each node set.
1868 for (NodeSet &I : NodeSets)
1869 I.computeNodeSetInfo(this);
1870
1871 LLVM_DEBUG({
1872 for (unsigned i = 0; i < SUnits.size(); i++) {
1873 dbgs() << "\tNode " << i << ":\n";
1874 dbgs() << "\t ASAP = " << getASAP(&SUnits[i]) << "\n";
1875 dbgs() << "\t ALAP = " << getALAP(&SUnits[i]) << "\n";
1876 dbgs() << "\t MOV = " << getMOV(&SUnits[i]) << "\n";
1877 dbgs() << "\t D = " << getDepth(&SUnits[i]) << "\n";
1878 dbgs() << "\t H = " << getHeight(&SUnits[i]) << "\n";
1879 dbgs() << "\t ZLD = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
1880 dbgs() << "\t ZLH = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
1881 }
1882 });
1883}
1884
1885/// Compute the Pred_L(O) set, as defined in the paper. The set is defined
1886/// as the predecessors of the elements of NodeOrder that are not also in
1887/// NodeOrder.
1890 const NodeSet *S = nullptr) {
1891 Preds.clear();
1892 for (const SUnit *SU : NodeOrder) {
1893 for (const SDep &Pred : SU->Preds) {
1894 if (S && S->count(Pred.getSUnit()) == 0)
1895 continue;
1896 if (ignoreDependence(Pred, true))
1897 continue;
1898 if (NodeOrder.count(Pred.getSUnit()) == 0)
1899 Preds.insert(Pred.getSUnit());
1900 }
1901 // Back-edges are predecessors with an anti-dependence.
1902 for (const SDep &Succ : SU->Succs) {
1903 if (Succ.getKind() != SDep::Anti)
1904 continue;
1905 if (S && S->count(Succ.getSUnit()) == 0)
1906 continue;
1907 if (NodeOrder.count(Succ.getSUnit()) == 0)
1908 Preds.insert(Succ.getSUnit());
1909 }
1910 }
1911 return !Preds.empty();
1912}
1913
1914/// Compute the Succ_L(O) set, as defined in the paper. The set is defined
1915/// as the successors of the elements of NodeOrder that are not also in
1916/// NodeOrder.
1919 const NodeSet *S = nullptr) {
1920 Succs.clear();
1921 for (const SUnit *SU : NodeOrder) {
1922 for (const SDep &Succ : SU->Succs) {
1923 if (S && S->count(Succ.getSUnit()) == 0)
1924 continue;
1925 if (ignoreDependence(Succ, false))
1926 continue;
1927 if (NodeOrder.count(Succ.getSUnit()) == 0)
1928 Succs.insert(Succ.getSUnit());
1929 }
1930 for (const SDep &Pred : SU->Preds) {
1931 if (Pred.getKind() != SDep::Anti)
1932 continue;
1933 if (S && S->count(Pred.getSUnit()) == 0)
1934 continue;
1935 if (NodeOrder.count(Pred.getSUnit()) == 0)
1936 Succs.insert(Pred.getSUnit());
1937 }
1938 }
1939 return !Succs.empty();
1940}
1941
1942/// Return true if there is a path from the specified node to any of the nodes
1943/// in DestNodes. Keep track and return the nodes in any path.
1944static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
1945 SetVector<SUnit *> &DestNodes,
1946 SetVector<SUnit *> &Exclude,
1947 SmallPtrSet<SUnit *, 8> &Visited) {
1948 if (Cur->isBoundaryNode())
1949 return false;
1950 if (Exclude.contains(Cur))
1951 return false;
1952 if (DestNodes.contains(Cur))
1953 return true;
1954 if (!Visited.insert(Cur).second)
1955 return Path.contains(Cur);
1956 bool FoundPath = false;
1957 for (auto &SI : Cur->Succs)
1958 if (!ignoreDependence(SI, false))
1959 FoundPath |=
1960 computePath(SI.getSUnit(), Path, DestNodes, Exclude, Visited);
1961 for (auto &PI : Cur->Preds)
1962 if (PI.getKind() == SDep::Anti)
1963 FoundPath |=
1964 computePath(PI.getSUnit(), Path, DestNodes, Exclude, Visited);
1965 if (FoundPath)
1966 Path.insert(Cur);
1967 return FoundPath;
1968}
1969
1970/// Compute the live-out registers for the instructions in a node-set.
1971/// The live-out registers are those that are defined in the node-set,
1972/// but not used. Except for use operands of Phis.
1974 NodeSet &NS) {
1979 for (SUnit *SU : NS) {
1980 const MachineInstr *MI = SU->getInstr();
1981 if (MI->isPHI())
1982 continue;
1983 for (const MachineOperand &MO : MI->all_uses()) {
1984 Register Reg = MO.getReg();
1985 if (Reg.isVirtual())
1986 Uses.insert(Reg);
1987 else if (MRI.isAllocatable(Reg))
1988 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
1989 Uses.insert(Unit);
1990 }
1991 }
1992 for (SUnit *SU : NS)
1993 for (const MachineOperand &MO : SU->getInstr()->all_defs())
1994 if (!MO.isDead()) {
1995 Register Reg = MO.getReg();
1996 if (Reg.isVirtual()) {
1997 if (!Uses.count(Reg))
1998 LiveOutRegs.push_back(RegisterMaskPair(Reg,
2000 } else if (MRI.isAllocatable(Reg)) {
2001 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
2002 if (!Uses.count(Unit))
2003 LiveOutRegs.push_back(
2005 }
2006 }
2007 RPTracker.addLiveRegs(LiveOutRegs);
2008}
2009
2010/// A heuristic to filter nodes in recurrent node-sets if the register
2011/// pressure of a set is too high.
2012void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
2013 for (auto &NS : NodeSets) {
2014 // Skip small node-sets since they won't cause register pressure problems.
2015 if (NS.size() <= 2)
2016 continue;
2017 IntervalPressure RecRegPressure;
2018 RegPressureTracker RecRPTracker(RecRegPressure);
2019 RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
2020 computeLiveOuts(MF, RecRPTracker, NS);
2021 RecRPTracker.closeBottom();
2022
2023 std::vector<SUnit *> SUnits(NS.begin(), NS.end());
2024 llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
2025 return A->NodeNum > B->NodeNum;
2026 });
2027
2028 for (auto &SU : SUnits) {
2029 // Since we're computing the register pressure for a subset of the
2030 // instructions in a block, we need to set the tracker for each
2031 // instruction in the node-set. The tracker is set to the instruction
2032 // just after the one we're interested in.
2034 RecRPTracker.setPos(std::next(CurInstI));
2035
2036 RegPressureDelta RPDelta;
2037 ArrayRef<PressureChange> CriticalPSets;
2038 RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
2039 CriticalPSets,
2040 RecRegPressure.MaxSetPressure);
2041 if (RPDelta.Excess.isValid()) {
2042 LLVM_DEBUG(
2043 dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
2045 << ":" << RPDelta.Excess.getUnitInc() << "\n");
2046 NS.setExceedPressure(SU);
2047 break;
2048 }
2049 RecRPTracker.recede();
2050 }
2051 }
2052}
2053
2054/// A heuristic to colocate node sets that have the same set of
2055/// successors.
2056void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
2057 unsigned Colocate = 0;
2058 for (int i = 0, e = NodeSets.size(); i < e; ++i) {
2059 NodeSet &N1 = NodeSets[i];
2061 if (N1.empty() || !succ_L(N1, S1))
2062 continue;
2063 for (int j = i + 1; j < e; ++j) {
2064 NodeSet &N2 = NodeSets[j];
2065 if (N1.compareRecMII(N2) != 0)
2066 continue;
2068 if (N2.empty() || !succ_L(N2, S2))
2069 continue;
2070 if (llvm::set_is_subset(S1, S2) && S1.size() == S2.size()) {
2071 N1.setColocate(++Colocate);
2072 N2.setColocate(Colocate);
2073 break;
2074 }
2075 }
2076 }
2077}
2078
2079/// Check if the existing node-sets are profitable. If not, then ignore the
2080/// recurrent node-sets, and attempt to schedule all nodes together. This is
2081/// a heuristic. If the MII is large and all the recurrent node-sets are small,
2082/// then it's best to try to schedule all instructions together instead of
2083/// starting with the recurrent node-sets.
2084void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
2085 // Look for loops with a large MII.
2086 if (MII < 17)
2087 return;
2088 // Check if the node-set contains only a simple add recurrence.
2089 for (auto &NS : NodeSets) {
2090 if (NS.getRecMII() > 2)
2091 return;
2092 if (NS.getMaxDepth() > MII)
2093 return;
2094 }
2095 NodeSets.clear();
2096 LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
2097}
2098
2099/// Add the nodes that do not belong to a recurrence set into groups
2100/// based upon connected components.
2101void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
2102 SetVector<SUnit *> NodesAdded;
2104 // Add the nodes that are on a path between the previous node sets and
2105 // the current node set.
2106 for (NodeSet &I : NodeSets) {
2108 // Add the nodes from the current node set to the previous node set.
2109 if (succ_L(I, N)) {
2111 for (SUnit *NI : N) {
2112 Visited.clear();
2113 computePath(NI, Path, NodesAdded, I, Visited);
2114 }
2115 if (!Path.empty())
2116 I.insert(Path.begin(), Path.end());
2117 }
2118 // Add the nodes from the previous node set to the current node set.
2119 N.clear();
2120 if (succ_L(NodesAdded, N)) {
2122 for (SUnit *NI : N) {
2123 Visited.clear();
2124 computePath(NI, Path, I, NodesAdded, Visited);
2125 }
2126 if (!Path.empty())
2127 I.insert(Path.begin(), Path.end());
2128 }
2129 NodesAdded.insert(I.begin(), I.end());
2130 }
2131
2132 // Create a new node set with the connected nodes of any successor of a node
2133 // in a recurrent set.
2134 NodeSet NewSet;
2136 if (succ_L(NodesAdded, N))
2137 for (SUnit *I : N)
2138 addConnectedNodes(I, NewSet, NodesAdded);
2139 if (!NewSet.empty())
2140 NodeSets.push_back(NewSet);
2141
2142 // Create a new node set with the connected nodes of any predecessor of a node
2143 // in a recurrent set.
2144 NewSet.clear();
2145 if (pred_L(NodesAdded, N))
2146 for (SUnit *I : N)
2147 addConnectedNodes(I, NewSet, NodesAdded);
2148 if (!NewSet.empty())
2149 NodeSets.push_back(NewSet);
2150
2151 // Create new nodes sets with the connected nodes any remaining node that
2152 // has no predecessor.
2153 for (SUnit &SU : SUnits) {
2154 if (NodesAdded.count(&SU) == 0) {
2155 NewSet.clear();
2156 addConnectedNodes(&SU, NewSet, NodesAdded);
2157 if (!NewSet.empty())
2158 NodeSets.push_back(NewSet);
2159 }
2160 }
2161}
2162
2163/// Add the node to the set, and add all of its connected nodes to the set.
2164void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
2165 SetVector<SUnit *> &NodesAdded) {
2166 NewSet.insert(SU);
2167 NodesAdded.insert(SU);
2168 for (auto &SI : SU->Succs) {
2169 SUnit *Successor = SI.getSUnit();
2170 if (!SI.isArtificial() && !Successor->isBoundaryNode() &&
2171 NodesAdded.count(Successor) == 0)
2172 addConnectedNodes(Successor, NewSet, NodesAdded);
2173 }
2174 for (auto &PI : SU->Preds) {
2175 SUnit *Predecessor = PI.getSUnit();
2176 if (!PI.isArtificial() && NodesAdded.count(Predecessor) == 0)
2177 addConnectedNodes(Predecessor, NewSet, NodesAdded);
2178 }
2179}
2180
2181/// Return true if Set1 contains elements in Set2. The elements in common
2182/// are returned in a different container.
2183static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
2185 Result.clear();
2186 for (SUnit *SU : Set1) {
2187 if (Set2.count(SU) != 0)
2188 Result.insert(SU);
2189 }
2190 return !Result.empty();
2191}
2192
2193/// Merge the recurrence node sets that have the same initial node.
2194void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
2195 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2196 ++I) {
2197 NodeSet &NI = *I;
2198 for (NodeSetType::iterator J = I + 1; J != E;) {
2199 NodeSet &NJ = *J;
2200 if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
2201 if (NJ.compareRecMII(NI) > 0)
2202 NI.setRecMII(NJ.getRecMII());
2203 for (SUnit *SU : *J)
2204 I->insert(SU);
2205 NodeSets.erase(J);
2206 E = NodeSets.end();
2207 } else {
2208 ++J;
2209 }
2210 }
2211 }
2212}
2213
2214/// Remove nodes that have been scheduled in previous NodeSets.
2215void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
2216 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2217 ++I)
2218 for (NodeSetType::iterator J = I + 1; J != E;) {
2219 J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
2220
2221 if (J->empty()) {
2222 NodeSets.erase(J);
2223 E = NodeSets.end();
2224 } else {
2225 ++J;
2226 }
2227 }
2228}
2229
2230/// Compute an ordered list of the dependence graph nodes, which
2231/// indicates the order that the nodes will be scheduled. This is a
2232/// two-level algorithm. First, a partial order is created, which
2233/// consists of a list of sets ordered from highest to lowest priority.
2234void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
2236 NodeOrder.clear();
2237
2238 for (auto &Nodes : NodeSets) {
2239 LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
2240 OrderKind Order;
2242 if (pred_L(NodeOrder, N) && llvm::set_is_subset(N, Nodes)) {
2243 R.insert(N.begin(), N.end());
2244 Order = BottomUp;
2245 LLVM_DEBUG(dbgs() << " Bottom up (preds) ");
2246 } else if (succ_L(NodeOrder, N) && llvm::set_is_subset(N, Nodes)) {
2247 R.insert(N.begin(), N.end());
2248 Order = TopDown;
2249 LLVM_DEBUG(dbgs() << " Top down (succs) ");
2250 } else if (isIntersect(N, Nodes, R)) {
2251 // If some of the successors are in the existing node-set, then use the
2252 // top-down ordering.
2253 Order = TopDown;
2254 LLVM_DEBUG(dbgs() << " Top down (intersect) ");
2255 } else if (NodeSets.size() == 1) {
2256 for (const auto &N : Nodes)
2257 if (N->Succs.size() == 0)
2258 R.insert(N);
2259 Order = BottomUp;
2260 LLVM_DEBUG(dbgs() << " Bottom up (all) ");
2261 } else {
2262 // Find the node with the highest ASAP.
2263 SUnit *maxASAP = nullptr;
2264 for (SUnit *SU : Nodes) {
2265 if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
2266 (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
2267 maxASAP = SU;
2268 }
2269 R.insert(maxASAP);
2270 Order = BottomUp;
2271 LLVM_DEBUG(dbgs() << " Bottom up (default) ");
2272 }
2273
2274 while (!R.empty()) {
2275 if (Order == TopDown) {
2276 // Choose the node with the maximum height. If more than one, choose
2277 // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
2278 // choose the node with the lowest MOV.
2279 while (!R.empty()) {
2280 SUnit *maxHeight = nullptr;
2281 for (SUnit *I : R) {
2282 if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
2283 maxHeight = I;
2284 else if (getHeight(I) == getHeight(maxHeight) &&
2286 maxHeight = I;
2287 else if (getHeight(I) == getHeight(maxHeight) &&
2289 getZeroLatencyHeight(maxHeight) &&
2290 getMOV(I) < getMOV(maxHeight))
2291 maxHeight = I;
2292 }
2293 NodeOrder.insert(maxHeight);
2294 LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
2295 R.remove(maxHeight);
2296 for (const auto &I : maxHeight->Succs) {
2297 if (Nodes.count(I.getSUnit()) == 0)
2298 continue;
2299 if (NodeOrder.contains(I.getSUnit()))
2300 continue;
2301 if (ignoreDependence(I, false))
2302 continue;
2303 R.insert(I.getSUnit());
2304 }
2305 // Back-edges are predecessors with an anti-dependence.
2306 for (const auto &I : maxHeight->Preds) {
2307 if (I.getKind() != SDep::Anti)
2308 continue;
2309 if (Nodes.count(I.getSUnit()) == 0)
2310 continue;
2311 if (NodeOrder.contains(I.getSUnit()))
2312 continue;
2313 R.insert(I.getSUnit());
2314 }
2315 }
2316 Order = BottomUp;
2317 LLVM_DEBUG(dbgs() << "\n Switching order to bottom up ");
2319 if (pred_L(NodeOrder, N, &Nodes))
2320 R.insert(N.begin(), N.end());
2321 } else {
2322 // Choose the node with the maximum depth. If more than one, choose
2323 // the node with the maximum ZeroLatencyDepth. If still more than one,
2324 // choose the node with the lowest MOV.
2325 while (!R.empty()) {
2326 SUnit *maxDepth = nullptr;
2327 for (SUnit *I : R) {
2328 if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
2329 maxDepth = I;
2330 else if (getDepth(I) == getDepth(maxDepth) &&
2332 maxDepth = I;
2333 else if (getDepth(I) == getDepth(maxDepth) &&
2335 getMOV(I) < getMOV(maxDepth))
2336 maxDepth = I;
2337 }
2338 NodeOrder.insert(maxDepth);
2339 LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
2340 R.remove(maxDepth);
2341 if (Nodes.isExceedSU(maxDepth)) {
2342 Order = TopDown;
2343 R.clear();
2344 R.insert(Nodes.getNode(0));
2345 break;
2346 }
2347 for (const auto &I : maxDepth->Preds) {
2348 if (Nodes.count(I.getSUnit()) == 0)
2349 continue;
2350 if (NodeOrder.contains(I.getSUnit()))
2351 continue;
2352 R.insert(I.getSUnit());
2353 }
2354 // Back-edges are predecessors with an anti-dependence.
2355 for (const auto &I : maxDepth->Succs) {
2356 if (I.getKind() != SDep::Anti)
2357 continue;
2358 if (Nodes.count(I.getSUnit()) == 0)
2359 continue;
2360 if (NodeOrder.contains(I.getSUnit()))
2361 continue;
2362 R.insert(I.getSUnit());
2363 }
2364 }
2365 Order = TopDown;
2366 LLVM_DEBUG(dbgs() << "\n Switching order to top down ");
2368 if (succ_L(NodeOrder, N, &Nodes))
2369 R.insert(N.begin(), N.end());
2370 }
2371 }
2372 LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
2373 }
2374
2375 LLVM_DEBUG({
2376 dbgs() << "Node order: ";
2377 for (SUnit *I : NodeOrder)
2378 dbgs() << " " << I->NodeNum << " ";
2379 dbgs() << "\n";
2380 });
2381}
2382
2383/// Process the nodes in the computed order and create the pipelined schedule
2384/// of the instructions, if possible. Return true if a schedule is found.
2385bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
2386
2387 if (NodeOrder.empty()){
2388 LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
2389 return false;
2390 }
2391
2392 bool scheduleFound = false;
2393 std::unique_ptr<HighRegisterPressureDetector> HRPDetector;
2394 if (LimitRegPressure) {
2395 HRPDetector =
2396 std::make_unique<HighRegisterPressureDetector>(Loop.getHeader(), MF);
2397 HRPDetector->init(RegClassInfo);
2398 }
2399 // Keep increasing II until a valid schedule is found.
2400 for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) {
2401 Schedule.reset();
2402 Schedule.setInitiationInterval(II);
2403 LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
2404
2405 SetVector<SUnit *>::iterator NI = NodeOrder.begin();
2406 SetVector<SUnit *>::iterator NE = NodeOrder.end();
2407 do {
2408 SUnit *SU = *NI;
2409
2410 // Compute the schedule time for the instruction, which is based
2411 // upon the scheduled time for any predecessors/successors.
2412 int EarlyStart = INT_MIN;
2413 int LateStart = INT_MAX;
2414 // These values are set when the size of the schedule window is limited
2415 // due to chain dependences.
2416 int SchedEnd = INT_MAX;
2417 int SchedStart = INT_MIN;
2418 Schedule.computeStart(SU, &EarlyStart, &LateStart, &SchedEnd, &SchedStart,
2419 II, this);
2420 LLVM_DEBUG({
2421 dbgs() << "\n";
2422 dbgs() << "Inst (" << SU->NodeNum << ") ";
2423 SU->getInstr()->dump();
2424 dbgs() << "\n";
2425 });
2426 LLVM_DEBUG({
2427 dbgs() << format("\tes: %8x ls: %8x me: %8x ms: %8x\n", EarlyStart,
2428 LateStart, SchedEnd, SchedStart);
2429 });
2430
2431 if (EarlyStart > LateStart || SchedEnd < EarlyStart ||
2432 SchedStart > LateStart)
2433 scheduleFound = false;
2434 else if (EarlyStart != INT_MIN && LateStart == INT_MAX) {
2435 SchedEnd = std::min(SchedEnd, EarlyStart + (int)II - 1);
2436 scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2437 } else if (EarlyStart == INT_MIN && LateStart != INT_MAX) {
2438 SchedStart = std::max(SchedStart, LateStart - (int)II + 1);
2439 scheduleFound = Schedule.insert(SU, LateStart, SchedStart, II);
2440 } else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2441 SchedEnd =
2442 std::min(SchedEnd, std::min(LateStart, EarlyStart + (int)II - 1));
2443 // When scheduling a Phi it is better to start at the late cycle and go
2444 // backwards. The default order may insert the Phi too far away from
2445 // its first dependence.
2446 if (SU->getInstr()->isPHI())
2447 scheduleFound = Schedule.insert(SU, SchedEnd, EarlyStart, II);
2448 else
2449 scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2450 } else {
2451 int FirstCycle = Schedule.getFirstCycle();
2452 scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2453 FirstCycle + getASAP(SU) + II - 1, II);
2454 }
2455 // Even if we find a schedule, make sure the schedule doesn't exceed the
2456 // allowable number of stages. We keep trying if this happens.
2457 if (scheduleFound)
2458 if (SwpMaxStages > -1 &&
2459 Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2460 scheduleFound = false;
2461
2462 LLVM_DEBUG({
2463 if (!scheduleFound)
2464 dbgs() << "\tCan't schedule\n";
2465 });
2466 } while (++NI != NE && scheduleFound);
2467
2468 // If a schedule is found, ensure non-pipelined instructions are in stage 0
2469 if (scheduleFound)
2470 scheduleFound =
2471 Schedule.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo);
2472
2473 // If a schedule is found, check if it is a valid schedule too.
2474 if (scheduleFound)
2475 scheduleFound = Schedule.isValidSchedule(this);
2476
2477 // If a schedule was found and the option is enabled, check if the schedule
2478 // might generate additional register spills/fills.
2479 if (scheduleFound && LimitRegPressure)
2480 scheduleFound =
2481 !HRPDetector->detect(this, Schedule, Schedule.getMaxStageCount());
2482 }
2483
2484 LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
2485 << " (II=" << Schedule.getInitiationInterval()
2486 << ")\n");
2487
2488 if (scheduleFound) {
2489 scheduleFound = LoopPipelinerInfo->shouldUseSchedule(*this, Schedule);
2490 if (!scheduleFound)
2491 LLVM_DEBUG(dbgs() << "Target rejected schedule\n");
2492 }
2493
2494 if (scheduleFound) {
2495 Schedule.finalizeSchedule(this);
2496 Pass.ORE->emit([&]() {
2498 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
2499 << "Schedule found with Initiation Interval: "
2500 << ore::NV("II", Schedule.getInitiationInterval())
2501 << ", MaxStageCount: "
2502 << ore::NV("MaxStageCount", Schedule.getMaxStageCount());
2503 });
2504 } else
2505 Schedule.reset();
2506
2507 return scheduleFound && Schedule.getMaxStageCount() > 0;
2508}
2509
2510/// Return true if we can compute the amount the instruction changes
2511/// during each iteration. Set Delta to the amount of the change.
2512bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
2514 const MachineOperand *BaseOp;
2515 int64_t Offset;
2516 bool OffsetIsScalable;
2517 if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
2518 return false;
2519
2520 // FIXME: This algorithm assumes instructions have fixed-size offsets.
2521 if (OffsetIsScalable)
2522 return false;
2523
2524 if (!BaseOp->isReg())
2525 return false;
2526
2527 Register BaseReg = BaseOp->getReg();
2528
2530 // Check if there is a Phi. If so, get the definition in the loop.
2531 MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
2532 if (BaseDef && BaseDef->isPHI()) {
2533 BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
2534 BaseDef = MRI.getVRegDef(BaseReg);
2535 }
2536 if (!BaseDef)
2537 return false;
2538
2539 int D = 0;
2540 if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
2541 return false;
2542
2543 Delta = D;
2544 return true;
2545}
2546
2547/// Check if we can change the instruction to use an offset value from the
2548/// previous iteration. If so, return true and set the base and offset values
2549/// so that we can rewrite the load, if necessary.
2550/// v1 = Phi(v0, v3)
2551/// v2 = load v1, 0
2552/// v3 = post_store v1, 4, x
2553/// This function enables the load to be rewritten as v2 = load v3, 4.
2554bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
2555 unsigned &BasePos,
2556 unsigned &OffsetPos,
2557 unsigned &NewBase,
2558 int64_t &Offset) {
2559 // Get the load instruction.
2560 if (TII->isPostIncrement(*MI))
2561 return false;
2562 unsigned BasePosLd, OffsetPosLd;
2563 if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
2564 return false;
2565 Register BaseReg = MI->getOperand(BasePosLd).getReg();
2566
2567 // Look for the Phi instruction.
2568 MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
2569 MachineInstr *Phi = MRI.getVRegDef(BaseReg);
2570 if (!Phi || !Phi->isPHI())
2571 return false;
2572 // Get the register defined in the loop block.
2573 unsigned PrevReg = getLoopPhiReg(*Phi, MI->getParent());
2574 if (!PrevReg)
2575 return false;
2576
2577 // Check for the post-increment load/store instruction.
2578 MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
2579 if (!PrevDef || PrevDef == MI)
2580 return false;
2581
2582 if (!TII->isPostIncrement(*PrevDef))
2583 return false;
2584
2585 unsigned BasePos1 = 0, OffsetPos1 = 0;
2586 if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
2587 return false;
2588
2589 // Make sure that the instructions do not access the same memory location in
2590 // the next iteration.
2591 int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
2592 int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
2594 NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
2595 bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
2596 MF.deleteMachineInstr(NewMI);
2597 if (!Disjoint)
2598 return false;
2599
2600 // Set the return value once we determine that we return true.
2601 BasePos = BasePosLd;
2602 OffsetPos = OffsetPosLd;
2603 NewBase = PrevReg;
2604 Offset = StoreOffset;
2605 return true;
2606}
2607
2608/// Apply changes to the instruction if needed. The changes are need
2609/// to improve the scheduling and depend up on the final schedule.
2611 SMSchedule &Schedule) {
2612 SUnit *SU = getSUnit(MI);
2614 InstrChanges.find(SU);
2615 if (It != InstrChanges.end()) {
2616 std::pair<unsigned, int64_t> RegAndOffset = It->second;
2617 unsigned BasePos, OffsetPos;
2618 if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2619 return;
2620 Register BaseReg = MI->getOperand(BasePos).getReg();
2621 MachineInstr *LoopDef = findDefInLoop(BaseReg);
2622 int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
2623 int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
2624 int BaseStageNum = Schedule.stageScheduled(SU);
2625 int BaseCycleNum = Schedule.cycleScheduled(SU);
2626 if (BaseStageNum < DefStageNum) {
2628 int OffsetDiff = DefStageNum - BaseStageNum;
2629 if (DefCycleNum < BaseCycleNum) {
2630 NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
2631 if (OffsetDiff > 0)
2632 --OffsetDiff;
2633 }
2634 int64_t NewOffset =
2635 MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
2636 NewMI->getOperand(OffsetPos).setImm(NewOffset);
2637 SU->setInstr(NewMI);
2638 MISUnitMap[NewMI] = SU;
2639 NewMIs[MI] = NewMI;
2640 }
2641 }
2642}
2643
2644/// Return the instruction in the loop that defines the register.
2645/// If the definition is a Phi, then follow the Phi operand to
2646/// the instruction in the loop.
2647MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
2649 MachineInstr *Def = MRI.getVRegDef(Reg);
2650 while (Def->isPHI()) {
2651 if (!Visited.insert(Def).second)
2652 break;
2653 for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
2654 if (Def->getOperand(i + 1).getMBB() == BB) {
2655 Def = MRI.getVRegDef(Def->getOperand(i).getReg());
2656 break;
2657 }
2658 }
2659 return Def;
2660}
2661
2662/// Return true for an order or output dependence that is loop carried
2663/// potentially. A dependence is loop carried if the destination defines a value
2664/// that may be used or defined by the source in a subsequent iteration.
2666 bool isSucc) {
2667 if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
2668 Dep.isArtificial() || Dep.getSUnit()->isBoundaryNode())
2669 return false;
2670
2672 return true;
2673
2674 if (Dep.getKind() == SDep::Output)
2675 return true;
2676
2677 MachineInstr *SI = Source->getInstr();
2678 MachineInstr *DI = Dep.getSUnit()->getInstr();
2679 if (!isSucc)
2680 std::swap(SI, DI);
2681 assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
2682
2683 // Assume ordered loads and stores may have a loop carried dependence.
2684 if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
2685 SI->mayRaiseFPException() || DI->mayRaiseFPException() ||
2686 SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
2687 return true;
2688
2689 if (!DI->mayLoadOrStore() || !SI->mayLoadOrStore())
2690 return false;
2691
2692 // The conservative assumption is that a dependence between memory operations
2693 // may be loop carried. The following code checks when it can be proved that
2694 // there is no loop carried dependence.
2695 unsigned DeltaS, DeltaD;
2696 if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD))
2697 return true;
2698
2699 const MachineOperand *BaseOpS, *BaseOpD;
2700 int64_t OffsetS, OffsetD;
2701 bool OffsetSIsScalable, OffsetDIsScalable;
2703 if (!TII->getMemOperandWithOffset(*SI, BaseOpS, OffsetS, OffsetSIsScalable,
2704 TRI) ||
2705 !TII->getMemOperandWithOffset(*DI, BaseOpD, OffsetD, OffsetDIsScalable,
2706 TRI))
2707 return true;
2708
2709 assert(!OffsetSIsScalable && !OffsetDIsScalable &&
2710 "Expected offsets to be byte offsets");
2711
2712 MachineInstr *DefS = MRI.getVRegDef(BaseOpS->getReg());
2713 MachineInstr *DefD = MRI.getVRegDef(BaseOpD->getReg());
2714 if (!DefS || !DefD || !DefS->isPHI() || !DefD->isPHI())
2715 return true;
2716
2717 unsigned InitValS = 0;
2718 unsigned LoopValS = 0;
2719 unsigned InitValD = 0;
2720 unsigned LoopValD = 0;
2721 getPhiRegs(*DefS, BB, InitValS, LoopValS);
2722 getPhiRegs(*DefD, BB, InitValD, LoopValD);
2723 MachineInstr *InitDefS = MRI.getVRegDef(InitValS);
2724 MachineInstr *InitDefD = MRI.getVRegDef(InitValD);
2725
2726 if (!InitDefS->isIdenticalTo(*InitDefD))
2727 return true;
2728
2729 // Check that the base register is incremented by a constant value for each
2730 // iteration.
2731 MachineInstr *LoopDefS = MRI.getVRegDef(LoopValS);
2732 int D = 0;
2733 if (!LoopDefS || !TII->getIncrementValue(*LoopDefS, D))
2734 return true;
2735
2736 LocationSize AccessSizeS = (*SI->memoperands_begin())->getSize();
2737 LocationSize AccessSizeD = (*DI->memoperands_begin())->getSize();
2738
2739 // This is the main test, which checks the offset values and the loop
2740 // increment value to determine if the accesses may be loop carried.
2741 if (!AccessSizeS.hasValue() || !AccessSizeD.hasValue())
2742 return true;
2743
2744 if (DeltaS != DeltaD || DeltaS < AccessSizeS.getValue() ||
2745 DeltaD < AccessSizeD.getValue())
2746 return true;
2747
2748 return (OffsetS + (int64_t)AccessSizeS.getValue() <
2749 OffsetD + (int64_t)AccessSizeD.getValue());
2750}
2751
2752void SwingSchedulerDAG::postProcessDAG() {
2753 for (auto &M : Mutations)
2754 M->apply(this);
2755}
2756
2757/// Try to schedule the node at the specified StartCycle and continue
2758/// until the node is schedule or the EndCycle is reached. This function
2759/// returns true if the node is scheduled. This routine may search either
2760/// forward or backward for a place to insert the instruction based upon
2761/// the relative values of StartCycle and EndCycle.
2762bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
2763 bool forward = true;
2764 LLVM_DEBUG({
2765 dbgs() << "Trying to insert node between " << StartCycle << " and "
2766 << EndCycle << " II: " << II << "\n";
2767 });
2768 if (StartCycle > EndCycle)
2769 forward = false;
2770
2771 // The terminating condition depends on the direction.
2772 int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
2773 for (int curCycle = StartCycle; curCycle != termCycle;
2774 forward ? ++curCycle : --curCycle) {
2775
2776 if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
2777 ProcItinResources.canReserveResources(*SU, curCycle)) {
2778 LLVM_DEBUG({
2779 dbgs() << "\tinsert at cycle " << curCycle << " ";
2780 SU->getInstr()->dump();
2781 });
2782
2783 if (!ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
2784 ProcItinResources.reserveResources(*SU, curCycle);
2785 ScheduledInstrs[curCycle].push_back(SU);
2786 InstrToCycle.insert(std::make_pair(SU, curCycle));
2787 if (curCycle > LastCycle)
2788 LastCycle = curCycle;
2789 if (curCycle < FirstCycle)
2790 FirstCycle = curCycle;
2791 return true;
2792 }
2793 LLVM_DEBUG({
2794 dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
2795 SU->getInstr()->dump();
2796 });
2797 }
2798 return false;
2799}
2800
2801// Return the cycle of the earliest scheduled instruction in the chain.
2804 SmallVector<SDep, 8> Worklist;
2805 Worklist.push_back(Dep);
2806 int EarlyCycle = INT_MAX;
2807 while (!Worklist.empty()) {
2808 const SDep &Cur = Worklist.pop_back_val();
2809 SUnit *PrevSU = Cur.getSUnit();
2810 if (Visited.count(PrevSU))
2811 continue;
2812 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
2813 if (it == InstrToCycle.end())
2814 continue;
2815 EarlyCycle = std::min(EarlyCycle, it->second);
2816 for (const auto &PI : PrevSU->Preds)
2817 if (PI.getKind() == SDep::Order || PI.getKind() == SDep::Output)
2818 Worklist.push_back(PI);
2819 Visited.insert(PrevSU);
2820 }
2821 return EarlyCycle;
2822}
2823
2824// Return the cycle of the latest scheduled instruction in the chain.
2827 SmallVector<SDep, 8> Worklist;
2828 Worklist.push_back(Dep);
2829 int LateCycle = INT_MIN;
2830 while (!Worklist.empty()) {
2831 const SDep &Cur = Worklist.pop_back_val();
2832 SUnit *SuccSU = Cur.getSUnit();
2833 if (Visited.count(SuccSU) || SuccSU->isBoundaryNode())
2834 continue;
2835 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
2836 if (it == InstrToCycle.end())
2837 continue;
2838 LateCycle = std::max(LateCycle, it->second);
2839 for (const auto &SI : SuccSU->Succs)
2840 if (SI.getKind() == SDep::Order || SI.getKind() == SDep::Output)
2841 Worklist.push_back(SI);
2842 Visited.insert(SuccSU);
2843 }
2844 return LateCycle;
2845}
2846
2847/// If an instruction has a use that spans multiple iterations, then
2848/// return true. These instructions are characterized by having a back-ege
2849/// to a Phi, which contains a reference to another Phi.
2851 for (auto &P : SU->Preds)
2852 if (DAG->isBackedge(SU, P) && P.getSUnit()->getInstr()->isPHI())
2853 for (auto &S : P.getSUnit()->Succs)
2854 if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
2855 return P.getSUnit();
2856 return nullptr;
2857}
2858
2859/// Compute the scheduling start slot for the instruction. The start slot
2860/// depends on any predecessor or successor nodes scheduled already.
2861void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
2862 int *MinEnd, int *MaxStart, int II,
2863 SwingSchedulerDAG *DAG) {
2864 // Iterate over each instruction that has been scheduled already. The start
2865 // slot computation depends on whether the previously scheduled instruction
2866 // is a predecessor or successor of the specified instruction.
2867 for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
2868
2869 // Iterate over each instruction in the current cycle.
2870 for (SUnit *I : getInstructions(cycle)) {
2871 // Because we're processing a DAG for the dependences, we recognize
2872 // the back-edge in recurrences by anti dependences.
2873 for (unsigned i = 0, e = (unsigned)SU->Preds.size(); i != e; ++i) {
2874 const SDep &Dep = SU->Preds[i];
2875 if (Dep.getSUnit() == I) {
2876 if (!DAG->isBackedge(SU, Dep)) {
2877 int EarlyStart = cycle + Dep.getLatency() -
2878 DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
2879 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
2880 if (DAG->isLoopCarriedDep(SU, Dep, false)) {
2881 int End = earliestCycleInChain(Dep) + (II - 1);
2882 *MinEnd = std::min(*MinEnd, End);
2883 }
2884 } else {
2885 int LateStart = cycle - Dep.getLatency() +
2886 DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
2887 *MinLateStart = std::min(*MinLateStart, LateStart);
2888 }
2889 }
2890 // For instruction that requires multiple iterations, make sure that
2891 // the dependent instruction is not scheduled past the definition.
2892 SUnit *BE = multipleIterations(I, DAG);
2893 if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
2894 !SU->isPred(I))
2895 *MinLateStart = std::min(*MinLateStart, cycle);
2896 }
2897 for (unsigned i = 0, e = (unsigned)SU->Succs.size(); i != e; ++i) {
2898 if (SU->Succs[i].getSUnit() == I) {
2899 const SDep &Dep = SU->Succs[i];
2900 if (!DAG->isBackedge(SU, Dep)) {
2901 int LateStart = cycle - Dep.getLatency() +
2902 DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
2903 *MinLateStart = std::min(*MinLateStart, LateStart);
2904 if (DAG->isLoopCarriedDep(SU, Dep)) {
2905 int Start = latestCycleInChain(Dep) + 1 - II;
2906 *MaxStart = std::max(*MaxStart, Start);
2907 }
2908 } else {
2909 int EarlyStart = cycle + Dep.getLatency() -
2910 DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
2911 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
2912 }
2913 }
2914 }
2915 }
2916 }
2917}
2918
2919/// Order the instructions within a cycle so that the definitions occur
2920/// before the uses. Returns true if the instruction is added to the start
2921/// of the list, or false if added to the end.
2923 std::deque<SUnit *> &Insts) const {
2924 MachineInstr *MI = SU->getInstr();
2925 bool OrderBeforeUse = false;
2926 bool OrderAfterDef = false;
2927 bool OrderBeforeDef = false;
2928 unsigned MoveDef = 0;
2929 unsigned MoveUse = 0;
2930 int StageInst1 = stageScheduled(SU);
2931
2932 unsigned Pos = 0;
2933 for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
2934 ++I, ++Pos) {
2935 for (MachineOperand &MO : MI->operands()) {
2936 if (!MO.isReg() || !MO.getReg().isVirtual())
2937 continue;
2938
2939 Register Reg = MO.getReg();
2940 unsigned BasePos, OffsetPos;
2941 if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2942 if (MI->getOperand(BasePos).getReg() == Reg)
2943 if (unsigned NewReg = SSD->getInstrBaseReg(SU))
2944 Reg = NewReg;
2945 bool Reads, Writes;
2946 std::tie(Reads, Writes) =
2947 (*I)->getInstr()->readsWritesVirtualRegister(Reg);
2948 if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
2949 OrderBeforeUse = true;
2950 if (MoveUse == 0)
2951 MoveUse = Pos;
2952 } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
2953 // Add the instruction after the scheduled instruction.
2954 OrderAfterDef = true;
2955 MoveDef = Pos;
2956 } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
2957 if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
2958 OrderBeforeUse = true;
2959 if (MoveUse == 0)
2960 MoveUse = Pos;
2961 } else {
2962 OrderAfterDef = true;
2963 MoveDef = Pos;
2964 }
2965 } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
2966 OrderBeforeUse = true;
2967 if (MoveUse == 0)
2968 MoveUse = Pos;
2969 if (MoveUse != 0) {
2970 OrderAfterDef = true;
2971 MoveDef = Pos - 1;
2972 }
2973 } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
2974 // Add the instruction before the scheduled instruction.
2975 OrderBeforeUse = true;
2976 if (MoveUse == 0)
2977 MoveUse = Pos;
2978 } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
2979 isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
2980 if (MoveUse == 0) {
2981 OrderBeforeDef = true;
2982 MoveUse = Pos;
2983 }
2984 }
2985 }
2986 // Check for order dependences between instructions. Make sure the source
2987 // is ordered before the destination.
2988 for (auto &S : SU->Succs) {
2989 if (S.getSUnit() != *I)
2990 continue;
2991 if (S.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
2992 OrderBeforeUse = true;
2993 if (Pos < MoveUse)
2994 MoveUse = Pos;
2995 }
2996 // We did not handle HW dependences in previous for loop,
2997 // and we normally set Latency = 0 for Anti deps,
2998 // so may have nodes in same cycle with Anti denpendent on HW regs.
2999 else if (S.getKind() == SDep::Anti && stageScheduled(*I) == StageInst1) {
3000 OrderBeforeUse = true;
3001 if ((MoveUse == 0) || (Pos < MoveUse))
3002 MoveUse = Pos;
3003 }
3004 }
3005 for (auto &P : SU->Preds) {
3006 if (P.getSUnit() != *I)
3007 continue;
3008 if (P.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
3009 OrderAfterDef = true;
3010 MoveDef = Pos;
3011 }
3012 }
3013 }
3014
3015 // A circular dependence.
3016 if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
3017 OrderBeforeUse = false;
3018
3019 // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3020 // to a loop-carried dependence.
3021 if (OrderBeforeDef)
3022 OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
3023
3024 // The uncommon case when the instruction order needs to be updated because
3025 // there is both a use and def.
3026 if (OrderBeforeUse && OrderAfterDef) {
3027 SUnit *UseSU = Insts.at(MoveUse);
3028 SUnit *DefSU = Insts.at(MoveDef);
3029 if (MoveUse > MoveDef) {
3030 Insts.erase(Insts.begin() + MoveUse);
3031 Insts.erase(Insts.begin() + MoveDef);
3032 } else {
3033 Insts.erase(Insts.begin() + MoveDef);
3034 Insts.erase(Insts.begin() + MoveUse);
3035 }
3036 orderDependence(SSD, UseSU, Insts);
3037 orderDependence(SSD, SU, Insts);
3038 orderDependence(SSD, DefSU, Insts);
3039 return;
3040 }
3041 // Put the new instruction first if there is a use in the list. Otherwise,
3042 // put it at the end of the list.
3043 if (OrderBeforeUse)
3044 Insts.push_front(SU);
3045 else
3046 Insts.push_back(SU);
3047}
3048
3049/// Return true if the scheduled Phi has a loop carried operand.
3051 MachineInstr &Phi) const {
3052 if (!Phi.isPHI())
3053 return false;
3054 assert(Phi.isPHI() && "Expecting a Phi.");
3055 SUnit *DefSU = SSD->getSUnit(&Phi);
3056 unsigned DefCycle = cycleScheduled(DefSU);
3057 int DefStage = stageScheduled(DefSU);
3058
3059 unsigned InitVal = 0;
3060 unsigned LoopVal = 0;
3061 getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
3062 SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
3063 if (!UseSU)
3064 return true;
3065 if (UseSU->getInstr()->isPHI())
3066 return true;
3067 unsigned LoopCycle = cycleScheduled(UseSU);
3068 int LoopStage = stageScheduled(UseSU);
3069 return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
3070}
3071
3072/// Return true if the instruction is a definition that is loop carried
3073/// and defines the use on the next iteration.
3074/// v1 = phi(v2, v3)
3075/// (Def) v3 = op v1
3076/// (MO) = v1
3077/// If MO appears before Def, then v1 and v3 may get assigned to the same
3078/// register.
3080 MachineInstr *Def,
3081 MachineOperand &MO) const {
3082 if (!MO.isReg())
3083 return false;
3084 if (Def->isPHI())
3085 return false;
3086 MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
3087 if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
3088 return false;
3089 if (!isLoopCarried(SSD, *Phi))
3090 return false;
3091 unsigned LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
3092 for (MachineOperand &DMO : Def->all_defs()) {
3093 if (DMO.getReg() == LoopReg)
3094 return true;
3095 }
3096 return false;
3097}
3098
3099/// Determine transitive dependences of unpipelineable instructions
3102 SmallSet<SUnit *, 8> DoNotPipeline;
3103 SmallVector<SUnit *, 8> Worklist;
3104
3105 for (auto &SU : SSD->SUnits)
3106 if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr()))
3107 Worklist.push_back(&SU);
3108
3109 while (!Worklist.empty()) {
3110 auto SU = Worklist.pop_back_val();
3111 if (DoNotPipeline.count(SU))
3112 continue;
3113 LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");
3114 DoNotPipeline.insert(SU);
3115 for (auto &Dep : SU->Preds)
3116 Worklist.push_back(Dep.getSUnit());
3117 if (SU->getInstr()->isPHI())
3118 for (auto &Dep : SU->Succs)
3119 if (Dep.getKind() == SDep::Anti)
3120 Worklist.push_back(Dep.getSUnit());
3121 }
3122 return DoNotPipeline;
3123}
3124
3125// Determine all instructions upon which any unpipelineable instruction depends
3126// and ensure that they are in stage 0. If unable to do so, return false.
3130
3131 int NewLastCycle = INT_MIN;
3132 for (SUnit &SU : SSD->SUnits) {
3133 if (!SU.isInstr())
3134 continue;
3135 if (!DNP.contains(&SU) || stageScheduled(&SU) == 0) {
3136 NewLastCycle = std::max(NewLastCycle, InstrToCycle[&SU]);
3137 continue;
3138 }
3139
3140 // Put the non-pipelined instruction as early as possible in the schedule
3141 int NewCycle = getFirstCycle();
3142 for (auto &Dep : SU.Preds)
3143 NewCycle = std::max(InstrToCycle[Dep.getSUnit()], NewCycle);
3144
3145 int OldCycle = InstrToCycle[&SU];
3146 if (OldCycle != NewCycle) {
3147 InstrToCycle[&SU] = NewCycle;
3148 auto &OldS = getInstructions(OldCycle);
3149 llvm::erase(OldS, &SU);
3150 getInstructions(NewCycle).emplace_back(&SU);
3151 LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum
3152 << ") is not pipelined; moving from cycle " << OldCycle
3153 << " to " << NewCycle << " Instr:" << *SU.getInstr());
3154 }
3155 NewLastCycle = std::max(NewLastCycle, NewCycle);
3156 }
3157 LastCycle = NewLastCycle;
3158 return true;
3159}
3160
3161// Check if the generated schedule is valid. This function checks if
3162// an instruction that uses a physical register is scheduled in a
3163// different stage than the definition. The pipeliner does not handle
3164// physical register values that may cross a basic block boundary.
3165// Furthermore, if a physical def/use pair is assigned to the same
3166// cycle, orderDependence does not guarantee def/use ordering, so that
3167// case should be considered invalid. (The test checks for both
3168// earlier and same-cycle use to be more robust.)
3170 for (SUnit &SU : SSD->SUnits) {
3171 if (!SU.hasPhysRegDefs)
3172 continue;
3173 int StageDef = stageScheduled(&SU);
3174 int CycleDef = InstrToCycle[&SU];
3175 assert(StageDef != -1 && "Instruction should have been scheduled.");
3176 for (auto &SI : SU.Succs)
3177 if (SI.isAssignedRegDep() && !SI.getSUnit()->isBoundaryNode())
3178 if (Register::isPhysicalRegister(SI.getReg())) {
3179 if (stageScheduled(SI.getSUnit()) != StageDef)
3180 return false;
3181 if (InstrToCycle[SI.getSUnit()] <= CycleDef)
3182 return false;
3183 }
3184 }
3185 return true;
3186}
3187
3188/// A property of the node order in swing-modulo-scheduling is
3189/// that for nodes outside circuits the following holds:
3190/// none of them is scheduled after both a successor and a
3191/// predecessor.
3192/// The method below checks whether the property is met.
3193/// If not, debug information is printed and statistics information updated.
3194/// Note that we do not use an assert statement.
3195/// The reason is that although an invalid node oder may prevent
3196/// the pipeliner from finding a pipelined schedule for arbitrary II,
3197/// it does not lead to the generation of incorrect code.
3198void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
3199
3200 // a sorted vector that maps each SUnit to its index in the NodeOrder
3201 typedef std::pair<SUnit *, unsigned> UnitIndex;
3202 std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
3203
3204 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
3205 Indices.push_back(std::make_pair(NodeOrder[i], i));
3206
3207 auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
3208 return std::get<0>(i1) < std::get<0>(i2);
3209 };
3210
3211 // sort, so that we can perform a binary search
3212 llvm::sort(Indices, CompareKey);
3213
3214 bool Valid = true;
3215 (void)Valid;
3216 // for each SUnit in the NodeOrder, check whether
3217 // it appears after both a successor and a predecessor
3218 // of the SUnit. If this is the case, and the SUnit
3219 // is not part of circuit, then the NodeOrder is not
3220 // valid.
3221 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
3222 SUnit *SU = NodeOrder[i];
3223 unsigned Index = i;
3224
3225 bool PredBefore = false;
3226 bool SuccBefore = false;
3227
3228 SUnit *Succ;
3229 SUnit *Pred;
3230 (void)Succ;
3231 (void)Pred;
3232
3233 for (SDep &PredEdge : SU->Preds) {
3234 SUnit *PredSU = PredEdge.getSUnit();
3235 unsigned PredIndex = std::get<1>(
3236 *llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey));
3237 if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
3238 PredBefore = true;
3239 Pred = PredSU;
3240 break;
3241 }
3242 }
3243
3244 for (SDep &SuccEdge : SU->Succs) {
3245 SUnit *SuccSU = SuccEdge.getSUnit();
3246 // Do not process a boundary node, it was not included in NodeOrder,
3247 // hence not in Indices either, call to std::lower_bound() below will
3248 // return Indices.end().
3249 if (SuccSU->isBoundaryNode())
3250 continue;
3251 unsigned SuccIndex = std::get<1>(
3252 *llvm::lower_bound(Indices, std::make_pair(SuccSU, 0), CompareKey));
3253 if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
3254 SuccBefore = true;
3255 Succ = SuccSU;
3256 break;
3257 }
3258 }
3259
3260 if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
3261 // instructions in circuits are allowed to be scheduled
3262 // after both a successor and predecessor.
3263 bool InCircuit = llvm::any_of(
3264 Circuits, [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
3265 if (InCircuit)
3266 LLVM_DEBUG(dbgs() << "In a circuit, predecessor ";);
3267 else {
3268 Valid = false;
3269 NumNodeOrderIssues++;
3270 LLVM_DEBUG(dbgs() << "Predecessor ";);
3271 }
3272 LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
3273 << " are scheduled before node " << SU->NodeNum
3274 << "\n";);
3275 }
3276 }
3277
3278 LLVM_DEBUG({
3279 if (!Valid)
3280 dbgs() << "Invalid node order found!\n";
3281 });
3282}
3283
3284/// Attempt to fix the degenerate cases when the instruction serialization
3285/// causes the register lifetimes to overlap. For example,
3286/// p' = store_pi(p, b)
3287/// = load p, offset
3288/// In this case p and p' overlap, which means that two registers are needed.
3289/// Instead, this function changes the load to use p' and updates the offset.
3290void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
3291 unsigned OverlapReg = 0;
3292 unsigned NewBaseReg = 0;
3293 for (SUnit *SU : Instrs) {
3294 MachineInstr *MI = SU->getInstr();
3295 for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3296 const MachineOperand &MO = MI->getOperand(i);
3297 // Look for an instruction that uses p. The instruction occurs in the
3298 // same cycle but occurs later in the serialized order.
3299 if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
3300 // Check that the instruction appears in the InstrChanges structure,
3301 // which contains instructions that can have the offset updated.
3303 InstrChanges.find(SU);
3304 if (It != InstrChanges.end()) {
3305 unsigned BasePos, OffsetPos;
3306 // Update the base register and adjust the offset.
3307 if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
3309 NewMI->getOperand(BasePos).setReg(NewBaseReg);
3310 int64_t NewOffset =
3311 MI->getOperand(OffsetPos).getImm() - It->second.second;
3312 NewMI->getOperand(OffsetPos).setImm(NewOffset);
3313 SU->setInstr(NewMI);
3314 MISUnitMap[NewMI] = SU;
3315 NewMIs[MI] = NewMI;
3316 }
3317 }
3318 OverlapReg = 0;
3319 NewBaseReg = 0;
3320 break;
3321 }
3322 // Look for an instruction of the form p' = op(p), which uses and defines
3323 // two virtual registers that get allocated to the same physical register.
3324 unsigned TiedUseIdx = 0;
3325 if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
3326 // OverlapReg is p in the example above.
3327 OverlapReg = MI->getOperand(TiedUseIdx).getReg();
3328 // NewBaseReg is p' in the example above.
3329 NewBaseReg = MI->getOperand(i).getReg();
3330 break;
3331 }
3332 }
3333 }
3334}
3335
3336std::deque<SUnit *>
3338 const std::deque<SUnit *> &Instrs) const {
3339 std::deque<SUnit *> NewOrderPhi;
3340 for (SUnit *SU : Instrs) {
3341 if (SU->getInstr()->isPHI())
3342 NewOrderPhi.push_back(SU);
3343 }
3344 std::deque<SUnit *> NewOrderI;
3345 for (SUnit *SU : Instrs) {
3346 if (!SU->getInstr()->isPHI())
3347 orderDependence(SSD, SU, NewOrderI);
3348 }
3349 llvm::append_range(NewOrderPhi, NewOrderI);
3350 return NewOrderPhi;
3351}
3352
3353/// After the schedule has been formed, call this function to combine
3354/// the instructions from the different stages/cycles. That is, this
3355/// function creates a schedule that represents a single iteration.
3357 // Move all instructions to the first stage from later stages.
3358 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3359 for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
3360 ++stage) {
3361 std::deque<SUnit *> &cycleInstrs =
3362 ScheduledInstrs[cycle + (stage * InitiationInterval)];
3363 for (SUnit *SU : llvm::reverse(cycleInstrs))
3364 ScheduledInstrs[cycle].push_front(SU);
3365 }
3366 }
3367
3368 // Erase all the elements in the later stages. Only one iteration should
3369 // remain in the scheduled list, and it contains all the instructions.
3370 for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
3371 ScheduledInstrs.erase(cycle);
3372
3373 // Change the registers in instruction as specified in the InstrChanges
3374 // map. We need to use the new registers to create the correct order.
3375 for (const SUnit &SU : SSD->SUnits)
3376 SSD->applyInstrChange(SU.getInstr(), *this);
3377
3378 // Reorder the instructions in each cycle to fix and improve the
3379 // generated code.
3380 for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
3381 std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
3382 cycleInstrs = reorderInstructions(SSD, cycleInstrs);
3383 SSD->fixupRegisterOverlaps(cycleInstrs);
3384 }
3385
3386 LLVM_DEBUG(dump(););
3387}
3388
3390 os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
3391 << " depth " << MaxDepth << " col " << Colocate << "\n";
3392 for (const auto &I : Nodes)
3393 os << " SU(" << I->NodeNum << ") " << *(I->getInstr());
3394 os << "\n";
3395}
3396
3397#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3398/// Print the schedule information to the given output.
3400 // Iterate over each cycle.
3401 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3402 // Iterate over each instruction in the cycle.
3403 const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
3404 for (SUnit *CI : cycleInstrs->second) {
3405 os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
3406 os << "(" << CI->NodeNum << ") ";
3407 CI->getInstr()->print(os);
3408 os << "\n";
3409 }
3410 }
3411}
3412
3413/// Utility function used for debugging to print the schedule.
3416
3417void ResourceManager::dumpMRT() const {
3418 LLVM_DEBUG({
3419 if (UseDFA)
3420 return;
3421 std::stringstream SS;
3422 SS << "MRT:\n";
3423 SS << std::setw(4) << "Slot";
3424 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3425 SS << std::setw(3) << I;
3426 SS << std::setw(7) << "#Mops"
3427 << "\n";
3428 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3429 SS << std::setw(4) << Slot;
3430 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3431 SS << std::setw(3) << MRT[Slot][I];
3432 SS << std::setw(7) << NumScheduledMops[Slot] << "\n";
3433 }
3434 dbgs() << SS.str();
3435 });
3436}
3437#endif
3438
3440 const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
3441 unsigned ProcResourceID = 0;
3442
3443 // We currently limit the resource kinds to 64 and below so that we can use
3444 // uint64_t for Masks
3445 assert(SM.getNumProcResourceKinds() < 64 &&
3446 "Too many kinds of resources, unsupported");
3447 // Create a unique bitmask for every processor resource unit.
3448 // Skip resource at index 0, since it always references 'InvalidUnit'.
3449 Masks.resize(SM.getNumProcResourceKinds());
3450 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3452 if (Desc.SubUnitsIdxBegin)
3453 continue;
3454 Masks[I] = 1ULL << ProcResourceID;
3455 ProcResourceID++;
3456 }
3457 // Create a unique bitmask for every processor resource group.
3458 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3460 if (!Desc.SubUnitsIdxBegin)
3461 continue;
3462 Masks[I] = 1ULL << ProcResourceID;
3463 for (unsigned U = 0; U < Desc.NumUnits; ++U)
3464 Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];
3465 ProcResourceID++;
3466 }
3467 LLVM_DEBUG({
3468 if (SwpShowResMask) {
3469 dbgs() << "ProcResourceDesc:\n";
3470 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3471 const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
3472 dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
3473 ProcResource->Name, I, Masks[I],
3474 ProcResource->NumUnits);
3475 }
3476 dbgs() << " -----------------\n";
3477 }
3478 });
3479}
3480
3482 LLVM_DEBUG({
3483 if (SwpDebugResource)
3484 dbgs() << "canReserveResources:\n";
3485 });
3486 if (UseDFA)
3487 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3488 ->canReserveResources(&SU.getInstr()->getDesc());
3489
3490 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3491 if (!SCDesc->isValid()) {
3492 LLVM_DEBUG({
3493 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3494 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3495 });
3496 return true;
3497 }
3498
3499 reserveResources(SCDesc, Cycle);
3500 bool Result = !isOverbooked();
3501 unreserveResources(SCDesc, Cycle);
3502
3503 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return " << Result << "\n\n";);
3504 return Result;
3505}
3506
3507void ResourceManager::reserveResources(SUnit &SU, int Cycle) {
3508 LLVM_DEBUG({
3509 if (SwpDebugResource)
3510 dbgs() << "reserveResources:\n";
3511 });
3512 if (UseDFA)
3513 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3514 ->reserveResources(&SU.getInstr()->getDesc());
3515
3516 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3517 if (!SCDesc->isValid()) {
3518 LLVM_DEBUG({
3519 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3520 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3521 });
3522 return;
3523 }
3524
3525 reserveResources(SCDesc, Cycle);
3526
3527 LLVM_DEBUG({
3528 if (SwpDebugResource) {
3529 dumpMRT();
3530 dbgs() << "reserveResources: done!\n\n";
3531 }
3532 });
3533}
3534
3535void ResourceManager::reserveResources(const MCSchedClassDesc *SCDesc,
3536 int Cycle) {
3537 assert(!UseDFA);
3538 for (const MCWriteProcResEntry &PRE : make_range(
3539 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3540 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3541 ++MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3542
3543 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3544 ++NumScheduledMops[positiveModulo(C, InitiationInterval)];
3545}
3546
3547void ResourceManager::unreserveResources(const MCSchedClassDesc *SCDesc,
3548 int Cycle) {
3549 assert(!UseDFA);
3550 for (const MCWriteProcResEntry &PRE : make_range(
3551 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3552 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3553 --MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3554
3555 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3556 --NumScheduledMops[positiveModulo(C, InitiationInterval)];
3557}
3558
3559bool ResourceManager::isOverbooked() const {
3560 assert(!UseDFA);
3561 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3562 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3564 if (MRT[Slot][I] > Desc->NumUnits)
3565 return true;
3566 }
3567 if (NumScheduledMops[Slot] > IssueWidth)
3568 return true;
3569 }
3570 return false;
3571}
3572
3573int ResourceManager::calculateResMIIDFA() const {
3574 assert(UseDFA);
3575
3576 // Sort the instructions by the number of available choices for scheduling,
3577 // least to most. Use the number of critical resources as the tie breaker.
3578 FuncUnitSorter FUS = FuncUnitSorter(*ST);
3579 for (SUnit &SU : DAG->SUnits)
3580 FUS.calcCriticalResources(*SU.getInstr());
3582 FuncUnitOrder(FUS);
3583
3584 for (SUnit &SU : DAG->SUnits)
3585 FuncUnitOrder.push(SU.getInstr());
3586
3588 Resources.push_back(
3589 std::unique_ptr<DFAPacketizer>(TII->CreateTargetScheduleState(*ST)));
3590
3591 while (!FuncUnitOrder.empty()) {
3592 MachineInstr *MI = FuncUnitOrder.top();
3593 FuncUnitOrder.pop();
3594 if (TII->isZeroCost(MI->getOpcode()))
3595 continue;
3596
3597 // Attempt to reserve the instruction in an existing DFA. At least one
3598 // DFA is needed for each cycle.
3599 unsigned NumCycles = DAG->getSUnit(MI)->Latency;
3600 unsigned ReservedCycles = 0;
3601 auto *RI = Resources.begin();
3602 auto *RE = Resources.end();
3603 LLVM_DEBUG({
3604 dbgs() << "Trying to reserve resource for " << NumCycles
3605 << " cycles for \n";
3606 MI->dump();
3607 });
3608 for (unsigned C = 0; C < NumCycles; ++C)
3609 while (RI != RE) {
3610 if ((*RI)->canReserveResources(*MI)) {
3611 (*RI)->reserveResources(*MI);
3612 ++ReservedCycles;
3613 break;
3614 }
3615 RI++;
3616 }
3617 LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
3618 << ", NumCycles:" << NumCycles << "\n");
3619 // Add new DFAs, if needed, to reserve resources.
3620 for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
3622 << "NewResource created to reserve resources"
3623 << "\n");
3624 auto *NewResource = TII->CreateTargetScheduleState(*ST);
3625 assert(NewResource->canReserveResources(*MI) && "Reserve error.");
3626 NewResource->reserveResources(*MI);
3627 Resources.push_back(std::unique_ptr<DFAPacketizer>(NewResource));
3628 }
3629 }
3630
3631 int Resmii = Resources.size();
3632 LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
3633 return Resmii;
3634}
3635
3637 if (UseDFA)
3638 return calculateResMIIDFA();
3639
3640 // Count each resource consumption and divide it by the number of units.
3641 // ResMII is the max value among them.
3642
3643 int NumMops = 0;
3645 for (SUnit &SU : DAG->SUnits) {
3646 if (TII->isZeroCost(SU.getInstr()->getOpcode()))
3647 continue;
3648
3649 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3650 if (!SCDesc->isValid())
3651 continue;
3652
3653 LLVM_DEBUG({
3654 if (SwpDebugResource) {
3655 DAG->dumpNode(SU);
3656 dbgs() << " #Mops: " << SCDesc->NumMicroOps << "\n"
3657 << " WriteProcRes: ";
3658 }
3659 });
3660 NumMops += SCDesc->NumMicroOps;
3661 for (const MCWriteProcResEntry &PRE :
3662 make_range(STI->getWriteProcResBegin(SCDesc),
3663 STI->getWriteProcResEnd(SCDesc))) {
3664 LLVM_DEBUG({
3665 if (SwpDebugResource) {
3666 const MCProcResourceDesc *Desc =
3667 SM.getProcResource(PRE.ProcResourceIdx);
3668 dbgs() << Desc->Name << ": " << PRE.ReleaseAtCycle << ", ";
3669 }
3670 });
3671 ResourceCount[PRE.ProcResourceIdx] += PRE.ReleaseAtCycle;
3672 }
3673 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "\n");
3674 }
3675
3676 int Result = (NumMops + IssueWidth - 1) / IssueWidth;
3677 LLVM_DEBUG({
3678 if (SwpDebugResource)
3679 dbgs() << "#Mops: " << NumMops << ", "
3680 << "IssueWidth: " << IssueWidth << ", "
3681 << "Cycles: " << Result << "\n";
3682 });
3683
3684 LLVM_DEBUG({
3685 if (SwpDebugResource) {
3686 std::stringstream SS;
3687 SS << std::setw(2) << "ID" << std::setw(16) << "Name" << std::setw(10)
3688 << "Units" << std::setw(10) << "Consumed" << std::setw(10) << "Cycles"
3689 << "\n";
3690 dbgs() << SS.str();
3691 }
3692 });
3693 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3695 int Cycles = (ResourceCount[I] + Desc->NumUnits - 1) / Desc->NumUnits;
3696 LLVM_DEBUG({
3697 if (SwpDebugResource) {
3698 std::stringstream SS;
3699 SS << std::setw(2) << I << std::setw(16) << Desc->Name << std::setw(10)
3700 << Desc->NumUnits << std::setw(10) << ResourceCount[I]
3701 << std::setw(10) << Cycles << "\n";
3702 dbgs() << SS.str();
3703 }
3704 });
3705 if (Cycles > Result)
3706 Result = Cycles;
3707 }
3708 return Result;
3709}
3710
3712 InitiationInterval = II;
3713 DFAResources.clear();
3714 DFAResources.resize(II);
3715 for (auto &I : DFAResources)
3716 I.reset(ST->getInstrInfo()->CreateTargetScheduleState(*ST));
3717 MRT.clear();
3719 NumScheduledMops.clear();
3720 NumScheduledMops.resize(II);
3721}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const LLT S1
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:529
This file declares an analysis pass that computes CycleInfo for LLVM IR, specialized from GenericCycl...
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
bool End
Definition: ELF_riscv.cpp:480
SmallVector< uint32_t, 0 > Writes
Definition: ELF_riscv.cpp:497
Rewrite Partial Register Uses
#define DEBUG_TYPE
const HexagonInstrInfo * TII
hexagon gen pred
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
static cl::opt< int > SwpForceII("pipeliner-force-ii", cl::desc("Force pipeliner to use specified II."), cl::Hidden, cl::init(-1))
A command line argument to force pipeliner to use specified initial interval.
static cl::opt< bool > ExperimentalCodeGen("pipeliner-experimental-cg", cl::Hidden, cl::init(false), cl::desc("Use the experimental peeling code generator for software pipelining"))
static bool pred_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Preds, const NodeSet *S=nullptr)
Compute the Pred_L(O) set, as defined in the paper.
static cl::opt< int > RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden, cl::init(5), cl::desc("Margin representing the unused percentage of " "the register pressure limit"))
static cl::opt< bool > SwpDebugResource("pipeliner-dbg-res", cl::Hidden, cl::init(false))
static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker, NodeSet &NS)
Compute the live-out registers for the instructions in a node-set.
static void computeScheduledInsts(const SwingSchedulerDAG *SSD, SMSchedule &Schedule, std::vector< MachineInstr * > &OrderedInsts, DenseMap< MachineInstr *, unsigned > &Stages)
Create an instruction stream that represents a single iteration and stage of each instruction.
static cl::opt< bool > EmitTestAnnotations("pipeliner-annotate-for-testing", cl::Hidden, cl::init(false), cl::desc("Instead of emitting the pipelined code, annotate instructions " "with the generated schedule for feeding into the " "-modulo-schedule-test pass"))
static bool isIntersect(SmallSetVector< SUnit *, 8 > &Set1, const NodeSet &Set2, SmallSetVector< SUnit *, 8 > &Result)
Return true if Set1 contains elements in Set2.
static cl::opt< bool > SwpIgnoreRecMII("pipeliner-ignore-recmii", cl::ReallyHidden, cl::desc("Ignore RecMII"))
static cl::opt< int > SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1))
static bool succ_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Succs, const NodeSet *S=nullptr)
Compute the Succ_L(O) set, as defined in the paper.
Modulo Software Pipelining
static cl::opt< bool > SwpPruneLoopCarried("pipeliner-prune-loop-carried", cl::desc("Prune loop carried order dependences."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of loop carried order dependences.
static bool isDependenceBarrier(MachineInstr &MI)
Return true if the instruction causes a chain between memory references before and after it.
static cl::opt< int > SwpMaxMii("pipeliner-max-mii", cl::desc("Size limit for the MII."), cl::Hidden, cl::init(27))
A command line argument to limit minimum initial interval for pipelining.
static unsigned getLoopPhiReg(const MachineInstr &Phi, const MachineBasicBlock *LoopBB)
Return the Phi register value that comes the loop block.
static void swapAntiDependences(std::vector< SUnit > &SUnits)
Swap all the anti dependences in the DAG.
static bool isSuccOrder(SUnit *SUa, SUnit *SUb)
Return true if SUb can be reached from SUa following the chain edges.
static cl::opt< int > SwpMaxStages("pipeliner-max-stages", cl::desc("Maximum stages allowed in the generated scheduled."), cl::Hidden, cl::init(3))
A command line argument to limit the number of stages in the pipeline.
static cl::opt< bool > EnableSWPOptSize("enable-pipeliner-opt-size", cl::desc("Enable SWP at Os."), cl::Hidden, cl::init(false))
A command line option to enable SWP at -Os.
static cl::opt< bool > SwpShowResMask("pipeliner-show-mask", cl::Hidden, cl::init(false))
static cl::opt< int > SwpIISearchRange("pipeliner-ii-search-range", cl::desc("Range to search for II"), cl::Hidden, cl::init(10))
static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop, unsigned &InitVal, unsigned &LoopVal)
Return the register values for the operands of a Phi instruction.
static cl::opt< bool > LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false), cl::desc("Limit register pressure of scheduled loop"))
#define DEBUG_TYPE
static cl::opt< bool > EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true), cl::desc("Enable Software Pipelining"))
A command line option to turn software pipelining on or off.
static bool ignoreDependence(const SDep &D, bool isPred)
Return true for DAG nodes that we ignore when computing the cost functions.
static cl::opt< bool > SwpPruneDeps("pipeliner-prune-deps", cl::desc("Prune dependences between unrelated Phi nodes."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of chain dependences due to an unrelated Phi.
static SUnit * multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG)
If an instruction has a use that spans multiple iterations, then return true.
static bool computePath(SUnit *Cur, SetVector< SUnit * > &Path, SetVector< SUnit * > &DestNodes, SetVector< SUnit * > &Exclude, SmallPtrSet< SUnit *, 8 > &Visited)
Return true if there is a path from the specified node to any of the nodes in DestNodes.
unsigned const TargetRegisterInfo * TRI
unsigned Reg
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file defines the PriorityQueue class.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static unsigned getSize(unsigned Kind)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
bool isNoAlias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A trivial helper function to check to see if the specified pointers are no-alias.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:221
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
bool erase(const KeyT &Val)
Definition: DenseMap.h:329
bool empty() const
Definition: DenseMap.h:98
iterator end()
Definition: DenseMap.h:84
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:356
A possibly irreducible generalization of a Loop.
Itinerary data supplied by a subtarget to be used by a target.
const InstrStage * beginStage(unsigned ItinClassIndx) const
Return the first stage of the itinerary.
const InstrStage * endStage(unsigned ItinClassIndx) const
Return the last+1 stage of the itinerary.
bool isEmpty() const
Returns true if there are no itineraries.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:359
bool hasValue() const
TypeSize getValue() const
BlockT * getHeader() const
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:631
unsigned getSchedClass() const
Return the scheduling class for this instruction.
Definition: MCInstrDesc.h:600
const MCInstrDesc & get(unsigned Opcode) const
Return the machine instruction descriptor that corresponds to the specified instruction opcode.
Definition: MCInstrInfo.h:63
Generic base class for all target subtargets.
const MCWriteProcResEntry * getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCWriteProcResEntry * getWriteProcResBegin(const MCSchedClassDesc *SC) const
Return an iterator at the first process resource consumed by the given scheduling class.
const MCSchedModel & getSchedModel() const
Get the machine model for this subtarget's CPU.
Metadata node.
Definition: Metadata.h:1067
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1428
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1426
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1434
Tracking metadata reference owned by Metadata.
Definition: Metadata.h:889
A single uniqued string.
Definition: Metadata.h:720
StringRef getString() const
Definition: Metadata.cpp:610
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
instr_iterator instr_end()
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void deleteMachineInstr(MachineInstr *MI)
DeleteMachineInstr - Delete the given MachineInstr.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
bool mayRaiseFPException() const
Return true if this instruction could possibly raise a floating-point exception.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:546
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:329
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition: MachineInstr.h:543
bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool isRegSequence() const
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:777
bool isIdenticalTo(const MachineInstr &Other, MICheckType Check=CheckDefs) const
Return true if this instruction is identical to Other.
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool isPseudo(QueryType Type=IgnoreBundle) const
Return true if this is a pseudo instruction that doesn't correspond to a real machine instruction.
Definition: MachineInstr.h:898
bool isPHI() const
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:556
iterator_range< filtered_mop_iterator > all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
Definition: MachineInstr.h:733
A description of a memory reference used in the backend.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
const Value * getValue() const
Return the base address of the memory access.
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
Register getReg() const
getReg - Returns the register number.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
Diagnostic information for optimization analysis remarks.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
The main class in the implementation of the target independent software pipeliner pass.
const TargetInstrInfo * TII
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
MachineFunction * MF
MachineOptimizationRemarkEmitter * ORE
RegisterClassInfo RegClassInfo
defusechain_iterator - This class provides iterator support for machine operands in the function that...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
use_instr_iterator use_instr_begin(Register RegNo) const
static use_instr_iterator use_instr_end()
MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:71
iterator find(const KeyT &Key)
Definition: MapVector.h:167
void clear()
Definition: MapVector.h:88
static MemoryLocation getAfter(const Value *Ptr, const AAMDNodes &AATags=AAMDNodes())
Return a location that may access any location after Ptr, while remaining within the underlying objec...
The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place, rewriting the old loop and...
void cleanup()
Performs final cleanup after expansion.
void expand()
Performs the actual expansion.
Expander that simply annotates each scheduled instruction with a post-instr symbol that can be consum...
void annotate()
Performs the annotation.
Represents a schedule for a single-block loop.
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
SUnit * getNode(unsigned i) const
void print(raw_ostream &os) const
void setRecMII(unsigned mii)
unsigned count(SUnit *SU) const
void setColocate(unsigned c)
int compareRecMII(NodeSet &RHS)
unsigned size() const
bool insert(SUnit *SU)
LLVM_DUMP_METHOD void dump() const
bool empty() const
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
void dump() const
Definition: Pass.cpp:136
A reimplementation of ModuloScheduleExpander.
unsigned getPSet() const
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
Track the current register pressure at some position in the instruction stream, and remember the high...
void addLiveRegs(ArrayRef< RegisterMaskPair > Regs)
Force liveness of virtual registers or physical register units.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static constexpr bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:65
void initProcResourceVectors(const MCSchedModel &SM, SmallVectorImpl< uint64_t > &Masks)
void init(int II)
Initialize resources with the initiation interval II.
bool canReserveResources(SUnit &SU, int Cycle)
Check if the resources occupied by a machine instruction are available in the current state.
Scheduling dependency.
Definition: ScheduleDAG.h:49
SUnit * getSUnit() const
Definition: ScheduleDAG.h:480
Kind getKind() const
Returns an enum value representing the kind of the dependence.
Definition: ScheduleDAG.h:486
Kind
These are the different kinds of scheduling dependencies.
Definition: ScheduleDAG.h:52
@ Output
A register output-dependence (aka WAW).
Definition: ScheduleDAG.h:55
@ Order
Any other ordering dependency.
Definition: ScheduleDAG.h:56
@ Anti
A register anti-dependence (aka WAR).
Definition: ScheduleDAG.h:54
@ Data
Regular data dependence (aka true-dependence).
Definition: ScheduleDAG.h:53
void setLatency(unsigned Lat)
Sets the latency for this edge.
Definition: ScheduleDAG.h:147
@ Barrier
An unknown scheduling barrier.
Definition: ScheduleDAG.h:69
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
Definition: ScheduleDAG.h:142
bool isArtificial() const
Tests if this is an Order dependence that is marked as "artificial", meaning it isn't necessary for c...
Definition: ScheduleDAG.h:200
This class represents the scheduled code.
std::deque< SUnit * > reorderInstructions(const SwingSchedulerDAG *SSD, const std::deque< SUnit * > &Instrs) const
int earliestCycleInChain(const SDep &Dep)
Return the cycle of the earliest scheduled instruction in the dependence chain.
void setInitiationInterval(int ii)
Set the initiation interval for this schedule.
SmallSet< SUnit *, 8 > computeUnpipelineableNodes(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
Determine transitive dependences of unpipelineable instructions.
void dump() const
Utility function used for debugging to print the schedule.
bool insert(SUnit *SU, int StartCycle, int EndCycle, int II)
Try to schedule the node at the specified StartCycle and continue until the node is schedule or the E...
unsigned getMaxStageCount()
Return the maximum stage count needed for this schedule.
void print(raw_ostream &os) const
Print the schedule information to the given output.
int latestCycleInChain(const SDep &Dep)
Return the cycle of the latest scheduled instruction in the dependence chain.
int stageScheduled(SUnit *SU) const
Return the stage for a scheduled instruction.
void orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU, std::deque< SUnit * > &Insts) const
Order the instructions within a cycle so that the definitions occur before the uses.
void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart, int *MinEnd, int *MaxStart, int II, SwingSchedulerDAG *DAG)
Compute the scheduling start slot for the instruction.
bool isValidSchedule(SwingSchedulerDAG *SSD)
int getInitiationInterval() const
Return the initiation interval for this schedule.
std::deque< SUnit * > & getInstructions(int cycle)
Return the instructions that are scheduled at the specified cycle.
int getFirstCycle() const
Return the first cycle in the completed schedule.
bool isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD, MachineInstr *Def, MachineOperand &MO) const
Return true if the instruction is a definition that is loop carried and defines the use on the next i...
unsigned cycleScheduled(SUnit *SU) const
Return the cycle for a scheduled instruction.
bool normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
bool isLoopCarried(const SwingSchedulerDAG *SSD, MachineInstr &Phi) const
Return true if the scheduled Phi has a loop carried operand.
int getFinalCycle() const
Return the last cycle in the finalized schedule.
void finalizeSchedule(SwingSchedulerDAG *SSD)
After the schedule has been formed, call this function to combine the instructions from the different...
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
unsigned NumPreds
Definition: ScheduleDAG.h:266
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
Definition: ScheduleDAG.h:362
unsigned NodeNum
Entry # of node in the node vector.
Definition: ScheduleDAG.h:264
void setInstr(MachineInstr *MI)
Assigns the instruction for the SUnit.
Definition: ScheduleDAG.h:366
void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
bool isPred(const SUnit *N) const
Tests if node N is a predecessor of this node.
Definition: ScheduleDAG.h:431
unsigned short Latency
Node latency.
Definition: ScheduleDAG.h:273
bool isBoundaryNode() const
Boundary nodes are placeholders for the boundary of the scheduling region.
Definition: ScheduleDAG.h:344
bool hasPhysRegDefs
Has physreg defs that are being used.
Definition: ScheduleDAG.h:280
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:257
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:256
bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:373
A ScheduleDAG for scheduling lists of MachineInstr.
DenseMap< MachineInstr *, SUnit * > MISUnitMap
After calling BuildSchedGraph, each machine instruction in the current scheduling region is mapped to...
virtual void finishBlock()
Cleans up after scheduling in the given block.
MachineBasicBlock * BB
The block in which to insert instructions.
const MCSchedClassDesc * getSchedClass(SUnit *SU) const
Resolves and cache a resolved scheduling class for an SUnit.
void dumpNode(const SUnit &SU) const override
UndefValue * UnknownValue
For an unanalyzable memory access, this Value is used in maps.
void buildSchedGraph(AAResults *AA, RegPressureTracker *RPTracker=nullptr, PressureDiffs *PDiffs=nullptr, LiveIntervals *LIS=nullptr, bool TrackLaneMasks=false)
Builds SUnits for the current region.
SUnit * getSUnit(MachineInstr *MI) const
Returns an existing SUnit for this MI, or nullptr.
TargetSchedModel SchedModel
TargetSchedModel provides an interface to the machine model.
void dump() const override
void RemovePred(SUnit *M, SUnit *N)
Updates the topological ordering to accommodate an edge to be removed from the specified node N from ...
void InitDAGTopologicalSorting()
Creates the initial topological ordering from the DAG to be scheduled.
void AddPred(SUnit *Y, SUnit *X)
Updates the topological ordering to accommodate an edge to be added from SUnit X to SUnit Y.
bool IsReachable(const SUnit *SU, const SUnit *TargetSU)
Checks if SU is reachable from TargetSU.
MachineRegisterInfo & MRI
Virtual/real register map.
Definition: ScheduleDAG.h:560
const TargetInstrInfo * TII
Target instruction information.
Definition: ScheduleDAG.h:557
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:561
const TargetRegisterInfo * TRI
Target processor register info.
Definition: ScheduleDAG.h:558
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:559
A vector that has set insertion semantics.
Definition: SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
typename vector_type::const_iterator iterator
Definition: SetVector.h:69
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
SlotIndexes pass.
Definition: SlotIndexes.h:300
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:523
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:290
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:356
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
iterator end() const
Definition: SmallPtrSet.h:385
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
iterator begin() const
Definition: SmallPtrSet.h:380
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:236
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
This class builds the dependence graph for the instructions in a loop, and attempts to schedule the i...
unsigned getInstrBaseReg(SUnit *SU) const
Return the new base register that was stored away for the changed instruction.
unsigned getDepth(SUnit *Node)
The depth, in the dependence graph, for a node.
int getASAP(SUnit *Node)
Return the earliest time an instruction may be scheduled.
void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule)
Apply changes to the instruction if needed.
void finishBlock() override
Clean up after the software pipeliner runs.
void fixupRegisterOverlaps(std::deque< SUnit * > &Instrs)
Attempt to fix the degenerate cases when the instruction serialization causes the register lifetimes ...
int getZeroLatencyDepth(SUnit *Node)
The maximum unweighted length of a path from an arbitrary node to the given node in which each edge h...
bool isLoopCarriedDep(SUnit *Source, const SDep &Dep, bool isSucc=true)
Return true for an order or output dependence that is loop carried potentially.
unsigned getDistance(SUnit *U, SUnit *V, const SDep &Dep)
The distance function, which indicates that operation V of iteration I depends on operations U of ite...
void schedule() override
We override the schedule function in ScheduleDAGInstrs to implement the scheduling part of the Swing ...
int getMOV(SUnit *Node)
The mobility function, which the number of slots in which an instruction may be scheduled.
int getZeroLatencyHeight(SUnit *Node)
The maximum unweighted length of a path from the given node to an arbitrary node in which each edge h...
bool isBackedge(SUnit *Source, const SDep &Dep)
Return true if the dependence is a back-edge in the data dependence graph.
unsigned getHeight(SUnit *Node)
The height, in the dependence graph, for a node.
int getALAP(SUnit *Node)
Return the latest time an instruction my be scheduled.
Object returned by analyzeLoopForPipelining.
virtual bool shouldIgnoreForPipelining(const MachineInstr *MI) const =0
Return true if the given instruction should not be pipelined and should be ignored.
virtual bool shouldUseSchedule(SwingSchedulerDAG &SSD, SMSchedule &SMS)
Return true if the proposed schedule should used.
virtual std::unique_ptr< PipelinerLoopInfo > analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const
Analyze loop L, which must be a single-basic-block loop, and if the conditions can be understood enou...
bool isZeroCost(unsigned Opcode) const
Return true for pseudo instructions that don't consume any machine resources in their current form.
virtual bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
virtual DFAPacketizer * CreateTargetScheduleState(const TargetSubtargetInfo &) const
Create machine specific model for scheduling.
virtual bool isPostIncrement(const MachineInstr &MI) const
Return true for post-incremented instructions.
virtual bool getBaseAndOffsetPosition(const MachineInstr &MI, unsigned &BasePos, unsigned &OffsetPos) const
Return true if the instruction contains a base register and offset.
virtual bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const
Sometimes, it is possible for the target to tell, even without aliasing information,...
virtual bool getIncrementValue(const MachineInstr &MI, int &Value) const
If the instruction is an increment of a constant value, return the amount.
bool getMemOperandWithOffset(const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const
Get the base operand and byte offset of an instruction that reads/writes memory.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const char * getRegPressureSetName(unsigned Idx) const =0
Get the name of this register unit pressure set.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
static Type * getVoidTy(LLVMContext &C)
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1808
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:139
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
constexpr double e
Definition: MathExtras.h:31
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Offset
Definition: DWP.cpp:456
void stable_sort(R &&Range)
Definition: STLExtras.h:1995
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2073
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2059
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
unsigned getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1954
char & MachinePipelinerID
This pass performs software pipelining on machine instructions.
CycleInfo::CycleT Cycle
Definition: CycleInfo.h:24
cl::opt< bool > SwpEnableCopyToPhi
bool isIdentifiedObject(const Value *V)
Return true if this pointer refers to a distinct and identifiable object.
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
cl::opt< int > SwpForceIssueWidth
A command line argument to force pipeliner to use specified issue width.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Description of the encoding of one expression Op.
These values represent a non-pipelined step in the execution of an instruction.
RegisterPressure computed within a region of instructions delimited by TopIdx and BottomIdx.
static constexpr LaneBitmask getNone()
Definition: LaneBitmask.h:81
Define a kind of processor resource that will be modeled by the scheduler.
Definition: MCSchedule.h:31
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition: MCSchedule.h:118
bool isValid() const
Definition: MCSchedule.h:136
Machine model for scheduling, bundling, and heuristics.
Definition: MCSchedule.h:253
const MCSchedClassDesc * getSchedClassDesc(unsigned SchedClassIdx) const
Definition: MCSchedule.h:360
unsigned getNumProcResourceKinds() const
Definition: MCSchedule.h:349
bool hasInstrSchedModel() const
Does this machine model include instruction-level scheduling.
Definition: MCSchedule.h:334
const MCProcResourceDesc * getProcResource(unsigned ProcResourceIdx) const
Definition: MCSchedule.h:353
Identify one of the processor resource kinds consumed by a particular scheduling class for the specif...
Definition: MCSchedule.h:63
SmallVector< MachineOperand, 4 > BrCond
std::unique_ptr< TargetInstrInfo::PipelinerLoopInfo > LoopPipelinerInfo
Store the effects of a change in pressure on things that MI scheduler cares about.
std::vector< unsigned > MaxSetPressure
Map of max reg pressure indexed by pressure set ID, not class ID.