LLVM 17.0.0git
MachinePipeliner.cpp
Go to the documentation of this file.
1//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
10//
11// This SMS implementation is a target-independent back-end pass. When enabled,
12// the pass runs just prior to the register allocation pass, while the machine
13// IR is in SSA form. If software pipelining is successful, then the original
14// loop is replaced by the optimized loop. The optimized loop contains one or
15// more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16// the instructions cannot be scheduled in a given MII, we increase the MII by
17// one and try again.
18//
19// The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20// represent loop carried dependences in the DAG as order edges to the Phi
21// nodes. We also perform several passes over the DAG to eliminate unnecessary
22// edges that inhibit the ability to pipeline. The implementation uses the
23// DFAPacketizer class to compute the minimum initiation interval and the check
24// where an instruction may be inserted in the pipelined schedule.
25//
26// In order for the SMS pass to work, several target specific hooks need to be
27// implemented to get information about the loop structure and to rewrite
28// instructions.
29//
30//===----------------------------------------------------------------------===//
31
33#include "llvm/ADT/ArrayRef.h"
34#include "llvm/ADT/BitVector.h"
35#include "llvm/ADT/DenseMap.h"
36#include "llvm/ADT/MapVector.h"
39#include "llvm/ADT/SetVector.h"
41#include "llvm/ADT/SmallSet.h"
43#include "llvm/ADT/Statistic.h"
69#include "llvm/Config/llvm-config.h"
70#include "llvm/IR/Attributes.h"
71#include "llvm/IR/Function.h"
72#include "llvm/MC/LaneBitmask.h"
73#include "llvm/MC/MCInstrDesc.h"
76#include "llvm/Pass.h"
79#include "llvm/Support/Debug.h"
82#include <algorithm>
83#include <cassert>
84#include <climits>
85#include <cstdint>
86#include <deque>
87#include <functional>
88#include <iomanip>
89#include <iterator>
90#include <map>
91#include <memory>
92#include <sstream>
93#include <tuple>
94#include <utility>
95#include <vector>
96
97using namespace llvm;
98
99#define DEBUG_TYPE "pipeliner"
100
101STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
102STATISTIC(NumPipelined, "Number of loops software pipelined");
103STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
104STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
105STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
106STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
107STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
108STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
109STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
110STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
111STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
112
113/// A command line option to turn software pipelining on or off.
114static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
115 cl::desc("Enable Software Pipelining"));
116
117/// A command line option to enable SWP at -Os.
118static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
119 cl::desc("Enable SWP at Os."), cl::Hidden,
120 cl::init(false));
121
122/// A command line argument to limit minimum initial interval for pipelining.
123static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
124 cl::desc("Size limit for the MII."),
125 cl::Hidden, cl::init(27));
126
127/// A command line argument to force pipeliner to use specified initial
128/// interval.
129static cl::opt<int> SwpForceII("pipeliner-force-ii",
130 cl::desc("Force pipeliner to use specified II."),
131 cl::Hidden, cl::init(-1));
132
133/// A command line argument to limit the number of stages in the pipeline.
134static cl::opt<int>
135 SwpMaxStages("pipeliner-max-stages",
136 cl::desc("Maximum stages allowed in the generated scheduled."),
137 cl::Hidden, cl::init(3));
138
139/// A command line option to disable the pruning of chain dependences due to
140/// an unrelated Phi.
141static cl::opt<bool>
142 SwpPruneDeps("pipeliner-prune-deps",
143 cl::desc("Prune dependences between unrelated Phi nodes."),
144 cl::Hidden, cl::init(true));
145
146/// A command line option to disable the pruning of loop carried order
147/// dependences.
148static cl::opt<bool>
149 SwpPruneLoopCarried("pipeliner-prune-loop-carried",
150 cl::desc("Prune loop carried order dependences."),
151 cl::Hidden, cl::init(true));
152
153#ifndef NDEBUG
154static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
155#endif
156
157static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
159 cl::desc("Ignore RecMII"));
160
161static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
162 cl::init(false));
163static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
164 cl::init(false));
165
167 "pipeliner-annotate-for-testing", cl::Hidden, cl::init(false),
168 cl::desc("Instead of emitting the pipelined code, annotate instructions "
169 "with the generated schedule for feeding into the "
170 "-modulo-schedule-test pass"));
171
173 "pipeliner-experimental-cg", cl::Hidden, cl::init(false),
174 cl::desc(
175 "Use the experimental peeling code generator for software pipelining"));
176
177namespace llvm {
178
179// A command line option to enable the CopyToPhi DAG mutation.
181 cl::init(true),
182 cl::desc("Enable CopyToPhi DAG Mutation"));
183
184/// A command line argument to force pipeliner to use specified issue
185/// width.
187 "pipeliner-force-issue-width",
188 cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
189 cl::init(-1));
190
191} // end namespace llvm
192
193unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
194char MachinePipeliner::ID = 0;
195#ifndef NDEBUG
197#endif
199
201 "Modulo Software Pipelining", false, false)
207 "Modulo Software Pipelining", false, false)
208
209/// The "main" function for implementing Swing Modulo Scheduling.
210bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
211 if (skipFunction(mf.getFunction()))
212 return false;
213
214 if (!EnableSWP)
215 return false;
216
217 if (mf.getFunction().getAttributes().hasFnAttr(Attribute::OptimizeForSize) &&
219 return false;
220
221 if (!mf.getSubtarget().enableMachinePipeliner())
222 return false;
223
224 // Cannot pipeline loops without instruction itineraries if we are using
225 // DFA for the pipeliner.
226 if (mf.getSubtarget().useDFAforSMS() &&
227 (!mf.getSubtarget().getInstrItineraryData() ||
228 mf.getSubtarget().getInstrItineraryData()->isEmpty()))
229 return false;
230
231 MF = &mf;
232 MLI = &getAnalysis<MachineLoopInfo>();
233 MDT = &getAnalysis<MachineDominatorTree>();
234 ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
235 TII = MF->getSubtarget().getInstrInfo();
236 RegClassInfo.runOnMachineFunction(*MF);
237
238 for (const auto &L : *MLI)
239 scheduleLoop(*L);
240
241 return false;
242}
243
244/// Attempt to perform the SMS algorithm on the specified loop. This function is
245/// the main entry point for the algorithm. The function identifies candidate
246/// loops, calculates the minimum initiation interval, and attempts to schedule
247/// the loop.
248bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
249 bool Changed = false;
250 for (const auto &InnerLoop : L)
251 Changed |= scheduleLoop(*InnerLoop);
252
253#ifndef NDEBUG
254 // Stop trying after reaching the limit (if any).
255 int Limit = SwpLoopLimit;
256 if (Limit >= 0) {
257 if (NumTries >= SwpLoopLimit)
258 return Changed;
259 NumTries++;
260 }
261#endif
262
263 setPragmaPipelineOptions(L);
264 if (!canPipelineLoop(L)) {
265 LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
266 ORE->emit([&]() {
267 return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
268 L.getStartLoc(), L.getHeader())
269 << "Failed to pipeline loop";
270 });
271
272 LI.LoopPipelinerInfo.reset();
273 return Changed;
274 }
275
276 ++NumTrytoPipeline;
277
278 Changed = swingModuloScheduler(L);
279
280 LI.LoopPipelinerInfo.reset();
281 return Changed;
282}
283
284void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
285 // Reset the pragma for the next loop in iteration.
286 disabledByPragma = false;
287 II_setByPragma = 0;
288
289 MachineBasicBlock *LBLK = L.getTopBlock();
290
291 if (LBLK == nullptr)
292 return;
293
294 const BasicBlock *BBLK = LBLK->getBasicBlock();
295 if (BBLK == nullptr)
296 return;
297
298 const Instruction *TI = BBLK->getTerminator();
299 if (TI == nullptr)
300 return;
301
302 MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);
303 if (LoopID == nullptr)
304 return;
305
306 assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");
307 assert(LoopID->getOperand(0) == LoopID && "invalid loop");
308
309 for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
310 MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
311
312 if (MD == nullptr)
313 continue;
314
315 MDString *S = dyn_cast<MDString>(MD->getOperand(0));
316
317 if (S == nullptr)
318 continue;
319
320 if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
321 assert(MD->getNumOperands() == 2 &&
322 "Pipeline initiation interval hint metadata should have two operands.");
324 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
325 assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");
326 } else if (S->getString() == "llvm.loop.pipeline.disable") {
327 disabledByPragma = true;
328 }
329 }
330}
331
332/// Return true if the loop can be software pipelined. The algorithm is
333/// restricted to loops with a single basic block. Make sure that the
334/// branch in the loop can be analyzed.
335bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
336 if (L.getNumBlocks() != 1) {
337 ORE->emit([&]() {
338 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
339 L.getStartLoc(), L.getHeader())
340 << "Not a single basic block: "
341 << ore::NV("NumBlocks", L.getNumBlocks());
342 });
343 return false;
344 }
345
346 if (disabledByPragma) {
347 ORE->emit([&]() {
348 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
349 L.getStartLoc(), L.getHeader())
350 << "Disabled by Pragma.";
351 });
352 return false;
353 }
354
355 // Check if the branch can't be understood because we can't do pipelining
356 // if that's the case.
357 LI.TBB = nullptr;
358 LI.FBB = nullptr;
359 LI.BrCond.clear();
360 if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
361 LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
362 NumFailBranch++;
363 ORE->emit([&]() {
364 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
365 L.getStartLoc(), L.getHeader())
366 << "The branch can't be understood";
367 });
368 return false;
369 }
370
371 LI.LoopInductionVar = nullptr;
372 LI.LoopCompare = nullptr;
374 if (!LI.LoopPipelinerInfo) {
375 LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
376 NumFailLoop++;
377 ORE->emit([&]() {
378 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
379 L.getStartLoc(), L.getHeader())
380 << "The loop structure is not supported";
381 });
382 return false;
383 }
384
385 if (!L.getLoopPreheader()) {
386 LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
387 NumFailPreheader++;
388 ORE->emit([&]() {
389 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
390 L.getStartLoc(), L.getHeader())
391 << "No loop preheader found";
392 });
393 return false;
394 }
395
396 // Remove any subregisters from inputs to phi nodes.
397 preprocessPhiNodes(*L.getHeader());
398 return true;
399}
400
401void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
403 SlotIndexes &Slots = *getAnalysis<LiveIntervals>().getSlotIndexes();
404
405 for (MachineInstr &PI : B.phis()) {
406 MachineOperand &DefOp = PI.getOperand(0);
407 assert(DefOp.getSubReg() == 0);
408 auto *RC = MRI.getRegClass(DefOp.getReg());
409
410 for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
411 MachineOperand &RegOp = PI.getOperand(i);
412 if (RegOp.getSubReg() == 0)
413 continue;
414
415 // If the operand uses a subregister, replace it with a new register
416 // without subregisters, and generate a copy to the new register.
417 Register NewReg = MRI.createVirtualRegister(RC);
418 MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
420 const DebugLoc &DL = PredB.findDebugLoc(At);
421 auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
422 .addReg(RegOp.getReg(), getRegState(RegOp),
423 RegOp.getSubReg());
424 Slots.insertMachineInstrInMaps(*Copy);
425 RegOp.setReg(NewReg);
426 RegOp.setSubReg(0);
427 }
428 }
429}
430
431/// The SMS algorithm consists of the following main steps:
432/// 1. Computation and analysis of the dependence graph.
433/// 2. Ordering of the nodes (instructions).
434/// 3. Attempt to Schedule the loop.
435bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
436 assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
437
438 SwingSchedulerDAG SMS(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo,
440
442 // The kernel should not include any terminator instructions. These
443 // will be added back later.
444 SMS.startBlock(MBB);
445
446 // Compute the number of 'real' instructions in the basic block by
447 // ignoring terminators.
448 unsigned size = MBB->size();
450 E = MBB->instr_end();
451 I != E; ++I, --size)
452 ;
453
454 SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
455 SMS.schedule();
456 SMS.exitRegion();
457
458 SMS.finishBlock();
459 return SMS.hasNewSchedule();
460}
461
470}
471
472void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
473 if (SwpForceII > 0)
474 MII = SwpForceII;
475 else if (II_setByPragma > 0)
476 MII = II_setByPragma;
477 else
478 MII = std::max(ResMII, RecMII);
479}
480
481void SwingSchedulerDAG::setMAX_II() {
482 if (SwpForceII > 0)
483 MAX_II = SwpForceII;
484 else if (II_setByPragma > 0)
485 MAX_II = II_setByPragma;
486 else
487 MAX_II = MII + 10;
488}
489
490/// We override the schedule function in ScheduleDAGInstrs to implement the
491/// scheduling part of the Swing Modulo Scheduling algorithm.
493 AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
494 buildSchedGraph(AA);
495 addLoopCarriedDependences(AA);
496 updatePhiDependences();
498 changeDependences();
499 postprocessDAG();
500 LLVM_DEBUG(dump());
501
502 NodeSetType NodeSets;
503 findCircuits(NodeSets);
504 NodeSetType Circuits = NodeSets;
505
506 // Calculate the MII.
507 unsigned ResMII = calculateResMII();
508 unsigned RecMII = calculateRecMII(NodeSets);
509
510 fuseRecs(NodeSets);
511
512 // This flag is used for testing and can cause correctness problems.
513 if (SwpIgnoreRecMII)
514 RecMII = 0;
515
516 setMII(ResMII, RecMII);
517 setMAX_II();
518
519 LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
520 << " (rec=" << RecMII << ", res=" << ResMII << ")\n");
521
522 // Can't schedule a loop without a valid MII.
523 if (MII == 0) {
524 LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
525 NumFailZeroMII++;
526 Pass.ORE->emit([&]() {
528 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
529 << "Invalid Minimal Initiation Interval: 0";
530 });
531 return;
532 }
533
534 // Don't pipeline large loops.
535 if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {
536 LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
537 << ", we don't pipeline large loops\n");
538 NumFailLargeMaxMII++;
539 Pass.ORE->emit([&]() {
541 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
542 << "Minimal Initiation Interval too large: "
543 << ore::NV("MII", (int)MII) << " > "
544 << ore::NV("SwpMaxMii", SwpMaxMii) << "."
545 << "Refer to -pipeliner-max-mii.";
546 });
547 return;
548 }
549
550 computeNodeFunctions(NodeSets);
551
552 registerPressureFilter(NodeSets);
553
554 colocateNodeSets(NodeSets);
555
556 checkNodeSets(NodeSets);
557
558 LLVM_DEBUG({
559 for (auto &I : NodeSets) {
560 dbgs() << " Rec NodeSet ";
561 I.dump();
562 }
563 });
564
565 llvm::stable_sort(NodeSets, std::greater<NodeSet>());
566
567 groupRemainingNodes(NodeSets);
568
569 removeDuplicateNodes(NodeSets);
570
571 LLVM_DEBUG({
572 for (auto &I : NodeSets) {
573 dbgs() << " NodeSet ";
574 I.dump();
575 }
576 });
577
578 computeNodeOrder(NodeSets);
579
580 // check for node order issues
581 checkValidNodeOrder(Circuits);
582
583 SMSchedule Schedule(Pass.MF, this);
584 Scheduled = schedulePipeline(Schedule);
585
586 if (!Scheduled){
587 LLVM_DEBUG(dbgs() << "No schedule found, return\n");
588 NumFailNoSchedule++;
589 Pass.ORE->emit([&]() {
591 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
592 << "Unable to find schedule";
593 });
594 return;
595 }
596
597 unsigned numStages = Schedule.getMaxStageCount();
598 // No need to generate pipeline if there are no overlapped iterations.
599 if (numStages == 0) {
600 LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
601 NumFailZeroStage++;
602 Pass.ORE->emit([&]() {
604 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
605 << "No need to pipeline - no overlapped iterations in schedule.";
606 });
607 return;
608 }
609 // Check that the maximum stage count is less than user-defined limit.
610 if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {
611 LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
612 << " : too many stages, abort\n");
613 NumFailLargeMaxStage++;
614 Pass.ORE->emit([&]() {
616 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
617 << "Too many stages in schedule: "
618 << ore::NV("numStages", (int)numStages) << " > "
619 << ore::NV("SwpMaxStages", SwpMaxStages)
620 << ". Refer to -pipeliner-max-stages.";
621 });
622 return;
623 }
624
625 Pass.ORE->emit([&]() {
627 Loop.getHeader())
628 << "Pipelined succesfully!";
629 });
630
631 // Generate the schedule as a ModuloSchedule.
632 DenseMap<MachineInstr *, int> Cycles, Stages;
633 std::vector<MachineInstr *> OrderedInsts;
634 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
635 ++Cycle) {
636 for (SUnit *SU : Schedule.getInstructions(Cycle)) {
637 OrderedInsts.push_back(SU->getInstr());
638 Cycles[SU->getInstr()] = Cycle;
639 Stages[SU->getInstr()] = Schedule.stageScheduled(SU);
640 }
641 }
643 for (auto &KV : NewMIs) {
644 Cycles[KV.first] = Cycles[KV.second];
645 Stages[KV.first] = Stages[KV.second];
646 NewInstrChanges[KV.first] = InstrChanges[getSUnit(KV.first)];
647 }
648
649 ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
650 std::move(Stages));
652 assert(NewInstrChanges.empty() &&
653 "Cannot serialize a schedule with InstrChanges!");
655 MSTI.annotate();
656 return;
657 }
658 // The experimental code generator can't work if there are InstChanges.
659 if (ExperimentalCodeGen && NewInstrChanges.empty()) {
660 PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
661 MSE.expand();
662 } else {
663 ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
664 MSE.expand();
665 MSE.cleanup();
666 }
667 ++NumPipelined;
668}
669
670/// Clean up after the software pipeliner runs.
672 for (auto &KV : NewMIs)
673 MF.deleteMachineInstr(KV.second);
674 NewMIs.clear();
675
676 // Call the superclass.
678}
679
680/// Return the register values for the operands of a Phi instruction.
681/// This function assume the instruction is a Phi.
683 unsigned &InitVal, unsigned &LoopVal) {
684 assert(Phi.isPHI() && "Expecting a Phi.");
685
686 InitVal = 0;
687 LoopVal = 0;
688 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
689 if (Phi.getOperand(i + 1).getMBB() != Loop)
690 InitVal = Phi.getOperand(i).getReg();
691 else
692 LoopVal = Phi.getOperand(i).getReg();
693
694 assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
695}
696
697/// Return the Phi register value that comes the loop block.
698static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
699 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
700 if (Phi.getOperand(i + 1).getMBB() == LoopBB)
701 return Phi.getOperand(i).getReg();
702 return 0;
703}
704
705/// Return true if SUb can be reached from SUa following the chain edges.
706static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
709 Worklist.push_back(SUa);
710 while (!Worklist.empty()) {
711 const SUnit *SU = Worklist.pop_back_val();
712 for (const auto &SI : SU->Succs) {
713 SUnit *SuccSU = SI.getSUnit();
714 if (SI.getKind() == SDep::Order) {
715 if (Visited.count(SuccSU))
716 continue;
717 if (SuccSU == SUb)
718 return true;
719 Worklist.push_back(SuccSU);
720 Visited.insert(SuccSU);
721 }
722 }
723 }
724 return false;
725}
726
727/// Return true if the instruction causes a chain between memory
728/// references before and after it.
730 return MI.isCall() || MI.mayRaiseFPException() ||
731 MI.hasUnmodeledSideEffects() ||
732 (MI.hasOrderedMemoryRef() &&
733 (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad()));
734}
735
736/// Return the underlying objects for the memory references of an instruction.
737/// This function calls the code in ValueTracking, but first checks that the
738/// instruction has a memory operand.
741 if (!MI->hasOneMemOperand())
742 return;
743 MachineMemOperand *MM = *MI->memoperands_begin();
744 if (!MM->getValue())
745 return;
746 getUnderlyingObjects(MM->getValue(), Objs);
747 for (const Value *V : Objs) {
748 if (!isIdentifiedObject(V)) {
749 Objs.clear();
750 return;
751 }
752 Objs.push_back(V);
753 }
754}
755
756/// Add a chain edge between a load and store if the store can be an
757/// alias of the load on a subsequent iteration, i.e., a loop carried
758/// dependence. This code is very similar to the code in ScheduleDAGInstrs
759/// but that code doesn't create loop carried dependences.
760void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
764 for (auto &SU : SUnits) {
765 MachineInstr &MI = *SU.getInstr();
767 PendingLoads.clear();
768 else if (MI.mayLoad()) {
771 if (Objs.empty())
773 for (const auto *V : Objs) {
774 SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
775 SUs.push_back(&SU);
776 }
777 } else if (MI.mayStore()) {
780 if (Objs.empty())
782 for (const auto *V : Objs) {
784 PendingLoads.find(V);
785 if (I == PendingLoads.end())
786 continue;
787 for (auto *Load : I->second) {
788 if (isSuccOrder(Load, &SU))
789 continue;
790 MachineInstr &LdMI = *Load->getInstr();
791 // First, perform the cheaper check that compares the base register.
792 // If they are the same and the load offset is less than the store
793 // offset, then mark the dependence as loop carried potentially.
794 const MachineOperand *BaseOp1, *BaseOp2;
795 int64_t Offset1, Offset2;
796 bool Offset1IsScalable, Offset2IsScalable;
797 if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1,
798 Offset1IsScalable, TRI) &&
799 TII->getMemOperandWithOffset(MI, BaseOp2, Offset2,
800 Offset2IsScalable, TRI)) {
801 if (BaseOp1->isIdenticalTo(*BaseOp2) &&
802 Offset1IsScalable == Offset2IsScalable &&
803 (int)Offset1 < (int)Offset2) {
805 "What happened to the chain edge?");
806 SDep Dep(Load, SDep::Barrier);
807 Dep.setLatency(1);
808 SU.addPred(Dep);
809 continue;
810 }
811 }
812 // Second, the more expensive check that uses alias analysis on the
813 // base registers. If they alias, and the load offset is less than
814 // the store offset, the mark the dependence as loop carried.
815 if (!AA) {
816 SDep Dep(Load, SDep::Barrier);
817 Dep.setLatency(1);
818 SU.addPred(Dep);
819 continue;
820 }
821 MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
822 MachineMemOperand *MMO2 = *MI.memoperands_begin();
823 if (!MMO1->getValue() || !MMO2->getValue()) {
824 SDep Dep(Load, SDep::Barrier);
825 Dep.setLatency(1);
826 SU.addPred(Dep);
827 continue;
828 }
829 if (MMO1->getValue() == MMO2->getValue() &&
830 MMO1->getOffset() <= MMO2->getOffset()) {
831 SDep Dep(Load, SDep::Barrier);
832 Dep.setLatency(1);
833 SU.addPred(Dep);
834 continue;
835 }
836 if (!AA->isNoAlias(
839 MMO2->getAAInfo()))) {
840 SDep Dep(Load, SDep::Barrier);
841 Dep.setLatency(1);
842 SU.addPred(Dep);
843 }
844 }
845 }
846 }
847 }
848}
849
850/// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
851/// processes dependences for PHIs. This function adds true dependences
852/// from a PHI to a use, and a loop carried dependence from the use to the
853/// PHI. The loop carried dependence is represented as an anti dependence
854/// edge. This function also removes chain dependences between unrelated
855/// PHIs.
856void SwingSchedulerDAG::updatePhiDependences() {
857 SmallVector<SDep, 4> RemoveDeps;
859
860 // Iterate over each DAG node.
861 for (SUnit &I : SUnits) {
862 RemoveDeps.clear();
863 // Set to true if the instruction has an operand defined by a Phi.
864 unsigned HasPhiUse = 0;
865 unsigned HasPhiDef = 0;
866 MachineInstr *MI = I.getInstr();
867 // Iterate over each operand, and we process the definitions.
868 for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
869 MOE = MI->operands_end();
870 MOI != MOE; ++MOI) {
871 if (!MOI->isReg())
872 continue;
873 Register Reg = MOI->getReg();
874 if (MOI->isDef()) {
875 // If the register is used by a Phi, then create an anti dependence.
877 UI = MRI.use_instr_begin(Reg),
878 UE = MRI.use_instr_end();
879 UI != UE; ++UI) {
880 MachineInstr *UseMI = &*UI;
881 SUnit *SU = getSUnit(UseMI);
882 if (SU != nullptr && UseMI->isPHI()) {
883 if (!MI->isPHI()) {
884 SDep Dep(SU, SDep::Anti, Reg);
885 Dep.setLatency(1);
886 I.addPred(Dep);
887 } else {
888 HasPhiDef = Reg;
889 // Add a chain edge to a dependent Phi that isn't an existing
890 // predecessor.
891 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
892 I.addPred(SDep(SU, SDep::Barrier));
893 }
894 }
895 }
896 } else if (MOI->isUse()) {
897 // If the register is defined by a Phi, then create a true dependence.
899 if (DefMI == nullptr)
900 continue;
901 SUnit *SU = getSUnit(DefMI);
902 if (SU != nullptr && DefMI->isPHI()) {
903 if (!MI->isPHI()) {
904 SDep Dep(SU, SDep::Data, Reg);
905 Dep.setLatency(0);
906 ST.adjustSchedDependency(SU, 0, &I, MI->getOperandNo(MOI), Dep);
907 I.addPred(Dep);
908 } else {
909 HasPhiUse = Reg;
910 // Add a chain edge to a dependent Phi that isn't an existing
911 // predecessor.
912 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
913 I.addPred(SDep(SU, SDep::Barrier));
914 }
915 }
916 }
917 }
918 // Remove order dependences from an unrelated Phi.
919 if (!SwpPruneDeps)
920 continue;
921 for (auto &PI : I.Preds) {
922 MachineInstr *PMI = PI.getSUnit()->getInstr();
923 if (PMI->isPHI() && PI.getKind() == SDep::Order) {
924 if (I.getInstr()->isPHI()) {
925 if (PMI->getOperand(0).getReg() == HasPhiUse)
926 continue;
927 if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
928 continue;
929 }
930 RemoveDeps.push_back(PI);
931 }
932 }
933 for (int i = 0, e = RemoveDeps.size(); i != e; ++i)
934 I.removePred(RemoveDeps[i]);
935 }
936}
937
938/// Iterate over each DAG node and see if we can change any dependences
939/// in order to reduce the recurrence MII.
940void SwingSchedulerDAG::changeDependences() {
941 // See if an instruction can use a value from the previous iteration.
942 // If so, we update the base and offset of the instruction and change
943 // the dependences.
944 for (SUnit &I : SUnits) {
945 unsigned BasePos = 0, OffsetPos = 0, NewBase = 0;
946 int64_t NewOffset = 0;
947 if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
948 NewOffset))
949 continue;
950
951 // Get the MI and SUnit for the instruction that defines the original base.
952 Register OrigBase = I.getInstr()->getOperand(BasePos).getReg();
954 if (!DefMI)
955 continue;
956 SUnit *DefSU = getSUnit(DefMI);
957 if (!DefSU)
958 continue;
959 // Get the MI and SUnit for the instruction that defins the new base.
960 MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
961 if (!LastMI)
962 continue;
963 SUnit *LastSU = getSUnit(LastMI);
964 if (!LastSU)
965 continue;
966
967 if (Topo.IsReachable(&I, LastSU))
968 continue;
969
970 // Remove the dependence. The value now depends on a prior iteration.
972 for (const SDep &P : I.Preds)
973 if (P.getSUnit() == DefSU)
974 Deps.push_back(P);
975 for (int i = 0, e = Deps.size(); i != e; i++) {
976 Topo.RemovePred(&I, Deps[i].getSUnit());
977 I.removePred(Deps[i]);
978 }
979 // Remove the chain dependence between the instructions.
980 Deps.clear();
981 for (auto &P : LastSU->Preds)
982 if (P.getSUnit() == &I && P.getKind() == SDep::Order)
983 Deps.push_back(P);
984 for (int i = 0, e = Deps.size(); i != e; i++) {
985 Topo.RemovePred(LastSU, Deps[i].getSUnit());
986 LastSU->removePred(Deps[i]);
987 }
988
989 // Add a dependence between the new instruction and the instruction
990 // that defines the new base.
991 SDep Dep(&I, SDep::Anti, NewBase);
992 Topo.AddPred(LastSU, &I);
993 LastSU->addPred(Dep);
994
995 // Remember the base and offset information so that we can update the
996 // instruction during code generation.
997 InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
998 }
999}
1000
1001namespace {
1002
1003// FuncUnitSorter - Comparison operator used to sort instructions by
1004// the number of functional unit choices.
1005struct FuncUnitSorter {
1006 const InstrItineraryData *InstrItins;
1007 const MCSubtargetInfo *STI;
1009
1010 FuncUnitSorter(const TargetSubtargetInfo &TSI)
1011 : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
1012
1013 // Compute the number of functional unit alternatives needed
1014 // at each stage, and take the minimum value. We prioritize the
1015 // instructions by the least number of choices first.
1016 unsigned minFuncUnits(const MachineInstr *Inst,
1017 InstrStage::FuncUnits &F) const {
1018 unsigned SchedClass = Inst->getDesc().getSchedClass();
1019 unsigned min = UINT_MAX;
1020 if (InstrItins && !InstrItins->isEmpty()) {
1021 for (const InstrStage &IS :
1022 make_range(InstrItins->beginStage(SchedClass),
1023 InstrItins->endStage(SchedClass))) {
1024 InstrStage::FuncUnits funcUnits = IS.getUnits();
1025 unsigned numAlternatives = llvm::popcount(funcUnits);
1026 if (numAlternatives < min) {
1027 min = numAlternatives;
1028 F = funcUnits;
1029 }
1030 }
1031 return min;
1032 }
1033 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1034 const MCSchedClassDesc *SCDesc =
1035 STI->getSchedModel().getSchedClassDesc(SchedClass);
1036 if (!SCDesc->isValid())
1037 // No valid Schedule Class Desc for schedClass, should be
1038 // Pseudo/PostRAPseudo
1039 return min;
1040
1041 for (const MCWriteProcResEntry &PRE :
1042 make_range(STI->getWriteProcResBegin(SCDesc),
1043 STI->getWriteProcResEnd(SCDesc))) {
1044 if (!PRE.Cycles)
1045 continue;
1046 const MCProcResourceDesc *ProcResource =
1047 STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);
1048 unsigned NumUnits = ProcResource->NumUnits;
1049 if (NumUnits < min) {
1050 min = NumUnits;
1051 F = PRE.ProcResourceIdx;
1052 }
1053 }
1054 return min;
1055 }
1056 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1057 }
1058
1059 // Compute the critical resources needed by the instruction. This
1060 // function records the functional units needed by instructions that
1061 // must use only one functional unit. We use this as a tie breaker
1062 // for computing the resource MII. The instrutions that require
1063 // the same, highly used, functional unit have high priority.
1064 void calcCriticalResources(MachineInstr &MI) {
1065 unsigned SchedClass = MI.getDesc().getSchedClass();
1066 if (InstrItins && !InstrItins->isEmpty()) {
1067 for (const InstrStage &IS :
1068 make_range(InstrItins->beginStage(SchedClass),
1069 InstrItins->endStage(SchedClass))) {
1070 InstrStage::FuncUnits FuncUnits = IS.getUnits();
1071 if (llvm::popcount(FuncUnits) == 1)
1072 Resources[FuncUnits]++;
1073 }
1074 return;
1075 }
1076 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1077 const MCSchedClassDesc *SCDesc =
1078 STI->getSchedModel().getSchedClassDesc(SchedClass);
1079 if (!SCDesc->isValid())
1080 // No valid Schedule Class Desc for schedClass, should be
1081 // Pseudo/PostRAPseudo
1082 return;
1083
1084 for (const MCWriteProcResEntry &PRE :
1085 make_range(STI->getWriteProcResBegin(SCDesc),
1086 STI->getWriteProcResEnd(SCDesc))) {
1087 if (!PRE.Cycles)
1088 continue;
1089 Resources[PRE.ProcResourceIdx]++;
1090 }
1091 return;
1092 }
1093 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1094 }
1095
1096 /// Return true if IS1 has less priority than IS2.
1097 bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1098 InstrStage::FuncUnits F1 = 0, F2 = 0;
1099 unsigned MFUs1 = minFuncUnits(IS1, F1);
1100 unsigned MFUs2 = minFuncUnits(IS2, F2);
1101 if (MFUs1 == MFUs2)
1102 return Resources.lookup(F1) < Resources.lookup(F2);
1103 return MFUs1 > MFUs2;
1104 }
1105};
1106
1107} // end anonymous namespace
1108
1109/// Calculate the resource constrained minimum initiation interval for the
1110/// specified loop. We use the DFA to model the resources needed for
1111/// each instruction, and we ignore dependences. A different DFA is created
1112/// for each cycle that is required. When adding a new instruction, we attempt
1113/// to add it to each existing DFA, until a legal space is found. If the
1114/// instruction cannot be reserved in an existing DFA, we create a new one.
1115unsigned SwingSchedulerDAG::calculateResMII() {
1116 LLVM_DEBUG(dbgs() << "calculateResMII:\n");
1118 return RM.calculateResMII();
1119}
1120
1121/// Calculate the recurrence-constrainted minimum initiation interval.
1122/// Iterate over each circuit. Compute the delay(c) and distance(c)
1123/// for each circuit. The II needs to satisfy the inequality
1124/// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1125/// II that satisfies the inequality, and the RecMII is the maximum
1126/// of those values.
1127unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1128 unsigned RecMII = 0;
1129
1130 for (NodeSet &Nodes : NodeSets) {
1131 if (Nodes.empty())
1132 continue;
1133
1134 unsigned Delay = Nodes.getLatency();
1135 unsigned Distance = 1;
1136
1137 // ii = ceil(delay / distance)
1138 unsigned CurMII = (Delay + Distance - 1) / Distance;
1139 Nodes.setRecMII(CurMII);
1140 if (CurMII > RecMII)
1141 RecMII = CurMII;
1142 }
1143
1144 return RecMII;
1145}
1146
1147/// Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1148/// but we do this to find the circuits, and then change them back.
1149static void swapAntiDependences(std::vector<SUnit> &SUnits) {
1151 for (SUnit &SU : SUnits) {
1152 for (SDep &Pred : SU.Preds)
1153 if (Pred.getKind() == SDep::Anti)
1154 DepsAdded.push_back(std::make_pair(&SU, Pred));
1155 }
1156 for (std::pair<SUnit *, SDep> &P : DepsAdded) {
1157 // Remove this anti dependency and add one in the reverse direction.
1158 SUnit *SU = P.first;
1159 SDep &D = P.second;
1160 SUnit *TargetSU = D.getSUnit();
1161 unsigned Reg = D.getReg();
1162 unsigned Lat = D.getLatency();
1163 SU->removePred(D);
1164 SDep Dep(SU, SDep::Anti, Reg);
1165 Dep.setLatency(Lat);
1166 TargetSU->addPred(Dep);
1167 }
1168}
1169
1170/// Create the adjacency structure of the nodes in the graph.
1171void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1172 SwingSchedulerDAG *DAG) {
1173 BitVector Added(SUnits.size());
1174 DenseMap<int, int> OutputDeps;
1175 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1176 Added.reset();
1177 // Add any successor to the adjacency matrix and exclude duplicates.
1178 for (auto &SI : SUnits[i].Succs) {
1179 // Only create a back-edge on the first and last nodes of a dependence
1180 // chain. This records any chains and adds them later.
1181 if (SI.getKind() == SDep::Output) {
1182 int N = SI.getSUnit()->NodeNum;
1183 int BackEdge = i;
1184 auto Dep = OutputDeps.find(BackEdge);
1185 if (Dep != OutputDeps.end()) {
1186 BackEdge = Dep->second;
1187 OutputDeps.erase(Dep);
1188 }
1189 OutputDeps[N] = BackEdge;
1190 }
1191 // Do not process a boundary node, an artificial node.
1192 // A back-edge is processed only if it goes to a Phi.
1193 if (SI.getSUnit()->isBoundaryNode() || SI.isArtificial() ||
1194 (SI.getKind() == SDep::Anti && !SI.getSUnit()->getInstr()->isPHI()))
1195 continue;
1196 int N = SI.getSUnit()->NodeNum;
1197 if (!Added.test(N)) {
1198 AdjK[i].push_back(N);
1199 Added.set(N);
1200 }
1201 }
1202 // A chain edge between a store and a load is treated as a back-edge in the
1203 // adjacency matrix.
1204 for (auto &PI : SUnits[i].Preds) {
1205 if (!SUnits[i].getInstr()->mayStore() ||
1206 !DAG->isLoopCarriedDep(&SUnits[i], PI, false))
1207 continue;
1208 if (PI.getKind() == SDep::Order && PI.getSUnit()->getInstr()->mayLoad()) {
1209 int N = PI.getSUnit()->NodeNum;
1210 if (!Added.test(N)) {
1211 AdjK[i].push_back(N);
1212 Added.set(N);
1213 }
1214 }
1215 }
1216 }
1217 // Add back-edges in the adjacency matrix for the output dependences.
1218 for (auto &OD : OutputDeps)
1219 if (!Added.test(OD.second)) {
1220 AdjK[OD.first].push_back(OD.second);
1221 Added.set(OD.second);
1222 }
1223}
1224
1225/// Identify an elementary circuit in the dependence graph starting at the
1226/// specified node.
1227bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1228 bool HasBackedge) {
1229 SUnit *SV = &SUnits[V];
1230 bool F = false;
1231 Stack.insert(SV);
1232 Blocked.set(V);
1233
1234 for (auto W : AdjK[V]) {
1235 if (NumPaths > MaxPaths)
1236 break;
1237 if (W < S)
1238 continue;
1239 if (W == S) {
1240 if (!HasBackedge)
1241 NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
1242 F = true;
1243 ++NumPaths;
1244 break;
1245 } else if (!Blocked.test(W)) {
1246 if (circuit(W, S, NodeSets,
1247 Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
1248 F = true;
1249 }
1250 }
1251
1252 if (F)
1253 unblock(V);
1254 else {
1255 for (auto W : AdjK[V]) {
1256 if (W < S)
1257 continue;
1258 B[W].insert(SV);
1259 }
1260 }
1261 Stack.pop_back();
1262 return F;
1263}
1264
1265/// Unblock a node in the circuit finding algorithm.
1266void SwingSchedulerDAG::Circuits::unblock(int U) {
1267 Blocked.reset(U);
1268 SmallPtrSet<SUnit *, 4> &BU = B[U];
1269 while (!BU.empty()) {
1271 assert(SI != BU.end() && "Invalid B set.");
1272 SUnit *W = *SI;
1273 BU.erase(W);
1274 if (Blocked.test(W->NodeNum))
1275 unblock(W->NodeNum);
1276 }
1277}
1278
1279/// Identify all the elementary circuits in the dependence graph using
1280/// Johnson's circuit algorithm.
1281void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
1282 // Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1283 // but we do this to find the circuits, and then change them back.
1285
1286 Circuits Cir(SUnits, Topo);
1287 // Create the adjacency structure.
1288 Cir.createAdjacencyStructure(this);
1289 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1290 Cir.reset();
1291 Cir.circuit(i, i, NodeSets);
1292 }
1293
1294 // Change the dependences back so that we've created a DAG again.
1296}
1297
1298// Create artificial dependencies between the source of COPY/REG_SEQUENCE that
1299// is loop-carried to the USE in next iteration. This will help pipeliner avoid
1300// additional copies that are needed across iterations. An artificial dependence
1301// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
1302
1303// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
1304// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
1305// PHI-------True-Dep------> USEOfPhi
1306
1307// The mutation creates
1308// USEOfPHI -------Artificial-Dep---> SRCOfCopy
1309
1310// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
1311// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
1312// late to avoid additional copies across iterations. The possible scheduling
1313// order would be
1314// USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE.
1315
1316void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
1317 for (SUnit &SU : DAG->SUnits) {
1318 // Find the COPY/REG_SEQUENCE instruction.
1319 if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
1320 continue;
1321
1322 // Record the loop carried PHIs.
1324 // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
1326
1327 for (auto &Dep : SU.Preds) {
1328 SUnit *TmpSU = Dep.getSUnit();
1329 MachineInstr *TmpMI = TmpSU->getInstr();
1330 SDep::Kind DepKind = Dep.getKind();
1331 // Save the loop carried PHI.
1332 if (DepKind == SDep::Anti && TmpMI->isPHI())
1333 PHISUs.push_back(TmpSU);
1334 // Save the source of COPY/REG_SEQUENCE.
1335 // If the source has no pre-decessors, we will end up creating cycles.
1336 else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
1337 SrcSUs.push_back(TmpSU);
1338 }
1339
1340 if (PHISUs.size() == 0 || SrcSUs.size() == 0)
1341 continue;
1342
1343 // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
1344 // SUnit to the container.
1346 // Do not use iterator based loop here as we are updating the container.
1347 for (size_t Index = 0; Index < PHISUs.size(); ++Index) {
1348 for (auto &Dep : PHISUs[Index]->Succs) {
1349 if (Dep.getKind() != SDep::Data)
1350 continue;
1351
1352 SUnit *TmpSU = Dep.getSUnit();
1353 MachineInstr *TmpMI = TmpSU->getInstr();
1354 if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
1355 PHISUs.push_back(TmpSU);
1356 continue;
1357 }
1358 UseSUs.push_back(TmpSU);
1359 }
1360 }
1361
1362 if (UseSUs.size() == 0)
1363 continue;
1364
1365 SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
1366 // Add the artificial dependencies if it does not form a cycle.
1367 for (auto *I : UseSUs) {
1368 for (auto *Src : SrcSUs) {
1369 if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
1370 Src->addPred(SDep(I, SDep::Artificial));
1371 SDAG->Topo.AddPred(Src, I);
1372 }
1373 }
1374 }
1375 }
1376}
1377
1378/// Return true for DAG nodes that we ignore when computing the cost functions.
1379/// We ignore the back-edge recurrence in order to avoid unbounded recursion
1380/// in the calculation of the ASAP, ALAP, etc functions.
1381static bool ignoreDependence(const SDep &D, bool isPred) {
1382 if (D.isArtificial() || D.getSUnit()->isBoundaryNode())
1383 return true;
1384 return D.getKind() == SDep::Anti && isPred;
1385}
1386
1387/// Compute several functions need to order the nodes for scheduling.
1388/// ASAP - Earliest time to schedule a node.
1389/// ALAP - Latest time to schedule a node.
1390/// MOV - Mobility function, difference between ALAP and ASAP.
1391/// D - Depth of each node.
1392/// H - Height of each node.
1393void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
1394 ScheduleInfo.resize(SUnits.size());
1395
1396 LLVM_DEBUG({
1397 for (int I : Topo) {
1398 const SUnit &SU = SUnits[I];
1399 dumpNode(SU);
1400 }
1401 });
1402
1403 int maxASAP = 0;
1404 // Compute ASAP and ZeroLatencyDepth.
1405 for (int I : Topo) {
1406 int asap = 0;
1407 int zeroLatencyDepth = 0;
1408 SUnit *SU = &SUnits[I];
1409 for (const SDep &P : SU->Preds) {
1410 SUnit *pred = P.getSUnit();
1411 if (P.getLatency() == 0)
1412 zeroLatencyDepth =
1413 std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1);
1414 if (ignoreDependence(P, true))
1415 continue;
1416 asap = std::max(asap, (int)(getASAP(pred) + P.getLatency() -
1417 getDistance(pred, SU, P) * MII));
1418 }
1419 maxASAP = std::max(maxASAP, asap);
1420 ScheduleInfo[I].ASAP = asap;
1421 ScheduleInfo[I].ZeroLatencyDepth = zeroLatencyDepth;
1422 }
1423
1424 // Compute ALAP, ZeroLatencyHeight, and MOV.
1425 for (int I : llvm::reverse(Topo)) {
1426 int alap = maxASAP;
1427 int zeroLatencyHeight = 0;
1428 SUnit *SU = &SUnits[I];
1429 for (const SDep &S : SU->Succs) {
1430 SUnit *succ = S.getSUnit();
1431 if (succ->isBoundaryNode())
1432 continue;
1433 if (S.getLatency() == 0)
1434 zeroLatencyHeight =
1435 std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
1436 if (ignoreDependence(S, true))
1437 continue;
1438 alap = std::min(alap, (int)(getALAP(succ) - S.getLatency() +
1439 getDistance(SU, succ, S) * MII));
1440 }
1441
1442 ScheduleInfo[I].ALAP = alap;
1443 ScheduleInfo[I].ZeroLatencyHeight = zeroLatencyHeight;
1444 }
1445
1446 // After computing the node functions, compute the summary for each node set.
1447 for (NodeSet &I : NodeSets)
1448 I.computeNodeSetInfo(this);
1449
1450 LLVM_DEBUG({
1451 for (unsigned i = 0; i < SUnits.size(); i++) {
1452 dbgs() << "\tNode " << i << ":\n";
1453 dbgs() << "\t ASAP = " << getASAP(&SUnits[i]) << "\n";
1454 dbgs() << "\t ALAP = " << getALAP(&SUnits[i]) << "\n";
1455 dbgs() << "\t MOV = " << getMOV(&SUnits[i]) << "\n";
1456 dbgs() << "\t D = " << getDepth(&SUnits[i]) << "\n";
1457 dbgs() << "\t H = " << getHeight(&SUnits[i]) << "\n";
1458 dbgs() << "\t ZLD = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
1459 dbgs() << "\t ZLH = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
1460 }
1461 });
1462}
1463
1464/// Compute the Pred_L(O) set, as defined in the paper. The set is defined
1465/// as the predecessors of the elements of NodeOrder that are not also in
1466/// NodeOrder.
1469 const NodeSet *S = nullptr) {
1470 Preds.clear();
1471 for (const SUnit *SU : NodeOrder) {
1472 for (const SDep &Pred : SU->Preds) {
1473 if (S && S->count(Pred.getSUnit()) == 0)
1474 continue;
1475 if (ignoreDependence(Pred, true))
1476 continue;
1477 if (NodeOrder.count(Pred.getSUnit()) == 0)
1478 Preds.insert(Pred.getSUnit());
1479 }
1480 // Back-edges are predecessors with an anti-dependence.
1481 for (const SDep &Succ : SU->Succs) {
1482 if (Succ.getKind() != SDep::Anti)
1483 continue;
1484 if (S && S->count(Succ.getSUnit()) == 0)
1485 continue;
1486 if (NodeOrder.count(Succ.getSUnit()) == 0)
1487 Preds.insert(Succ.getSUnit());
1488 }
1489 }
1490 return !Preds.empty();
1491}
1492
1493/// Compute the Succ_L(O) set, as defined in the paper. The set is defined
1494/// as the successors of the elements of NodeOrder that are not also in
1495/// NodeOrder.
1498 const NodeSet *S = nullptr) {
1499 Succs.clear();
1500 for (const SUnit *SU : NodeOrder) {
1501 for (const SDep &Succ : SU->Succs) {
1502 if (S && S->count(Succ.getSUnit()) == 0)
1503 continue;
1504 if (ignoreDependence(Succ, false))
1505 continue;
1506 if (NodeOrder.count(Succ.getSUnit()) == 0)
1507 Succs.insert(Succ.getSUnit());
1508 }
1509 for (const SDep &Pred : SU->Preds) {
1510 if (Pred.getKind() != SDep::Anti)
1511 continue;
1512 if (S && S->count(Pred.getSUnit()) == 0)
1513 continue;
1514 if (NodeOrder.count(Pred.getSUnit()) == 0)
1515 Succs.insert(Pred.getSUnit());
1516 }
1517 }
1518 return !Succs.empty();
1519}
1520
1521/// Return true if there is a path from the specified node to any of the nodes
1522/// in DestNodes. Keep track and return the nodes in any path.
1523static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
1524 SetVector<SUnit *> &DestNodes,
1525 SetVector<SUnit *> &Exclude,
1526 SmallPtrSet<SUnit *, 8> &Visited) {
1527 if (Cur->isBoundaryNode())
1528 return false;
1529 if (Exclude.contains(Cur))
1530 return false;
1531 if (DestNodes.contains(Cur))
1532 return true;
1533 if (!Visited.insert(Cur).second)
1534 return Path.contains(Cur);
1535 bool FoundPath = false;
1536 for (auto &SI : Cur->Succs)
1537 if (!ignoreDependence(SI, false))
1538 FoundPath |=
1539 computePath(SI.getSUnit(), Path, DestNodes, Exclude, Visited);
1540 for (auto &PI : Cur->Preds)
1541 if (PI.getKind() == SDep::Anti)
1542 FoundPath |=
1543 computePath(PI.getSUnit(), Path, DestNodes, Exclude, Visited);
1544 if (FoundPath)
1545 Path.insert(Cur);
1546 return FoundPath;
1547}
1548
1549/// Compute the live-out registers for the instructions in a node-set.
1550/// The live-out registers are those that are defined in the node-set,
1551/// but not used. Except for use operands of Phis.
1553 NodeSet &NS) {
1558 for (SUnit *SU : NS) {
1559 const MachineInstr *MI = SU->getInstr();
1560 if (MI->isPHI())
1561 continue;
1562 for (const MachineOperand &MO : MI->operands())
1563 if (MO.isReg() && MO.isUse()) {
1564 Register Reg = MO.getReg();
1565 if (Reg.isVirtual())
1566 Uses.insert(Reg);
1567 else if (MRI.isAllocatable(Reg))
1568 for (MCRegUnitIterator Units(Reg.asMCReg(), TRI); Units.isValid();
1569 ++Units)
1570 Uses.insert(*Units);
1571 }
1572 }
1573 for (SUnit *SU : NS)
1574 for (const MachineOperand &MO : SU->getInstr()->operands())
1575 if (MO.isReg() && MO.isDef() && !MO.isDead()) {
1576 Register Reg = MO.getReg();
1577 if (Reg.isVirtual()) {
1578 if (!Uses.count(Reg))
1579 LiveOutRegs.push_back(RegisterMaskPair(Reg,
1581 } else if (MRI.isAllocatable(Reg)) {
1582 for (MCRegUnitIterator Units(Reg.asMCReg(), TRI); Units.isValid();
1583 ++Units)
1584 if (!Uses.count(*Units))
1585 LiveOutRegs.push_back(RegisterMaskPair(*Units,
1587 }
1588 }
1589 RPTracker.addLiveRegs(LiveOutRegs);
1590}
1591
1592/// A heuristic to filter nodes in recurrent node-sets if the register
1593/// pressure of a set is too high.
1594void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
1595 for (auto &NS : NodeSets) {
1596 // Skip small node-sets since they won't cause register pressure problems.
1597 if (NS.size() <= 2)
1598 continue;
1599 IntervalPressure RecRegPressure;
1600 RegPressureTracker RecRPTracker(RecRegPressure);
1601 RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
1602 computeLiveOuts(MF, RecRPTracker, NS);
1603 RecRPTracker.closeBottom();
1604
1605 std::vector<SUnit *> SUnits(NS.begin(), NS.end());
1606 llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
1607 return A->NodeNum > B->NodeNum;
1608 });
1609
1610 for (auto &SU : SUnits) {
1611 // Since we're computing the register pressure for a subset of the
1612 // instructions in a block, we need to set the tracker for each
1613 // instruction in the node-set. The tracker is set to the instruction
1614 // just after the one we're interested in.
1616 RecRPTracker.setPos(std::next(CurInstI));
1617
1618 RegPressureDelta RPDelta;
1619 ArrayRef<PressureChange> CriticalPSets;
1620 RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
1621 CriticalPSets,
1622 RecRegPressure.MaxSetPressure);
1623 if (RPDelta.Excess.isValid()) {
1624 LLVM_DEBUG(
1625 dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
1627 << ":" << RPDelta.Excess.getUnitInc() << "\n");
1628 NS.setExceedPressure(SU);
1629 break;
1630 }
1631 RecRPTracker.recede();
1632 }
1633 }
1634}
1635
1636/// A heuristic to colocate node sets that have the same set of
1637/// successors.
1638void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
1639 unsigned Colocate = 0;
1640 for (int i = 0, e = NodeSets.size(); i < e; ++i) {
1641 NodeSet &N1 = NodeSets[i];
1643 if (N1.empty() || !succ_L(N1, S1))
1644 continue;
1645 for (int j = i + 1; j < e; ++j) {
1646 NodeSet &N2 = NodeSets[j];
1647 if (N1.compareRecMII(N2) != 0)
1648 continue;
1650 if (N2.empty() || !succ_L(N2, S2))
1651 continue;
1652 if (llvm::set_is_subset(S1, S2) && S1.size() == S2.size()) {
1653 N1.setColocate(++Colocate);
1654 N2.setColocate(Colocate);
1655 break;
1656 }
1657 }
1658 }
1659}
1660
1661/// Check if the existing node-sets are profitable. If not, then ignore the
1662/// recurrent node-sets, and attempt to schedule all nodes together. This is
1663/// a heuristic. If the MII is large and all the recurrent node-sets are small,
1664/// then it's best to try to schedule all instructions together instead of
1665/// starting with the recurrent node-sets.
1666void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
1667 // Look for loops with a large MII.
1668 if (MII < 17)
1669 return;
1670 // Check if the node-set contains only a simple add recurrence.
1671 for (auto &NS : NodeSets) {
1672 if (NS.getRecMII() > 2)
1673 return;
1674 if (NS.getMaxDepth() > MII)
1675 return;
1676 }
1677 NodeSets.clear();
1678 LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
1679}
1680
1681/// Add the nodes that do not belong to a recurrence set into groups
1682/// based upon connected components.
1683void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
1684 SetVector<SUnit *> NodesAdded;
1686 // Add the nodes that are on a path between the previous node sets and
1687 // the current node set.
1688 for (NodeSet &I : NodeSets) {
1690 // Add the nodes from the current node set to the previous node set.
1691 if (succ_L(I, N)) {
1693 for (SUnit *NI : N) {
1694 Visited.clear();
1695 computePath(NI, Path, NodesAdded, I, Visited);
1696 }
1697 if (!Path.empty())
1698 I.insert(Path.begin(), Path.end());
1699 }
1700 // Add the nodes from the previous node set to the current node set.
1701 N.clear();
1702 if (succ_L(NodesAdded, N)) {
1704 for (SUnit *NI : N) {
1705 Visited.clear();
1706 computePath(NI, Path, I, NodesAdded, Visited);
1707 }
1708 if (!Path.empty())
1709 I.insert(Path.begin(), Path.end());
1710 }
1711 NodesAdded.insert(I.begin(), I.end());
1712 }
1713
1714 // Create a new node set with the connected nodes of any successor of a node
1715 // in a recurrent set.
1716 NodeSet NewSet;
1718 if (succ_L(NodesAdded, N))
1719 for (SUnit *I : N)
1720 addConnectedNodes(I, NewSet, NodesAdded);
1721 if (!NewSet.empty())
1722 NodeSets.push_back(NewSet);
1723
1724 // Create a new node set with the connected nodes of any predecessor of a node
1725 // in a recurrent set.
1726 NewSet.clear();
1727 if (pred_L(NodesAdded, N))
1728 for (SUnit *I : N)
1729 addConnectedNodes(I, NewSet, NodesAdded);
1730 if (!NewSet.empty())
1731 NodeSets.push_back(NewSet);
1732
1733 // Create new nodes sets with the connected nodes any remaining node that
1734 // has no predecessor.
1735 for (SUnit &SU : SUnits) {
1736 if (NodesAdded.count(&SU) == 0) {
1737 NewSet.clear();
1738 addConnectedNodes(&SU, NewSet, NodesAdded);
1739 if (!NewSet.empty())
1740 NodeSets.push_back(NewSet);
1741 }
1742 }
1743}
1744
1745/// Add the node to the set, and add all of its connected nodes to the set.
1746void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
1747 SetVector<SUnit *> &NodesAdded) {
1748 NewSet.insert(SU);
1749 NodesAdded.insert(SU);
1750 for (auto &SI : SU->Succs) {
1751 SUnit *Successor = SI.getSUnit();
1752 if (!SI.isArtificial() && !Successor->isBoundaryNode() &&
1753 NodesAdded.count(Successor) == 0)
1754 addConnectedNodes(Successor, NewSet, NodesAdded);
1755 }
1756 for (auto &PI : SU->Preds) {
1757 SUnit *Predecessor = PI.getSUnit();
1758 if (!PI.isArtificial() && NodesAdded.count(Predecessor) == 0)
1759 addConnectedNodes(Predecessor, NewSet, NodesAdded);
1760 }
1761}
1762
1763/// Return true if Set1 contains elements in Set2. The elements in common
1764/// are returned in a different container.
1765static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
1767 Result.clear();
1768 for (SUnit *SU : Set1) {
1769 if (Set2.count(SU) != 0)
1770 Result.insert(SU);
1771 }
1772 return !Result.empty();
1773}
1774
1775/// Merge the recurrence node sets that have the same initial node.
1776void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
1777 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
1778 ++I) {
1779 NodeSet &NI = *I;
1780 for (NodeSetType::iterator J = I + 1; J != E;) {
1781 NodeSet &NJ = *J;
1782 if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
1783 if (NJ.compareRecMII(NI) > 0)
1784 NI.setRecMII(NJ.getRecMII());
1785 for (SUnit *SU : *J)
1786 I->insert(SU);
1787 NodeSets.erase(J);
1788 E = NodeSets.end();
1789 } else {
1790 ++J;
1791 }
1792 }
1793 }
1794}
1795
1796/// Remove nodes that have been scheduled in previous NodeSets.
1797void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
1798 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
1799 ++I)
1800 for (NodeSetType::iterator J = I + 1; J != E;) {
1801 J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
1802
1803 if (J->empty()) {
1804 NodeSets.erase(J);
1805 E = NodeSets.end();
1806 } else {
1807 ++J;
1808 }
1809 }
1810}
1811
1812/// Compute an ordered list of the dependence graph nodes, which
1813/// indicates the order that the nodes will be scheduled. This is a
1814/// two-level algorithm. First, a partial order is created, which
1815/// consists of a list of sets ordered from highest to lowest priority.
1816void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
1818 NodeOrder.clear();
1819
1820 for (auto &Nodes : NodeSets) {
1821 LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
1822 OrderKind Order;
1824 if (pred_L(NodeOrder, N) && llvm::set_is_subset(N, Nodes)) {
1825 R.insert(N.begin(), N.end());
1826 Order = BottomUp;
1827 LLVM_DEBUG(dbgs() << " Bottom up (preds) ");
1828 } else if (succ_L(NodeOrder, N) && llvm::set_is_subset(N, Nodes)) {
1829 R.insert(N.begin(), N.end());
1830 Order = TopDown;
1831 LLVM_DEBUG(dbgs() << " Top down (succs) ");
1832 } else if (isIntersect(N, Nodes, R)) {
1833 // If some of the successors are in the existing node-set, then use the
1834 // top-down ordering.
1835 Order = TopDown;
1836 LLVM_DEBUG(dbgs() << " Top down (intersect) ");
1837 } else if (NodeSets.size() == 1) {
1838 for (const auto &N : Nodes)
1839 if (N->Succs.size() == 0)
1840 R.insert(N);
1841 Order = BottomUp;
1842 LLVM_DEBUG(dbgs() << " Bottom up (all) ");
1843 } else {
1844 // Find the node with the highest ASAP.
1845 SUnit *maxASAP = nullptr;
1846 for (SUnit *SU : Nodes) {
1847 if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
1848 (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
1849 maxASAP = SU;
1850 }
1851 R.insert(maxASAP);
1852 Order = BottomUp;
1853 LLVM_DEBUG(dbgs() << " Bottom up (default) ");
1854 }
1855
1856 while (!R.empty()) {
1857 if (Order == TopDown) {
1858 // Choose the node with the maximum height. If more than one, choose
1859 // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
1860 // choose the node with the lowest MOV.
1861 while (!R.empty()) {
1862 SUnit *maxHeight = nullptr;
1863 for (SUnit *I : R) {
1864 if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
1865 maxHeight = I;
1866 else if (getHeight(I) == getHeight(maxHeight) &&
1868 maxHeight = I;
1869 else if (getHeight(I) == getHeight(maxHeight) &&
1871 getZeroLatencyHeight(maxHeight) &&
1872 getMOV(I) < getMOV(maxHeight))
1873 maxHeight = I;
1874 }
1875 NodeOrder.insert(maxHeight);
1876 LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
1877 R.remove(maxHeight);
1878 for (const auto &I : maxHeight->Succs) {
1879 if (Nodes.count(I.getSUnit()) == 0)
1880 continue;
1881 if (NodeOrder.contains(I.getSUnit()))
1882 continue;
1883 if (ignoreDependence(I, false))
1884 continue;
1885 R.insert(I.getSUnit());
1886 }
1887 // Back-edges are predecessors with an anti-dependence.
1888 for (const auto &I : maxHeight->Preds) {
1889 if (I.getKind() != SDep::Anti)
1890 continue;
1891 if (Nodes.count(I.getSUnit()) == 0)
1892 continue;
1893 if (NodeOrder.contains(I.getSUnit()))
1894 continue;
1895 R.insert(I.getSUnit());
1896 }
1897 }
1898 Order = BottomUp;
1899 LLVM_DEBUG(dbgs() << "\n Switching order to bottom up ");
1901 if (pred_L(NodeOrder, N, &Nodes))
1902 R.insert(N.begin(), N.end());
1903 } else {
1904 // Choose the node with the maximum depth. If more than one, choose
1905 // the node with the maximum ZeroLatencyDepth. If still more than one,
1906 // choose the node with the lowest MOV.
1907 while (!R.empty()) {
1908 SUnit *maxDepth = nullptr;
1909 for (SUnit *I : R) {
1910 if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
1911 maxDepth = I;
1912 else if (getDepth(I) == getDepth(maxDepth) &&
1914 maxDepth = I;
1915 else if (getDepth(I) == getDepth(maxDepth) &&
1917 getMOV(I) < getMOV(maxDepth))
1918 maxDepth = I;
1919 }
1920 NodeOrder.insert(maxDepth);
1921 LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
1922 R.remove(maxDepth);
1923 if (Nodes.isExceedSU(maxDepth)) {
1924 Order = TopDown;
1925 R.clear();
1926 R.insert(Nodes.getNode(0));
1927 break;
1928 }
1929 for (const auto &I : maxDepth->Preds) {
1930 if (Nodes.count(I.getSUnit()) == 0)
1931 continue;
1932 if (NodeOrder.contains(I.getSUnit()))
1933 continue;
1934 R.insert(I.getSUnit());
1935 }
1936 // Back-edges are predecessors with an anti-dependence.
1937 for (const auto &I : maxDepth->Succs) {
1938 if (I.getKind() != SDep::Anti)
1939 continue;
1940 if (Nodes.count(I.getSUnit()) == 0)
1941 continue;
1942 if (NodeOrder.contains(I.getSUnit()))
1943 continue;
1944 R.insert(I.getSUnit());
1945 }
1946 }
1947 Order = TopDown;
1948 LLVM_DEBUG(dbgs() << "\n Switching order to top down ");
1950 if (succ_L(NodeOrder, N, &Nodes))
1951 R.insert(N.begin(), N.end());
1952 }
1953 }
1954 LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
1955 }
1956
1957 LLVM_DEBUG({
1958 dbgs() << "Node order: ";
1959 for (SUnit *I : NodeOrder)
1960 dbgs() << " " << I->NodeNum << " ";
1961 dbgs() << "\n";
1962 });
1963}
1964
1965/// Process the nodes in the computed order and create the pipelined schedule
1966/// of the instructions, if possible. Return true if a schedule is found.
1967bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
1968
1969 if (NodeOrder.empty()){
1970 LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
1971 return false;
1972 }
1973
1974 bool scheduleFound = false;
1975 // Keep increasing II until a valid schedule is found.
1976 for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) {
1977 Schedule.reset();
1978 Schedule.setInitiationInterval(II);
1979 LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
1980
1981 SetVector<SUnit *>::iterator NI = NodeOrder.begin();
1982 SetVector<SUnit *>::iterator NE = NodeOrder.end();
1983 do {
1984 SUnit *SU = *NI;
1985
1986 // Compute the schedule time for the instruction, which is based
1987 // upon the scheduled time for any predecessors/successors.
1988 int EarlyStart = INT_MIN;
1989 int LateStart = INT_MAX;
1990 // These values are set when the size of the schedule window is limited
1991 // due to chain dependences.
1992 int SchedEnd = INT_MAX;
1993 int SchedStart = INT_MIN;
1994 Schedule.computeStart(SU, &EarlyStart, &LateStart, &SchedEnd, &SchedStart,
1995 II, this);
1996 LLVM_DEBUG({
1997 dbgs() << "\n";
1998 dbgs() << "Inst (" << SU->NodeNum << ") ";
1999 SU->getInstr()->dump();
2000 dbgs() << "\n";
2001 });
2002 LLVM_DEBUG({
2003 dbgs() << format("\tes: %8x ls: %8x me: %8x ms: %8x\n", EarlyStart,
2004 LateStart, SchedEnd, SchedStart);
2005 });
2006
2007 if (EarlyStart > LateStart || SchedEnd < EarlyStart ||
2008 SchedStart > LateStart)
2009 scheduleFound = false;
2010 else if (EarlyStart != INT_MIN && LateStart == INT_MAX) {
2011 SchedEnd = std::min(SchedEnd, EarlyStart + (int)II - 1);
2012 scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2013 } else if (EarlyStart == INT_MIN && LateStart != INT_MAX) {
2014 SchedStart = std::max(SchedStart, LateStart - (int)II + 1);
2015 scheduleFound = Schedule.insert(SU, LateStart, SchedStart, II);
2016 } else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2017 SchedEnd =
2018 std::min(SchedEnd, std::min(LateStart, EarlyStart + (int)II - 1));
2019 // When scheduling a Phi it is better to start at the late cycle and go
2020 // backwards. The default order may insert the Phi too far away from
2021 // its first dependence.
2022 if (SU->getInstr()->isPHI())
2023 scheduleFound = Schedule.insert(SU, SchedEnd, EarlyStart, II);
2024 else
2025 scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2026 } else {
2027 int FirstCycle = Schedule.getFirstCycle();
2028 scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2029 FirstCycle + getASAP(SU) + II - 1, II);
2030 }
2031 // Even if we find a schedule, make sure the schedule doesn't exceed the
2032 // allowable number of stages. We keep trying if this happens.
2033 if (scheduleFound)
2034 if (SwpMaxStages > -1 &&
2035 Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2036 scheduleFound = false;
2037
2038 LLVM_DEBUG({
2039 if (!scheduleFound)
2040 dbgs() << "\tCan't schedule\n";
2041 });
2042 } while (++NI != NE && scheduleFound);
2043
2044 // If a schedule is found, ensure non-pipelined instructions are in stage 0
2045 if (scheduleFound)
2046 scheduleFound =
2047 Schedule.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo);
2048
2049 // If a schedule is found, check if it is a valid schedule too.
2050 if (scheduleFound)
2051 scheduleFound = Schedule.isValidSchedule(this);
2052 }
2053
2054 LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
2055 << " (II=" << Schedule.getInitiationInterval()
2056 << ")\n");
2057
2058 if (scheduleFound) {
2059 scheduleFound = LoopPipelinerInfo->shouldUseSchedule(*this, Schedule);
2060 if (!scheduleFound)
2061 LLVM_DEBUG(dbgs() << "Target rejected schedule\n");
2062 }
2063
2064 if (scheduleFound) {
2065 Schedule.finalizeSchedule(this);
2066 Pass.ORE->emit([&]() {
2068 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
2069 << "Schedule found with Initiation Interval: "
2070 << ore::NV("II", Schedule.getInitiationInterval())
2071 << ", MaxStageCount: "
2072 << ore::NV("MaxStageCount", Schedule.getMaxStageCount());
2073 });
2074 } else
2075 Schedule.reset();
2076
2077 return scheduleFound && Schedule.getMaxStageCount() > 0;
2078}
2079
2080/// Return true if we can compute the amount the instruction changes
2081/// during each iteration. Set Delta to the amount of the change.
2082bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
2084 const MachineOperand *BaseOp;
2085 int64_t Offset;
2086 bool OffsetIsScalable;
2087 if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
2088 return false;
2089
2090 // FIXME: This algorithm assumes instructions have fixed-size offsets.
2091 if (OffsetIsScalable)
2092 return false;
2093
2094 if (!BaseOp->isReg())
2095 return false;
2096
2097 Register BaseReg = BaseOp->getReg();
2098
2100 // Check if there is a Phi. If so, get the definition in the loop.
2101 MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
2102 if (BaseDef && BaseDef->isPHI()) {
2103 BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
2104 BaseDef = MRI.getVRegDef(BaseReg);
2105 }
2106 if (!BaseDef)
2107 return false;
2108
2109 int D = 0;
2110 if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
2111 return false;
2112
2113 Delta = D;
2114 return true;
2115}
2116
2117/// Check if we can change the instruction to use an offset value from the
2118/// previous iteration. If so, return true and set the base and offset values
2119/// so that we can rewrite the load, if necessary.
2120/// v1 = Phi(v0, v3)
2121/// v2 = load v1, 0
2122/// v3 = post_store v1, 4, x
2123/// This function enables the load to be rewritten as v2 = load v3, 4.
2124bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
2125 unsigned &BasePos,
2126 unsigned &OffsetPos,
2127 unsigned &NewBase,
2128 int64_t &Offset) {
2129 // Get the load instruction.
2130 if (TII->isPostIncrement(*MI))
2131 return false;
2132 unsigned BasePosLd, OffsetPosLd;
2133 if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
2134 return false;
2135 Register BaseReg = MI->getOperand(BasePosLd).getReg();
2136
2137 // Look for the Phi instruction.
2138 MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
2139 MachineInstr *Phi = MRI.getVRegDef(BaseReg);
2140 if (!Phi || !Phi->isPHI())
2141 return false;
2142 // Get the register defined in the loop block.
2143 unsigned PrevReg = getLoopPhiReg(*Phi, MI->getParent());
2144 if (!PrevReg)
2145 return false;
2146
2147 // Check for the post-increment load/store instruction.
2148 MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
2149 if (!PrevDef || PrevDef == MI)
2150 return false;
2151
2152 if (!TII->isPostIncrement(*PrevDef))
2153 return false;
2154
2155 unsigned BasePos1 = 0, OffsetPos1 = 0;
2156 if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
2157 return false;
2158
2159 // Make sure that the instructions do not access the same memory location in
2160 // the next iteration.
2161 int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
2162 int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
2164 NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
2165 bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
2166 MF.deleteMachineInstr(NewMI);
2167 if (!Disjoint)
2168 return false;
2169
2170 // Set the return value once we determine that we return true.
2171 BasePos = BasePosLd;
2172 OffsetPos = OffsetPosLd;
2173 NewBase = PrevReg;
2174 Offset = StoreOffset;
2175 return true;
2176}
2177
2178/// Apply changes to the instruction if needed. The changes are need
2179/// to improve the scheduling and depend up on the final schedule.
2181 SMSchedule &Schedule) {
2182 SUnit *SU = getSUnit(MI);
2184 InstrChanges.find(SU);
2185 if (It != InstrChanges.end()) {
2186 std::pair<unsigned, int64_t> RegAndOffset = It->second;
2187 unsigned BasePos, OffsetPos;
2188 if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2189 return;
2190 Register BaseReg = MI->getOperand(BasePos).getReg();
2191 MachineInstr *LoopDef = findDefInLoop(BaseReg);
2192 int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
2193 int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
2194 int BaseStageNum = Schedule.stageScheduled(SU);
2195 int BaseCycleNum = Schedule.cycleScheduled(SU);
2196 if (BaseStageNum < DefStageNum) {
2198 int OffsetDiff = DefStageNum - BaseStageNum;
2199 if (DefCycleNum < BaseCycleNum) {
2200 NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
2201 if (OffsetDiff > 0)
2202 --OffsetDiff;
2203 }
2204 int64_t NewOffset =
2205 MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
2206 NewMI->getOperand(OffsetPos).setImm(NewOffset);
2207 SU->setInstr(NewMI);
2208 MISUnitMap[NewMI] = SU;
2209 NewMIs[MI] = NewMI;
2210 }
2211 }
2212}
2213
2214/// Return the instruction in the loop that defines the register.
2215/// If the definition is a Phi, then follow the Phi operand to
2216/// the instruction in the loop.
2217MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
2219 MachineInstr *Def = MRI.getVRegDef(Reg);
2220 while (Def->isPHI()) {
2221 if (!Visited.insert(Def).second)
2222 break;
2223 for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
2224 if (Def->getOperand(i + 1).getMBB() == BB) {
2225 Def = MRI.getVRegDef(Def->getOperand(i).getReg());
2226 break;
2227 }
2228 }
2229 return Def;
2230}
2231
2232/// Return true for an order or output dependence that is loop carried
2233/// potentially. A dependence is loop carried if the destination defines a valu
2234/// that may be used or defined by the source in a subsequent iteration.
2236 bool isSucc) {
2237 if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
2238 Dep.isArtificial() || Dep.getSUnit()->isBoundaryNode())
2239 return false;
2240
2242 return true;
2243
2244 if (Dep.getKind() == SDep::Output)
2245 return true;
2246
2247 MachineInstr *SI = Source->getInstr();
2248 MachineInstr *DI = Dep.getSUnit()->getInstr();
2249 if (!isSucc)
2250 std::swap(SI, DI);
2251 assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
2252
2253 // Assume ordered loads and stores may have a loop carried dependence.
2254 if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
2255 SI->mayRaiseFPException() || DI->mayRaiseFPException() ||
2256 SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
2257 return true;
2258
2259 // Only chain dependences between a load and store can be loop carried.
2260 if (!DI->mayStore() || !SI->mayLoad())
2261 return false;
2262
2263 unsigned DeltaS, DeltaD;
2264 if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD))
2265 return true;
2266
2267 const MachineOperand *BaseOpS, *BaseOpD;
2268 int64_t OffsetS, OffsetD;
2269 bool OffsetSIsScalable, OffsetDIsScalable;
2271 if (!TII->getMemOperandWithOffset(*SI, BaseOpS, OffsetS, OffsetSIsScalable,
2272 TRI) ||
2273 !TII->getMemOperandWithOffset(*DI, BaseOpD, OffsetD, OffsetDIsScalable,
2274 TRI))
2275 return true;
2276
2277 assert(!OffsetSIsScalable && !OffsetDIsScalable &&
2278 "Expected offsets to be byte offsets");
2279
2280 MachineInstr *DefS = MRI.getVRegDef(BaseOpS->getReg());
2281 MachineInstr *DefD = MRI.getVRegDef(BaseOpD->getReg());
2282 if (!DefS || !DefD || !DefS->isPHI() || !DefD->isPHI())
2283 return true;
2284
2285 unsigned InitValS = 0;
2286 unsigned LoopValS = 0;
2287 unsigned InitValD = 0;
2288 unsigned LoopValD = 0;
2289 getPhiRegs(*DefS, BB, InitValS, LoopValS);
2290 getPhiRegs(*DefD, BB, InitValD, LoopValD);
2291 MachineInstr *InitDefS = MRI.getVRegDef(InitValS);
2292 MachineInstr *InitDefD = MRI.getVRegDef(InitValD);
2293
2294 if (!InitDefS->isIdenticalTo(*InitDefD))
2295 return true;
2296
2297 // Check that the base register is incremented by a constant value for each
2298 // iteration.
2299 MachineInstr *LoopDefS = MRI.getVRegDef(LoopValS);
2300 int D = 0;
2301 if (!LoopDefS || !TII->getIncrementValue(*LoopDefS, D))
2302 return true;
2303
2304 uint64_t AccessSizeS = (*SI->memoperands_begin())->getSize();
2305 uint64_t AccessSizeD = (*DI->memoperands_begin())->getSize();
2306
2307 // This is the main test, which checks the offset values and the loop
2308 // increment value to determine if the accesses may be loop carried.
2309 if (AccessSizeS == MemoryLocation::UnknownSize ||
2310 AccessSizeD == MemoryLocation::UnknownSize)
2311 return true;
2312
2313 if (DeltaS != DeltaD || DeltaS < AccessSizeS || DeltaD < AccessSizeD)
2314 return true;
2315
2316 return (OffsetS + (int64_t)AccessSizeS < OffsetD + (int64_t)AccessSizeD);
2317}
2318
2319void SwingSchedulerDAG::postprocessDAG() {
2320 for (auto &M : Mutations)
2321 M->apply(this);
2322}
2323
2324/// Try to schedule the node at the specified StartCycle and continue
2325/// until the node is schedule or the EndCycle is reached. This function
2326/// returns true if the node is scheduled. This routine may search either
2327/// forward or backward for a place to insert the instruction based upon
2328/// the relative values of StartCycle and EndCycle.
2329bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
2330 bool forward = true;
2331 LLVM_DEBUG({
2332 dbgs() << "Trying to insert node between " << StartCycle << " and "
2333 << EndCycle << " II: " << II << "\n";
2334 });
2335 if (StartCycle > EndCycle)
2336 forward = false;
2337
2338 // The terminating condition depends on the direction.
2339 int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
2340 for (int curCycle = StartCycle; curCycle != termCycle;
2341 forward ? ++curCycle : --curCycle) {
2342
2343 if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
2344 ProcItinResources.canReserveResources(*SU, curCycle)) {
2345 LLVM_DEBUG({
2346 dbgs() << "\tinsert at cycle " << curCycle << " ";
2347 SU->getInstr()->dump();
2348 });
2349
2350 if (!ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
2351 ProcItinResources.reserveResources(*SU, curCycle);
2352 ScheduledInstrs[curCycle].push_back(SU);
2353 InstrToCycle.insert(std::make_pair(SU, curCycle));
2354 if (curCycle > LastCycle)
2355 LastCycle = curCycle;
2356 if (curCycle < FirstCycle)
2357 FirstCycle = curCycle;
2358 return true;
2359 }
2360 LLVM_DEBUG({
2361 dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
2362 SU->getInstr()->dump();
2363 });
2364 }
2365 return false;
2366}
2367
2368// Return the cycle of the earliest scheduled instruction in the chain.
2371 SmallVector<SDep, 8> Worklist;
2372 Worklist.push_back(Dep);
2373 int EarlyCycle = INT_MAX;
2374 while (!Worklist.empty()) {
2375 const SDep &Cur = Worklist.pop_back_val();
2376 SUnit *PrevSU = Cur.getSUnit();
2377 if (Visited.count(PrevSU))
2378 continue;
2379 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
2380 if (it == InstrToCycle.end())
2381 continue;
2382 EarlyCycle = std::min(EarlyCycle, it->second);
2383 for (const auto &PI : PrevSU->Preds)
2384 if (PI.getKind() == SDep::Order || PI.getKind() == SDep::Output)
2385 Worklist.push_back(PI);
2386 Visited.insert(PrevSU);
2387 }
2388 return EarlyCycle;
2389}
2390
2391// Return the cycle of the latest scheduled instruction in the chain.
2394 SmallVector<SDep, 8> Worklist;
2395 Worklist.push_back(Dep);
2396 int LateCycle = INT_MIN;
2397 while (!Worklist.empty()) {
2398 const SDep &Cur = Worklist.pop_back_val();
2399 SUnit *SuccSU = Cur.getSUnit();
2400 if (Visited.count(SuccSU) || SuccSU->isBoundaryNode())
2401 continue;
2402 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
2403 if (it == InstrToCycle.end())
2404 continue;
2405 LateCycle = std::max(LateCycle, it->second);
2406 for (const auto &SI : SuccSU->Succs)
2407 if (SI.getKind() == SDep::Order || SI.getKind() == SDep::Output)
2408 Worklist.push_back(SI);
2409 Visited.insert(SuccSU);
2410 }
2411 return LateCycle;
2412}
2413
2414/// If an instruction has a use that spans multiple iterations, then
2415/// return true. These instructions are characterized by having a back-ege
2416/// to a Phi, which contains a reference to another Phi.
2418 for (auto &P : SU->Preds)
2419 if (DAG->isBackedge(SU, P) && P.getSUnit()->getInstr()->isPHI())
2420 for (auto &S : P.getSUnit()->Succs)
2421 if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
2422 return P.getSUnit();
2423 return nullptr;
2424}
2425
2426/// Compute the scheduling start slot for the instruction. The start slot
2427/// depends on any predecessor or successor nodes scheduled already.
2428void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
2429 int *MinEnd, int *MaxStart, int II,
2430 SwingSchedulerDAG *DAG) {
2431 // Iterate over each instruction that has been scheduled already. The start
2432 // slot computation depends on whether the previously scheduled instruction
2433 // is a predecessor or successor of the specified instruction.
2434 for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
2435
2436 // Iterate over each instruction in the current cycle.
2437 for (SUnit *I : getInstructions(cycle)) {
2438 // Because we're processing a DAG for the dependences, we recognize
2439 // the back-edge in recurrences by anti dependences.
2440 for (unsigned i = 0, e = (unsigned)SU->Preds.size(); i != e; ++i) {
2441 const SDep &Dep = SU->Preds[i];
2442 if (Dep.getSUnit() == I) {
2443 if (!DAG->isBackedge(SU, Dep)) {
2444 int EarlyStart = cycle + Dep.getLatency() -
2445 DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
2446 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
2447 if (DAG->isLoopCarriedDep(SU, Dep, false)) {
2448 int End = earliestCycleInChain(Dep) + (II - 1);
2449 *MinEnd = std::min(*MinEnd, End);
2450 }
2451 } else {
2452 int LateStart = cycle - Dep.getLatency() +
2453 DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
2454 *MinLateStart = std::min(*MinLateStart, LateStart);
2455 }
2456 }
2457 // For instruction that requires multiple iterations, make sure that
2458 // the dependent instruction is not scheduled past the definition.
2459 SUnit *BE = multipleIterations(I, DAG);
2460 if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
2461 !SU->isPred(I))
2462 *MinLateStart = std::min(*MinLateStart, cycle);
2463 }
2464 for (unsigned i = 0, e = (unsigned)SU->Succs.size(); i != e; ++i) {
2465 if (SU->Succs[i].getSUnit() == I) {
2466 const SDep &Dep = SU->Succs[i];
2467 if (!DAG->isBackedge(SU, Dep)) {
2468 int LateStart = cycle - Dep.getLatency() +
2469 DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
2470 *MinLateStart = std::min(*MinLateStart, LateStart);
2471 if (DAG->isLoopCarriedDep(SU, Dep)) {
2472 int Start = latestCycleInChain(Dep) + 1 - II;
2473 *MaxStart = std::max(*MaxStart, Start);
2474 }
2475 } else {
2476 int EarlyStart = cycle + Dep.getLatency() -
2477 DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
2478 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
2479 }
2480 }
2481 }
2482 }
2483 }
2484}
2485
2486/// Order the instructions within a cycle so that the definitions occur
2487/// before the uses. Returns true if the instruction is added to the start
2488/// of the list, or false if added to the end.
2490 std::deque<SUnit *> &Insts) {
2491 MachineInstr *MI = SU->getInstr();
2492 bool OrderBeforeUse = false;
2493 bool OrderAfterDef = false;
2494 bool OrderBeforeDef = false;
2495 unsigned MoveDef = 0;
2496 unsigned MoveUse = 0;
2497 int StageInst1 = stageScheduled(SU);
2498
2499 unsigned Pos = 0;
2500 for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
2501 ++I, ++Pos) {
2502 for (MachineOperand &MO : MI->operands()) {
2503 if (!MO.isReg() || !MO.getReg().isVirtual())
2504 continue;
2505
2506 Register Reg = MO.getReg();
2507 unsigned BasePos, OffsetPos;
2508 if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2509 if (MI->getOperand(BasePos).getReg() == Reg)
2510 if (unsigned NewReg = SSD->getInstrBaseReg(SU))
2511 Reg = NewReg;
2512 bool Reads, Writes;
2513 std::tie(Reads, Writes) =
2514 (*I)->getInstr()->readsWritesVirtualRegister(Reg);
2515 if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
2516 OrderBeforeUse = true;
2517 if (MoveUse == 0)
2518 MoveUse = Pos;
2519 } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
2520 // Add the instruction after the scheduled instruction.
2521 OrderAfterDef = true;
2522 MoveDef = Pos;
2523 } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
2524 if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
2525 OrderBeforeUse = true;
2526 if (MoveUse == 0)
2527 MoveUse = Pos;
2528 } else {
2529 OrderAfterDef = true;
2530 MoveDef = Pos;
2531 }
2532 } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
2533 OrderBeforeUse = true;
2534 if (MoveUse == 0)
2535 MoveUse = Pos;
2536 if (MoveUse != 0) {
2537 OrderAfterDef = true;
2538 MoveDef = Pos - 1;
2539 }
2540 } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
2541 // Add the instruction before the scheduled instruction.
2542 OrderBeforeUse = true;
2543 if (MoveUse == 0)
2544 MoveUse = Pos;
2545 } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
2546 isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
2547 if (MoveUse == 0) {
2548 OrderBeforeDef = true;
2549 MoveUse = Pos;
2550 }
2551 }
2552 }
2553 // Check for order dependences between instructions. Make sure the source
2554 // is ordered before the destination.
2555 for (auto &S : SU->Succs) {
2556 if (S.getSUnit() != *I)
2557 continue;
2558 if (S.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
2559 OrderBeforeUse = true;
2560 if (Pos < MoveUse)
2561 MoveUse = Pos;
2562 }
2563 // We did not handle HW dependences in previous for loop,
2564 // and we normally set Latency = 0 for Anti deps,
2565 // so may have nodes in same cycle with Anti denpendent on HW regs.
2566 else if (S.getKind() == SDep::Anti && stageScheduled(*I) == StageInst1) {
2567 OrderBeforeUse = true;
2568 if ((MoveUse == 0) || (Pos < MoveUse))
2569 MoveUse = Pos;
2570 }
2571 }
2572 for (auto &P : SU->Preds) {
2573 if (P.getSUnit() != *I)
2574 continue;
2575 if (P.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
2576 OrderAfterDef = true;
2577 MoveDef = Pos;
2578 }
2579 }
2580 }
2581
2582 // A circular dependence.
2583 if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
2584 OrderBeforeUse = false;
2585
2586 // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
2587 // to a loop-carried dependence.
2588 if (OrderBeforeDef)
2589 OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
2590
2591 // The uncommon case when the instruction order needs to be updated because
2592 // there is both a use and def.
2593 if (OrderBeforeUse && OrderAfterDef) {
2594 SUnit *UseSU = Insts.at(MoveUse);
2595 SUnit *DefSU = Insts.at(MoveDef);
2596 if (MoveUse > MoveDef) {
2597 Insts.erase(Insts.begin() + MoveUse);
2598 Insts.erase(Insts.begin() + MoveDef);
2599 } else {
2600 Insts.erase(Insts.begin() + MoveDef);
2601 Insts.erase(Insts.begin() + MoveUse);
2602 }
2603 orderDependence(SSD, UseSU, Insts);
2604 orderDependence(SSD, SU, Insts);
2605 orderDependence(SSD, DefSU, Insts);
2606 return;
2607 }
2608 // Put the new instruction first if there is a use in the list. Otherwise,
2609 // put it at the end of the list.
2610 if (OrderBeforeUse)
2611 Insts.push_front(SU);
2612 else
2613 Insts.push_back(SU);
2614}
2615
2616/// Return true if the scheduled Phi has a loop carried operand.
2618 if (!Phi.isPHI())
2619 return false;
2620 assert(Phi.isPHI() && "Expecting a Phi.");
2621 SUnit *DefSU = SSD->getSUnit(&Phi);
2622 unsigned DefCycle = cycleScheduled(DefSU);
2623 int DefStage = stageScheduled(DefSU);
2624
2625 unsigned InitVal = 0;
2626 unsigned LoopVal = 0;
2627 getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
2628 SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
2629 if (!UseSU)
2630 return true;
2631 if (UseSU->getInstr()->isPHI())
2632 return true;
2633 unsigned LoopCycle = cycleScheduled(UseSU);
2634 int LoopStage = stageScheduled(UseSU);
2635 return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
2636}
2637
2638/// Return true if the instruction is a definition that is loop carried
2639/// and defines the use on the next iteration.
2640/// v1 = phi(v2, v3)
2641/// (Def) v3 = op v1
2642/// (MO) = v1
2643/// If MO appears before Def, then then v1 and v3 may get assigned to the same
2644/// register.
2646 MachineInstr *Def, MachineOperand &MO) {
2647 if (!MO.isReg())
2648 return false;
2649 if (Def->isPHI())
2650 return false;
2651 MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
2652 if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
2653 return false;
2654 if (!isLoopCarried(SSD, *Phi))
2655 return false;
2656 unsigned LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
2657 for (unsigned i = 0, e = Def->getNumOperands(); i != e; ++i) {
2658 MachineOperand &DMO = Def->getOperand(i);
2659 if (!DMO.isReg() || !DMO.isDef())
2660 continue;
2661 if (DMO.getReg() == LoopReg)
2662 return true;
2663 }
2664 return false;
2665}
2666
2667/// Determine transitive dependences of unpipelineable instructions
2670 SmallSet<SUnit *, 8> DoNotPipeline;
2671 SmallVector<SUnit *, 8> Worklist;
2672
2673 for (auto &SU : SSD->SUnits)
2674 if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr()))
2675 Worklist.push_back(&SU);
2676
2677 while (!Worklist.empty()) {
2678 auto SU = Worklist.pop_back_val();
2679 if (DoNotPipeline.count(SU))
2680 continue;
2681 LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");
2682 DoNotPipeline.insert(SU);
2683 for (auto &Dep : SU->Preds)
2684 Worklist.push_back(Dep.getSUnit());
2685 if (SU->getInstr()->isPHI())
2686 for (auto &Dep : SU->Succs)
2687 if (Dep.getKind() == SDep::Anti)
2688 Worklist.push_back(Dep.getSUnit());
2689 }
2690 return DoNotPipeline;
2691}
2692
2693// Determine all instructions upon which any unpipelineable instruction depends
2694// and ensure that they are in stage 0. If unable to do so, return false.
2698
2699 int NewLastCycle = INT_MIN;
2700 for (SUnit &SU : SSD->SUnits) {
2701 if (!SU.isInstr())
2702 continue;
2703 if (!DNP.contains(&SU) || stageScheduled(&SU) == 0) {
2704 NewLastCycle = std::max(NewLastCycle, InstrToCycle[&SU]);
2705 continue;
2706 }
2707
2708 // Put the non-pipelined instruction as early as possible in the schedule
2709 int NewCycle = getFirstCycle();
2710 for (auto &Dep : SU.Preds)
2711 NewCycle = std::max(InstrToCycle[Dep.getSUnit()], NewCycle);
2712
2713 int OldCycle = InstrToCycle[&SU];
2714 if (OldCycle != NewCycle) {
2715 InstrToCycle[&SU] = NewCycle;
2716 auto &OldS = getInstructions(OldCycle);
2717 llvm::erase_value(OldS, &SU);
2718 getInstructions(NewCycle).emplace_back(&SU);
2719 LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum
2720 << ") is not pipelined; moving from cycle " << OldCycle
2721 << " to " << NewCycle << " Instr:" << *SU.getInstr());
2722 }
2723 NewLastCycle = std::max(NewLastCycle, NewCycle);
2724 }
2725 LastCycle = NewLastCycle;
2726 return true;
2727}
2728
2729// Check if the generated schedule is valid. This function checks if
2730// an instruction that uses a physical register is scheduled in a
2731// different stage than the definition. The pipeliner does not handle
2732// physical register values that may cross a basic block boundary.
2733// Furthermore, if a physical def/use pair is assigned to the same
2734// cycle, orderDependence does not guarantee def/use ordering, so that
2735// case should be considered invalid. (The test checks for both
2736// earlier and same-cycle use to be more robust.)
2738 for (SUnit &SU : SSD->SUnits) {
2739 if (!SU.hasPhysRegDefs)
2740 continue;
2741 int StageDef = stageScheduled(&SU);
2742 int CycleDef = InstrToCycle[&SU];
2743 assert(StageDef != -1 && "Instruction should have been scheduled.");
2744 for (auto &SI : SU.Succs)
2745 if (SI.isAssignedRegDep() && !SI.getSUnit()->isBoundaryNode())
2746 if (Register::isPhysicalRegister(SI.getReg())) {
2747 if (stageScheduled(SI.getSUnit()) != StageDef)
2748 return false;
2749 if (InstrToCycle[SI.getSUnit()] <= CycleDef)
2750 return false;
2751 }
2752 }
2753 return true;
2754}
2755
2756/// A property of the node order in swing-modulo-scheduling is
2757/// that for nodes outside circuits the following holds:
2758/// none of them is scheduled after both a successor and a
2759/// predecessor.
2760/// The method below checks whether the property is met.
2761/// If not, debug information is printed and statistics information updated.
2762/// Note that we do not use an assert statement.
2763/// The reason is that although an invalid node oder may prevent
2764/// the pipeliner from finding a pipelined schedule for arbitrary II,
2765/// it does not lead to the generation of incorrect code.
2766void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
2767
2768 // a sorted vector that maps each SUnit to its index in the NodeOrder
2769 typedef std::pair<SUnit *, unsigned> UnitIndex;
2770 std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
2771
2772 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
2773 Indices.push_back(std::make_pair(NodeOrder[i], i));
2774
2775 auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
2776 return std::get<0>(i1) < std::get<0>(i2);
2777 };
2778
2779 // sort, so that we can perform a binary search
2780 llvm::sort(Indices, CompareKey);
2781
2782 bool Valid = true;
2783 (void)Valid;
2784 // for each SUnit in the NodeOrder, check whether
2785 // it appears after both a successor and a predecessor
2786 // of the SUnit. If this is the case, and the SUnit
2787 // is not part of circuit, then the NodeOrder is not
2788 // valid.
2789 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
2790 SUnit *SU = NodeOrder[i];
2791 unsigned Index = i;
2792
2793 bool PredBefore = false;
2794 bool SuccBefore = false;
2795
2796 SUnit *Succ;
2797 SUnit *Pred;
2798 (void)Succ;
2799 (void)Pred;
2800
2801 for (SDep &PredEdge : SU->Preds) {
2802 SUnit *PredSU = PredEdge.getSUnit();
2803 unsigned PredIndex = std::get<1>(
2804 *llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey));
2805 if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
2806 PredBefore = true;
2807 Pred = PredSU;
2808 break;
2809 }
2810 }
2811
2812 for (SDep &SuccEdge : SU->Succs) {
2813 SUnit *SuccSU = SuccEdge.getSUnit();
2814 // Do not process a boundary node, it was not included in NodeOrder,
2815 // hence not in Indices either, call to std::lower_bound() below will
2816 // return Indices.end().
2817 if (SuccSU->isBoundaryNode())
2818 continue;
2819 unsigned SuccIndex = std::get<1>(
2820 *llvm::lower_bound(Indices, std::make_pair(SuccSU, 0), CompareKey));
2821 if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
2822 SuccBefore = true;
2823 Succ = SuccSU;
2824 break;
2825 }
2826 }
2827
2828 if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
2829 // instructions in circuits are allowed to be scheduled
2830 // after both a successor and predecessor.
2831 bool InCircuit = llvm::any_of(
2832 Circuits, [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
2833 if (InCircuit)
2834 LLVM_DEBUG(dbgs() << "In a circuit, predecessor ";);
2835 else {
2836 Valid = false;
2837 NumNodeOrderIssues++;
2838 LLVM_DEBUG(dbgs() << "Predecessor ";);
2839 }
2840 LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
2841 << " are scheduled before node " << SU->NodeNum
2842 << "\n";);
2843 }
2844 }
2845
2846 LLVM_DEBUG({
2847 if (!Valid)
2848 dbgs() << "Invalid node order found!\n";
2849 });
2850}
2851
2852/// Attempt to fix the degenerate cases when the instruction serialization
2853/// causes the register lifetimes to overlap. For example,
2854/// p' = store_pi(p, b)
2855/// = load p, offset
2856/// In this case p and p' overlap, which means that two registers are needed.
2857/// Instead, this function changes the load to use p' and updates the offset.
2858void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
2859 unsigned OverlapReg = 0;
2860 unsigned NewBaseReg = 0;
2861 for (SUnit *SU : Instrs) {
2862 MachineInstr *MI = SU->getInstr();
2863 for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
2864 const MachineOperand &MO = MI->getOperand(i);
2865 // Look for an instruction that uses p. The instruction occurs in the
2866 // same cycle but occurs later in the serialized order.
2867 if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
2868 // Check that the instruction appears in the InstrChanges structure,
2869 // which contains instructions that can have the offset updated.
2871 InstrChanges.find(SU);
2872 if (It != InstrChanges.end()) {
2873 unsigned BasePos, OffsetPos;
2874 // Update the base register and adjust the offset.
2875 if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
2877 NewMI->getOperand(BasePos).setReg(NewBaseReg);
2878 int64_t NewOffset =
2879 MI->getOperand(OffsetPos).getImm() - It->second.second;
2880 NewMI->getOperand(OffsetPos).setImm(NewOffset);
2881 SU->setInstr(NewMI);
2882 MISUnitMap[NewMI] = SU;
2883 NewMIs[MI] = NewMI;
2884 }
2885 }
2886 OverlapReg = 0;
2887 NewBaseReg = 0;
2888 break;
2889 }
2890 // Look for an instruction of the form p' = op(p), which uses and defines
2891 // two virtual registers that get allocated to the same physical register.
2892 unsigned TiedUseIdx = 0;
2893 if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
2894 // OverlapReg is p in the example above.
2895 OverlapReg = MI->getOperand(TiedUseIdx).getReg();
2896 // NewBaseReg is p' in the example above.
2897 NewBaseReg = MI->getOperand(i).getReg();
2898 break;
2899 }
2900 }
2901 }
2902}
2903
2904/// After the schedule has been formed, call this function to combine
2905/// the instructions from the different stages/cycles. That is, this
2906/// function creates a schedule that represents a single iteration.
2908 // Move all instructions to the first stage from later stages.
2909 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
2910 for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
2911 ++stage) {
2912 std::deque<SUnit *> &cycleInstrs =
2913 ScheduledInstrs[cycle + (stage * InitiationInterval)];
2914 for (SUnit *SU : llvm::reverse(cycleInstrs))
2915 ScheduledInstrs[cycle].push_front(SU);
2916 }
2917 }
2918
2919 // Erase all the elements in the later stages. Only one iteration should
2920 // remain in the scheduled list, and it contains all the instructions.
2921 for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
2922 ScheduledInstrs.erase(cycle);
2923
2924 // Change the registers in instruction as specified in the InstrChanges
2925 // map. We need to use the new registers to create the correct order.
2926 for (const SUnit &SU : SSD->SUnits)
2927 SSD->applyInstrChange(SU.getInstr(), *this);
2928
2929 // Reorder the instructions in each cycle to fix and improve the
2930 // generated code.
2931 for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
2932 std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
2933 std::deque<SUnit *> newOrderPhi;
2934 for (SUnit *SU : cycleInstrs) {
2935 if (SU->getInstr()->isPHI())
2936 newOrderPhi.push_back(SU);
2937 }
2938 std::deque<SUnit *> newOrderI;
2939 for (SUnit *SU : cycleInstrs) {
2940 if (!SU->getInstr()->isPHI())
2941 orderDependence(SSD, SU, newOrderI);
2942 }
2943 // Replace the old order with the new order.
2944 cycleInstrs.swap(newOrderPhi);
2945 llvm::append_range(cycleInstrs, newOrderI);
2946 SSD->fixupRegisterOverlaps(cycleInstrs);
2947 }
2948
2949 LLVM_DEBUG(dump(););
2950}
2951
2953 os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
2954 << " depth " << MaxDepth << " col " << Colocate << "\n";
2955 for (const auto &I : Nodes)
2956 os << " SU(" << I->NodeNum << ") " << *(I->getInstr());
2957 os << "\n";
2958}
2959
2960#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2961/// Print the schedule information to the given output.
2963 // Iterate over each cycle.
2964 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
2965 // Iterate over each instruction in the cycle.
2966 const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
2967 for (SUnit *CI : cycleInstrs->second) {
2968 os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
2969 os << "(" << CI->NodeNum << ") ";
2970 CI->getInstr()->print(os);
2971 os << "\n";
2972 }
2973 }
2974}
2975
2976/// Utility function used for debugging to print the schedule.
2979
2980void ResourceManager::dumpMRT() const {
2981 LLVM_DEBUG({
2982 if (UseDFA)
2983 return;
2984 std::stringstream SS;
2985 SS << "MRT:\n";
2986 SS << std::setw(4) << "Slot";
2987 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
2988 SS << std::setw(3) << I;
2989 SS << std::setw(7) << "#Mops"
2990 << "\n";
2991 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
2992 SS << std::setw(4) << Slot;
2993 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
2994 SS << std::setw(3) << MRT[Slot][I];
2995 SS << std::setw(7) << NumScheduledMops[Slot] << "\n";
2996 }
2997 dbgs() << SS.str();
2998 });
2999}
3000#endif
3001
3003 const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
3004 unsigned ProcResourceID = 0;
3005
3006 // We currently limit the resource kinds to 64 and below so that we can use
3007 // uint64_t for Masks
3008 assert(SM.getNumProcResourceKinds() < 64 &&
3009 "Too many kinds of resources, unsupported");
3010 // Create a unique bitmask for every processor resource unit.
3011 // Skip resource at index 0, since it always references 'InvalidUnit'.
3012 Masks.resize(SM.getNumProcResourceKinds());
3013 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3014 const MCProcResourceDesc &Desc = *SM.getProcResource(I);
3015 if (Desc.SubUnitsIdxBegin)
3016 continue;
3017 Masks[I] = 1ULL << ProcResourceID;
3018 ProcResourceID++;
3019 }
3020 // Create a unique bitmask for every processor resource group.
3021 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3022 const MCProcResourceDesc &Desc = *SM.getProcResource(I);
3023 if (!Desc.SubUnitsIdxBegin)
3024 continue;
3025 Masks[I] = 1ULL << ProcResourceID;
3026 for (unsigned U = 0; U < Desc.NumUnits; ++U)
3027 Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];
3028 ProcResourceID++;
3029 }
3030 LLVM_DEBUG({
3031 if (SwpShowResMask) {
3032 dbgs() << "ProcResourceDesc:\n";
3033 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3034 const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
3035 dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
3036 ProcResource->Name, I, Masks[I],
3037 ProcResource->NumUnits);
3038 }
3039 dbgs() << " -----------------\n";
3040 }
3041 });
3042}
3043
3045 LLVM_DEBUG({
3046 if (SwpDebugResource)
3047 dbgs() << "canReserveResources:\n";
3048 });
3049 if (UseDFA)
3050 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3051 ->canReserveResources(&SU.getInstr()->getDesc());
3052
3053 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3054 if (!SCDesc->isValid()) {
3055 LLVM_DEBUG({
3056 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3057 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3058 });
3059 return true;
3060 }
3061
3062 reserveResources(SCDesc, Cycle);
3063 bool Result = !isOverbooked();
3064 unreserveResources(SCDesc, Cycle);
3065
3066 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return " << Result << "\n\n";);
3067 return Result;
3068}
3069
3070void ResourceManager::reserveResources(SUnit &SU, int Cycle) {
3071 LLVM_DEBUG({
3072 if (SwpDebugResource)
3073 dbgs() << "reserveResources:\n";
3074 });
3075 if (UseDFA)
3076 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3077 ->reserveResources(&SU.getInstr()->getDesc());
3078
3079 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3080 if (!SCDesc->isValid()) {
3081 LLVM_DEBUG({
3082 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3083 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3084 });
3085 return;
3086 }
3087
3088 reserveResources(SCDesc, Cycle);
3089
3090 LLVM_DEBUG({
3091 if (SwpDebugResource) {
3092 dumpMRT();
3093 dbgs() << "reserveResources: done!\n\n";
3094 }
3095 });
3096}
3097
3098void ResourceManager::reserveResources(const MCSchedClassDesc *SCDesc,
3099 int Cycle) {
3100 assert(!UseDFA);
3101 for (const MCWriteProcResEntry &PRE : make_range(
3102 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3103 for (int C = Cycle; C < Cycle + PRE.Cycles; ++C)
3104 ++MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3105
3106 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3107 ++NumScheduledMops[positiveModulo(C, InitiationInterval)];
3108}
3109
3110void ResourceManager::unreserveResources(const MCSchedClassDesc *SCDesc,
3111 int Cycle) {
3112 assert(!UseDFA);
3113 for (const MCWriteProcResEntry &PRE : make_range(
3114 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3115 for (int C = Cycle; C < Cycle + PRE.Cycles; ++C)
3116 --MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3117
3118 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3119 --NumScheduledMops[positiveModulo(C, InitiationInterval)];
3120}
3121
3122bool ResourceManager::isOverbooked() const {
3123 assert(!UseDFA);
3124 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3125 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3126 const MCProcResourceDesc *Desc = SM.getProcResource(I);
3127 if (MRT[Slot][I] > Desc->NumUnits)
3128 return true;
3129 }
3130 if (NumScheduledMops[Slot] > IssueWidth)
3131 return true;
3132 }
3133 return false;
3134}
3135
3136int ResourceManager::calculateResMIIDFA() const {
3137 assert(UseDFA);
3138
3139 // Sort the instructions by the number of available choices for scheduling,
3140 // least to most. Use the number of critical resources as the tie breaker.
3141 FuncUnitSorter FUS = FuncUnitSorter(*ST);
3142 for (SUnit &SU : DAG->SUnits)
3143 FUS.calcCriticalResources(*SU.getInstr());
3145 FuncUnitOrder(FUS);
3146
3147 for (SUnit &SU : DAG->SUnits)
3148 FuncUnitOrder.push(SU.getInstr());
3149
3151 Resources.push_back(
3152 std::unique_ptr<DFAPacketizer>(TII->CreateTargetScheduleState(*ST)));
3153
3154 while (!FuncUnitOrder.empty()) {
3155 MachineInstr *MI = FuncUnitOrder.top();
3156 FuncUnitOrder.pop();
3157 if (TII->isZeroCost(MI->getOpcode()))
3158 continue;
3159
3160 // Attempt to reserve the instruction in an existing DFA. At least one
3161 // DFA is needed for each cycle.
3162 unsigned NumCycles = DAG->getSUnit(MI)->Latency;
3163 unsigned ReservedCycles = 0;
3164 auto *RI = Resources.begin();
3165 auto *RE = Resources.end();
3166 LLVM_DEBUG({
3167 dbgs() << "Trying to reserve resource for " << NumCycles
3168 << " cycles for \n";
3169 MI->dump();
3170 });
3171 for (unsigned C = 0; C < NumCycles; ++C)
3172 while (RI != RE) {
3173 if ((*RI)->canReserveResources(*MI)) {
3174 (*RI)->reserveResources(*MI);
3175 ++ReservedCycles;
3176 break;
3177 }
3178 RI++;
3179 }
3180 LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
3181 << ", NumCycles:" << NumCycles << "\n");
3182 // Add new DFAs, if needed, to reserve resources.
3183 for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
3185 << "NewResource created to reserve resources"
3186 << "\n");
3187 auto *NewResource = TII->CreateTargetScheduleState(*ST);
3188 assert(NewResource->canReserveResources(*MI) && "Reserve error.");
3189 NewResource->reserveResources(*MI);
3190 Resources.push_back(std::unique_ptr<DFAPacketizer>(NewResource));
3191 }
3192 }
3193
3194 int Resmii = Resources.size();
3195 LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
3196 return Resmii;
3197}
3198
3200 if (UseDFA)
3201 return calculateResMIIDFA();
3202
3203 // Count each resource consumption and divide it by the number of units.
3204 // ResMII is the max value among them.
3205
3206 int NumMops = 0;
3208 for (SUnit &SU : DAG->SUnits) {
3209 if (TII->isZeroCost(SU.getInstr()->getOpcode()))
3210 continue;
3211
3212 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3213 if (!SCDesc->isValid())
3214 continue;
3215
3216 LLVM_DEBUG({
3217 if (SwpDebugResource) {
3218 DAG->dumpNode(SU);
3219 dbgs() << " #Mops: " << SCDesc->NumMicroOps << "\n"
3220 << " WriteProcRes: ";
3221 }
3222 });
3223 NumMops += SCDesc->NumMicroOps;
3224 for (const MCWriteProcResEntry &PRE :
3225 make_range(STI->getWriteProcResBegin(SCDesc),
3226 STI->getWriteProcResEnd(SCDesc))) {
3227 LLVM_DEBUG({
3228 if (SwpDebugResource) {
3229 const MCProcResourceDesc *Desc =
3230 SM.getProcResource(PRE.ProcResourceIdx);
3231 dbgs() << Desc->Name << ": " << PRE.Cycles << ", ";
3232 }
3233 });
3234 ResourceCount[PRE.ProcResourceIdx] += PRE.Cycles;
3235 }
3236 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "\n");
3237 }
3238
3239 int Result = (NumMops + IssueWidth - 1) / IssueWidth;
3240 LLVM_DEBUG({
3241 if (SwpDebugResource)
3242 dbgs() << "#Mops: " << NumMops << ", "
3243 << "IssueWidth: " << IssueWidth << ", "
3244 << "Cycles: " << Result << "\n";
3245 });
3246
3247 LLVM_DEBUG({
3248 if (SwpDebugResource) {
3249 std::stringstream SS;
3250 SS << std::setw(2) << "ID" << std::setw(16) << "Name" << std::setw(10)
3251 << "Units" << std::setw(10) << "Consumed" << std::setw(10) << "Cycles"
3252 << "\n";
3253 dbgs() << SS.str();
3254 }
3255 });
3256 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3257 const MCProcResourceDesc *Desc = SM.getProcResource(I);
3258 int Cycles = (ResourceCount[I] + Desc->NumUnits - 1) / Desc->NumUnits;
3259 LLVM_DEBUG({
3260 if (SwpDebugResource) {
3261 std::stringstream SS;
3262 SS << std::setw(2) << I << std::setw(16) << Desc->Name << std::setw(10)
3263 << Desc->NumUnits << std::setw(10) << ResourceCount[I]
3264 << std::setw(10) << Cycles << "\n";
3265 dbgs() << SS.str();
3266 }
3267 });
3268 if (Cycles > Result)
3269 Result = Cycles;
3270 }
3271 return Result;
3272}
3273
3275 InitiationInterval = II;
3276 DFAResources.clear();
3277 DFAResources.resize(II);
3278 for (auto &I : DFAResources)
3279 I.reset(ST->getInstrInfo()->CreateTargetScheduleState(*ST));
3280 MRT.clear();
3282 NumScheduledMops.clear();
3283 NumScheduledMops.resize(II);
3284}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
SmallPtrSet< MachineInstr *, 2 > Uses
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:492
This file declares an analysis pass that computes CycleInfo for LLVM IR, specialized from GenericCycl...
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
hexagon gen pred
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
static cl::opt< int > SwpForceII("pipeliner-force-ii", cl::desc("Force pipeliner to use specified II."), cl::Hidden, cl::init(-1))
A command line argument to force pipeliner to use specified initial interval.
static cl::opt< bool > ExperimentalCodeGen("pipeliner-experimental-cg", cl::Hidden, cl::init(false), cl::desc("Use the experimental peeling code generator for software pipelining"))
static bool pred_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Preds, const NodeSet *S=nullptr)
Compute the Pred_L(O) set, as defined in the paper.
static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB)
Return the Phi register value that comes the loop block.
static cl::opt< bool > SwpDebugResource("pipeliner-dbg-res", cl::Hidden, cl::init(false))
static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker, NodeSet &NS)
Compute the live-out registers for the instructions in a node-set.
static cl::opt< bool > EmitTestAnnotations("pipeliner-annotate-for-testing", cl::Hidden, cl::init(false), cl::desc("Instead of emitting the pipelined code, annotate instructions " "with the generated schedule for feeding into the " "-modulo-schedule-test pass"))
static bool isIntersect(SmallSetVector< SUnit *, 8 > &Set1, const NodeSet &Set2, SmallSetVector< SUnit *, 8 > &Result)
Return true if Set1 contains elements in Set2.
static cl::opt< bool > SwpIgnoreRecMII("pipeliner-ignore-recmii", cl::ReallyHidden, cl::desc("Ignore RecMII"))
static cl::opt< int > SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1))
static bool succ_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Succs, const NodeSet *S=nullptr)
Compute the Succ_L(O) set, as defined in the paper.
Modulo Software Pipelining
static cl::opt< bool > SwpPruneLoopCarried("pipeliner-prune-loop-carried", cl::desc("Prune loop carried order dependences."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of loop carried order dependences.
static bool isDependenceBarrier(MachineInstr &MI)
Return true if the instruction causes a chain between memory references before and after it.
static cl::opt< int > SwpMaxMii("pipeliner-max-mii", cl::desc("Size limit for the MII."), cl::Hidden, cl::init(27))
A command line argument to limit minimum initial interval for pipelining.
static void swapAntiDependences(std::vector< SUnit > &SUnits)
Swap all the anti dependences in the DAG.
static bool isSuccOrder(SUnit *SUa, SUnit *SUb)
Return true if SUb can be reached from SUa following the chain edges.
static cl::opt< int > SwpMaxStages("pipeliner-max-stages", cl::desc("Maximum stages allowed in the generated scheduled."), cl::Hidden, cl::init(3))
A command line argument to limit the number of stages in the pipeline.
static cl::opt< bool > EnableSWPOptSize("enable-pipeliner-opt-size", cl::desc("Enable SWP at Os."), cl::Hidden, cl::init(false))
A command line option to enable SWP at -Os.
static cl::opt< bool > SwpShowResMask("pipeliner-show-mask", cl::Hidden, cl::init(false))
static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop, unsigned &InitVal, unsigned &LoopVal)
Return the register values for the operands of a Phi instruction.
#define DEBUG_TYPE
static cl::opt< bool > EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true), cl::desc("Enable Software Pipelining"))
A command line option to turn software pipelining on or off.
static bool ignoreDependence(const SDep &D, bool isPred)
Return true for DAG nodes that we ignore when computing the cost functions.
static cl::opt< bool > SwpPruneDeps("pipeliner-prune-deps", cl::desc("Prune dependences between unrelated Phi nodes."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of chain dependences due to an unrelated Phi.
static SUnit * multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG)
If an instruction has a use that spans multiple iterations, then return true.
static bool computePath(SUnit *Cur, SetVector< SUnit * > &Path, SetVector< SUnit * > &DestNodes, SetVector< SUnit * > &Exclude, SmallPtrSet< SUnit *, 8 > &Visited)
Return true if there is a path from the specified node to any of the nodes in DestNodes.
unsigned const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
return ToRemove size() > 0
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file defines the PriorityQueue class.
@ SI
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static unsigned getSize(unsigned Kind)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
bool isNoAlias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A trivial helper function to check to see if the specified pointers are no-alias.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:127
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:197
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:150
bool erase(const KeyT &Val)
Definition: DenseMap.h:302
bool empty() const
Definition: DenseMap.h:98
iterator end()
Definition: DenseMap.h:84
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:315
A possibly irreducible generalization of a Loop.
Itinerary data supplied by a subtarget to be used by a target.
const InstrStage * beginStage(unsigned ItinClassIndx) const
Return the first stage of the itinerary.
const InstrStage * endStage(unsigned ItinClassIndx) const
Return the last+1 stage of the itinerary.
bool isEmpty() const
Returns true if there are no itineraries.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:275
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:202
BlockT * getHeader() const
Definition: LoopInfo.h:105
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:183
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:188
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:547
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:631
unsigned getSchedClass() const
Return the scheduling class for this instruction.
Definition: MCInstrDesc.h:596
const MCInstrDesc & get(unsigned Opcode) const
Return the machine instruction descriptor that corresponds to the specified instruction opcode.
Definition: MCInstrInfo.h:63
bool isValid() const
isValid - returns true if this iterator is not yet at the end.
Generic base class for all target subtargets.
const MCWriteProcResEntry * getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCWriteProcResEntry * getWriteProcResBegin(const MCSchedClassDesc *SC) const
Return an iterator at the first process resource consumed by the given scheduling class.
const MCSchedModel & getSchedModel() const
Get the machine model for this subtarget's CPU.
Metadata node.
Definition: Metadata.h:943
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1291
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1297
A single uniqued string.
Definition: Metadata.h:611
StringRef getString() const
Definition: Metadata.cpp:507
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any DBG_VALUE and DBG_LABEL instructions.
instr_iterator instr_end()
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void deleteMachineInstr(MachineInstr *MI)
DeleteMachineInstr - Delete the given MachineInstr.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:68
bool mayRaiseFPException() const
Return true if this instruction could possibly raise a floating-point exception.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:516
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:313
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:519
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition: MachineInstr.h:513
bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool isRegSequence() const
iterator_range< mop_iterator > operands()
Definition: MachineInstr.h:641
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:731
bool isIdenticalTo(const MachineInstr &Other, MICheckType Check=CheckDefs) const
Return true if this instruction is identical to Other.
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isPseudo(QueryType Type=IgnoreBundle) const
Return true if this is a pseudo instruction that doesn't correspond to a real machine instruction.
Definition: MachineInstr.h:852
bool isPHI() const
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:526
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
MachineBasicBlock * getTopBlock()
Return the "top" block in the loop, which is the first block in the linear layout,...
A description of a memory reference used in the backend.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
const Value * getValue() const
Return the base address of the memory access.
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineBasicBlock * getMBB() const
void setReg(Register Reg)
Change the register this operand corresponds to.
Register getReg() const
getReg - Returns the register number.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
Diagnostic information for optimization analysis remarks.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
The main class in the implementation of the target independent software pipeliner pass.
const TargetInstrInfo * TII
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
MachineFunction * MF
MachineOptimizationRemarkEmitter * ORE
RegisterClassInfo RegClassInfo
defusechain_iterator - This class provides iterator support for machine operands in the function that...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
use_instr_iterator use_instr_begin(Register RegNo) const
static use_instr_iterator use_instr_end()
MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:37
iterator end()
Definition: MapVector.h:72
iterator find(const KeyT &Key)
Definition: MapVector.h:148
void clear()
Definition: MapVector.h:89
static MemoryLocation getAfter(const Value *Ptr, const AAMDNodes &AATags=AAMDNodes())
Return a location that may access any location after Ptr, while remaining within the underlying objec...
The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place, rewriting the old loop and...
void cleanup()
Performs final cleanup after expansion.
void expand()
Performs the actual expansion.
Expander that simply annotates each scheduled instruction with a post-instr symbol that can be consum...
void annotate()
Performs the annotation.
Represents a schedule for a single-block loop.
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
SUnit * getNode(unsigned i) const
void print(raw_ostream &os) const
void setRecMII(unsigned mii)
unsigned count(SUnit *SU) const
void setColocate(unsigned c)
int compareRecMII(NodeSet &RHS)
unsigned size() const
bool insert(SUnit *SU)
LLVM_DUMP_METHOD void dump() const
bool empty() const
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:91
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
void dump() const
Definition: Pass.cpp:136
A reimplementation of ModuloScheduleExpander.
unsigned getPSet() const
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
Track the current register pressure at some position in the instruction stream, and remember the high...
void addLiveRegs(ArrayRef< RegisterMaskPair > Regs)
Force liveness of virtual registers or physical register units.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:65
void initProcResourceVectors(const MCSchedModel &SM, SmallVectorImpl< uint64_t > &Masks)
void init(int II)
Initialize resources with the initiation interval II.
bool canReserveResources(SUnit &SU, int Cycle)
Check if the resources occupied by a machine instruction are available in the current state.
Scheduling dependency.
Definition: ScheduleDAG.h:49
SUnit * getSUnit() const
Definition: ScheduleDAG.h:480
Kind getKind() const
Returns an enum value representing the kind of the dependence.
Definition: ScheduleDAG.h:486
Kind
These are the different kinds of scheduling dependencies.
Definition: ScheduleDAG.h:52
@ Output
A register output-dependence (aka WAW).
Definition: ScheduleDAG.h:55
@ Order
Any other ordering dependency.
Definition: ScheduleDAG.h:56
@ Anti
A register anti-dependence (aka WAR).
Definition: ScheduleDAG.h:54
@ Data
Regular data dependence (aka true-dependence).
Definition: ScheduleDAG.h:53
void setLatency(unsigned Lat)
Sets the latency for this edge.
Definition: ScheduleDAG.h:147
@ Barrier
An unknown scheduling barrier.
Definition: ScheduleDAG.h:69
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
Definition: ScheduleDAG.h:142
bool isArtificial() const
Tests if this is an Order dependence that is marked as "artificial", meaning it isn't necessary for c...
Definition: ScheduleDAG.h:200
This class represents the scheduled code.
int earliestCycleInChain(const SDep &Dep)
Return the cycle of the earliest scheduled instruction in the dependence chain.
void setInitiationInterval(int ii)
Set the initiation interval for this schedule.
SmallSet< SUnit *, 8 > computeUnpipelineableNodes(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
Determine transitive dependences of unpipelineable instructions.
bool isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD, MachineInstr *Def, MachineOperand &MO)
Return true if the instruction is a definition that is loop carried and defines the use on the next i...
void dump() const
Utility function used for debugging to print the schedule.
bool insert(SUnit *SU, int StartCycle, int EndCycle, int II)
Try to schedule the node at the specified StartCycle and continue until the node is schedule or the E...
unsigned getMaxStageCount()
Return the maximum stage count needed for this schedule.
void print(raw_ostream &os) const
Print the schedule information to the given output.
int latestCycleInChain(const SDep &Dep)
Return the cycle of the latest scheduled instruction in the dependence chain.
int stageScheduled(SUnit *SU) const
Return the stage for a scheduled instruction.
void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart, int *MinEnd, int *MaxStart, int II, SwingSchedulerDAG *DAG)
Compute the scheduling start slot for the instruction.
bool isValidSchedule(SwingSchedulerDAG *SSD)
int getInitiationInterval() const
Return the initiation interval for this schedule.
std::deque< SUnit * > & getInstructions(int cycle)
Return the instructions that are scheduled at the specified cycle.
int getFirstCycle() const
Return the first cycle in the completed schedule.
unsigned cycleScheduled(SUnit *SU) const
Return the cycle for a scheduled instruction.
bool isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi)
Return true if the scheduled Phi has a loop carried operand.
bool normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
int getFinalCycle() const
Return the last cycle in the finalized schedule.
void orderDependence(SwingSchedulerDAG *SSD, SUnit *SU, std::deque< SUnit * > &Insts)
Order the instructions within a cycle so that the definitions occur before the uses.
void finalizeSchedule(SwingSchedulerDAG *SSD)
After the schedule has been formed, call this function to combine the instructions from the different...
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
unsigned NumPreds
Definition: ScheduleDAG.h:266
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
Definition: ScheduleDAG.h:362
unsigned NodeNum
Entry # of node in the node vector.
Definition: ScheduleDAG.h:264
void setInstr(MachineInstr *MI)
Assigns the instruction for the SUnit.
Definition: ScheduleDAG.h:366
void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
bool isPred(const SUnit *N) const
Tests if node N is a predecessor of this node.
Definition: ScheduleDAG.h:431
unsigned short Latency
Node latency.
Definition: ScheduleDAG.h:273
bool isBoundaryNode() const
Boundary nodes are placeholders for the boundary of the scheduling region.
Definition: ScheduleDAG.h:344
bool hasPhysRegDefs
Has physreg defs that are being used.
Definition: ScheduleDAG.h:280
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:257
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:256
bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:373
A ScheduleDAG for scheduling lists of MachineInstr.
DenseMap< MachineInstr *, SUnit * > MISUnitMap
After calling BuildSchedGraph, each machine instruction in the current scheduling region is mapped to...
virtual void finishBlock()
Cleans up after scheduling in the given block.
MachineBasicBlock * BB
The block in which to insert instructions.
const MCSchedClassDesc * getSchedClass(SUnit *SU) const
Resolves and cache a resolved scheduling class for an SUnit.
void dumpNode(const SUnit &SU) const override
UndefValue * UnknownValue
For an unanalyzable memory access, this Value is used in maps.
void buildSchedGraph(AAResults *AA, RegPressureTracker *RPTracker=nullptr, PressureDiffs *PDiffs=nullptr, LiveIntervals *LIS=nullptr, bool TrackLaneMasks=false)
Builds SUnits for the current region.
SUnit * getSUnit(MachineInstr *MI) const
Returns an existing SUnit for this MI, or nullptr.
void dump() const override
void RemovePred(SUnit *M, SUnit *N)
Updates the topological ordering to accommodate an an edge to be removed from the specified node N fr...
void InitDAGTopologicalSorting()
Creates the initial topological ordering from the DAG to be scheduled.
void AddPred(SUnit *Y, SUnit *X)
Updates the topological ordering to accommodate an edge to be added from SUnit X to SUnit Y.
bool IsReachable(const SUnit *SU, const SUnit *TargetSU)
Checks if SU is reachable from TargetSU.
MachineRegisterInfo & MRI
Virtual/real register map.
Definition: ScheduleDAG.h:560
const TargetInstrInfo * TII
Target instruction information.
Definition: ScheduleDAG.h:557
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:561
const TargetRegisterInfo * TRI
Target processor register info.
Definition: ScheduleDAG.h:558
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:559
A vector that has set insertion semantics.
Definition: SetVector.h:40
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:77
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:208
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:202
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:141
void clear()
Completely clear the SetVector.
Definition: SetVector.h:213
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:72
typename vector_type::const_iterator iterator
Definition: SetVector.h:48
SlotIndexes pass.
Definition: SlotIndexes.h:319
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:540
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:379
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:383
iterator end() const
Definition: SmallPtrSet.h:408
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
iterator begin() const
Definition: SmallPtrSet.h:403
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:301
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:164
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:234
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:177
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
void resize(size_type N)
Definition: SmallVector.h:642
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
This class builds the dependence graph for the instructions in a loop, and attempts to schedule the i...
unsigned getDepth(SUnit *Node)
The depth, in the dependence graph, for a node.
unsigned getInstrBaseReg(SUnit *SU)
Return the new base register that was stored away for the changed instruction.
int getASAP(SUnit *Node)
Return the earliest time an instruction may be scheduled.
void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule)
Apply changes to the instruction if needed.
void finishBlock() override
Clean up after the software pipeliner runs.
void fixupRegisterOverlaps(std::deque< SUnit * > &Instrs)
Attempt to fix the degenerate cases when the instruction serialization causes the register lifetimes ...
int getZeroLatencyDepth(SUnit *Node)
The maximum unweighted length of a path from an arbitrary node to the given node in which each edge h...
bool isLoopCarriedDep(SUnit *Source, const SDep &Dep, bool isSucc=true)
Return true for an order or output dependence that is loop carried potentially.
unsigned getDistance(SUnit *U, SUnit *V, const SDep &Dep)
The distance function, which indicates that operation V of iteration I depends on operations U of ite...
void schedule() override
We override the schedule function in ScheduleDAGInstrs to implement the scheduling part of the Swing ...
int getMOV(SUnit *Node)
The mobility function, which the number of slots in which an instruction may be scheduled.
int getZeroLatencyHeight(SUnit *Node)
The maximum unweighted length of a path from the given node to an arbitrary node in which each edge h...
bool isBackedge(SUnit *Source, const SDep &Dep)
Return true if the dependence is a back-edge in the data dependence graph.
unsigned getHeight(SUnit *Node)
The height, in the dependence graph, for a node.
int getALAP(SUnit *Node)
Return the latest time an instruction my be scheduled.
Object returned by analyzeLoopForPipelining.
virtual bool shouldIgnoreForPipelining(const MachineInstr *MI) const =0
Return true if the given instruction should not be pipelined and should be ignored.
virtual bool shouldUseSchedule(SwingSchedulerDAG &SSD, SMSchedule &SMS)
Return true if the proposed schedule should used.
virtual std::unique_ptr< PipelinerLoopInfo > analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const
Analyze loop L, which must be a single-basic-block loop, and if the conditions can be understood enou...
bool isZeroCost(unsigned Opcode) const
Return true for pseudo instructions that don't consume any machine resources in their current form.
virtual bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
virtual DFAPacketizer * CreateTargetScheduleState(const TargetSubtargetInfo &) const
Create machine specific model for scheduling.
virtual bool isPostIncrement(const MachineInstr &MI) const
Return true for post-incremented instructions.
virtual bool getBaseAndOffsetPosition(const MachineInstr &MI, unsigned &BasePos, unsigned &OffsetPos) const
Return true if the instruction contains a base register and offset.
virtual bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const
Sometimes, it is possible for the target to tell, even without aliasing information,...
virtual bool getIncrementValue(const MachineInstr &MI, int &Value) const
If the instruction is an increment of a constant value, return the amount.
bool getMemOperandWithOffset(const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const
Get the base operand and byte offset of an instruction that reads/writes memory.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const char * getRegPressureSetName(unsigned Idx) const =0
Get the name of this register unit pressure set.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
static Type * getVoidTy(LLVMContext &C)
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1740
LLVM Value Representation.
Definition: Value.h:74
unsigned getPosition() const
Definition: CommandLine.h:304
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:139
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
constexpr double e
Definition: MathExtras.h:31
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:406
void stable_sort(R &&Range)
Definition: STLExtras.h:1948
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:349
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
Definition: SetOperations.h:72
void append_range(Container &C, Range &&R)
Wrapper function to append a range to a container.
Definition: STLExtras.h:2014
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:484
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1683
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:124
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
unsigned getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1923
void erase_value(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2006
char & MachinePipelinerID
This pass performs software pipelining on machine instructions.
CycleInfo::CycleT Cycle
Definition: CycleAnalysis.h:28
cl::opt< bool > SwpEnableCopyToPhi
bool isIdentifiedObject(const Value *V)
Return true if this pointer refers to a distinct and identifiable object.
cl::opt< int > SwpForceIssueWidth
A command line argument to force pipeliner to use specified issue width.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:853
#define N
These values represent a non-pipelined step in the execution of an instruction.
RegisterPressure computed within a region of instructions delimited by TopIdx and BottomIdx.
static constexpr LaneBitmask getNone()
Definition: LaneBitmask.h:81
Define a kind of processor resource that will be modeled by the scheduler.
Definition: MCSchedule.h:31
const unsigned * SubUnitsIdxBegin
Definition: MCSchedule.h:53
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition: MCSchedule.h:109
bool isValid() const
Definition: MCSchedule.h:127
Machine model for scheduling, bundling, and heuristics.
Definition: MCSchedule.h:244
const MCSchedClassDesc * getSchedClassDesc(unsigned SchedClassIdx) const
Definition: MCSchedule.h:346
unsigned getNumProcResourceKinds() const
Definition: MCSchedule.h:335
bool hasInstrSchedModel() const
Does this machine model include instruction-level scheduling.
Definition: MCSchedule.h:320
const MCProcResourceDesc * getProcResource(unsigned ProcResourceIdx) const
Definition: MCSchedule.h:339
Identify one of the processor resource kinds consumed by a particular scheduling class for the specif...
Definition: MCSchedule.h:63
SmallVector< MachineOperand, 4 > BrCond
std::unique_ptr< TargetInstrInfo::PipelinerLoopInfo > LoopPipelinerInfo
Store the effects of a change in pressure on things that MI scheduler cares about.
std::vector< unsigned > MaxSetPressure
Map of max reg pressure indexed by pressure set ID, not class ID.