LLVM 18.0.0git
MachinePipeliner.cpp
Go to the documentation of this file.
1//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
10//
11// This SMS implementation is a target-independent back-end pass. When enabled,
12// the pass runs just prior to the register allocation pass, while the machine
13// IR is in SSA form. If software pipelining is successful, then the original
14// loop is replaced by the optimized loop. The optimized loop contains one or
15// more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16// the instructions cannot be scheduled in a given MII, we increase the MII by
17// one and try again.
18//
19// The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20// represent loop carried dependences in the DAG as order edges to the Phi
21// nodes. We also perform several passes over the DAG to eliminate unnecessary
22// edges that inhibit the ability to pipeline. The implementation uses the
23// DFAPacketizer class to compute the minimum initiation interval and the check
24// where an instruction may be inserted in the pipelined schedule.
25//
26// In order for the SMS pass to work, several target specific hooks need to be
27// implemented to get information about the loop structure and to rewrite
28// instructions.
29//
30//===----------------------------------------------------------------------===//
31
33#include "llvm/ADT/ArrayRef.h"
34#include "llvm/ADT/BitVector.h"
35#include "llvm/ADT/DenseMap.h"
36#include "llvm/ADT/MapVector.h"
39#include "llvm/ADT/SetVector.h"
41#include "llvm/ADT/SmallSet.h"
43#include "llvm/ADT/Statistic.h"
69#include "llvm/Config/llvm-config.h"
70#include "llvm/IR/Attributes.h"
71#include "llvm/IR/Function.h"
72#include "llvm/MC/LaneBitmask.h"
73#include "llvm/MC/MCInstrDesc.h"
76#include "llvm/Pass.h"
79#include "llvm/Support/Debug.h"
82#include <algorithm>
83#include <cassert>
84#include <climits>
85#include <cstdint>
86#include <deque>
87#include <functional>
88#include <iomanip>
89#include <iterator>
90#include <map>
91#include <memory>
92#include <sstream>
93#include <tuple>
94#include <utility>
95#include <vector>
96
97using namespace llvm;
98
99#define DEBUG_TYPE "pipeliner"
100
101STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
102STATISTIC(NumPipelined, "Number of loops software pipelined");
103STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
104STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
105STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
106STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
107STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
108STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
109STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
110STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
111STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
112
113/// A command line option to turn software pipelining on or off.
114static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
115 cl::desc("Enable Software Pipelining"));
116
117/// A command line option to enable SWP at -Os.
118static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
119 cl::desc("Enable SWP at Os."), cl::Hidden,
120 cl::init(false));
121
122/// A command line argument to limit minimum initial interval for pipelining.
123static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
124 cl::desc("Size limit for the MII."),
125 cl::Hidden, cl::init(27));
126
127/// A command line argument to force pipeliner to use specified initial
128/// interval.
129static cl::opt<int> SwpForceII("pipeliner-force-ii",
130 cl::desc("Force pipeliner to use specified II."),
131 cl::Hidden, cl::init(-1));
132
133/// A command line argument to limit the number of stages in the pipeline.
134static cl::opt<int>
135 SwpMaxStages("pipeliner-max-stages",
136 cl::desc("Maximum stages allowed in the generated scheduled."),
137 cl::Hidden, cl::init(3));
138
139/// A command line option to disable the pruning of chain dependences due to
140/// an unrelated Phi.
141static cl::opt<bool>
142 SwpPruneDeps("pipeliner-prune-deps",
143 cl::desc("Prune dependences between unrelated Phi nodes."),
144 cl::Hidden, cl::init(true));
145
146/// A command line option to disable the pruning of loop carried order
147/// dependences.
148static cl::opt<bool>
149 SwpPruneLoopCarried("pipeliner-prune-loop-carried",
150 cl::desc("Prune loop carried order dependences."),
151 cl::Hidden, cl::init(true));
152
153#ifndef NDEBUG
154static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
155#endif
156
157static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
159 cl::desc("Ignore RecMII"));
160
161static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
162 cl::init(false));
163static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
164 cl::init(false));
165
167 "pipeliner-annotate-for-testing", cl::Hidden, cl::init(false),
168 cl::desc("Instead of emitting the pipelined code, annotate instructions "
169 "with the generated schedule for feeding into the "
170 "-modulo-schedule-test pass"));
171
173 "pipeliner-experimental-cg", cl::Hidden, cl::init(false),
174 cl::desc(
175 "Use the experimental peeling code generator for software pipelining"));
176
177namespace llvm {
178
179// A command line option to enable the CopyToPhi DAG mutation.
181 cl::init(true),
182 cl::desc("Enable CopyToPhi DAG Mutation"));
183
184/// A command line argument to force pipeliner to use specified issue
185/// width.
187 "pipeliner-force-issue-width",
188 cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
189 cl::init(-1));
190
191} // end namespace llvm
192
193unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
194char MachinePipeliner::ID = 0;
195#ifndef NDEBUG
197#endif
199
201 "Modulo Software Pipelining", false, false)
207 "Modulo Software Pipelining", false, false)
208
209/// The "main" function for implementing Swing Modulo Scheduling.
210bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
211 if (skipFunction(mf.getFunction()))
212 return false;
213
214 if (!EnableSWP)
215 return false;
216
217 if (mf.getFunction().getAttributes().hasFnAttr(Attribute::OptimizeForSize) &&
218 !EnableSWPOptSize.getPosition())
219 return false;
220
221 if (!mf.getSubtarget().enableMachinePipeliner())
222 return false;
223
224 // Cannot pipeline loops without instruction itineraries if we are using
225 // DFA for the pipeliner.
226 if (mf.getSubtarget().useDFAforSMS() &&
227 (!mf.getSubtarget().getInstrItineraryData() ||
228 mf.getSubtarget().getInstrItineraryData()->isEmpty()))
229 return false;
230
231 MF = &mf;
232 MLI = &getAnalysis<MachineLoopInfo>();
233 MDT = &getAnalysis<MachineDominatorTree>();
234 ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
235 TII = MF->getSubtarget().getInstrInfo();
236 RegClassInfo.runOnMachineFunction(*MF);
237
238 for (const auto &L : *MLI)
239 scheduleLoop(*L);
240
241 return false;
242}
243
244/// Attempt to perform the SMS algorithm on the specified loop. This function is
245/// the main entry point for the algorithm. The function identifies candidate
246/// loops, calculates the minimum initiation interval, and attempts to schedule
247/// the loop.
248bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
249 bool Changed = false;
250 for (const auto &InnerLoop : L)
251 Changed |= scheduleLoop(*InnerLoop);
252
253#ifndef NDEBUG
254 // Stop trying after reaching the limit (if any).
255 int Limit = SwpLoopLimit;
256 if (Limit >= 0) {
257 if (NumTries >= SwpLoopLimit)
258 return Changed;
259 NumTries++;
260 }
261#endif
262
263 setPragmaPipelineOptions(L);
264 if (!canPipelineLoop(L)) {
265 LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
266 ORE->emit([&]() {
267 return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
268 L.getStartLoc(), L.getHeader())
269 << "Failed to pipeline loop";
270 });
271
272 LI.LoopPipelinerInfo.reset();
273 return Changed;
274 }
275
276 ++NumTrytoPipeline;
277
278 Changed = swingModuloScheduler(L);
279
280 LI.LoopPipelinerInfo.reset();
281 return Changed;
282}
283
284void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
285 // Reset the pragma for the next loop in iteration.
286 disabledByPragma = false;
287 II_setByPragma = 0;
288
289 MachineBasicBlock *LBLK = L.getTopBlock();
290
291 if (LBLK == nullptr)
292 return;
293
294 const BasicBlock *BBLK = LBLK->getBasicBlock();
295 if (BBLK == nullptr)
296 return;
297
298 const Instruction *TI = BBLK->getTerminator();
299 if (TI == nullptr)
300 return;
301
302 MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);
303 if (LoopID == nullptr)
304 return;
305
306 assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");
307 assert(LoopID->getOperand(0) == LoopID && "invalid loop");
308
309 for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
310 MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
311
312 if (MD == nullptr)
313 continue;
314
315 MDString *S = dyn_cast<MDString>(MD->getOperand(0));
316
317 if (S == nullptr)
318 continue;
319
320 if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
321 assert(MD->getNumOperands() == 2 &&
322 "Pipeline initiation interval hint metadata should have two operands.");
324 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
325 assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");
326 } else if (S->getString() == "llvm.loop.pipeline.disable") {
327 disabledByPragma = true;
328 }
329 }
330}
331
332/// Return true if the loop can be software pipelined. The algorithm is
333/// restricted to loops with a single basic block. Make sure that the
334/// branch in the loop can be analyzed.
335bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
336 if (L.getNumBlocks() != 1) {
337 ORE->emit([&]() {
338 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
339 L.getStartLoc(), L.getHeader())
340 << "Not a single basic block: "
341 << ore::NV("NumBlocks", L.getNumBlocks());
342 });
343 return false;
344 }
345
346 if (disabledByPragma) {
347 ORE->emit([&]() {
348 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
349 L.getStartLoc(), L.getHeader())
350 << "Disabled by Pragma.";
351 });
352 return false;
353 }
354
355 // Check if the branch can't be understood because we can't do pipelining
356 // if that's the case.
357 LI.TBB = nullptr;
358 LI.FBB = nullptr;
359 LI.BrCond.clear();
360 if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
361 LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
362 NumFailBranch++;
363 ORE->emit([&]() {
364 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
365 L.getStartLoc(), L.getHeader())
366 << "The branch can't be understood";
367 });
368 return false;
369 }
370
371 LI.LoopInductionVar = nullptr;
372 LI.LoopCompare = nullptr;
374 if (!LI.LoopPipelinerInfo) {
375 LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
376 NumFailLoop++;
377 ORE->emit([&]() {
378 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
379 L.getStartLoc(), L.getHeader())
380 << "The loop structure is not supported";
381 });
382 return false;
383 }
384
385 if (!L.getLoopPreheader()) {
386 LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
387 NumFailPreheader++;
388 ORE->emit([&]() {
389 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
390 L.getStartLoc(), L.getHeader())
391 << "No loop preheader found";
392 });
393 return false;
394 }
395
396 // Remove any subregisters from inputs to phi nodes.
397 preprocessPhiNodes(*L.getHeader());
398 return true;
399}
400
401void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
403 SlotIndexes &Slots = *getAnalysis<LiveIntervals>().getSlotIndexes();
404
405 for (MachineInstr &PI : B.phis()) {
406 MachineOperand &DefOp = PI.getOperand(0);
407 assert(DefOp.getSubReg() == 0);
408 auto *RC = MRI.getRegClass(DefOp.getReg());
409
410 for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
411 MachineOperand &RegOp = PI.getOperand(i);
412 if (RegOp.getSubReg() == 0)
413 continue;
414
415 // If the operand uses a subregister, replace it with a new register
416 // without subregisters, and generate a copy to the new register.
417 Register NewReg = MRI.createVirtualRegister(RC);
418 MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
420 const DebugLoc &DL = PredB.findDebugLoc(At);
421 auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
422 .addReg(RegOp.getReg(), getRegState(RegOp),
423 RegOp.getSubReg());
424 Slots.insertMachineInstrInMaps(*Copy);
425 RegOp.setReg(NewReg);
426 RegOp.setSubReg(0);
427 }
428 }
429}
430
431/// The SMS algorithm consists of the following main steps:
432/// 1. Computation and analysis of the dependence graph.
433/// 2. Ordering of the nodes (instructions).
434/// 3. Attempt to Schedule the loop.
435bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
436 assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
437
438 SwingSchedulerDAG SMS(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo,
440
441 MachineBasicBlock *MBB = L.getHeader();
442 // The kernel should not include any terminator instructions. These
443 // will be added back later.
444 SMS.startBlock(MBB);
445
446 // Compute the number of 'real' instructions in the basic block by
447 // ignoring terminators.
448 unsigned size = MBB->size();
450 E = MBB->instr_end();
451 I != E; ++I, --size)
452 ;
453
454 SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
455 SMS.schedule();
456 SMS.exitRegion();
457
458 SMS.finishBlock();
459 return SMS.hasNewSchedule();
460}
461
470}
471
472void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
473 if (SwpForceII > 0)
474 MII = SwpForceII;
475 else if (II_setByPragma > 0)
476 MII = II_setByPragma;
477 else
478 MII = std::max(ResMII, RecMII);
479}
480
481void SwingSchedulerDAG::setMAX_II() {
482 if (SwpForceII > 0)
483 MAX_II = SwpForceII;
484 else if (II_setByPragma > 0)
485 MAX_II = II_setByPragma;
486 else
487 MAX_II = MII + 10;
488}
489
490/// We override the schedule function in ScheduleDAGInstrs to implement the
491/// scheduling part of the Swing Modulo Scheduling algorithm.
493 AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
494 buildSchedGraph(AA);
495 addLoopCarriedDependences(AA);
496 updatePhiDependences();
498 changeDependences();
499 postProcessDAG();
500 LLVM_DEBUG(dump());
501
502 NodeSetType NodeSets;
503 findCircuits(NodeSets);
504 NodeSetType Circuits = NodeSets;
505
506 // Calculate the MII.
507 unsigned ResMII = calculateResMII();
508 unsigned RecMII = calculateRecMII(NodeSets);
509
510 fuseRecs(NodeSets);
511
512 // This flag is used for testing and can cause correctness problems.
513 if (SwpIgnoreRecMII)
514 RecMII = 0;
515
516 setMII(ResMII, RecMII);
517 setMAX_II();
518
519 LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
520 << " (rec=" << RecMII << ", res=" << ResMII << ")\n");
521
522 // Can't schedule a loop without a valid MII.
523 if (MII == 0) {
524 LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
525 NumFailZeroMII++;
526 Pass.ORE->emit([&]() {
528 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
529 << "Invalid Minimal Initiation Interval: 0";
530 });
531 return;
532 }
533
534 // Don't pipeline large loops.
535 if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {
536 LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
537 << ", we don't pipeline large loops\n");
538 NumFailLargeMaxMII++;
539 Pass.ORE->emit([&]() {
541 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
542 << "Minimal Initiation Interval too large: "
543 << ore::NV("MII", (int)MII) << " > "
544 << ore::NV("SwpMaxMii", SwpMaxMii) << "."
545 << "Refer to -pipeliner-max-mii.";
546 });
547 return;
548 }
549
550 computeNodeFunctions(NodeSets);
551
552 registerPressureFilter(NodeSets);
553
554 colocateNodeSets(NodeSets);
555
556 checkNodeSets(NodeSets);
557
558 LLVM_DEBUG({
559 for (auto &I : NodeSets) {
560 dbgs() << " Rec NodeSet ";
561 I.dump();
562 }
563 });
564
565 llvm::stable_sort(NodeSets, std::greater<NodeSet>());
566
567 groupRemainingNodes(NodeSets);
568
569 removeDuplicateNodes(NodeSets);
570
571 LLVM_DEBUG({
572 for (auto &I : NodeSets) {
573 dbgs() << " NodeSet ";
574 I.dump();
575 }
576 });
577
578 computeNodeOrder(NodeSets);
579
580 // check for node order issues
581 checkValidNodeOrder(Circuits);
582
583 SMSchedule Schedule(Pass.MF, this);
584 Scheduled = schedulePipeline(Schedule);
585
586 if (!Scheduled){
587 LLVM_DEBUG(dbgs() << "No schedule found, return\n");
588 NumFailNoSchedule++;
589 Pass.ORE->emit([&]() {
591 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
592 << "Unable to find schedule";
593 });
594 return;
595 }
596
597 unsigned numStages = Schedule.getMaxStageCount();
598 // No need to generate pipeline if there are no overlapped iterations.
599 if (numStages == 0) {
600 LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
601 NumFailZeroStage++;
602 Pass.ORE->emit([&]() {
604 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
605 << "No need to pipeline - no overlapped iterations in schedule.";
606 });
607 return;
608 }
609 // Check that the maximum stage count is less than user-defined limit.
610 if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {
611 LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
612 << " : too many stages, abort\n");
613 NumFailLargeMaxStage++;
614 Pass.ORE->emit([&]() {
616 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
617 << "Too many stages in schedule: "
618 << ore::NV("numStages", (int)numStages) << " > "
619 << ore::NV("SwpMaxStages", SwpMaxStages)
620 << ". Refer to -pipeliner-max-stages.";
621 });
622 return;
623 }
624
625 Pass.ORE->emit([&]() {
627 Loop.getHeader())
628 << "Pipelined succesfully!";
629 });
630
631 // Generate the schedule as a ModuloSchedule.
632 DenseMap<MachineInstr *, int> Cycles, Stages;
633 std::vector<MachineInstr *> OrderedInsts;
634 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
635 ++Cycle) {
636 for (SUnit *SU : Schedule.getInstructions(Cycle)) {
637 OrderedInsts.push_back(SU->getInstr());
638 Cycles[SU->getInstr()] = Cycle;
639 Stages[SU->getInstr()] = Schedule.stageScheduled(SU);
640 }
641 }
643 for (auto &KV : NewMIs) {
644 Cycles[KV.first] = Cycles[KV.second];
645 Stages[KV.first] = Stages[KV.second];
646 NewInstrChanges[KV.first] = InstrChanges[getSUnit(KV.first)];
647 }
648
649 ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
650 std::move(Stages));
652 assert(NewInstrChanges.empty() &&
653 "Cannot serialize a schedule with InstrChanges!");
655 MSTI.annotate();
656 return;
657 }
658 // The experimental code generator can't work if there are InstChanges.
659 if (ExperimentalCodeGen && NewInstrChanges.empty()) {
660 PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
661 MSE.expand();
662 } else {
663 ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
664 MSE.expand();
665 MSE.cleanup();
666 }
667 ++NumPipelined;
668}
669
670/// Clean up after the software pipeliner runs.
672 for (auto &KV : NewMIs)
673 MF.deleteMachineInstr(KV.second);
674 NewMIs.clear();
675
676 // Call the superclass.
678}
679
680/// Return the register values for the operands of a Phi instruction.
681/// This function assume the instruction is a Phi.
683 unsigned &InitVal, unsigned &LoopVal) {
684 assert(Phi.isPHI() && "Expecting a Phi.");
685
686 InitVal = 0;
687 LoopVal = 0;
688 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
689 if (Phi.getOperand(i + 1).getMBB() != Loop)
690 InitVal = Phi.getOperand(i).getReg();
691 else
692 LoopVal = Phi.getOperand(i).getReg();
693
694 assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
695}
696
697/// Return the Phi register value that comes the loop block.
698static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
699 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
700 if (Phi.getOperand(i + 1).getMBB() == LoopBB)
701 return Phi.getOperand(i).getReg();
702 return 0;
703}
704
705/// Return true if SUb can be reached from SUa following the chain edges.
706static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
709 Worklist.push_back(SUa);
710 while (!Worklist.empty()) {
711 const SUnit *SU = Worklist.pop_back_val();
712 for (const auto &SI : SU->Succs) {
713 SUnit *SuccSU = SI.getSUnit();
714 if (SI.getKind() == SDep::Order) {
715 if (Visited.count(SuccSU))
716 continue;
717 if (SuccSU == SUb)
718 return true;
719 Worklist.push_back(SuccSU);
720 Visited.insert(SuccSU);
721 }
722 }
723 }
724 return false;
725}
726
727/// Return true if the instruction causes a chain between memory
728/// references before and after it.
730 return MI.isCall() || MI.mayRaiseFPException() ||
731 MI.hasUnmodeledSideEffects() ||
732 (MI.hasOrderedMemoryRef() &&
733 (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad()));
734}
735
736/// Return the underlying objects for the memory references of an instruction.
737/// This function calls the code in ValueTracking, but first checks that the
738/// instruction has a memory operand.
741 if (!MI->hasOneMemOperand())
742 return;
743 MachineMemOperand *MM = *MI->memoperands_begin();
744 if (!MM->getValue())
745 return;
746 getUnderlyingObjects(MM->getValue(), Objs);
747 for (const Value *V : Objs) {
748 if (!isIdentifiedObject(V)) {
749 Objs.clear();
750 return;
751 }
752 Objs.push_back(V);
753 }
754}
755
756/// Add a chain edge between a load and store if the store can be an
757/// alias of the load on a subsequent iteration, i.e., a loop carried
758/// dependence. This code is very similar to the code in ScheduleDAGInstrs
759/// but that code doesn't create loop carried dependences.
760void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
764 for (auto &SU : SUnits) {
765 MachineInstr &MI = *SU.getInstr();
767 PendingLoads.clear();
768 else if (MI.mayLoad()) {
771 if (Objs.empty())
773 for (const auto *V : Objs) {
774 SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
775 SUs.push_back(&SU);
776 }
777 } else if (MI.mayStore()) {
780 if (Objs.empty())
782 for (const auto *V : Objs) {
784 PendingLoads.find(V);
785 if (I == PendingLoads.end())
786 continue;
787 for (auto *Load : I->second) {
788 if (isSuccOrder(Load, &SU))
789 continue;
790 MachineInstr &LdMI = *Load->getInstr();
791 // First, perform the cheaper check that compares the base register.
792 // If they are the same and the load offset is less than the store
793 // offset, then mark the dependence as loop carried potentially.
794 const MachineOperand *BaseOp1, *BaseOp2;
795 int64_t Offset1, Offset2;
796 bool Offset1IsScalable, Offset2IsScalable;
797 if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1,
798 Offset1IsScalable, TRI) &&
799 TII->getMemOperandWithOffset(MI, BaseOp2, Offset2,
800 Offset2IsScalable, TRI)) {
801 if (BaseOp1->isIdenticalTo(*BaseOp2) &&
802 Offset1IsScalable == Offset2IsScalable &&
803 (int)Offset1 < (int)Offset2) {
805 "What happened to the chain edge?");
806 SDep Dep(Load, SDep::Barrier);
807 Dep.setLatency(1);
808 SU.addPred(Dep);
809 continue;
810 }
811 }
812 // Second, the more expensive check that uses alias analysis on the
813 // base registers. If they alias, and the load offset is less than
814 // the store offset, the mark the dependence as loop carried.
815 if (!AA) {
816 SDep Dep(Load, SDep::Barrier);
817 Dep.setLatency(1);
818 SU.addPred(Dep);
819 continue;
820 }
821 MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
822 MachineMemOperand *MMO2 = *MI.memoperands_begin();
823 if (!MMO1->getValue() || !MMO2->getValue()) {
824 SDep Dep(Load, SDep::Barrier);
825 Dep.setLatency(1);
826 SU.addPred(Dep);
827 continue;
828 }
829 if (MMO1->getValue() == MMO2->getValue() &&
830 MMO1->getOffset() <= MMO2->getOffset()) {
831 SDep Dep(Load, SDep::Barrier);
832 Dep.setLatency(1);
833 SU.addPred(Dep);
834 continue;
835 }
836 if (!AA->isNoAlias(
839 MMO2->getAAInfo()))) {
840 SDep Dep(Load, SDep::Barrier);
841 Dep.setLatency(1);
842 SU.addPred(Dep);
843 }
844 }
845 }
846 }
847 }
848}
849
850/// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
851/// processes dependences for PHIs. This function adds true dependences
852/// from a PHI to a use, and a loop carried dependence from the use to the
853/// PHI. The loop carried dependence is represented as an anti dependence
854/// edge. This function also removes chain dependences between unrelated
855/// PHIs.
856void SwingSchedulerDAG::updatePhiDependences() {
857 SmallVector<SDep, 4> RemoveDeps;
859
860 // Iterate over each DAG node.
861 for (SUnit &I : SUnits) {
862 RemoveDeps.clear();
863 // Set to true if the instruction has an operand defined by a Phi.
864 unsigned HasPhiUse = 0;
865 unsigned HasPhiDef = 0;
866 MachineInstr *MI = I.getInstr();
867 // Iterate over each operand, and we process the definitions.
868 for (const MachineOperand &MO : MI->operands()) {
869 if (!MO.isReg())
870 continue;
871 Register Reg = MO.getReg();
872 if (MO.isDef()) {
873 // If the register is used by a Phi, then create an anti dependence.
875 UI = MRI.use_instr_begin(Reg),
876 UE = MRI.use_instr_end();
877 UI != UE; ++UI) {
878 MachineInstr *UseMI = &*UI;
879 SUnit *SU = getSUnit(UseMI);
880 if (SU != nullptr && UseMI->isPHI()) {
881 if (!MI->isPHI()) {
882 SDep Dep(SU, SDep::Anti, Reg);
883 Dep.setLatency(1);
884 I.addPred(Dep);
885 } else {
886 HasPhiDef = Reg;
887 // Add a chain edge to a dependent Phi that isn't an existing
888 // predecessor.
889 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
890 I.addPred(SDep(SU, SDep::Barrier));
891 }
892 }
893 }
894 } else if (MO.isUse()) {
895 // If the register is defined by a Phi, then create a true dependence.
897 if (DefMI == nullptr)
898 continue;
899 SUnit *SU = getSUnit(DefMI);
900 if (SU != nullptr && DefMI->isPHI()) {
901 if (!MI->isPHI()) {
902 SDep Dep(SU, SDep::Data, Reg);
903 Dep.setLatency(0);
904 ST.adjustSchedDependency(SU, 0, &I, MO.getOperandNo(), Dep);
905 I.addPred(Dep);
906 } else {
907 HasPhiUse = Reg;
908 // Add a chain edge to a dependent Phi that isn't an existing
909 // predecessor.
910 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
911 I.addPred(SDep(SU, SDep::Barrier));
912 }
913 }
914 }
915 }
916 // Remove order dependences from an unrelated Phi.
917 if (!SwpPruneDeps)
918 continue;
919 for (auto &PI : I.Preds) {
920 MachineInstr *PMI = PI.getSUnit()->getInstr();
921 if (PMI->isPHI() && PI.getKind() == SDep::Order) {
922 if (I.getInstr()->isPHI()) {
923 if (PMI->getOperand(0).getReg() == HasPhiUse)
924 continue;
925 if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
926 continue;
927 }
928 RemoveDeps.push_back(PI);
929 }
930 }
931 for (int i = 0, e = RemoveDeps.size(); i != e; ++i)
932 I.removePred(RemoveDeps[i]);
933 }
934}
935
936/// Iterate over each DAG node and see if we can change any dependences
937/// in order to reduce the recurrence MII.
938void SwingSchedulerDAG::changeDependences() {
939 // See if an instruction can use a value from the previous iteration.
940 // If so, we update the base and offset of the instruction and change
941 // the dependences.
942 for (SUnit &I : SUnits) {
943 unsigned BasePos = 0, OffsetPos = 0, NewBase = 0;
944 int64_t NewOffset = 0;
945 if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
946 NewOffset))
947 continue;
948
949 // Get the MI and SUnit for the instruction that defines the original base.
950 Register OrigBase = I.getInstr()->getOperand(BasePos).getReg();
952 if (!DefMI)
953 continue;
954 SUnit *DefSU = getSUnit(DefMI);
955 if (!DefSU)
956 continue;
957 // Get the MI and SUnit for the instruction that defins the new base.
958 MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
959 if (!LastMI)
960 continue;
961 SUnit *LastSU = getSUnit(LastMI);
962 if (!LastSU)
963 continue;
964
965 if (Topo.IsReachable(&I, LastSU))
966 continue;
967
968 // Remove the dependence. The value now depends on a prior iteration.
970 for (const SDep &P : I.Preds)
971 if (P.getSUnit() == DefSU)
972 Deps.push_back(P);
973 for (int i = 0, e = Deps.size(); i != e; i++) {
974 Topo.RemovePred(&I, Deps[i].getSUnit());
975 I.removePred(Deps[i]);
976 }
977 // Remove the chain dependence between the instructions.
978 Deps.clear();
979 for (auto &P : LastSU->Preds)
980 if (P.getSUnit() == &I && P.getKind() == SDep::Order)
981 Deps.push_back(P);
982 for (int i = 0, e = Deps.size(); i != e; i++) {
983 Topo.RemovePred(LastSU, Deps[i].getSUnit());
984 LastSU->removePred(Deps[i]);
985 }
986
987 // Add a dependence between the new instruction and the instruction
988 // that defines the new base.
989 SDep Dep(&I, SDep::Anti, NewBase);
990 Topo.AddPred(LastSU, &I);
991 LastSU->addPred(Dep);
992
993 // Remember the base and offset information so that we can update the
994 // instruction during code generation.
995 InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
996 }
997}
998
999namespace {
1000
1001// FuncUnitSorter - Comparison operator used to sort instructions by
1002// the number of functional unit choices.
1003struct FuncUnitSorter {
1004 const InstrItineraryData *InstrItins;
1005 const MCSubtargetInfo *STI;
1007
1008 FuncUnitSorter(const TargetSubtargetInfo &TSI)
1009 : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
1010
1011 // Compute the number of functional unit alternatives needed
1012 // at each stage, and take the minimum value. We prioritize the
1013 // instructions by the least number of choices first.
1014 unsigned minFuncUnits(const MachineInstr *Inst,
1015 InstrStage::FuncUnits &F) const {
1016 unsigned SchedClass = Inst->getDesc().getSchedClass();
1017 unsigned min = UINT_MAX;
1018 if (InstrItins && !InstrItins->isEmpty()) {
1019 for (const InstrStage &IS :
1020 make_range(InstrItins->beginStage(SchedClass),
1021 InstrItins->endStage(SchedClass))) {
1022 InstrStage::FuncUnits funcUnits = IS.getUnits();
1023 unsigned numAlternatives = llvm::popcount(funcUnits);
1024 if (numAlternatives < min) {
1025 min = numAlternatives;
1026 F = funcUnits;
1027 }
1028 }
1029 return min;
1030 }
1031 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1032 const MCSchedClassDesc *SCDesc =
1033 STI->getSchedModel().getSchedClassDesc(SchedClass);
1034 if (!SCDesc->isValid())
1035 // No valid Schedule Class Desc for schedClass, should be
1036 // Pseudo/PostRAPseudo
1037 return min;
1038
1039 for (const MCWriteProcResEntry &PRE :
1040 make_range(STI->getWriteProcResBegin(SCDesc),
1041 STI->getWriteProcResEnd(SCDesc))) {
1042 if (!PRE.ReleaseAtCycle)
1043 continue;
1044 const MCProcResourceDesc *ProcResource =
1045 STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);
1046 unsigned NumUnits = ProcResource->NumUnits;
1047 if (NumUnits < min) {
1048 min = NumUnits;
1049 F = PRE.ProcResourceIdx;
1050 }
1051 }
1052 return min;
1053 }
1054 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1055 }
1056
1057 // Compute the critical resources needed by the instruction. This
1058 // function records the functional units needed by instructions that
1059 // must use only one functional unit. We use this as a tie breaker
1060 // for computing the resource MII. The instrutions that require
1061 // the same, highly used, functional unit have high priority.
1062 void calcCriticalResources(MachineInstr &MI) {
1063 unsigned SchedClass = MI.getDesc().getSchedClass();
1064 if (InstrItins && !InstrItins->isEmpty()) {
1065 for (const InstrStage &IS :
1066 make_range(InstrItins->beginStage(SchedClass),
1067 InstrItins->endStage(SchedClass))) {
1068 InstrStage::FuncUnits FuncUnits = IS.getUnits();
1069 if (llvm::popcount(FuncUnits) == 1)
1070 Resources[FuncUnits]++;
1071 }
1072 return;
1073 }
1074 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1075 const MCSchedClassDesc *SCDesc =
1076 STI->getSchedModel().getSchedClassDesc(SchedClass);
1077 if (!SCDesc->isValid())
1078 // No valid Schedule Class Desc for schedClass, should be
1079 // Pseudo/PostRAPseudo
1080 return;
1081
1082 for (const MCWriteProcResEntry &PRE :
1083 make_range(STI->getWriteProcResBegin(SCDesc),
1084 STI->getWriteProcResEnd(SCDesc))) {
1085 if (!PRE.ReleaseAtCycle)
1086 continue;
1087 Resources[PRE.ProcResourceIdx]++;
1088 }
1089 return;
1090 }
1091 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1092 }
1093
1094 /// Return true if IS1 has less priority than IS2.
1095 bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1096 InstrStage::FuncUnits F1 = 0, F2 = 0;
1097 unsigned MFUs1 = minFuncUnits(IS1, F1);
1098 unsigned MFUs2 = minFuncUnits(IS2, F2);
1099 if (MFUs1 == MFUs2)
1100 return Resources.lookup(F1) < Resources.lookup(F2);
1101 return MFUs1 > MFUs2;
1102 }
1103};
1104
1105} // end anonymous namespace
1106
1107/// Calculate the resource constrained minimum initiation interval for the
1108/// specified loop. We use the DFA to model the resources needed for
1109/// each instruction, and we ignore dependences. A different DFA is created
1110/// for each cycle that is required. When adding a new instruction, we attempt
1111/// to add it to each existing DFA, until a legal space is found. If the
1112/// instruction cannot be reserved in an existing DFA, we create a new one.
1113unsigned SwingSchedulerDAG::calculateResMII() {
1114 LLVM_DEBUG(dbgs() << "calculateResMII:\n");
1116 return RM.calculateResMII();
1117}
1118
1119/// Calculate the recurrence-constrainted minimum initiation interval.
1120/// Iterate over each circuit. Compute the delay(c) and distance(c)
1121/// for each circuit. The II needs to satisfy the inequality
1122/// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1123/// II that satisfies the inequality, and the RecMII is the maximum
1124/// of those values.
1125unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1126 unsigned RecMII = 0;
1127
1128 for (NodeSet &Nodes : NodeSets) {
1129 if (Nodes.empty())
1130 continue;
1131
1132 unsigned Delay = Nodes.getLatency();
1133 unsigned Distance = 1;
1134
1135 // ii = ceil(delay / distance)
1136 unsigned CurMII = (Delay + Distance - 1) / Distance;
1137 Nodes.setRecMII(CurMII);
1138 if (CurMII > RecMII)
1139 RecMII = CurMII;
1140 }
1141
1142 return RecMII;
1143}
1144
1145/// Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1146/// but we do this to find the circuits, and then change them back.
1147static void swapAntiDependences(std::vector<SUnit> &SUnits) {
1149 for (SUnit &SU : SUnits) {
1150 for (SDep &Pred : SU.Preds)
1151 if (Pred.getKind() == SDep::Anti)
1152 DepsAdded.push_back(std::make_pair(&SU, Pred));
1153 }
1154 for (std::pair<SUnit *, SDep> &P : DepsAdded) {
1155 // Remove this anti dependency and add one in the reverse direction.
1156 SUnit *SU = P.first;
1157 SDep &D = P.second;
1158 SUnit *TargetSU = D.getSUnit();
1159 unsigned Reg = D.getReg();
1160 unsigned Lat = D.getLatency();
1161 SU->removePred(D);
1162 SDep Dep(SU, SDep::Anti, Reg);
1163 Dep.setLatency(Lat);
1164 TargetSU->addPred(Dep);
1165 }
1166}
1167
1168/// Create the adjacency structure of the nodes in the graph.
1169void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1170 SwingSchedulerDAG *DAG) {
1171 BitVector Added(SUnits.size());
1172 DenseMap<int, int> OutputDeps;
1173 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1174 Added.reset();
1175 // Add any successor to the adjacency matrix and exclude duplicates.
1176 for (auto &SI : SUnits[i].Succs) {
1177 // Only create a back-edge on the first and last nodes of a dependence
1178 // chain. This records any chains and adds them later.
1179 if (SI.getKind() == SDep::Output) {
1180 int N = SI.getSUnit()->NodeNum;
1181 int BackEdge = i;
1182 auto Dep = OutputDeps.find(BackEdge);
1183 if (Dep != OutputDeps.end()) {
1184 BackEdge = Dep->second;
1185 OutputDeps.erase(Dep);
1186 }
1187 OutputDeps[N] = BackEdge;
1188 }
1189 // Do not process a boundary node, an artificial node.
1190 // A back-edge is processed only if it goes to a Phi.
1191 if (SI.getSUnit()->isBoundaryNode() || SI.isArtificial() ||
1192 (SI.getKind() == SDep::Anti && !SI.getSUnit()->getInstr()->isPHI()))
1193 continue;
1194 int N = SI.getSUnit()->NodeNum;
1195 if (!Added.test(N)) {
1196 AdjK[i].push_back(N);
1197 Added.set(N);
1198 }
1199 }
1200 // A chain edge between a store and a load is treated as a back-edge in the
1201 // adjacency matrix.
1202 for (auto &PI : SUnits[i].Preds) {
1203 if (!SUnits[i].getInstr()->mayStore() ||
1204 !DAG->isLoopCarriedDep(&SUnits[i], PI, false))
1205 continue;
1206 if (PI.getKind() == SDep::Order && PI.getSUnit()->getInstr()->mayLoad()) {
1207 int N = PI.getSUnit()->NodeNum;
1208 if (!Added.test(N)) {
1209 AdjK[i].push_back(N);
1210 Added.set(N);
1211 }
1212 }
1213 }
1214 }
1215 // Add back-edges in the adjacency matrix for the output dependences.
1216 for (auto &OD : OutputDeps)
1217 if (!Added.test(OD.second)) {
1218 AdjK[OD.first].push_back(OD.second);
1219 Added.set(OD.second);
1220 }
1221}
1222
1223/// Identify an elementary circuit in the dependence graph starting at the
1224/// specified node.
1225bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1226 bool HasBackedge) {
1227 SUnit *SV = &SUnits[V];
1228 bool F = false;
1229 Stack.insert(SV);
1230 Blocked.set(V);
1231
1232 for (auto W : AdjK[V]) {
1233 if (NumPaths > MaxPaths)
1234 break;
1235 if (W < S)
1236 continue;
1237 if (W == S) {
1238 if (!HasBackedge)
1239 NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
1240 F = true;
1241 ++NumPaths;
1242 break;
1243 } else if (!Blocked.test(W)) {
1244 if (circuit(W, S, NodeSets,
1245 Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
1246 F = true;
1247 }
1248 }
1249
1250 if (F)
1251 unblock(V);
1252 else {
1253 for (auto W : AdjK[V]) {
1254 if (W < S)
1255 continue;
1256 B[W].insert(SV);
1257 }
1258 }
1259 Stack.pop_back();
1260 return F;
1261}
1262
1263/// Unblock a node in the circuit finding algorithm.
1264void SwingSchedulerDAG::Circuits::unblock(int U) {
1265 Blocked.reset(U);
1267 while (!BU.empty()) {
1269 assert(SI != BU.end() && "Invalid B set.");
1270 SUnit *W = *SI;
1271 BU.erase(W);
1272 if (Blocked.test(W->NodeNum))
1273 unblock(W->NodeNum);
1274 }
1275}
1276
1277/// Identify all the elementary circuits in the dependence graph using
1278/// Johnson's circuit algorithm.
1279void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
1280 // Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1281 // but we do this to find the circuits, and then change them back.
1283
1284 Circuits Cir(SUnits, Topo);
1285 // Create the adjacency structure.
1286 Cir.createAdjacencyStructure(this);
1287 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1288 Cir.reset();
1289 Cir.circuit(i, i, NodeSets);
1290 }
1291
1292 // Change the dependences back so that we've created a DAG again.
1294}
1295
1296// Create artificial dependencies between the source of COPY/REG_SEQUENCE that
1297// is loop-carried to the USE in next iteration. This will help pipeliner avoid
1298// additional copies that are needed across iterations. An artificial dependence
1299// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
1300
1301// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
1302// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
1303// PHI-------True-Dep------> USEOfPhi
1304
1305// The mutation creates
1306// USEOfPHI -------Artificial-Dep---> SRCOfCopy
1307
1308// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
1309// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
1310// late to avoid additional copies across iterations. The possible scheduling
1311// order would be
1312// USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE.
1313
1314void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
1315 for (SUnit &SU : DAG->SUnits) {
1316 // Find the COPY/REG_SEQUENCE instruction.
1317 if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
1318 continue;
1319
1320 // Record the loop carried PHIs.
1322 // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
1324
1325 for (auto &Dep : SU.Preds) {
1326 SUnit *TmpSU = Dep.getSUnit();
1327 MachineInstr *TmpMI = TmpSU->getInstr();
1328 SDep::Kind DepKind = Dep.getKind();
1329 // Save the loop carried PHI.
1330 if (DepKind == SDep::Anti && TmpMI->isPHI())
1331 PHISUs.push_back(TmpSU);
1332 // Save the source of COPY/REG_SEQUENCE.
1333 // If the source has no pre-decessors, we will end up creating cycles.
1334 else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
1335 SrcSUs.push_back(TmpSU);
1336 }
1337
1338 if (PHISUs.size() == 0 || SrcSUs.size() == 0)
1339 continue;
1340
1341 // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
1342 // SUnit to the container.
1344 // Do not use iterator based loop here as we are updating the container.
1345 for (size_t Index = 0; Index < PHISUs.size(); ++Index) {
1346 for (auto &Dep : PHISUs[Index]->Succs) {
1347 if (Dep.getKind() != SDep::Data)
1348 continue;
1349
1350 SUnit *TmpSU = Dep.getSUnit();
1351 MachineInstr *TmpMI = TmpSU->getInstr();
1352 if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
1353 PHISUs.push_back(TmpSU);
1354 continue;
1355 }
1356 UseSUs.push_back(TmpSU);
1357 }
1358 }
1359
1360 if (UseSUs.size() == 0)
1361 continue;
1362
1363 SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
1364 // Add the artificial dependencies if it does not form a cycle.
1365 for (auto *I : UseSUs) {
1366 for (auto *Src : SrcSUs) {
1367 if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
1368 Src->addPred(SDep(I, SDep::Artificial));
1369 SDAG->Topo.AddPred(Src, I);
1370 }
1371 }
1372 }
1373 }
1374}
1375
1376/// Return true for DAG nodes that we ignore when computing the cost functions.
1377/// We ignore the back-edge recurrence in order to avoid unbounded recursion
1378/// in the calculation of the ASAP, ALAP, etc functions.
1379static bool ignoreDependence(const SDep &D, bool isPred) {
1380 if (D.isArtificial() || D.getSUnit()->isBoundaryNode())
1381 return true;
1382 return D.getKind() == SDep::Anti && isPred;
1383}
1384
1385/// Compute several functions need to order the nodes for scheduling.
1386/// ASAP - Earliest time to schedule a node.
1387/// ALAP - Latest time to schedule a node.
1388/// MOV - Mobility function, difference between ALAP and ASAP.
1389/// D - Depth of each node.
1390/// H - Height of each node.
1391void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
1392 ScheduleInfo.resize(SUnits.size());
1393
1394 LLVM_DEBUG({
1395 for (int I : Topo) {
1396 const SUnit &SU = SUnits[I];
1397 dumpNode(SU);
1398 }
1399 });
1400
1401 int maxASAP = 0;
1402 // Compute ASAP and ZeroLatencyDepth.
1403 for (int I : Topo) {
1404 int asap = 0;
1405 int zeroLatencyDepth = 0;
1406 SUnit *SU = &SUnits[I];
1407 for (const SDep &P : SU->Preds) {
1408 SUnit *pred = P.getSUnit();
1409 if (P.getLatency() == 0)
1410 zeroLatencyDepth =
1411 std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1);
1412 if (ignoreDependence(P, true))
1413 continue;
1414 asap = std::max(asap, (int)(getASAP(pred) + P.getLatency() -
1415 getDistance(pred, SU, P) * MII));
1416 }
1417 maxASAP = std::max(maxASAP, asap);
1418 ScheduleInfo[I].ASAP = asap;
1419 ScheduleInfo[I].ZeroLatencyDepth = zeroLatencyDepth;
1420 }
1421
1422 // Compute ALAP, ZeroLatencyHeight, and MOV.
1423 for (int I : llvm::reverse(Topo)) {
1424 int alap = maxASAP;
1425 int zeroLatencyHeight = 0;
1426 SUnit *SU = &SUnits[I];
1427 for (const SDep &S : SU->Succs) {
1428 SUnit *succ = S.getSUnit();
1429 if (succ->isBoundaryNode())
1430 continue;
1431 if (S.getLatency() == 0)
1432 zeroLatencyHeight =
1433 std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
1434 if (ignoreDependence(S, true))
1435 continue;
1436 alap = std::min(alap, (int)(getALAP(succ) - S.getLatency() +
1437 getDistance(SU, succ, S) * MII));
1438 }
1439
1440 ScheduleInfo[I].ALAP = alap;
1441 ScheduleInfo[I].ZeroLatencyHeight = zeroLatencyHeight;
1442 }
1443
1444 // After computing the node functions, compute the summary for each node set.
1445 for (NodeSet &I : NodeSets)
1446 I.computeNodeSetInfo(this);
1447
1448 LLVM_DEBUG({
1449 for (unsigned i = 0; i < SUnits.size(); i++) {
1450 dbgs() << "\tNode " << i << ":\n";
1451 dbgs() << "\t ASAP = " << getASAP(&SUnits[i]) << "\n";
1452 dbgs() << "\t ALAP = " << getALAP(&SUnits[i]) << "\n";
1453 dbgs() << "\t MOV = " << getMOV(&SUnits[i]) << "\n";
1454 dbgs() << "\t D = " << getDepth(&SUnits[i]) << "\n";
1455 dbgs() << "\t H = " << getHeight(&SUnits[i]) << "\n";
1456 dbgs() << "\t ZLD = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
1457 dbgs() << "\t ZLH = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
1458 }
1459 });
1460}
1461
1462/// Compute the Pred_L(O) set, as defined in the paper. The set is defined
1463/// as the predecessors of the elements of NodeOrder that are not also in
1464/// NodeOrder.
1467 const NodeSet *S = nullptr) {
1468 Preds.clear();
1469 for (const SUnit *SU : NodeOrder) {
1470 for (const SDep &Pred : SU->Preds) {
1471 if (S && S->count(Pred.getSUnit()) == 0)
1472 continue;
1473 if (ignoreDependence(Pred, true))
1474 continue;
1475 if (NodeOrder.count(Pred.getSUnit()) == 0)
1476 Preds.insert(Pred.getSUnit());
1477 }
1478 // Back-edges are predecessors with an anti-dependence.
1479 for (const SDep &Succ : SU->Succs) {
1480 if (Succ.getKind() != SDep::Anti)
1481 continue;
1482 if (S && S->count(Succ.getSUnit()) == 0)
1483 continue;
1484 if (NodeOrder.count(Succ.getSUnit()) == 0)
1485 Preds.insert(Succ.getSUnit());
1486 }
1487 }
1488 return !Preds.empty();
1489}
1490
1491/// Compute the Succ_L(O) set, as defined in the paper. The set is defined
1492/// as the successors of the elements of NodeOrder that are not also in
1493/// NodeOrder.
1496 const NodeSet *S = nullptr) {
1497 Succs.clear();
1498 for (const SUnit *SU : NodeOrder) {
1499 for (const SDep &Succ : SU->Succs) {
1500 if (S && S->count(Succ.getSUnit()) == 0)
1501 continue;
1502 if (ignoreDependence(Succ, false))
1503 continue;
1504 if (NodeOrder.count(Succ.getSUnit()) == 0)
1505 Succs.insert(Succ.getSUnit());
1506 }
1507 for (const SDep &Pred : SU->Preds) {
1508 if (Pred.getKind() != SDep::Anti)
1509 continue;
1510 if (S && S->count(Pred.getSUnit()) == 0)
1511 continue;
1512 if (NodeOrder.count(Pred.getSUnit()) == 0)
1513 Succs.insert(Pred.getSUnit());
1514 }
1515 }
1516 return !Succs.empty();
1517}
1518
1519/// Return true if there is a path from the specified node to any of the nodes
1520/// in DestNodes. Keep track and return the nodes in any path.
1521static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
1522 SetVector<SUnit *> &DestNodes,
1523 SetVector<SUnit *> &Exclude,
1524 SmallPtrSet<SUnit *, 8> &Visited) {
1525 if (Cur->isBoundaryNode())
1526 return false;
1527 if (Exclude.contains(Cur))
1528 return false;
1529 if (DestNodes.contains(Cur))
1530 return true;
1531 if (!Visited.insert(Cur).second)
1532 return Path.contains(Cur);
1533 bool FoundPath = false;
1534 for (auto &SI : Cur->Succs)
1535 if (!ignoreDependence(SI, false))
1536 FoundPath |=
1537 computePath(SI.getSUnit(), Path, DestNodes, Exclude, Visited);
1538 for (auto &PI : Cur->Preds)
1539 if (PI.getKind() == SDep::Anti)
1540 FoundPath |=
1541 computePath(PI.getSUnit(), Path, DestNodes, Exclude, Visited);
1542 if (FoundPath)
1543 Path.insert(Cur);
1544 return FoundPath;
1545}
1546
1547/// Compute the live-out registers for the instructions in a node-set.
1548/// The live-out registers are those that are defined in the node-set,
1549/// but not used. Except for use operands of Phis.
1551 NodeSet &NS) {
1556 for (SUnit *SU : NS) {
1557 const MachineInstr *MI = SU->getInstr();
1558 if (MI->isPHI())
1559 continue;
1560 for (const MachineOperand &MO : MI->all_uses()) {
1561 Register Reg = MO.getReg();
1562 if (Reg.isVirtual())
1563 Uses.insert(Reg);
1564 else if (MRI.isAllocatable(Reg))
1565 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
1566 Uses.insert(Unit);
1567 }
1568 }
1569 for (SUnit *SU : NS)
1570 for (const MachineOperand &MO : SU->getInstr()->all_defs())
1571 if (!MO.isDead()) {
1572 Register Reg = MO.getReg();
1573 if (Reg.isVirtual()) {
1574 if (!Uses.count(Reg))
1575 LiveOutRegs.push_back(RegisterMaskPair(Reg,
1577 } else if (MRI.isAllocatable(Reg)) {
1578 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
1579 if (!Uses.count(Unit))
1580 LiveOutRegs.push_back(
1582 }
1583 }
1584 RPTracker.addLiveRegs(LiveOutRegs);
1585}
1586
1587/// A heuristic to filter nodes in recurrent node-sets if the register
1588/// pressure of a set is too high.
1589void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
1590 for (auto &NS : NodeSets) {
1591 // Skip small node-sets since they won't cause register pressure problems.
1592 if (NS.size() <= 2)
1593 continue;
1594 IntervalPressure RecRegPressure;
1595 RegPressureTracker RecRPTracker(RecRegPressure);
1596 RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
1597 computeLiveOuts(MF, RecRPTracker, NS);
1598 RecRPTracker.closeBottom();
1599
1600 std::vector<SUnit *> SUnits(NS.begin(), NS.end());
1601 llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
1602 return A->NodeNum > B->NodeNum;
1603 });
1604
1605 for (auto &SU : SUnits) {
1606 // Since we're computing the register pressure for a subset of the
1607 // instructions in a block, we need to set the tracker for each
1608 // instruction in the node-set. The tracker is set to the instruction
1609 // just after the one we're interested in.
1611 RecRPTracker.setPos(std::next(CurInstI));
1612
1613 RegPressureDelta RPDelta;
1614 ArrayRef<PressureChange> CriticalPSets;
1615 RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
1616 CriticalPSets,
1617 RecRegPressure.MaxSetPressure);
1618 if (RPDelta.Excess.isValid()) {
1619 LLVM_DEBUG(
1620 dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
1622 << ":" << RPDelta.Excess.getUnitInc() << "\n");
1623 NS.setExceedPressure(SU);
1624 break;
1625 }
1626 RecRPTracker.recede();
1627 }
1628 }
1629}
1630
1631/// A heuristic to colocate node sets that have the same set of
1632/// successors.
1633void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
1634 unsigned Colocate = 0;
1635 for (int i = 0, e = NodeSets.size(); i < e; ++i) {
1636 NodeSet &N1 = NodeSets[i];
1638 if (N1.empty() || !succ_L(N1, S1))
1639 continue;
1640 for (int j = i + 1; j < e; ++j) {
1641 NodeSet &N2 = NodeSets[j];
1642 if (N1.compareRecMII(N2) != 0)
1643 continue;
1645 if (N2.empty() || !succ_L(N2, S2))
1646 continue;
1647 if (llvm::set_is_subset(S1, S2) && S1.size() == S2.size()) {
1648 N1.setColocate(++Colocate);
1649 N2.setColocate(Colocate);
1650 break;
1651 }
1652 }
1653 }
1654}
1655
1656/// Check if the existing node-sets are profitable. If not, then ignore the
1657/// recurrent node-sets, and attempt to schedule all nodes together. This is
1658/// a heuristic. If the MII is large and all the recurrent node-sets are small,
1659/// then it's best to try to schedule all instructions together instead of
1660/// starting with the recurrent node-sets.
1661void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
1662 // Look for loops with a large MII.
1663 if (MII < 17)
1664 return;
1665 // Check if the node-set contains only a simple add recurrence.
1666 for (auto &NS : NodeSets) {
1667 if (NS.getRecMII() > 2)
1668 return;
1669 if (NS.getMaxDepth() > MII)
1670 return;
1671 }
1672 NodeSets.clear();
1673 LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
1674}
1675
1676/// Add the nodes that do not belong to a recurrence set into groups
1677/// based upon connected components.
1678void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
1679 SetVector<SUnit *> NodesAdded;
1681 // Add the nodes that are on a path between the previous node sets and
1682 // the current node set.
1683 for (NodeSet &I : NodeSets) {
1685 // Add the nodes from the current node set to the previous node set.
1686 if (succ_L(I, N)) {
1688 for (SUnit *NI : N) {
1689 Visited.clear();
1690 computePath(NI, Path, NodesAdded, I, Visited);
1691 }
1692 if (!Path.empty())
1693 I.insert(Path.begin(), Path.end());
1694 }
1695 // Add the nodes from the previous node set to the current node set.
1696 N.clear();
1697 if (succ_L(NodesAdded, N)) {
1699 for (SUnit *NI : N) {
1700 Visited.clear();
1701 computePath(NI, Path, I, NodesAdded, Visited);
1702 }
1703 if (!Path.empty())
1704 I.insert(Path.begin(), Path.end());
1705 }
1706 NodesAdded.insert(I.begin(), I.end());
1707 }
1708
1709 // Create a new node set with the connected nodes of any successor of a node
1710 // in a recurrent set.
1711 NodeSet NewSet;
1713 if (succ_L(NodesAdded, N))
1714 for (SUnit *I : N)
1715 addConnectedNodes(I, NewSet, NodesAdded);
1716 if (!NewSet.empty())
1717 NodeSets.push_back(NewSet);
1718
1719 // Create a new node set with the connected nodes of any predecessor of a node
1720 // in a recurrent set.
1721 NewSet.clear();
1722 if (pred_L(NodesAdded, N))
1723 for (SUnit *I : N)
1724 addConnectedNodes(I, NewSet, NodesAdded);
1725 if (!NewSet.empty())
1726 NodeSets.push_back(NewSet);
1727
1728 // Create new nodes sets with the connected nodes any remaining node that
1729 // has no predecessor.
1730 for (SUnit &SU : SUnits) {
1731 if (NodesAdded.count(&SU) == 0) {
1732 NewSet.clear();
1733 addConnectedNodes(&SU, NewSet, NodesAdded);
1734 if (!NewSet.empty())
1735 NodeSets.push_back(NewSet);
1736 }
1737 }
1738}
1739
1740/// Add the node to the set, and add all of its connected nodes to the set.
1741void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
1742 SetVector<SUnit *> &NodesAdded) {
1743 NewSet.insert(SU);
1744 NodesAdded.insert(SU);
1745 for (auto &SI : SU->Succs) {
1746 SUnit *Successor = SI.getSUnit();
1747 if (!SI.isArtificial() && !Successor->isBoundaryNode() &&
1748 NodesAdded.count(Successor) == 0)
1749 addConnectedNodes(Successor, NewSet, NodesAdded);
1750 }
1751 for (auto &PI : SU->Preds) {
1752 SUnit *Predecessor = PI.getSUnit();
1753 if (!PI.isArtificial() && NodesAdded.count(Predecessor) == 0)
1754 addConnectedNodes(Predecessor, NewSet, NodesAdded);
1755 }
1756}
1757
1758/// Return true if Set1 contains elements in Set2. The elements in common
1759/// are returned in a different container.
1760static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
1762 Result.clear();
1763 for (SUnit *SU : Set1) {
1764 if (Set2.count(SU) != 0)
1765 Result.insert(SU);
1766 }
1767 return !Result.empty();
1768}
1769
1770/// Merge the recurrence node sets that have the same initial node.
1771void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
1772 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
1773 ++I) {
1774 NodeSet &NI = *I;
1775 for (NodeSetType::iterator J = I + 1; J != E;) {
1776 NodeSet &NJ = *J;
1777 if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
1778 if (NJ.compareRecMII(NI) > 0)
1779 NI.setRecMII(NJ.getRecMII());
1780 for (SUnit *SU : *J)
1781 I->insert(SU);
1782 NodeSets.erase(J);
1783 E = NodeSets.end();
1784 } else {
1785 ++J;
1786 }
1787 }
1788 }
1789}
1790
1791/// Remove nodes that have been scheduled in previous NodeSets.
1792void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
1793 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
1794 ++I)
1795 for (NodeSetType::iterator J = I + 1; J != E;) {
1796 J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
1797
1798 if (J->empty()) {
1799 NodeSets.erase(J);
1800 E = NodeSets.end();
1801 } else {
1802 ++J;
1803 }
1804 }
1805}
1806
1807/// Compute an ordered list of the dependence graph nodes, which
1808/// indicates the order that the nodes will be scheduled. This is a
1809/// two-level algorithm. First, a partial order is created, which
1810/// consists of a list of sets ordered from highest to lowest priority.
1811void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
1813 NodeOrder.clear();
1814
1815 for (auto &Nodes : NodeSets) {
1816 LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
1817 OrderKind Order;
1819 if (pred_L(NodeOrder, N) && llvm::set_is_subset(N, Nodes)) {
1820 R.insert(N.begin(), N.end());
1821 Order = BottomUp;
1822 LLVM_DEBUG(dbgs() << " Bottom up (preds) ");
1823 } else if (succ_L(NodeOrder, N) && llvm::set_is_subset(N, Nodes)) {
1824 R.insert(N.begin(), N.end());
1825 Order = TopDown;
1826 LLVM_DEBUG(dbgs() << " Top down (succs) ");
1827 } else if (isIntersect(N, Nodes, R)) {
1828 // If some of the successors are in the existing node-set, then use the
1829 // top-down ordering.
1830 Order = TopDown;
1831 LLVM_DEBUG(dbgs() << " Top down (intersect) ");
1832 } else if (NodeSets.size() == 1) {
1833 for (const auto &N : Nodes)
1834 if (N->Succs.size() == 0)
1835 R.insert(N);
1836 Order = BottomUp;
1837 LLVM_DEBUG(dbgs() << " Bottom up (all) ");
1838 } else {
1839 // Find the node with the highest ASAP.
1840 SUnit *maxASAP = nullptr;
1841 for (SUnit *SU : Nodes) {
1842 if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
1843 (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
1844 maxASAP = SU;
1845 }
1846 R.insert(maxASAP);
1847 Order = BottomUp;
1848 LLVM_DEBUG(dbgs() << " Bottom up (default) ");
1849 }
1850
1851 while (!R.empty()) {
1852 if (Order == TopDown) {
1853 // Choose the node with the maximum height. If more than one, choose
1854 // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
1855 // choose the node with the lowest MOV.
1856 while (!R.empty()) {
1857 SUnit *maxHeight = nullptr;
1858 for (SUnit *I : R) {
1859 if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
1860 maxHeight = I;
1861 else if (getHeight(I) == getHeight(maxHeight) &&
1863 maxHeight = I;
1864 else if (getHeight(I) == getHeight(maxHeight) &&
1866 getZeroLatencyHeight(maxHeight) &&
1867 getMOV(I) < getMOV(maxHeight))
1868 maxHeight = I;
1869 }
1870 NodeOrder.insert(maxHeight);
1871 LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
1872 R.remove(maxHeight);
1873 for (const auto &I : maxHeight->Succs) {
1874 if (Nodes.count(I.getSUnit()) == 0)
1875 continue;
1876 if (NodeOrder.contains(I.getSUnit()))
1877 continue;
1878 if (ignoreDependence(I, false))
1879 continue;
1880 R.insert(I.getSUnit());
1881 }
1882 // Back-edges are predecessors with an anti-dependence.
1883 for (const auto &I : maxHeight->Preds) {
1884 if (I.getKind() != SDep::Anti)
1885 continue;
1886 if (Nodes.count(I.getSUnit()) == 0)
1887 continue;
1888 if (NodeOrder.contains(I.getSUnit()))
1889 continue;
1890 R.insert(I.getSUnit());
1891 }
1892 }
1893 Order = BottomUp;
1894 LLVM_DEBUG(dbgs() << "\n Switching order to bottom up ");
1896 if (pred_L(NodeOrder, N, &Nodes))
1897 R.insert(N.begin(), N.end());
1898 } else {
1899 // Choose the node with the maximum depth. If more than one, choose
1900 // the node with the maximum ZeroLatencyDepth. If still more than one,
1901 // choose the node with the lowest MOV.
1902 while (!R.empty()) {
1903 SUnit *maxDepth = nullptr;
1904 for (SUnit *I : R) {
1905 if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
1906 maxDepth = I;
1907 else if (getDepth(I) == getDepth(maxDepth) &&
1909 maxDepth = I;
1910 else if (getDepth(I) == getDepth(maxDepth) &&
1912 getMOV(I) < getMOV(maxDepth))
1913 maxDepth = I;
1914 }
1915 NodeOrder.insert(maxDepth);
1916 LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
1917 R.remove(maxDepth);
1918 if (Nodes.isExceedSU(maxDepth)) {
1919 Order = TopDown;
1920 R.clear();
1921 R.insert(Nodes.getNode(0));
1922 break;
1923 }
1924 for (const auto &I : maxDepth->Preds) {
1925 if (Nodes.count(I.getSUnit()) == 0)
1926 continue;
1927 if (NodeOrder.contains(I.getSUnit()))
1928 continue;
1929 R.insert(I.getSUnit());
1930 }
1931 // Back-edges are predecessors with an anti-dependence.
1932 for (const auto &I : maxDepth->Succs) {
1933 if (I.getKind() != SDep::Anti)
1934 continue;
1935 if (Nodes.count(I.getSUnit()) == 0)
1936 continue;
1937 if (NodeOrder.contains(I.getSUnit()))
1938 continue;
1939 R.insert(I.getSUnit());
1940 }
1941 }
1942 Order = TopDown;
1943 LLVM_DEBUG(dbgs() << "\n Switching order to top down ");
1945 if (succ_L(NodeOrder, N, &Nodes))
1946 R.insert(N.begin(), N.end());
1947 }
1948 }
1949 LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
1950 }
1951
1952 LLVM_DEBUG({
1953 dbgs() << "Node order: ";
1954 for (SUnit *I : NodeOrder)
1955 dbgs() << " " << I->NodeNum << " ";
1956 dbgs() << "\n";
1957 });
1958}
1959
1960/// Process the nodes in the computed order and create the pipelined schedule
1961/// of the instructions, if possible. Return true if a schedule is found.
1962bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
1963
1964 if (NodeOrder.empty()){
1965 LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
1966 return false;
1967 }
1968
1969 bool scheduleFound = false;
1970 // Keep increasing II until a valid schedule is found.
1971 for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) {
1972 Schedule.reset();
1973 Schedule.setInitiationInterval(II);
1974 LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
1975
1976 SetVector<SUnit *>::iterator NI = NodeOrder.begin();
1977 SetVector<SUnit *>::iterator NE = NodeOrder.end();
1978 do {
1979 SUnit *SU = *NI;
1980
1981 // Compute the schedule time for the instruction, which is based
1982 // upon the scheduled time for any predecessors/successors.
1983 int EarlyStart = INT_MIN;
1984 int LateStart = INT_MAX;
1985 // These values are set when the size of the schedule window is limited
1986 // due to chain dependences.
1987 int SchedEnd = INT_MAX;
1988 int SchedStart = INT_MIN;
1989 Schedule.computeStart(SU, &EarlyStart, &LateStart, &SchedEnd, &SchedStart,
1990 II, this);
1991 LLVM_DEBUG({
1992 dbgs() << "\n";
1993 dbgs() << "Inst (" << SU->NodeNum << ") ";
1994 SU->getInstr()->dump();
1995 dbgs() << "\n";
1996 });
1997 LLVM_DEBUG({
1998 dbgs() << format("\tes: %8x ls: %8x me: %8x ms: %8x\n", EarlyStart,
1999 LateStart, SchedEnd, SchedStart);
2000 });
2001
2002 if (EarlyStart > LateStart || SchedEnd < EarlyStart ||
2003 SchedStart > LateStart)
2004 scheduleFound = false;
2005 else if (EarlyStart != INT_MIN && LateStart == INT_MAX) {
2006 SchedEnd = std::min(SchedEnd, EarlyStart + (int)II - 1);
2007 scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2008 } else if (EarlyStart == INT_MIN && LateStart != INT_MAX) {
2009 SchedStart = std::max(SchedStart, LateStart - (int)II + 1);
2010 scheduleFound = Schedule.insert(SU, LateStart, SchedStart, II);
2011 } else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2012 SchedEnd =
2013 std::min(SchedEnd, std::min(LateStart, EarlyStart + (int)II - 1));
2014 // When scheduling a Phi it is better to start at the late cycle and go
2015 // backwards. The default order may insert the Phi too far away from
2016 // its first dependence.
2017 if (SU->getInstr()->isPHI())
2018 scheduleFound = Schedule.insert(SU, SchedEnd, EarlyStart, II);
2019 else
2020 scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2021 } else {
2022 int FirstCycle = Schedule.getFirstCycle();
2023 scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2024 FirstCycle + getASAP(SU) + II - 1, II);
2025 }
2026 // Even if we find a schedule, make sure the schedule doesn't exceed the
2027 // allowable number of stages. We keep trying if this happens.
2028 if (scheduleFound)
2029 if (SwpMaxStages > -1 &&
2030 Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2031 scheduleFound = false;
2032
2033 LLVM_DEBUG({
2034 if (!scheduleFound)
2035 dbgs() << "\tCan't schedule\n";
2036 });
2037 } while (++NI != NE && scheduleFound);
2038
2039 // If a schedule is found, ensure non-pipelined instructions are in stage 0
2040 if (scheduleFound)
2041 scheduleFound =
2042 Schedule.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo);
2043
2044 // If a schedule is found, check if it is a valid schedule too.
2045 if (scheduleFound)
2046 scheduleFound = Schedule.isValidSchedule(this);
2047 }
2048
2049 LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
2050 << " (II=" << Schedule.getInitiationInterval()
2051 << ")\n");
2052
2053 if (scheduleFound) {
2054 scheduleFound = LoopPipelinerInfo->shouldUseSchedule(*this, Schedule);
2055 if (!scheduleFound)
2056 LLVM_DEBUG(dbgs() << "Target rejected schedule\n");
2057 }
2058
2059 if (scheduleFound) {
2060 Schedule.finalizeSchedule(this);
2061 Pass.ORE->emit([&]() {
2063 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
2064 << "Schedule found with Initiation Interval: "
2065 << ore::NV("II", Schedule.getInitiationInterval())
2066 << ", MaxStageCount: "
2067 << ore::NV("MaxStageCount", Schedule.getMaxStageCount());
2068 });
2069 } else
2070 Schedule.reset();
2071
2072 return scheduleFound && Schedule.getMaxStageCount() > 0;
2073}
2074
2075/// Return true if we can compute the amount the instruction changes
2076/// during each iteration. Set Delta to the amount of the change.
2077bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
2079 const MachineOperand *BaseOp;
2080 int64_t Offset;
2081 bool OffsetIsScalable;
2082 if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
2083 return false;
2084
2085 // FIXME: This algorithm assumes instructions have fixed-size offsets.
2086 if (OffsetIsScalable)
2087 return false;
2088
2089 if (!BaseOp->isReg())
2090 return false;
2091
2092 Register BaseReg = BaseOp->getReg();
2093
2095 // Check if there is a Phi. If so, get the definition in the loop.
2096 MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
2097 if (BaseDef && BaseDef->isPHI()) {
2098 BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
2099 BaseDef = MRI.getVRegDef(BaseReg);
2100 }
2101 if (!BaseDef)
2102 return false;
2103
2104 int D = 0;
2105 if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
2106 return false;
2107
2108 Delta = D;
2109 return true;
2110}
2111
2112/// Check if we can change the instruction to use an offset value from the
2113/// previous iteration. If so, return true and set the base and offset values
2114/// so that we can rewrite the load, if necessary.
2115/// v1 = Phi(v0, v3)
2116/// v2 = load v1, 0
2117/// v3 = post_store v1, 4, x
2118/// This function enables the load to be rewritten as v2 = load v3, 4.
2119bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
2120 unsigned &BasePos,
2121 unsigned &OffsetPos,
2122 unsigned &NewBase,
2123 int64_t &Offset) {
2124 // Get the load instruction.
2125 if (TII->isPostIncrement(*MI))
2126 return false;
2127 unsigned BasePosLd, OffsetPosLd;
2128 if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
2129 return false;
2130 Register BaseReg = MI->getOperand(BasePosLd).getReg();
2131
2132 // Look for the Phi instruction.
2133 MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
2134 MachineInstr *Phi = MRI.getVRegDef(BaseReg);
2135 if (!Phi || !Phi->isPHI())
2136 return false;
2137 // Get the register defined in the loop block.
2138 unsigned PrevReg = getLoopPhiReg(*Phi, MI->getParent());
2139 if (!PrevReg)
2140 return false;
2141
2142 // Check for the post-increment load/store instruction.
2143 MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
2144 if (!PrevDef || PrevDef == MI)
2145 return false;
2146
2147 if (!TII->isPostIncrement(*PrevDef))
2148 return false;
2149
2150 unsigned BasePos1 = 0, OffsetPos1 = 0;
2151 if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
2152 return false;
2153
2154 // Make sure that the instructions do not access the same memory location in
2155 // the next iteration.
2156 int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
2157 int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
2159 NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
2160 bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
2161 MF.deleteMachineInstr(NewMI);
2162 if (!Disjoint)
2163 return false;
2164
2165 // Set the return value once we determine that we return true.
2166 BasePos = BasePosLd;
2167 OffsetPos = OffsetPosLd;
2168 NewBase = PrevReg;
2169 Offset = StoreOffset;
2170 return true;
2171}
2172
2173/// Apply changes to the instruction if needed. The changes are need
2174/// to improve the scheduling and depend up on the final schedule.
2176 SMSchedule &Schedule) {
2177 SUnit *SU = getSUnit(MI);
2179 InstrChanges.find(SU);
2180 if (It != InstrChanges.end()) {
2181 std::pair<unsigned, int64_t> RegAndOffset = It->second;
2182 unsigned BasePos, OffsetPos;
2183 if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2184 return;
2185 Register BaseReg = MI->getOperand(BasePos).getReg();
2186 MachineInstr *LoopDef = findDefInLoop(BaseReg);
2187 int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
2188 int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
2189 int BaseStageNum = Schedule.stageScheduled(SU);
2190 int BaseCycleNum = Schedule.cycleScheduled(SU);
2191 if (BaseStageNum < DefStageNum) {
2193 int OffsetDiff = DefStageNum - BaseStageNum;
2194 if (DefCycleNum < BaseCycleNum) {
2195 NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
2196 if (OffsetDiff > 0)
2197 --OffsetDiff;
2198 }
2199 int64_t NewOffset =
2200 MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
2201 NewMI->getOperand(OffsetPos).setImm(NewOffset);
2202 SU->setInstr(NewMI);
2203 MISUnitMap[NewMI] = SU;
2204 NewMIs[MI] = NewMI;
2205 }
2206 }
2207}
2208
2209/// Return the instruction in the loop that defines the register.
2210/// If the definition is a Phi, then follow the Phi operand to
2211/// the instruction in the loop.
2212MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
2214 MachineInstr *Def = MRI.getVRegDef(Reg);
2215 while (Def->isPHI()) {
2216 if (!Visited.insert(Def).second)
2217 break;
2218 for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
2219 if (Def->getOperand(i + 1).getMBB() == BB) {
2220 Def = MRI.getVRegDef(Def->getOperand(i).getReg());
2221 break;
2222 }
2223 }
2224 return Def;
2225}
2226
2227/// Return true for an order or output dependence that is loop carried
2228/// potentially. A dependence is loop carried if the destination defines a valu
2229/// that may be used or defined by the source in a subsequent iteration.
2231 bool isSucc) {
2232 if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
2233 Dep.isArtificial() || Dep.getSUnit()->isBoundaryNode())
2234 return false;
2235
2237 return true;
2238
2239 if (Dep.getKind() == SDep::Output)
2240 return true;
2241
2242 MachineInstr *SI = Source->getInstr();
2243 MachineInstr *DI = Dep.getSUnit()->getInstr();
2244 if (!isSucc)
2245 std::swap(SI, DI);
2246 assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
2247
2248 // Assume ordered loads and stores may have a loop carried dependence.
2249 if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
2250 SI->mayRaiseFPException() || DI->mayRaiseFPException() ||
2251 SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
2252 return true;
2253
2254 // Only chain dependences between a load and store can be loop carried.
2255 if (!DI->mayStore() || !SI->mayLoad())
2256 return false;
2257
2258 unsigned DeltaS, DeltaD;
2259 if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD))
2260 return true;
2261
2262 const MachineOperand *BaseOpS, *BaseOpD;
2263 int64_t OffsetS, OffsetD;
2264 bool OffsetSIsScalable, OffsetDIsScalable;
2266 if (!TII->getMemOperandWithOffset(*SI, BaseOpS, OffsetS, OffsetSIsScalable,
2267 TRI) ||
2268 !TII->getMemOperandWithOffset(*DI, BaseOpD, OffsetD, OffsetDIsScalable,
2269 TRI))
2270 return true;
2271
2272 assert(!OffsetSIsScalable && !OffsetDIsScalable &&
2273 "Expected offsets to be byte offsets");
2274
2275 MachineInstr *DefS = MRI.getVRegDef(BaseOpS->getReg());
2276 MachineInstr *DefD = MRI.getVRegDef(BaseOpD->getReg());
2277 if (!DefS || !DefD || !DefS->isPHI() || !DefD->isPHI())
2278 return true;
2279
2280 unsigned InitValS = 0;
2281 unsigned LoopValS = 0;
2282 unsigned InitValD = 0;
2283 unsigned LoopValD = 0;
2284 getPhiRegs(*DefS, BB, InitValS, LoopValS);
2285 getPhiRegs(*DefD, BB, InitValD, LoopValD);
2286 MachineInstr *InitDefS = MRI.getVRegDef(InitValS);
2287 MachineInstr *InitDefD = MRI.getVRegDef(InitValD);
2288
2289 if (!InitDefS->isIdenticalTo(*InitDefD))
2290 return true;
2291
2292 // Check that the base register is incremented by a constant value for each
2293 // iteration.
2294 MachineInstr *LoopDefS = MRI.getVRegDef(LoopValS);
2295 int D = 0;
2296 if (!LoopDefS || !TII->getIncrementValue(*LoopDefS, D))
2297 return true;
2298
2299 uint64_t AccessSizeS = (*SI->memoperands_begin())->getSize();
2300 uint64_t AccessSizeD = (*DI->memoperands_begin())->getSize();
2301
2302 // This is the main test, which checks the offset values and the loop
2303 // increment value to determine if the accesses may be loop carried.
2304 if (AccessSizeS == MemoryLocation::UnknownSize ||
2305 AccessSizeD == MemoryLocation::UnknownSize)
2306 return true;
2307
2308 if (DeltaS != DeltaD || DeltaS < AccessSizeS || DeltaD < AccessSizeD)
2309 return true;
2310
2311 return (OffsetS + (int64_t)AccessSizeS < OffsetD + (int64_t)AccessSizeD);
2312}
2313
2314void SwingSchedulerDAG::postProcessDAG() {
2315 for (auto &M : Mutations)
2316 M->apply(this);
2317}
2318
2319/// Try to schedule the node at the specified StartCycle and continue
2320/// until the node is schedule or the EndCycle is reached. This function
2321/// returns true if the node is scheduled. This routine may search either
2322/// forward or backward for a place to insert the instruction based upon
2323/// the relative values of StartCycle and EndCycle.
2324bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
2325 bool forward = true;
2326 LLVM_DEBUG({
2327 dbgs() << "Trying to insert node between " << StartCycle << " and "
2328 << EndCycle << " II: " << II << "\n";
2329 });
2330 if (StartCycle > EndCycle)
2331 forward = false;
2332
2333 // The terminating condition depends on the direction.
2334 int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
2335 for (int curCycle = StartCycle; curCycle != termCycle;
2336 forward ? ++curCycle : --curCycle) {
2337
2338 if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
2339 ProcItinResources.canReserveResources(*SU, curCycle)) {
2340 LLVM_DEBUG({
2341 dbgs() << "\tinsert at cycle " << curCycle << " ";
2342 SU->getInstr()->dump();
2343 });
2344
2345 if (!ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
2346 ProcItinResources.reserveResources(*SU, curCycle);
2347 ScheduledInstrs[curCycle].push_back(SU);
2348 InstrToCycle.insert(std::make_pair(SU, curCycle));
2349 if (curCycle > LastCycle)
2350 LastCycle = curCycle;
2351 if (curCycle < FirstCycle)
2352 FirstCycle = curCycle;
2353 return true;
2354 }
2355 LLVM_DEBUG({
2356 dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
2357 SU->getInstr()->dump();
2358 });
2359 }
2360 return false;
2361}
2362
2363// Return the cycle of the earliest scheduled instruction in the chain.
2366 SmallVector<SDep, 8> Worklist;
2367 Worklist.push_back(Dep);
2368 int EarlyCycle = INT_MAX;
2369 while (!Worklist.empty()) {
2370 const SDep &Cur = Worklist.pop_back_val();
2371 SUnit *PrevSU = Cur.getSUnit();
2372 if (Visited.count(PrevSU))
2373 continue;
2374 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
2375 if (it == InstrToCycle.end())
2376 continue;
2377 EarlyCycle = std::min(EarlyCycle, it->second);
2378 for (const auto &PI : PrevSU->Preds)
2379 if (PI.getKind() == SDep::Order || PI.getKind() == SDep::Output)
2380 Worklist.push_back(PI);
2381 Visited.insert(PrevSU);
2382 }
2383 return EarlyCycle;
2384}
2385
2386// Return the cycle of the latest scheduled instruction in the chain.
2389 SmallVector<SDep, 8> Worklist;
2390 Worklist.push_back(Dep);
2391 int LateCycle = INT_MIN;
2392 while (!Worklist.empty()) {
2393 const SDep &Cur = Worklist.pop_back_val();
2394 SUnit *SuccSU = Cur.getSUnit();
2395 if (Visited.count(SuccSU) || SuccSU->isBoundaryNode())
2396 continue;
2397 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
2398 if (it == InstrToCycle.end())
2399 continue;
2400 LateCycle = std::max(LateCycle, it->second);
2401 for (const auto &SI : SuccSU->Succs)
2402 if (SI.getKind() == SDep::Order || SI.getKind() == SDep::Output)
2403 Worklist.push_back(SI);
2404 Visited.insert(SuccSU);
2405 }
2406 return LateCycle;
2407}
2408
2409/// If an instruction has a use that spans multiple iterations, then
2410/// return true. These instructions are characterized by having a back-ege
2411/// to a Phi, which contains a reference to another Phi.
2413 for (auto &P : SU->Preds)
2414 if (DAG->isBackedge(SU, P) && P.getSUnit()->getInstr()->isPHI())
2415 for (auto &S : P.getSUnit()->Succs)
2416 if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
2417 return P.getSUnit();
2418 return nullptr;
2419}
2420
2421/// Compute the scheduling start slot for the instruction. The start slot
2422/// depends on any predecessor or successor nodes scheduled already.
2423void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
2424 int *MinEnd, int *MaxStart, int II,
2425 SwingSchedulerDAG *DAG) {
2426 // Iterate over each instruction that has been scheduled already. The start
2427 // slot computation depends on whether the previously scheduled instruction
2428 // is a predecessor or successor of the specified instruction.
2429 for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
2430
2431 // Iterate over each instruction in the current cycle.
2432 for (SUnit *I : getInstructions(cycle)) {
2433 // Because we're processing a DAG for the dependences, we recognize
2434 // the back-edge in recurrences by anti dependences.
2435 for (unsigned i = 0, e = (unsigned)SU->Preds.size(); i != e; ++i) {
2436 const SDep &Dep = SU->Preds[i];
2437 if (Dep.getSUnit() == I) {
2438 if (!DAG->isBackedge(SU, Dep)) {
2439 int EarlyStart = cycle + Dep.getLatency() -
2440 DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
2441 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
2442 if (DAG->isLoopCarriedDep(SU, Dep, false)) {
2443 int End = earliestCycleInChain(Dep) + (II - 1);
2444 *MinEnd = std::min(*MinEnd, End);
2445 }
2446 } else {
2447 int LateStart = cycle - Dep.getLatency() +
2448 DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
2449 *MinLateStart = std::min(*MinLateStart, LateStart);
2450 }
2451 }
2452 // For instruction that requires multiple iterations, make sure that
2453 // the dependent instruction is not scheduled past the definition.
2454 SUnit *BE = multipleIterations(I, DAG);
2455 if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
2456 !SU->isPred(I))
2457 *MinLateStart = std::min(*MinLateStart, cycle);
2458 }
2459 for (unsigned i = 0, e = (unsigned)SU->Succs.size(); i != e; ++i) {
2460 if (SU->Succs[i].getSUnit() == I) {
2461 const SDep &Dep = SU->Succs[i];
2462 if (!DAG->isBackedge(SU, Dep)) {
2463 int LateStart = cycle - Dep.getLatency() +
2464 DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
2465 *MinLateStart = std::min(*MinLateStart, LateStart);
2466 if (DAG->isLoopCarriedDep(SU, Dep)) {
2467 int Start = latestCycleInChain(Dep) + 1 - II;
2468 *MaxStart = std::max(*MaxStart, Start);
2469 }
2470 } else {
2471 int EarlyStart = cycle + Dep.getLatency() -
2472 DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
2473 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
2474 }
2475 }
2476 }
2477 }
2478 }
2479}
2480
2481/// Order the instructions within a cycle so that the definitions occur
2482/// before the uses. Returns true if the instruction is added to the start
2483/// of the list, or false if added to the end.
2485 std::deque<SUnit *> &Insts) {
2486 MachineInstr *MI = SU->getInstr();
2487 bool OrderBeforeUse = false;
2488 bool OrderAfterDef = false;
2489 bool OrderBeforeDef = false;
2490 unsigned MoveDef = 0;
2491 unsigned MoveUse = 0;
2492 int StageInst1 = stageScheduled(SU);
2493
2494 unsigned Pos = 0;
2495 for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
2496 ++I, ++Pos) {
2497 for (MachineOperand &MO : MI->operands()) {
2498 if (!MO.isReg() || !MO.getReg().isVirtual())
2499 continue;
2500
2501 Register Reg = MO.getReg();
2502 unsigned BasePos, OffsetPos;
2503 if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2504 if (MI->getOperand(BasePos).getReg() == Reg)
2505 if (unsigned NewReg = SSD->getInstrBaseReg(SU))
2506 Reg = NewReg;
2507 bool Reads, Writes;
2508 std::tie(Reads, Writes) =
2509 (*I)->getInstr()->readsWritesVirtualRegister(Reg);
2510 if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
2511 OrderBeforeUse = true;
2512 if (MoveUse == 0)
2513 MoveUse = Pos;
2514 } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
2515 // Add the instruction after the scheduled instruction.
2516 OrderAfterDef = true;
2517 MoveDef = Pos;
2518 } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
2519 if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
2520 OrderBeforeUse = true;
2521 if (MoveUse == 0)
2522 MoveUse = Pos;
2523 } else {
2524 OrderAfterDef = true;
2525 MoveDef = Pos;
2526 }
2527 } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
2528 OrderBeforeUse = true;
2529 if (MoveUse == 0)
2530 MoveUse = Pos;
2531 if (MoveUse != 0) {
2532 OrderAfterDef = true;
2533 MoveDef = Pos - 1;
2534 }
2535 } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
2536 // Add the instruction before the scheduled instruction.
2537 OrderBeforeUse = true;
2538 if (MoveUse == 0)
2539 MoveUse = Pos;
2540 } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
2541 isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
2542 if (MoveUse == 0) {
2543 OrderBeforeDef = true;
2544 MoveUse = Pos;
2545 }
2546 }
2547 }
2548 // Check for order dependences between instructions. Make sure the source
2549 // is ordered before the destination.
2550 for (auto &S : SU->Succs) {
2551 if (S.getSUnit() != *I)
2552 continue;
2553 if (S.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
2554 OrderBeforeUse = true;
2555 if (Pos < MoveUse)
2556 MoveUse = Pos;
2557 }
2558 // We did not handle HW dependences in previous for loop,
2559 // and we normally set Latency = 0 for Anti deps,
2560 // so may have nodes in same cycle with Anti denpendent on HW regs.
2561 else if (S.getKind() == SDep::Anti && stageScheduled(*I) == StageInst1) {
2562 OrderBeforeUse = true;
2563 if ((MoveUse == 0) || (Pos < MoveUse))
2564 MoveUse = Pos;
2565 }
2566 }
2567 for (auto &P : SU->Preds) {
2568 if (P.getSUnit() != *I)
2569 continue;
2570 if (P.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
2571 OrderAfterDef = true;
2572 MoveDef = Pos;
2573 }
2574 }
2575 }
2576
2577 // A circular dependence.
2578 if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
2579 OrderBeforeUse = false;
2580
2581 // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
2582 // to a loop-carried dependence.
2583 if (OrderBeforeDef)
2584 OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
2585
2586 // The uncommon case when the instruction order needs to be updated because
2587 // there is both a use and def.
2588 if (OrderBeforeUse && OrderAfterDef) {
2589 SUnit *UseSU = Insts.at(MoveUse);
2590 SUnit *DefSU = Insts.at(MoveDef);
2591 if (MoveUse > MoveDef) {
2592 Insts.erase(Insts.begin() + MoveUse);
2593 Insts.erase(Insts.begin() + MoveDef);
2594 } else {
2595 Insts.erase(Insts.begin() + MoveDef);
2596 Insts.erase(Insts.begin() + MoveUse);
2597 }
2598 orderDependence(SSD, UseSU, Insts);
2599 orderDependence(SSD, SU, Insts);
2600 orderDependence(SSD, DefSU, Insts);
2601 return;
2602 }
2603 // Put the new instruction first if there is a use in the list. Otherwise,
2604 // put it at the end of the list.
2605 if (OrderBeforeUse)
2606 Insts.push_front(SU);
2607 else
2608 Insts.push_back(SU);
2609}
2610
2611/// Return true if the scheduled Phi has a loop carried operand.
2613 if (!Phi.isPHI())
2614 return false;
2615 assert(Phi.isPHI() && "Expecting a Phi.");
2616 SUnit *DefSU = SSD->getSUnit(&Phi);
2617 unsigned DefCycle = cycleScheduled(DefSU);
2618 int DefStage = stageScheduled(DefSU);
2619
2620 unsigned InitVal = 0;
2621 unsigned LoopVal = 0;
2622 getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
2623 SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
2624 if (!UseSU)
2625 return true;
2626 if (UseSU->getInstr()->isPHI())
2627 return true;
2628 unsigned LoopCycle = cycleScheduled(UseSU);
2629 int LoopStage = stageScheduled(UseSU);
2630 return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
2631}
2632
2633/// Return true if the instruction is a definition that is loop carried
2634/// and defines the use on the next iteration.
2635/// v1 = phi(v2, v3)
2636/// (Def) v3 = op v1
2637/// (MO) = v1
2638/// If MO appears before Def, then v1 and v3 may get assigned to the same
2639/// register.
2641 MachineInstr *Def, MachineOperand &MO) {
2642 if (!MO.isReg())
2643 return false;
2644 if (Def->isPHI())
2645 return false;
2646 MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
2647 if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
2648 return false;
2649 if (!isLoopCarried(SSD, *Phi))
2650 return false;
2651 unsigned LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
2652 for (MachineOperand &DMO : Def->all_defs()) {
2653 if (DMO.getReg() == LoopReg)
2654 return true;
2655 }
2656 return false;
2657}
2658
2659/// Determine transitive dependences of unpipelineable instructions
2662 SmallSet<SUnit *, 8> DoNotPipeline;
2663 SmallVector<SUnit *, 8> Worklist;
2664
2665 for (auto &SU : SSD->SUnits)
2666 if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr()))
2667 Worklist.push_back(&SU);
2668
2669 while (!Worklist.empty()) {
2670 auto SU = Worklist.pop_back_val();
2671 if (DoNotPipeline.count(SU))
2672 continue;
2673 LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");
2674 DoNotPipeline.insert(SU);
2675 for (auto &Dep : SU->Preds)
2676 Worklist.push_back(Dep.getSUnit());
2677 if (SU->getInstr()->isPHI())
2678 for (auto &Dep : SU->Succs)
2679 if (Dep.getKind() == SDep::Anti)
2680 Worklist.push_back(Dep.getSUnit());
2681 }
2682 return DoNotPipeline;
2683}
2684
2685// Determine all instructions upon which any unpipelineable instruction depends
2686// and ensure that they are in stage 0. If unable to do so, return false.
2690
2691 int NewLastCycle = INT_MIN;
2692 for (SUnit &SU : SSD->SUnits) {
2693 if (!SU.isInstr())
2694 continue;
2695 if (!DNP.contains(&SU) || stageScheduled(&SU) == 0) {
2696 NewLastCycle = std::max(NewLastCycle, InstrToCycle[&SU]);
2697 continue;
2698 }
2699
2700 // Put the non-pipelined instruction as early as possible in the schedule
2701 int NewCycle = getFirstCycle();
2702 for (auto &Dep : SU.Preds)
2703 NewCycle = std::max(InstrToCycle[Dep.getSUnit()], NewCycle);
2704
2705 int OldCycle = InstrToCycle[&SU];
2706 if (OldCycle != NewCycle) {
2707 InstrToCycle[&SU] = NewCycle;
2708 auto &OldS = getInstructions(OldCycle);
2709 llvm::erase_value(OldS, &SU);
2710 getInstructions(NewCycle).emplace_back(&SU);
2711 LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum
2712 << ") is not pipelined; moving from cycle " << OldCycle
2713 << " to " << NewCycle << " Instr:" << *SU.getInstr());
2714 }
2715 NewLastCycle = std::max(NewLastCycle, NewCycle);
2716 }
2717 LastCycle = NewLastCycle;
2718 return true;
2719}
2720
2721// Check if the generated schedule is valid. This function checks if
2722// an instruction that uses a physical register is scheduled in a
2723// different stage than the definition. The pipeliner does not handle
2724// physical register values that may cross a basic block boundary.
2725// Furthermore, if a physical def/use pair is assigned to the same
2726// cycle, orderDependence does not guarantee def/use ordering, so that
2727// case should be considered invalid. (The test checks for both
2728// earlier and same-cycle use to be more robust.)
2730 for (SUnit &SU : SSD->SUnits) {
2731 if (!SU.hasPhysRegDefs)
2732 continue;
2733 int StageDef = stageScheduled(&SU);
2734 int CycleDef = InstrToCycle[&SU];
2735 assert(StageDef != -1 && "Instruction should have been scheduled.");
2736 for (auto &SI : SU.Succs)
2737 if (SI.isAssignedRegDep() && !SI.getSUnit()->isBoundaryNode())
2738 if (Register::isPhysicalRegister(SI.getReg())) {
2739 if (stageScheduled(SI.getSUnit()) != StageDef)
2740 return false;
2741 if (InstrToCycle[SI.getSUnit()] <= CycleDef)
2742 return false;
2743 }
2744 }
2745 return true;
2746}
2747
2748/// A property of the node order in swing-modulo-scheduling is
2749/// that for nodes outside circuits the following holds:
2750/// none of them is scheduled after both a successor and a
2751/// predecessor.
2752/// The method below checks whether the property is met.
2753/// If not, debug information is printed and statistics information updated.
2754/// Note that we do not use an assert statement.
2755/// The reason is that although an invalid node oder may prevent
2756/// the pipeliner from finding a pipelined schedule for arbitrary II,
2757/// it does not lead to the generation of incorrect code.
2758void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
2759
2760 // a sorted vector that maps each SUnit to its index in the NodeOrder
2761 typedef std::pair<SUnit *, unsigned> UnitIndex;
2762 std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
2763
2764 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
2765 Indices.push_back(std::make_pair(NodeOrder[i], i));
2766
2767 auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
2768 return std::get<0>(i1) < std::get<0>(i2);
2769 };
2770
2771 // sort, so that we can perform a binary search
2772 llvm::sort(Indices, CompareKey);
2773
2774 bool Valid = true;
2775 (void)Valid;
2776 // for each SUnit in the NodeOrder, check whether
2777 // it appears after both a successor and a predecessor
2778 // of the SUnit. If this is the case, and the SUnit
2779 // is not part of circuit, then the NodeOrder is not
2780 // valid.
2781 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
2782 SUnit *SU = NodeOrder[i];
2783 unsigned Index = i;
2784
2785 bool PredBefore = false;
2786 bool SuccBefore = false;
2787
2788 SUnit *Succ;
2789 SUnit *Pred;
2790 (void)Succ;
2791 (void)Pred;
2792
2793 for (SDep &PredEdge : SU->Preds) {
2794 SUnit *PredSU = PredEdge.getSUnit();
2795 unsigned PredIndex = std::get<1>(
2796 *llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey));
2797 if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
2798 PredBefore = true;
2799 Pred = PredSU;
2800 break;
2801 }
2802 }
2803
2804 for (SDep &SuccEdge : SU->Succs) {
2805 SUnit *SuccSU = SuccEdge.getSUnit();
2806 // Do not process a boundary node, it was not included in NodeOrder,
2807 // hence not in Indices either, call to std::lower_bound() below will
2808 // return Indices.end().
2809 if (SuccSU->isBoundaryNode())
2810 continue;
2811 unsigned SuccIndex = std::get<1>(
2812 *llvm::lower_bound(Indices, std::make_pair(SuccSU, 0), CompareKey));
2813 if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
2814 SuccBefore = true;
2815 Succ = SuccSU;
2816 break;
2817 }
2818 }
2819
2820 if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
2821 // instructions in circuits are allowed to be scheduled
2822 // after both a successor and predecessor.
2823 bool InCircuit = llvm::any_of(
2824 Circuits, [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
2825 if (InCircuit)
2826 LLVM_DEBUG(dbgs() << "In a circuit, predecessor ";);
2827 else {
2828 Valid = false;
2829 NumNodeOrderIssues++;
2830 LLVM_DEBUG(dbgs() << "Predecessor ";);
2831 }
2832 LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
2833 << " are scheduled before node " << SU->NodeNum
2834 << "\n";);
2835 }
2836 }
2837
2838 LLVM_DEBUG({
2839 if (!Valid)
2840 dbgs() << "Invalid node order found!\n";
2841 });
2842}
2843
2844/// Attempt to fix the degenerate cases when the instruction serialization
2845/// causes the register lifetimes to overlap. For example,
2846/// p' = store_pi(p, b)
2847/// = load p, offset
2848/// In this case p and p' overlap, which means that two registers are needed.
2849/// Instead, this function changes the load to use p' and updates the offset.
2850void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
2851 unsigned OverlapReg = 0;
2852 unsigned NewBaseReg = 0;
2853 for (SUnit *SU : Instrs) {
2854 MachineInstr *MI = SU->getInstr();
2855 for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
2856 const MachineOperand &MO = MI->getOperand(i);
2857 // Look for an instruction that uses p. The instruction occurs in the
2858 // same cycle but occurs later in the serialized order.
2859 if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
2860 // Check that the instruction appears in the InstrChanges structure,
2861 // which contains instructions that can have the offset updated.
2863 InstrChanges.find(SU);
2864 if (It != InstrChanges.end()) {
2865 unsigned BasePos, OffsetPos;
2866 // Update the base register and adjust the offset.
2867 if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
2869 NewMI->getOperand(BasePos).setReg(NewBaseReg);
2870 int64_t NewOffset =
2871 MI->getOperand(OffsetPos).getImm() - It->second.second;
2872 NewMI->getOperand(OffsetPos).setImm(NewOffset);
2873 SU->setInstr(NewMI);
2874 MISUnitMap[NewMI] = SU;
2875 NewMIs[MI] = NewMI;
2876 }
2877 }
2878 OverlapReg = 0;
2879 NewBaseReg = 0;
2880 break;
2881 }
2882 // Look for an instruction of the form p' = op(p), which uses and defines
2883 // two virtual registers that get allocated to the same physical register.
2884 unsigned TiedUseIdx = 0;
2885 if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
2886 // OverlapReg is p in the example above.
2887 OverlapReg = MI->getOperand(TiedUseIdx).getReg();
2888 // NewBaseReg is p' in the example above.
2889 NewBaseReg = MI->getOperand(i).getReg();
2890 break;
2891 }
2892 }
2893 }
2894}
2895
2896/// After the schedule has been formed, call this function to combine
2897/// the instructions from the different stages/cycles. That is, this
2898/// function creates a schedule that represents a single iteration.
2900 // Move all instructions to the first stage from later stages.
2901 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
2902 for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
2903 ++stage) {
2904 std::deque<SUnit *> &cycleInstrs =
2905 ScheduledInstrs[cycle + (stage * InitiationInterval)];
2906 for (SUnit *SU : llvm::reverse(cycleInstrs))
2907 ScheduledInstrs[cycle].push_front(SU);
2908 }
2909 }
2910
2911 // Erase all the elements in the later stages. Only one iteration should
2912 // remain in the scheduled list, and it contains all the instructions.
2913 for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
2914 ScheduledInstrs.erase(cycle);
2915
2916 // Change the registers in instruction as specified in the InstrChanges
2917 // map. We need to use the new registers to create the correct order.
2918 for (const SUnit &SU : SSD->SUnits)
2919 SSD->applyInstrChange(SU.getInstr(), *this);
2920
2921 // Reorder the instructions in each cycle to fix and improve the
2922 // generated code.
2923 for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
2924 std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
2925 std::deque<SUnit *> newOrderPhi;
2926 for (SUnit *SU : cycleInstrs) {
2927 if (SU->getInstr()->isPHI())
2928 newOrderPhi.push_back(SU);
2929 }
2930 std::deque<SUnit *> newOrderI;
2931 for (SUnit *SU : cycleInstrs) {
2932 if (!SU->getInstr()->isPHI())
2933 orderDependence(SSD, SU, newOrderI);
2934 }
2935 // Replace the old order with the new order.
2936 cycleInstrs.swap(newOrderPhi);
2937 llvm::append_range(cycleInstrs, newOrderI);
2938 SSD->fixupRegisterOverlaps(cycleInstrs);
2939 }
2940
2941 LLVM_DEBUG(dump(););
2942}
2943
2945 os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
2946 << " depth " << MaxDepth << " col " << Colocate << "\n";
2947 for (const auto &I : Nodes)
2948 os << " SU(" << I->NodeNum << ") " << *(I->getInstr());
2949 os << "\n";
2950}
2951
2952#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2953/// Print the schedule information to the given output.
2955 // Iterate over each cycle.
2956 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
2957 // Iterate over each instruction in the cycle.
2958 const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
2959 for (SUnit *CI : cycleInstrs->second) {
2960 os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
2961 os << "(" << CI->NodeNum << ") ";
2962 CI->getInstr()->print(os);
2963 os << "\n";
2964 }
2965 }
2966}
2967
2968/// Utility function used for debugging to print the schedule.
2971
2972void ResourceManager::dumpMRT() const {
2973 LLVM_DEBUG({
2974 if (UseDFA)
2975 return;
2976 std::stringstream SS;
2977 SS << "MRT:\n";
2978 SS << std::setw(4) << "Slot";
2979 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
2980 SS << std::setw(3) << I;
2981 SS << std::setw(7) << "#Mops"
2982 << "\n";
2983 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
2984 SS << std::setw(4) << Slot;
2985 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
2986 SS << std::setw(3) << MRT[Slot][I];
2987 SS << std::setw(7) << NumScheduledMops[Slot] << "\n";
2988 }
2989 dbgs() << SS.str();
2990 });
2991}
2992#endif
2993
2995 const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
2996 unsigned ProcResourceID = 0;
2997
2998 // We currently limit the resource kinds to 64 and below so that we can use
2999 // uint64_t for Masks
3000 assert(SM.getNumProcResourceKinds() < 64 &&
3001 "Too many kinds of resources, unsupported");
3002 // Create a unique bitmask for every processor resource unit.
3003 // Skip resource at index 0, since it always references 'InvalidUnit'.
3004 Masks.resize(SM.getNumProcResourceKinds());
3005 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3007 if (Desc.SubUnitsIdxBegin)
3008 continue;
3009 Masks[I] = 1ULL << ProcResourceID;
3010 ProcResourceID++;
3011 }
3012 // Create a unique bitmask for every processor resource group.
3013 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3015 if (!Desc.SubUnitsIdxBegin)
3016 continue;
3017 Masks[I] = 1ULL << ProcResourceID;
3018 for (unsigned U = 0; U < Desc.NumUnits; ++U)
3019 Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];
3020 ProcResourceID++;
3021 }
3022 LLVM_DEBUG({
3023 if (SwpShowResMask) {
3024 dbgs() << "ProcResourceDesc:\n";
3025 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3026 const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
3027 dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
3028 ProcResource->Name, I, Masks[I],
3029 ProcResource->NumUnits);
3030 }
3031 dbgs() << " -----------------\n";
3032 }
3033 });
3034}
3035
3037 LLVM_DEBUG({
3038 if (SwpDebugResource)
3039 dbgs() << "canReserveResources:\n";
3040 });
3041 if (UseDFA)
3042 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3043 ->canReserveResources(&SU.getInstr()->getDesc());
3044
3045 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3046 if (!SCDesc->isValid()) {
3047 LLVM_DEBUG({
3048 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3049 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3050 });
3051 return true;
3052 }
3053
3054 reserveResources(SCDesc, Cycle);
3055 bool Result = !isOverbooked();
3056 unreserveResources(SCDesc, Cycle);
3057
3058 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return " << Result << "\n\n";);
3059 return Result;
3060}
3061
3062void ResourceManager::reserveResources(SUnit &SU, int Cycle) {
3063 LLVM_DEBUG({
3064 if (SwpDebugResource)
3065 dbgs() << "reserveResources:\n";
3066 });
3067 if (UseDFA)
3068 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3069 ->reserveResources(&SU.getInstr()->getDesc());
3070
3071 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3072 if (!SCDesc->isValid()) {
3073 LLVM_DEBUG({
3074 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3075 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3076 });
3077 return;
3078 }
3079
3080 reserveResources(SCDesc, Cycle);
3081
3082 LLVM_DEBUG({
3083 if (SwpDebugResource) {
3084 dumpMRT();
3085 dbgs() << "reserveResources: done!\n\n";
3086 }
3087 });
3088}
3089
3090void ResourceManager::reserveResources(const MCSchedClassDesc *SCDesc,
3091 int Cycle) {
3092 assert(!UseDFA);
3093 for (const MCWriteProcResEntry &PRE : make_range(
3094 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3095 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3096 ++MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3097
3098 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3099 ++NumScheduledMops[positiveModulo(C, InitiationInterval)];
3100}
3101
3102void ResourceManager::unreserveResources(const MCSchedClassDesc *SCDesc,
3103 int Cycle) {
3104 assert(!UseDFA);
3105 for (const MCWriteProcResEntry &PRE : make_range(
3106 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3107 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3108 --MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3109
3110 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3111 --NumScheduledMops[positiveModulo(C, InitiationInterval)];
3112}
3113
3114bool ResourceManager::isOverbooked() const {
3115 assert(!UseDFA);
3116 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3117 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3119 if (MRT[Slot][I] > Desc->NumUnits)
3120 return true;
3121 }
3122 if (NumScheduledMops[Slot] > IssueWidth)
3123 return true;
3124 }
3125 return false;
3126}
3127
3128int ResourceManager::calculateResMIIDFA() const {
3129 assert(UseDFA);
3130
3131 // Sort the instructions by the number of available choices for scheduling,
3132 // least to most. Use the number of critical resources as the tie breaker.
3133 FuncUnitSorter FUS = FuncUnitSorter(*ST);
3134 for (SUnit &SU : DAG->SUnits)
3135 FUS.calcCriticalResources(*SU.getInstr());
3137 FuncUnitOrder(FUS);
3138
3139 for (SUnit &SU : DAG->SUnits)
3140 FuncUnitOrder.push(SU.getInstr());
3141
3143 Resources.push_back(
3144 std::unique_ptr<DFAPacketizer>(TII->CreateTargetScheduleState(*ST)));
3145
3146 while (!FuncUnitOrder.empty()) {
3147 MachineInstr *MI = FuncUnitOrder.top();
3148 FuncUnitOrder.pop();
3149 if (TII->isZeroCost(MI->getOpcode()))
3150 continue;
3151
3152 // Attempt to reserve the instruction in an existing DFA. At least one
3153 // DFA is needed for each cycle.
3154 unsigned NumCycles = DAG->getSUnit(MI)->Latency;
3155 unsigned ReservedCycles = 0;
3156 auto *RI = Resources.begin();
3157 auto *RE = Resources.end();
3158 LLVM_DEBUG({
3159 dbgs() << "Trying to reserve resource for " << NumCycles
3160 << " cycles for \n";
3161 MI->dump();
3162 });
3163 for (unsigned C = 0; C < NumCycles; ++C)
3164 while (RI != RE) {
3165 if ((*RI)->canReserveResources(*MI)) {
3166 (*RI)->reserveResources(*MI);
3167 ++ReservedCycles;
3168 break;
3169 }
3170 RI++;
3171 }
3172 LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
3173 << ", NumCycles:" << NumCycles << "\n");
3174 // Add new DFAs, if needed, to reserve resources.
3175 for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
3177 << "NewResource created to reserve resources"
3178 << "\n");
3179 auto *NewResource = TII->CreateTargetScheduleState(*ST);
3180 assert(NewResource->canReserveResources(*MI) && "Reserve error.");
3181 NewResource->reserveResources(*MI);
3182 Resources.push_back(std::unique_ptr<DFAPacketizer>(NewResource));
3183 }
3184 }
3185
3186 int Resmii = Resources.size();
3187 LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
3188 return Resmii;
3189}
3190
3192 if (UseDFA)
3193 return calculateResMIIDFA();
3194
3195 // Count each resource consumption and divide it by the number of units.
3196 // ResMII is the max value among them.
3197
3198 int NumMops = 0;
3200 for (SUnit &SU : DAG->SUnits) {
3201 if (TII->isZeroCost(SU.getInstr()->getOpcode()))
3202 continue;
3203
3204 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3205 if (!SCDesc->isValid())
3206 continue;
3207
3208 LLVM_DEBUG({
3209 if (SwpDebugResource) {
3210 DAG->dumpNode(SU);
3211 dbgs() << " #Mops: " << SCDesc->NumMicroOps << "\n"
3212 << " WriteProcRes: ";
3213 }
3214 });
3215 NumMops += SCDesc->NumMicroOps;
3216 for (const MCWriteProcResEntry &PRE :
3217 make_range(STI->getWriteProcResBegin(SCDesc),
3218 STI->getWriteProcResEnd(SCDesc))) {
3219 LLVM_DEBUG({
3220 if (SwpDebugResource) {
3221 const MCProcResourceDesc *Desc =
3222 SM.getProcResource(PRE.ProcResourceIdx);
3223 dbgs() << Desc->Name << ": " << PRE.ReleaseAtCycle << ", ";
3224 }
3225 });
3226 ResourceCount[PRE.ProcResourceIdx] += PRE.ReleaseAtCycle;
3227 }
3228 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "\n");
3229 }
3230
3231 int Result = (NumMops + IssueWidth - 1) / IssueWidth;
3232 LLVM_DEBUG({
3233 if (SwpDebugResource)
3234 dbgs() << "#Mops: " << NumMops << ", "
3235 << "IssueWidth: " << IssueWidth << ", "
3236 << "Cycles: " << Result << "\n";
3237 });
3238
3239 LLVM_DEBUG({
3240 if (SwpDebugResource) {
3241 std::stringstream SS;
3242 SS << std::setw(2) << "ID" << std::setw(16) << "Name" << std::setw(10)
3243 << "Units" << std::setw(10) << "Consumed" << std::setw(10) << "Cycles"
3244 << "\n";
3245 dbgs() << SS.str();
3246 }
3247 });
3248 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3250 int Cycles = (ResourceCount[I] + Desc->NumUnits - 1) / Desc->NumUnits;
3251 LLVM_DEBUG({
3252 if (SwpDebugResource) {
3253 std::stringstream SS;
3254 SS << std::setw(2) << I << std::setw(16) << Desc->Name << std::setw(10)
3255 << Desc->NumUnits << std::setw(10) << ResourceCount[I]
3256 << std::setw(10) << Cycles << "\n";
3257 dbgs() << SS.str();
3258 }
3259 });
3260 if (Cycles > Result)
3261 Result = Cycles;
3262 }
3263 return Result;
3264}
3265
3267 InitiationInterval = II;
3268 DFAResources.clear();
3269 DFAResources.resize(II);
3270 for (auto &I : DFAResources)
3271 I.reset(ST->getInstrInfo()->CreateTargetScheduleState(*ST));
3272 MRT.clear();
3274 NumScheduledMops.clear();
3275 NumScheduledMops.resize(II);
3276}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:510
This file declares an analysis pass that computes CycleInfo for LLVM IR, specialized from GenericCycl...
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
bool End
Definition: ELF_riscv.cpp:469
SmallVector< uint32_t, 0 > Writes
Definition: ELF_riscv.cpp:486
Rewrite Partial Register Uses
#define DEBUG_TYPE
const HexagonInstrInfo * TII
hexagon gen pred
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
static cl::opt< int > SwpForceII("pipeliner-force-ii", cl::desc("Force pipeliner to use specified II."), cl::Hidden, cl::init(-1))
A command line argument to force pipeliner to use specified initial interval.
static cl::opt< bool > ExperimentalCodeGen("pipeliner-experimental-cg", cl::Hidden, cl::init(false), cl::desc("Use the experimental peeling code generator for software pipelining"))
static bool pred_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Preds, const NodeSet *S=nullptr)
Compute the Pred_L(O) set, as defined in the paper.
static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB)
Return the Phi register value that comes the loop block.
static cl::opt< bool > SwpDebugResource("pipeliner-dbg-res", cl::Hidden, cl::init(false))
static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker, NodeSet &NS)
Compute the live-out registers for the instructions in a node-set.
static cl::opt< bool > EmitTestAnnotations("pipeliner-annotate-for-testing", cl::Hidden, cl::init(false), cl::desc("Instead of emitting the pipelined code, annotate instructions " "with the generated schedule for feeding into the " "-modulo-schedule-test pass"))
static bool isIntersect(SmallSetVector< SUnit *, 8 > &Set1, const NodeSet &Set2, SmallSetVector< SUnit *, 8 > &Result)
Return true if Set1 contains elements in Set2.
static cl::opt< bool > SwpIgnoreRecMII("pipeliner-ignore-recmii", cl::ReallyHidden, cl::desc("Ignore RecMII"))
static cl::opt< int > SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1))
static bool succ_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Succs, const NodeSet *S=nullptr)
Compute the Succ_L(O) set, as defined in the paper.
Modulo Software Pipelining
static cl::opt< bool > SwpPruneLoopCarried("pipeliner-prune-loop-carried", cl::desc("Prune loop carried order dependences."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of loop carried order dependences.
static bool isDependenceBarrier(MachineInstr &MI)
Return true if the instruction causes a chain between memory references before and after it.
static cl::opt< int > SwpMaxMii("pipeliner-max-mii", cl::desc("Size limit for the MII."), cl::Hidden, cl::init(27))
A command line argument to limit minimum initial interval for pipelining.
static void swapAntiDependences(std::vector< SUnit > &SUnits)
Swap all the anti dependences in the DAG.
static bool isSuccOrder(SUnit *SUa, SUnit *SUb)
Return true if SUb can be reached from SUa following the chain edges.
static cl::opt< int > SwpMaxStages("pipeliner-max-stages", cl::desc("Maximum stages allowed in the generated scheduled."), cl::Hidden, cl::init(3))
A command line argument to limit the number of stages in the pipeline.
static cl::opt< bool > EnableSWPOptSize("enable-pipeliner-opt-size", cl::desc("Enable SWP at Os."), cl::Hidden, cl::init(false))
A command line option to enable SWP at -Os.
static cl::opt< bool > SwpShowResMask("pipeliner-show-mask", cl::Hidden, cl::init(false))
static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop, unsigned &InitVal, unsigned &LoopVal)
Return the register values for the operands of a Phi instruction.
#define DEBUG_TYPE
static cl::opt< bool > EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true), cl::desc("Enable Software Pipelining"))
A command line option to turn software pipelining on or off.
static bool ignoreDependence(const SDep &D, bool isPred)
Return true for DAG nodes that we ignore when computing the cost functions.
static cl::opt< bool > SwpPruneDeps("pipeliner-prune-deps", cl::desc("Prune dependences between unrelated Phi nodes."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of chain dependences due to an unrelated Phi.
static SUnit * multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG)
If an instruction has a use that spans multiple iterations, then return true.
static bool computePath(SUnit *Cur, SetVector< SUnit * > &Path, SetVector< SUnit * > &DestNodes, SetVector< SUnit * > &Exclude, SmallPtrSet< SUnit *, 8 > &Visited)
Return true if there is a path from the specified node to any of the nodes in DestNodes.
unsigned const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file defines the PriorityQueue class.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static unsigned getSize(unsigned Kind)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
bool isNoAlias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A trivial helper function to check to see if the specified pointers are no-alias.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:127
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
bool erase(const KeyT &Val)
Definition: DenseMap.h:329
bool empty() const
Definition: DenseMap.h:98
iterator end()
Definition: DenseMap.h:84
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:320
A possibly irreducible generalization of a Loop.
Itinerary data supplied by a subtarget to be used by a target.
const InstrStage * beginStage(unsigned ItinClassIndx) const
Return the first stage of the itinerary.
const InstrStage * endStage(unsigned ItinClassIndx) const
Return the last+1 stage of the itinerary.
bool isEmpty() const
Returns true if there are no itineraries.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:302
BlockT * getHeader() const
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:47
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:631
unsigned getSchedClass() const
Return the scheduling class for this instruction.
Definition: MCInstrDesc.h:600
const MCInstrDesc & get(unsigned Opcode) const
Return the machine instruction descriptor that corresponds to the specified instruction opcode.
Definition: MCInstrInfo.h:63
Generic base class for all target subtargets.
const MCWriteProcResEntry * getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCWriteProcResEntry * getWriteProcResBegin(const MCSchedClassDesc *SC) const
Return an iterator at the first process resource consumed by the given scheduling class.
const MCSchedModel & getSchedModel() const
Get the machine model for this subtarget's CPU.
Metadata node.
Definition: Metadata.h:950
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1303
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1309
A single uniqued string.
Definition: Metadata.h:611
StringRef getString() const
Definition: Metadata.cpp:509
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
instr_iterator instr_end()
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void deleteMachineInstr(MachineInstr *MI)
DeleteMachineInstr - Delete the given MachineInstr.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:68
bool mayRaiseFPException() const
Return true if this instruction could possibly raise a floating-point exception.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:543
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:326
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition: MachineInstr.h:540
bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool isRegSequence() const
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:774
bool isIdenticalTo(const MachineInstr &Other, MICheckType Check=CheckDefs) const
Return true if this instruction is identical to Other.
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isPseudo(QueryType Type=IgnoreBundle) const
Return true if this is a pseudo instruction that doesn't correspond to a real machine instruction.
Definition: MachineInstr.h:895
bool isPHI() const
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:553
iterator_range< filtered_mop_iterator > all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
Definition: MachineInstr.h:730
A description of a memory reference used in the backend.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
const Value * getValue() const
Return the base address of the memory access.
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
Register getReg() const
getReg - Returns the register number.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
Diagnostic information for optimization analysis remarks.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
The main class in the implementation of the target independent software pipeliner pass.
const TargetInstrInfo * TII
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
MachineFunction * MF
MachineOptimizationRemarkEmitter * ORE
RegisterClassInfo RegClassInfo
defusechain_iterator - This class provides iterator support for machine operands in the function that...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
use_instr_iterator use_instr_begin(Register RegNo) const
static use_instr_iterator use_instr_end()
MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:71
iterator find(const KeyT &Key)
Definition: MapVector.h:146
void clear()
Definition: MapVector.h:88
static MemoryLocation getAfter(const Value *Ptr, const AAMDNodes &AATags=AAMDNodes())
Return a location that may access any location after Ptr, while remaining within the underlying objec...
The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place, rewriting the old loop and...
void cleanup()
Performs final cleanup after expansion.
void expand()
Performs the actual expansion.
Expander that simply annotates each scheduled instruction with a post-instr symbol that can be consum...
void annotate()
Performs the annotation.
Represents a schedule for a single-block loop.
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
SUnit * getNode(unsigned i) const
void print(raw_ostream &os) const
void setRecMII(unsigned mii)
unsigned count(SUnit *SU) const
void setColocate(unsigned c)
int compareRecMII(NodeSet &RHS)
unsigned size() const
bool insert(SUnit *SU)
LLVM_DUMP_METHOD void dump() const
bool empty() const
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
void dump() const
Definition: Pass.cpp:136
A reimplementation of ModuloScheduleExpander.
unsigned getPSet() const
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
Track the current register pressure at some position in the instruction stream, and remember the high...
void addLiveRegs(ArrayRef< RegisterMaskPair > Regs)
Force liveness of virtual registers or physical register units.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static constexpr bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:65
void initProcResourceVectors(const MCSchedModel &SM, SmallVectorImpl< uint64_t > &Masks)
void init(int II)
Initialize resources with the initiation interval II.
bool canReserveResources(SUnit &SU, int Cycle)
Check if the resources occupied by a machine instruction are available in the current state.
Scheduling dependency.
Definition: ScheduleDAG.h:49
SUnit * getSUnit() const
Definition: ScheduleDAG.h:480
Kind getKind() const
Returns an enum value representing the kind of the dependence.
Definition: ScheduleDAG.h:486
Kind
These are the different kinds of scheduling dependencies.
Definition: ScheduleDAG.h:52
@ Output
A register output-dependence (aka WAW).
Definition: ScheduleDAG.h:55
@ Order
Any other ordering dependency.
Definition: ScheduleDAG.h:56
@ Anti
A register anti-dependence (aka WAR).
Definition: ScheduleDAG.h:54
@ Data
Regular data dependence (aka true-dependence).
Definition: ScheduleDAG.h:53
void setLatency(unsigned Lat)
Sets the latency for this edge.
Definition: ScheduleDAG.h:147
@ Barrier
An unknown scheduling barrier.
Definition: ScheduleDAG.h:69
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
Definition: ScheduleDAG.h:142
bool isArtificial() const
Tests if this is an Order dependence that is marked as "artificial", meaning it isn't necessary for c...
Definition: ScheduleDAG.h:200
This class represents the scheduled code.
int earliestCycleInChain(const SDep &Dep)
Return the cycle of the earliest scheduled instruction in the dependence chain.
void setInitiationInterval(int ii)
Set the initiation interval for this schedule.
SmallSet< SUnit *, 8 > computeUnpipelineableNodes(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
Determine transitive dependences of unpipelineable instructions.
bool isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD, MachineInstr *Def, MachineOperand &MO)
Return true if the instruction is a definition that is loop carried and defines the use on the next i...
void dump() const
Utility function used for debugging to print the schedule.
bool insert(SUnit *SU, int StartCycle, int EndCycle, int II)
Try to schedule the node at the specified StartCycle and continue until the node is schedule or the E...
unsigned getMaxStageCount()
Return the maximum stage count needed for this schedule.
void print(raw_ostream &os) const
Print the schedule information to the given output.
int latestCycleInChain(const SDep &Dep)
Return the cycle of the latest scheduled instruction in the dependence chain.
int stageScheduled(SUnit *SU) const
Return the stage for a scheduled instruction.
void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart, int *MinEnd, int *MaxStart, int II, SwingSchedulerDAG *DAG)
Compute the scheduling start slot for the instruction.
bool isValidSchedule(SwingSchedulerDAG *SSD)
int getInitiationInterval() const
Return the initiation interval for this schedule.
std::deque< SUnit * > & getInstructions(int cycle)
Return the instructions that are scheduled at the specified cycle.
int getFirstCycle() const
Return the first cycle in the completed schedule.
unsigned cycleScheduled(SUnit *SU) const
Return the cycle for a scheduled instruction.
bool isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi)
Return true if the scheduled Phi has a loop carried operand.
bool normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
int getFinalCycle() const
Return the last cycle in the finalized schedule.
void orderDependence(SwingSchedulerDAG *SSD, SUnit *SU, std::deque< SUnit * > &Insts)
Order the instructions within a cycle so that the definitions occur before the uses.
void finalizeSchedule(SwingSchedulerDAG *SSD)
After the schedule has been formed, call this function to combine the instructions from the different...
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
unsigned NumPreds
Definition: ScheduleDAG.h:266
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
Definition: ScheduleDAG.h:362
unsigned NodeNum
Entry # of node in the node vector.
Definition: ScheduleDAG.h:264
void setInstr(MachineInstr *MI)
Assigns the instruction for the SUnit.
Definition: ScheduleDAG.h:366
void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
bool isPred(const SUnit *N) const
Tests if node N is a predecessor of this node.
Definition: ScheduleDAG.h:431
unsigned short Latency
Node latency.
Definition: ScheduleDAG.h:273
bool isBoundaryNode() const
Boundary nodes are placeholders for the boundary of the scheduling region.
Definition: ScheduleDAG.h:344
bool hasPhysRegDefs
Has physreg defs that are being used.
Definition: ScheduleDAG.h:280
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:257
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:256
bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:373
A ScheduleDAG for scheduling lists of MachineInstr.
DenseMap< MachineInstr *, SUnit * > MISUnitMap
After calling BuildSchedGraph, each machine instruction in the current scheduling region is mapped to...
virtual void finishBlock()
Cleans up after scheduling in the given block.
MachineBasicBlock * BB
The block in which to insert instructions.
const MCSchedClassDesc * getSchedClass(SUnit *SU) const
Resolves and cache a resolved scheduling class for an SUnit.
void dumpNode(const SUnit &SU) const override
UndefValue * UnknownValue
For an unanalyzable memory access, this Value is used in maps.
void buildSchedGraph(AAResults *AA, RegPressureTracker *RPTracker=nullptr, PressureDiffs *PDiffs=nullptr, LiveIntervals *LIS=nullptr, bool TrackLaneMasks=false)
Builds SUnits for the current region.
SUnit * getSUnit(MachineInstr *MI) const
Returns an existing SUnit for this MI, or nullptr.
void dump() const override
void RemovePred(SUnit *M, SUnit *N)
Updates the topological ordering to accommodate an edge to be removed from the specified node N from ...
void InitDAGTopologicalSorting()
Creates the initial topological ordering from the DAG to be scheduled.
void AddPred(SUnit *Y, SUnit *X)
Updates the topological ordering to accommodate an edge to be added from SUnit X to SUnit Y.
bool IsReachable(const SUnit *SU, const SUnit *TargetSU)
Checks if SU is reachable from TargetSU.
MachineRegisterInfo & MRI
Virtual/real register map.
Definition: ScheduleDAG.h:560
const TargetInstrInfo * TII
Target instruction information.
Definition: ScheduleDAG.h:557
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:561
const TargetRegisterInfo * TRI
Target processor register info.
Definition: ScheduleDAG.h:558
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:559
A vector that has set insertion semantics.
Definition: SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
typename vector_type::const_iterator iterator
Definition: SetVector.h:69
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
SlotIndexes pass.
Definition: SlotIndexes.h:301
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:522
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:380
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:384
iterator end() const
Definition: SmallPtrSet.h:409
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:366
iterator begin() const
Definition: SmallPtrSet.h:404
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:451
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:236
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
void resize(size_type N)
Definition: SmallVector.h:642
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
This class builds the dependence graph for the instructions in a loop, and attempts to schedule the i...
unsigned getDepth(SUnit *Node)
The depth, in the dependence graph, for a node.
unsigned getInstrBaseReg(SUnit *SU)
Return the new base register that was stored away for the changed instruction.
int getASAP(SUnit *Node)
Return the earliest time an instruction may be scheduled.
void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule)
Apply changes to the instruction if needed.
void finishBlock() override
Clean up after the software pipeliner runs.
void fixupRegisterOverlaps(std::deque< SUnit * > &Instrs)
Attempt to fix the degenerate cases when the instruction serialization causes the register lifetimes ...
int getZeroLatencyDepth(SUnit *Node)
The maximum unweighted length of a path from an arbitrary node to the given node in which each edge h...
bool isLoopCarriedDep(SUnit *Source, const SDep &Dep, bool isSucc=true)
Return true for an order or output dependence that is loop carried potentially.
unsigned getDistance(SUnit *U, SUnit *V, const SDep &Dep)
The distance function, which indicates that operation V of iteration I depends on operations U of ite...
void schedule() override
We override the schedule function in ScheduleDAGInstrs to implement the scheduling part of the Swing ...
int getMOV(SUnit *Node)
The mobility function, which the number of slots in which an instruction may be scheduled.
int getZeroLatencyHeight(SUnit *Node)
The maximum unweighted length of a path from the given node to an arbitrary node in which each edge h...
bool isBackedge(SUnit *Source, const SDep &Dep)
Return true if the dependence is a back-edge in the data dependence graph.
unsigned getHeight(SUnit *Node)
The height, in the dependence graph, for a node.
int getALAP(SUnit *Node)
Return the latest time an instruction my be scheduled.
Object returned by analyzeLoopForPipelining.
virtual bool shouldIgnoreForPipelining(const MachineInstr *MI) const =0
Return true if the given instruction should not be pipelined and should be ignored.
virtual bool shouldUseSchedule(SwingSchedulerDAG &SSD, SMSchedule &SMS)
Return true if the proposed schedule should used.
virtual std::unique_ptr< PipelinerLoopInfo > analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const
Analyze loop L, which must be a single-basic-block loop, and if the conditions can be understood enou...
bool isZeroCost(unsigned Opcode) const
Return true for pseudo instructions that don't consume any machine resources in their current form.
virtual bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
virtual DFAPacketizer * CreateTargetScheduleState(const TargetSubtargetInfo &) const
Create machine specific model for scheduling.
virtual bool isPostIncrement(const MachineInstr &MI) const
Return true for post-incremented instructions.
virtual bool getBaseAndOffsetPosition(const MachineInstr &MI, unsigned &BasePos, unsigned &OffsetPos) const
Return true if the instruction contains a base register and offset.
virtual bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const
Sometimes, it is possible for the target to tell, even without aliasing information,...
virtual bool getIncrementValue(const MachineInstr &MI, int &Value) const
If the instruction is an increment of a constant value, return the amount.
bool getMemOperandWithOffset(const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const
Get the base operand and byte offset of an instruction that reads/writes memory.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const char * getRegPressureSetName(unsigned Idx) const =0
Get the name of this register unit pressure set.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
static Type * getVoidTy(LLVMContext &C)
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1724
LLVM Value Representation.
Definition: Value.h:74
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:139
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
constexpr double e
Definition: MathExtras.h:31
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:440
void stable_sort(R &&Range)
Definition: STLExtras.h:1971
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:349
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1685
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append a range to a container.
Definition: STLExtras.h:2037
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1734
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:429
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1652
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
unsigned getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1946
void erase_value(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2029
char & MachinePipelinerID
This pass performs software pipelining on machine instructions.
CycleInfo::CycleT Cycle
Definition: CycleInfo.h:24
cl::opt< bool > SwpEnableCopyToPhi
bool isIdentifiedObject(const Value *V)
Return true if this pointer refers to a distinct and identifiable object.
cl::opt< int > SwpForceIssueWidth
A command line argument to force pipeliner to use specified issue width.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Description of the encoding of one expression Op.
These values represent a non-pipelined step in the execution of an instruction.
RegisterPressure computed within a region of instructions delimited by TopIdx and BottomIdx.
static constexpr LaneBitmask getNone()
Definition: LaneBitmask.h:81
Define a kind of processor resource that will be modeled by the scheduler.
Definition: MCSchedule.h:31
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition: MCSchedule.h:118
bool isValid() const
Definition: MCSchedule.h:136
Machine model for scheduling, bundling, and heuristics.
Definition: MCSchedule.h:253
const MCSchedClassDesc * getSchedClassDesc(unsigned SchedClassIdx) const
Definition: MCSchedule.h:360
unsigned getNumProcResourceKinds() const
Definition: MCSchedule.h:349
bool hasInstrSchedModel() const
Does this machine model include instruction-level scheduling.
Definition: MCSchedule.h:334
const MCProcResourceDesc * getProcResource(unsigned ProcResourceIdx) const
Definition: MCSchedule.h:353
Identify one of the processor resource kinds consumed by a particular scheduling class for the specif...
Definition: MCSchedule.h:63
SmallVector< MachineOperand, 4 > BrCond
std::unique_ptr< TargetInstrInfo::PipelinerLoopInfo > LoopPipelinerInfo
Store the effects of a change in pressure on things that MI scheduler cares about.
std::vector< unsigned > MaxSetPressure
Map of max reg pressure indexed by pressure set ID, not class ID.