LLVM 18.0.0git
MachineCombiner.cpp
Go to the documentation of this file.
1//===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// The machine combiner pass uses machine trace metrics to ensure the combined
10// instructions do not lengthen the critical path or the resource depth.
11//===----------------------------------------------------------------------===//
12
13#include "llvm/ADT/DenseMap.h"
14#include "llvm/ADT/Statistic.h"
32#include "llvm/Support/Debug.h"
34
35using namespace llvm;
36
37#define DEBUG_TYPE "machine-combiner"
38
39STATISTIC(NumInstCombined, "Number of machineinst combined");
40
42inc_threshold("machine-combiner-inc-threshold", cl::Hidden,
43 cl::desc("Incremental depth computation will be used for basic "
44 "blocks with more instructions."), cl::init(500));
45
46static cl::opt<bool> dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden,
47 cl::desc("Dump all substituted intrs"),
48 cl::init(false));
49
50#ifdef EXPENSIVE_CHECKS
52 "machine-combiner-verify-pattern-order", cl::Hidden,
54 "Verify that the generated patterns are ordered by increasing latency"),
55 cl::init(true));
56#else
58 "machine-combiner-verify-pattern-order", cl::Hidden,
60 "Verify that the generated patterns are ordered by increasing latency"),
61 cl::init(false));
62#endif
63
64namespace {
65class MachineCombiner : public MachineFunctionPass {
66 const TargetSubtargetInfo *STI = nullptr;
67 const TargetInstrInfo *TII = nullptr;
68 const TargetRegisterInfo *TRI = nullptr;
69 MCSchedModel SchedModel;
70 MachineRegisterInfo *MRI = nullptr;
71 MachineLoopInfo *MLI = nullptr; // Current MachineLoopInfo
72 MachineTraceMetrics *Traces = nullptr;
73 MachineTraceMetrics::Ensemble *TraceEnsemble = nullptr;
74 MachineBlockFrequencyInfo *MBFI = nullptr;
75 ProfileSummaryInfo *PSI = nullptr;
76 RegisterClassInfo RegClassInfo;
77
78 TargetSchedModel TSchedModel;
79
80 /// True if optimizing for code size.
81 bool OptSize = false;
82
83public:
84 static char ID;
85 MachineCombiner() : MachineFunctionPass(ID) {
87 }
88 void getAnalysisUsage(AnalysisUsage &AU) const override;
89 bool runOnMachineFunction(MachineFunction &MF) override;
90 StringRef getPassName() const override { return "Machine InstCombiner"; }
91
92private:
93 bool combineInstructions(MachineBasicBlock *);
94 MachineInstr *getOperandDef(const MachineOperand &MO);
95 bool isTransientMI(const MachineInstr *MI);
96 unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
97 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
99 const MachineBasicBlock &MBB);
100 unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot,
101 MachineTraceMetrics::Trace BlockTrace);
102 bool
103 improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
107 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
108 MachineCombinerPattern Pattern, bool SlackIsAccurate);
109 bool reduceRegisterPressure(MachineInstr &Root, MachineBasicBlock *MBB,
113 bool preservesResourceLen(MachineBasicBlock *MBB,
117 void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs,
119 std::pair<unsigned, unsigned>
120 getLatenciesForInstrSequences(MachineInstr &MI,
123 MachineTraceMetrics::Trace BlockTrace);
124
125 void verifyPatternOrder(MachineBasicBlock *MBB, MachineInstr &Root,
127};
128}
129
130char MachineCombiner::ID = 0;
131char &llvm::MachineCombinerID = MachineCombiner::ID;
132
134 "Machine InstCombiner", false, false)
137INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner",
139
140void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
141 AU.setPreservesCFG();
142 AU.addPreserved<MachineDominatorTree>();
143 AU.addRequired<MachineLoopInfo>();
144 AU.addPreserved<MachineLoopInfo>();
145 AU.addRequired<MachineTraceMetrics>();
146 AU.addPreserved<MachineTraceMetrics>();
147 AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
148 AU.addRequired<ProfileSummaryInfoWrapperPass>();
150}
151
153MachineCombiner::getOperandDef(const MachineOperand &MO) {
154 MachineInstr *DefInstr = nullptr;
155 // We need a virtual register definition.
156 if (MO.isReg() && MO.getReg().isVirtual())
157 DefInstr = MRI->getUniqueVRegDef(MO.getReg());
158 // PHI's have no depth etc.
159 if (DefInstr && DefInstr->isPHI())
160 DefInstr = nullptr;
161 return DefInstr;
162}
163
164/// Return true if MI is unlikely to generate an actual target instruction.
165bool MachineCombiner::isTransientMI(const MachineInstr *MI) {
166 if (!MI->isCopy())
167 return MI->isTransient();
168
169 // If MI is a COPY, check if its src and dst registers can be coalesced.
170 Register Dst = MI->getOperand(0).getReg();
171 Register Src = MI->getOperand(1).getReg();
172
173 if (!MI->isFullCopy()) {
174 // If src RC contains super registers of dst RC, it can also be coalesced.
175 if (MI->getOperand(0).getSubReg() || Src.isPhysical() || Dst.isPhysical())
176 return false;
177
178 auto SrcSub = MI->getOperand(1).getSubReg();
179 auto SrcRC = MRI->getRegClass(Src);
180 auto DstRC = MRI->getRegClass(Dst);
181 return TRI->getMatchingSuperRegClass(SrcRC, DstRC, SrcSub) != nullptr;
182 }
183
184 if (Src.isPhysical() && Dst.isPhysical())
185 return Src == Dst;
186
187 if (Src.isVirtual() && Dst.isVirtual()) {
188 auto SrcRC = MRI->getRegClass(Src);
189 auto DstRC = MRI->getRegClass(Dst);
190 return SrcRC->hasSuperClassEq(DstRC) || SrcRC->hasSubClassEq(DstRC);
191 }
192
193 if (Src.isVirtual())
194 std::swap(Src, Dst);
195
196 // Now Src is physical register, Dst is virtual register.
197 auto DstRC = MRI->getRegClass(Dst);
198 return DstRC->contains(Src);
199}
200
201/// Computes depth of instructions in vector \InsInstr.
202///
203/// \param InsInstrs is a vector of machine instructions
204/// \param InstrIdxForVirtReg is a dense map of virtual register to index
205/// of defining machine instruction in \p InsInstrs
206/// \param BlockTrace is a trace of machine instructions
207///
208/// \returns Depth of last instruction in \InsInstrs ("NewRoot")
209unsigned
210MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
211 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
213 const MachineBasicBlock &MBB) {
214 SmallVector<unsigned, 16> InstrDepth;
215 // For each instruction in the new sequence compute the depth based on the
216 // operands. Use the trace information when possible. For new operands which
217 // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
218 for (auto *InstrPtr : InsInstrs) { // for each Use
219 unsigned IDepth = 0;
220 for (const MachineOperand &MO : InstrPtr->all_uses()) {
221 // Check for virtual register operand.
222 if (!MO.getReg().isVirtual())
223 continue;
224 unsigned DepthOp = 0;
225 unsigned LatencyOp = 0;
227 InstrIdxForVirtReg.find(MO.getReg());
228 if (II != InstrIdxForVirtReg.end()) {
229 // Operand is new virtual register not in trace
230 assert(II->second < InstrDepth.size() && "Bad Index");
231 MachineInstr *DefInstr = InsInstrs[II->second];
232 assert(DefInstr &&
233 "There must be a definition for a new virtual register");
234 DepthOp = InstrDepth[II->second];
235 int DefIdx = DefInstr->findRegisterDefOperandIdx(MO.getReg());
236 int UseIdx = InstrPtr->findRegisterUseOperandIdx(MO.getReg());
237 LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx,
238 InstrPtr, UseIdx);
239 } else {
240 MachineInstr *DefInstr = getOperandDef(MO);
241 if (DefInstr && (TII->getMachineCombinerTraceStrategy() !=
242 MachineTraceStrategy::TS_Local ||
243 DefInstr->getParent() == &MBB)) {
244 DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth;
245 if (!isTransientMI(DefInstr))
246 LatencyOp = TSchedModel.computeOperandLatency(
247 DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
248 InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
249 }
250 }
251 IDepth = std::max(IDepth, DepthOp + LatencyOp);
252 }
253 InstrDepth.push_back(IDepth);
254 }
255 unsigned NewRootIdx = InsInstrs.size() - 1;
256 return InstrDepth[NewRootIdx];
257}
258
259/// Computes instruction latency as max of latency of defined operands.
260///
261/// \param Root is a machine instruction that could be replaced by NewRoot.
262/// It is used to compute a more accurate latency information for NewRoot in
263/// case there is a dependent instruction in the same trace (\p BlockTrace)
264/// \param NewRoot is the instruction for which the latency is computed
265/// \param BlockTrace is a trace of machine instructions
266///
267/// \returns Latency of \p NewRoot
268unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
269 MachineTraceMetrics::Trace BlockTrace) {
270 // Check each definition in NewRoot and compute the latency
271 unsigned NewRootLatency = 0;
272
273 for (const MachineOperand &MO : NewRoot->all_defs()) {
274 // Check for virtual register operand.
275 if (!MO.getReg().isVirtual())
276 continue;
277 // Get the first instruction that uses MO
278 MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg());
279 RI++;
280 if (RI == MRI->reg_end())
281 continue;
282 MachineInstr *UseMO = RI->getParent();
283 unsigned LatencyOp = 0;
284 if (UseMO && BlockTrace.isDepInTrace(*Root, *UseMO)) {
285 LatencyOp = TSchedModel.computeOperandLatency(
286 NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO,
287 UseMO->findRegisterUseOperandIdx(MO.getReg()));
288 } else {
289 LatencyOp = TSchedModel.computeInstrLatency(NewRoot);
290 }
291 NewRootLatency = std::max(NewRootLatency, LatencyOp);
292 }
293 return NewRootLatency;
294}
295
296/// The combiner's goal may differ based on which pattern it is attempting
297/// to optimize.
299 MustReduceDepth, // The data dependency chain must be improved.
300 MustReduceRegisterPressure, // The register pressure must be reduced.
301 Default // The critical path must not be lengthened.
302};
303
305 // TODO: If C++ ever gets a real enum class, make this part of the
306 // MachineCombinerPattern class.
307 switch (P) {
308 case MachineCombinerPattern::REASSOC_AX_BY:
309 case MachineCombinerPattern::REASSOC_AX_YB:
310 case MachineCombinerPattern::REASSOC_XA_BY:
311 case MachineCombinerPattern::REASSOC_XA_YB:
312 case MachineCombinerPattern::REASSOC_XY_AMM_BMM:
313 case MachineCombinerPattern::REASSOC_XMM_AMM_BMM:
314 case MachineCombinerPattern::SUBADD_OP1:
315 case MachineCombinerPattern::SUBADD_OP2:
316 case MachineCombinerPattern::FMADD_AX:
317 case MachineCombinerPattern::FMADD_XA:
318 case MachineCombinerPattern::FMSUB:
319 case MachineCombinerPattern::FNMSUB:
321 case MachineCombinerPattern::REASSOC_XY_BCA:
322 case MachineCombinerPattern::REASSOC_XY_BAC:
324 default:
326 }
327}
328
329/// Estimate the latency of the new and original instruction sequence by summing
330/// up the latencies of the inserted and deleted instructions. This assumes
331/// that the inserted and deleted instructions are dependent instruction chains,
332/// which might not hold in all cases.
333std::pair<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences(
336 MachineTraceMetrics::Trace BlockTrace) {
337 assert(!InsInstrs.empty() && "Only support sequences that insert instrs.");
338 unsigned NewRootLatency = 0;
339 // NewRoot is the last instruction in the \p InsInstrs vector.
340 MachineInstr *NewRoot = InsInstrs.back();
341 for (unsigned i = 0; i < InsInstrs.size() - 1; i++)
342 NewRootLatency += TSchedModel.computeInstrLatency(InsInstrs[i]);
343 NewRootLatency += getLatency(&MI, NewRoot, BlockTrace);
344
345 unsigned RootLatency = 0;
346 for (auto *I : DelInstrs)
347 RootLatency += TSchedModel.computeInstrLatency(I);
348
349 return {NewRootLatency, RootLatency};
350}
351
352bool MachineCombiner::reduceRegisterPressure(
357 // FIXME: for now, we don't do any check for the register pressure patterns.
358 // We treat them as always profitable. But we can do better if we make
359 // RegPressureTracker class be aware of TIE attribute. Then we can get an
360 // accurate compare of register pressure with DelInstrs or InsInstrs.
361 return true;
362}
363
364/// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
365/// The new code sequence ends in MI NewRoot. A necessary condition for the new
366/// sequence to replace the old sequence is that it cannot lengthen the critical
367/// path. The definition of "improve" may be restricted by specifying that the
368/// new path improves the data dependency chain (MustReduceDepth).
369bool MachineCombiner::improvesCriticalPathLen(
374 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
376 bool SlackIsAccurate) {
377 // Get depth and latency of NewRoot and Root.
378 unsigned NewRootDepth =
379 getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace, *MBB);
380 unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth;
381
382 LLVM_DEBUG(dbgs() << " Dependence data for " << *Root << "\tNewRootDepth: "
383 << NewRootDepth << "\tRootDepth: " << RootDepth);
384
385 // For a transform such as reassociation, the cost equation is
386 // conservatively calculated so that we must improve the depth (data
387 // dependency cycles) in the critical path to proceed with the transform.
388 // Being conservative also protects against inaccuracies in the underlying
389 // machine trace metrics and CPU models.
391 LLVM_DEBUG(dbgs() << "\tIt MustReduceDepth ");
392 LLVM_DEBUG(NewRootDepth < RootDepth
393 ? dbgs() << "\t and it does it\n"
394 : dbgs() << "\t but it does NOT do it\n");
395 return NewRootDepth < RootDepth;
396 }
397
398 // A more flexible cost calculation for the critical path includes the slack
399 // of the original code sequence. This may allow the transform to proceed
400 // even if the instruction depths (data dependency cycles) become worse.
401
402 // Account for the latency of the inserted and deleted instructions by
403 unsigned NewRootLatency, RootLatency;
404 if (TII->accumulateInstrSeqToRootLatency(*Root)) {
405 std::tie(NewRootLatency, RootLatency) =
406 getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace);
407 } else {
408 NewRootLatency = TSchedModel.computeInstrLatency(InsInstrs.back());
409 RootLatency = TSchedModel.computeInstrLatency(Root);
410 }
411
412 unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
413 unsigned NewCycleCount = NewRootDepth + NewRootLatency;
414 unsigned OldCycleCount =
415 RootDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0);
416 LLVM_DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency
417 << "\tRootLatency: " << RootLatency << "\n\tRootSlack: "
418 << RootSlack << " SlackIsAccurate=" << SlackIsAccurate
419 << "\n\tNewRootDepth + NewRootLatency = " << NewCycleCount
420 << "\n\tRootDepth + RootLatency + RootSlack = "
421 << OldCycleCount;);
422 LLVM_DEBUG(NewCycleCount <= OldCycleCount
423 ? dbgs() << "\n\t It IMPROVES PathLen because"
424 : dbgs() << "\n\t It DOES NOT improve PathLen because");
425 LLVM_DEBUG(dbgs() << "\n\t\tNewCycleCount = " << NewCycleCount
426 << ", OldCycleCount = " << OldCycleCount << "\n");
427
428 return NewCycleCount <= OldCycleCount;
429}
430
431/// helper routine to convert instructions into SC
432void MachineCombiner::instr2instrSC(
435 for (auto *InstrPtr : Instrs) {
436 unsigned Opc = InstrPtr->getOpcode();
437 unsigned Idx = TII->get(Opc).getSchedClass();
438 const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx);
439 InstrsSC.push_back(SC);
440 }
441}
442
443/// True when the new instructions do not increase resource length
444bool MachineCombiner::preservesResourceLen(
448 if (!TSchedModel.hasInstrSchedModel())
449 return true;
450
451 // Compute current resource length
452
453 //ArrayRef<const MachineBasicBlock *> MBBarr(MBB);
454 SmallVector <const MachineBasicBlock *, 1> MBBarr;
455 MBBarr.push_back(MBB);
456 unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr);
457
458 // Deal with SC rather than Instructions.
461
462 instr2instrSC(InsInstrs, InsInstrsSC);
463 instr2instrSC(DelInstrs, DelInstrsSC);
464
465 ArrayRef<const MCSchedClassDesc *> MSCInsArr{InsInstrsSC};
466 ArrayRef<const MCSchedClassDesc *> MSCDelArr{DelInstrsSC};
467
468 // Compute new resource length.
469 unsigned ResLenAfterCombine =
470 BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr);
471
472 LLVM_DEBUG(dbgs() << "\t\tResource length before replacement: "
473 << ResLenBeforeCombine
474 << " and after: " << ResLenAfterCombine << "\n";);
476 ResLenAfterCombine <=
477 ResLenBeforeCombine + TII->getExtendResourceLenLimit()
478 ? dbgs() << "\t\t As result it IMPROVES/PRESERVES Resource Length\n"
479 : dbgs() << "\t\t As result it DOES NOT improve/preserve Resource "
480 "Length\n");
481
482 return ResLenAfterCombine <=
483 ResLenBeforeCombine + TII->getExtendResourceLenLimit();
484}
485
486/// Inserts InsInstrs and deletes DelInstrs. Incrementally updates instruction
487/// depths if requested.
488///
489/// \param MBB basic block to insert instructions in
490/// \param MI current machine instruction
491/// \param InsInstrs new instructions to insert in \p MBB
492/// \param DelInstrs instruction to delete from \p MBB
493/// \param TraceEnsemble is a pointer to the machine trace information
494/// \param RegUnits set of live registers, needed to compute instruction depths
495/// \param TII is target instruction info, used to call target hook
496/// \param Pattern is used to call target hook finalizeInsInstrs
497/// \param IncrementalUpdate if true, compute instruction depths incrementally,
498/// otherwise invalidate the trace
503 MachineTraceMetrics::Ensemble *TraceEnsemble,
505 MachineCombinerPattern Pattern, bool IncrementalUpdate) {
506 // If we want to fix up some placeholder for some target, do it now.
507 // We need this because in genAlternativeCodeSequence, we have not decided the
508 // better pattern InsInstrs or DelInstrs, so we don't want generate some
509 // sideeffect to the function. For example we need to delay the constant pool
510 // entry creation here after InsInstrs is selected as better pattern.
511 // Otherwise the constant pool entry created for InsInstrs will not be deleted
512 // even if InsInstrs is not the better pattern.
513 TII->finalizeInsInstrs(MI, Pattern, InsInstrs);
514
515 for (auto *InstrPtr : InsInstrs)
517
518 for (auto *InstrPtr : DelInstrs) {
519 InstrPtr->eraseFromParent();
520 // Erase all LiveRegs defined by the removed instruction
521 for (auto *I = RegUnits.begin(); I != RegUnits.end();) {
522 if (I->MI == InstrPtr)
523 I = RegUnits.erase(I);
524 else
525 I++;
526 }
527 }
528
529 if (IncrementalUpdate)
530 for (auto *InstrPtr : InsInstrs)
531 TraceEnsemble->updateDepth(MBB, *InstrPtr, RegUnits);
532 else
533 TraceEnsemble->invalidate(MBB);
534
535 NumInstCombined++;
536}
537
538// Check that the difference between original and new latency is decreasing for
539// later patterns. This helps to discover sub-optimal pattern orderings.
540void MachineCombiner::verifyPatternOrder(
543 long PrevLatencyDiff = std::numeric_limits<long>::max();
544 (void)PrevLatencyDiff; // Variable is used in assert only.
545 for (auto P : Patterns) {
548 DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
549 TII->genAlternativeCodeSequence(Root, P, InsInstrs, DelInstrs,
550 InstrIdxForVirtReg);
551 // Found pattern, but did not generate alternative sequence.
552 // This can happen e.g. when an immediate could not be materialized
553 // in a single instruction.
554 if (InsInstrs.empty() || !TSchedModel.hasInstrSchedModelOrItineraries())
555 continue;
556
557 unsigned NewRootLatency, RootLatency;
558 std::tie(NewRootLatency, RootLatency) = getLatenciesForInstrSequences(
559 Root, InsInstrs, DelInstrs, TraceEnsemble->getTrace(MBB));
560 long CurrentLatencyDiff = ((long)RootLatency) - ((long)NewRootLatency);
561 assert(CurrentLatencyDiff <= PrevLatencyDiff &&
562 "Current pattern is better than previous pattern.");
563 PrevLatencyDiff = CurrentLatencyDiff;
564 }
565}
566
567/// Substitute a slow code sequence with a faster one by
568/// evaluating instruction combining pattern.
569/// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
570/// combining based on machine trace metrics. Only combine a sequence of
571/// instructions when this neither lengthens the critical path nor increases
572/// resource pressure. When optimizing for codesize always combine when the new
573/// sequence is shorter.
574bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
575 bool Changed = false;
576 LLVM_DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
577
578 bool IncrementalUpdate = false;
579 auto BlockIter = MBB->begin();
580 decltype(BlockIter) LastUpdate;
581 // Check if the block is in a loop.
582 const MachineLoop *ML = MLI->getLoopFor(MBB);
583 if (!TraceEnsemble)
584 TraceEnsemble = Traces->getEnsemble(TII->getMachineCombinerTraceStrategy());
585
586 SparseSet<LiveRegUnit> RegUnits;
587 RegUnits.setUniverse(TRI->getNumRegUnits());
588
589 bool OptForSize = OptSize || llvm::shouldOptimizeForSize(MBB, PSI, MBFI);
590
591 bool DoRegPressureReduce =
592 TII->shouldReduceRegisterPressure(MBB, &RegClassInfo);
593
594 while (BlockIter != MBB->end()) {
595 auto &MI = *BlockIter++;
597 // The motivating example is:
598 //
599 // MUL Other MUL_op1 MUL_op2 Other
600 // \ / \ | /
601 // ADD/SUB => MADD/MSUB
602 // (=Root) (=NewRoot)
603
604 // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is
605 // usually beneficial for code size it unfortunately can hurt performance
606 // when the ADD is on the critical path, but the MUL is not. With the
607 // substitution the MUL becomes part of the critical path (in form of the
608 // MADD) and can lengthen it on architectures where the MADD latency is
609 // longer than the ADD latency.
610 //
611 // For each instruction we check if it can be the root of a combiner
612 // pattern. Then for each pattern the new code sequence in form of MI is
613 // generated and evaluated. When the efficiency criteria (don't lengthen
614 // critical path, don't use more resources) is met the new sequence gets
615 // hooked up into the basic block before the old sequence is removed.
616 //
617 // The algorithm does not try to evaluate all patterns and pick the best.
618 // This is only an artificial restriction though. In practice there is
619 // mostly one pattern, and getMachineCombinerPatterns() can order patterns
620 // based on an internal cost heuristic. If
621 // machine-combiner-verify-pattern-order is enabled, all patterns are
622 // checked to ensure later patterns do not provide better latency savings.
623
624 if (!TII->getMachineCombinerPatterns(MI, Patterns, DoRegPressureReduce))
625 continue;
626
628 verifyPatternOrder(MBB, MI, Patterns);
629
630 for (const auto P : Patterns) {
633 DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
634 TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
635 InstrIdxForVirtReg);
636 // Found pattern, but did not generate alternative sequence.
637 // This can happen e.g. when an immediate could not be materialized
638 // in a single instruction.
639 if (InsInstrs.empty())
640 continue;
641
643 dbgs() << "\tFor the Pattern (" << (int)P
644 << ") these instructions could be removed\n";
645 for (auto const *InstrPtr : DelInstrs)
646 InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
647 /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
648 dbgs() << "\tThese instructions could replace the removed ones\n";
649 for (auto const *InstrPtr : InsInstrs)
650 InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
651 /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
652 });
653
654 if (IncrementalUpdate && LastUpdate != BlockIter) {
655 // Update depths since the last incremental update.
656 TraceEnsemble->updateDepths(LastUpdate, BlockIter, RegUnits);
657 LastUpdate = BlockIter;
658 }
659
660 if (DoRegPressureReduce &&
663 if (MBB->size() > inc_threshold) {
664 // Use incremental depth updates for basic blocks above threshold
665 IncrementalUpdate = true;
666 LastUpdate = BlockIter;
667 }
668 if (reduceRegisterPressure(MI, MBB, InsInstrs, DelInstrs, P)) {
669 // Replace DelInstrs with InsInstrs.
670 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
671 RegUnits, TII, P, IncrementalUpdate);
672 Changed |= true;
673
674 // Go back to previous instruction as it may have ILP reassociation
675 // opportunity.
676 BlockIter--;
677 break;
678 }
679 }
680
681 if (ML && TII->isThroughputPattern(P)) {
682 LLVM_DEBUG(dbgs() << "\t Replacing due to throughput pattern in loop\n");
683 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
684 RegUnits, TII, P, IncrementalUpdate);
685 // Eagerly stop after the first pattern fires.
686 Changed = true;
687 break;
688 } else if (OptForSize && InsInstrs.size() < DelInstrs.size()) {
689 LLVM_DEBUG(dbgs() << "\t Replacing due to OptForSize ("
690 << InsInstrs.size() << " < "
691 << DelInstrs.size() << ")\n");
692 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
693 RegUnits, TII, P, IncrementalUpdate);
694 // Eagerly stop after the first pattern fires.
695 Changed = true;
696 break;
697 } else {
698 // For big basic blocks, we only compute the full trace the first time
699 // we hit this. We do not invalidate the trace, but instead update the
700 // instruction depths incrementally.
701 // NOTE: Only the instruction depths up to MI are accurate. All other
702 // trace information is not updated.
703 MachineTraceMetrics::Trace BlockTrace = TraceEnsemble->getTrace(MBB);
704 Traces->verifyAnalysis();
705 if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs,
706 InstrIdxForVirtReg, P,
707 !IncrementalUpdate) &&
708 preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) {
709 if (MBB->size() > inc_threshold) {
710 // Use incremental depth updates for basic blocks above treshold
711 IncrementalUpdate = true;
712 LastUpdate = BlockIter;
713 }
714
715 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
716 RegUnits, TII, P, IncrementalUpdate);
717
718 // Eagerly stop after the first pattern fires.
719 Changed = true;
720 break;
721 }
722 // Cleanup instructions of the alternative code sequence. There is no
723 // use for them.
725 for (auto *InstrPtr : InsInstrs)
726 MF->deleteMachineInstr(InstrPtr);
727 }
728 InstrIdxForVirtReg.clear();
729 }
730 }
731
732 if (Changed && IncrementalUpdate)
733 Traces->invalidate(MBB);
734 return Changed;
735}
736
737bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
738 STI = &MF.getSubtarget();
739 TII = STI->getInstrInfo();
740 TRI = STI->getRegisterInfo();
741 SchedModel = STI->getSchedModel();
742 TSchedModel.init(STI);
743 MRI = &MF.getRegInfo();
744 MLI = &getAnalysis<MachineLoopInfo>();
745 Traces = &getAnalysis<MachineTraceMetrics>();
746 PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
747 MBFI = (PSI && PSI->hasProfileSummary()) ?
748 &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
749 nullptr;
750 TraceEnsemble = nullptr;
751 OptSize = MF.getFunction().hasOptSize();
752 RegClassInfo.runOnMachineFunction(MF);
753
754 LLVM_DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n');
755 if (!TII->useMachineCombiner()) {
757 dbgs()
758 << " Skipping pass: Target does not support machine combiner\n");
759 return false;
760 }
761
762 bool Changed = false;
763
764 // Try to combine instructions.
765 for (auto &MBB : MF)
766 Changed |= combineInstructions(&MBB);
767
768 return Changed;
769}
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock & MBB
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
static int getLatency(LLVMDisasmContext *DC, const MCInst &Inst)
Gets latency information for Inst, based on DC information.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
===- LazyMachineBlockFrequencyInfo.h - Lazy Block Frequency -*- C++ -*–===//
#define I(x, y, z)
Definition: MD5.cpp:58
static CombinerObjective getCombinerObjective(MachineCombinerPattern P)
static cl::opt< bool > VerifyPatternOrder("machine-combiner-verify-pattern-order", cl::Hidden, cl::desc("Verify that the generated patterns are ordered by increasing latency"), cl::init(false))
static cl::opt< unsigned > inc_threshold("machine-combiner-inc-threshold", cl::Hidden, cl::desc("Incremental depth computation will be used for basic " "blocks with more instructions."), cl::init(500))
static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, MachineTraceMetrics::Ensemble *TraceEnsemble, SparseSet< LiveRegUnit > &RegUnits, const TargetInstrInfo *TII, MachineCombinerPattern Pattern, bool IncrementalUpdate)
Inserts InsInstrs and deletes DelInstrs.
static cl::opt< bool > dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden, cl::desc("Dump all substituted intrs"), cl::init(false))
CombinerObjective
The combiner's goal may differ based on which pattern it is attempting to optimize.
#define DEBUG_TYPE
unsigned const TargetRegisterInfo * TRI
#define P(N)
if(VerifyEach)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
Represent the analysis usage information of a pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
iterator end()
Definition: DenseMap.h:84
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:647
The core instruction combiner logic.
Definition: InstCombiner.h:46
This is an alternative analysis pass to MachineBlockFrequencyInfo.
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
void deleteMachineInstr(MachineInstr *MI)
DeleteMachineInstr - Delete the given MachineInstr.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Representation of each machine instruction.
Definition: MachineInstr.h:68
int findRegisterUseOperandIdx(Register Reg, bool isKill=false, const TargetRegisterInfo *TRI=nullptr) const
Returns the operand index that is a use of the specific register or -1 if it is not found.
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:326
int findRegisterDefOperandIdx(Register Reg, bool isDead=false, bool Overlap=false, const TargetRegisterInfo *TRI=nullptr) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
bool isPHI() const
iterator_range< filtered_mop_iterator > all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
Definition: MachineInstr.h:730
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
Register getReg() const
getReg - Returns the register number.
reg_begin/reg_end - Provide iteration support to walk over all definitions and uses of a register wit...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
A trace ensemble is a collection of traces selected using the same strategy, for example 'minimum res...
void invalidate(const MachineBasicBlock *MBB)
Invalidate traces through BadMBB.
void updateDepth(TraceBlockInfo &TBI, const MachineInstr &, SparseSet< LiveRegUnit > &RegUnits)
Updates the depth of an machine instruction, given RegUnits.
A trace represents a plausible sequence of executed basic blocks that passes through the current basi...
InstrCycles getInstrCycles(const MachineInstr &MI) const
Return the depth and height of MI.
unsigned getInstrSlack(const MachineInstr &MI) const
Return the slack of MI.
bool isDepInTrace(const MachineInstr &DefMI, const MachineInstr &UseMI) const
A dependence is useful if the basic block of the defining instruction is part of the trace of the use...
unsigned getResourceLength(ArrayRef< const MachineBasicBlock * > Extrablocks=std::nullopt, ArrayRef< const MCSchedClassDesc * > ExtraInstrs=std::nullopt, ArrayRef< const MCSchedClassDesc * > RemoveInstrs=std::nullopt) const
Return the resource length of the trace.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Analysis providing profile information.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
SparseSet - Fast set implementation for objects that can be identified by small unsigned keys.
Definition: SparseSet.h:124
iterator erase(iterator I)
erase - Erases an existing element identified by a valid iterator.
Definition: SparseSet.h:289
const_iterator begin() const
Definition: SparseSet.h:174
const_iterator end() const
Definition: SparseSet.h:175
void setUniverse(unsigned U)
setUniverse - Set the universe size which determines the largest key the set can hold.
Definition: SparseSet.h:155
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
TargetInstrInfo - Interface to description of machine instruction set.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Provide an instruction scheduling machine model to CodeGen passes.
TargetSubtargetInfo - Generic base class for all target subtargets.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ SC
CHAIN = SC CHAIN, Imm128 - System call.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
char & MachineCombinerID
This pass performs instruction combining using trace metrics to estimate critical-path and resource d...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
MachineCombinerPattern
These are instruction patterns matched by the machine combiner pass.
void initializeMachineCombinerPass(PassRegistry &)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition: MCSchedule.h:118
Machine model for scheduling, bundling, and heuristics.
Definition: MCSchedule.h:253
unsigned Depth
Earliest issue cycle as determined by data dependencies and instruction latencies from the beginning ...