LLVM 23.0.0git
MachineCombiner.cpp
Go to the documentation of this file.
1//===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// The machine combiner pass uses machine trace metrics to ensure the combined
10// instructions do not lengthen the critical path or the resource depth.
11//===----------------------------------------------------------------------===//
12
13#include "llvm/ADT/DenseMap.h"
14#include "llvm/ADT/Statistic.h"
32#include "llvm/Support/Debug.h"
34
35using namespace llvm;
36
37#define DEBUG_TYPE "machine-combiner"
38
39STATISTIC(NumInstCombined, "Number of machineinst combined");
40
42inc_threshold("machine-combiner-inc-threshold", cl::Hidden,
43 cl::desc("Incremental depth computation will be used for basic "
44 "blocks with more instructions."), cl::init(500));
45
46static cl::opt<bool> dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden,
47 cl::desc("Dump all substituted intrs"),
48 cl::init(false));
49
50#ifdef EXPENSIVE_CHECKS
52 "machine-combiner-verify-pattern-order", cl::Hidden,
54 "Verify that the generated patterns are ordered by increasing latency"),
55 cl::init(true));
56#else
58 "machine-combiner-verify-pattern-order", cl::Hidden,
60 "Verify that the generated patterns are ordered by increasing latency"),
61 cl::init(false));
62#endif
63
64namespace {
65class MachineCombiner : public MachineFunctionPass {
66 const TargetSubtargetInfo *STI = nullptr;
67 const TargetInstrInfo *TII = nullptr;
68 const TargetRegisterInfo *TRI = nullptr;
69 MCSchedModel SchedModel;
70 MachineRegisterInfo *MRI = nullptr;
71 MachineLoopInfo *MLI = nullptr; // Current MachineLoopInfo
72 MachineTraceMetrics *Traces = nullptr;
73 MachineTraceMetrics::Ensemble *TraceEnsemble = nullptr;
74 MachineBlockFrequencyInfo *MBFI = nullptr;
75 ProfileSummaryInfo *PSI = nullptr;
76 RegisterClassInfo RegClassInfo;
77
78 TargetSchedModel TSchedModel;
79
80public:
81 static char ID;
82 MachineCombiner() : MachineFunctionPass(ID) {}
83 void getAnalysisUsage(AnalysisUsage &AU) const override;
84 bool runOnMachineFunction(MachineFunction &MF) override;
85 StringRef getPassName() const override { return "Machine InstCombiner"; }
86
87private:
88 bool combineInstructions(MachineBasicBlock *);
89 MachineInstr *getOperandDef(const MachineOperand &MO);
90 bool isTransientMI(const MachineInstr *MI);
91 unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
92 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
94 const MachineBasicBlock &MBB);
95 unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot,
97 bool improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
99 SmallVectorImpl<MachineInstr *> &InsInstrs,
100 SmallVectorImpl<MachineInstr *> &DelInstrs,
101 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
102 unsigned Pattern, bool SlackIsAccurate);
103 bool reduceRegisterPressure(MachineInstr &Root, MachineBasicBlock *MBB,
104 SmallVectorImpl<MachineInstr *> &InsInstrs,
105 SmallVectorImpl<MachineInstr *> &DelInstrs,
106 unsigned Pattern);
107 bool preservesResourceLen(MachineBasicBlock *MBB,
109 SmallVectorImpl<MachineInstr *> &InsInstrs,
110 SmallVectorImpl<MachineInstr *> &DelInstrs);
111 void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs,
112 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC);
113 std::pair<unsigned, unsigned>
114 getLatenciesForInstrSequences(MachineInstr &MI,
115 SmallVectorImpl<MachineInstr *> &InsInstrs,
116 SmallVectorImpl<MachineInstr *> &DelInstrs,
117 MachineTraceMetrics::Trace BlockTrace);
118
119 CombinerObjective getCombinerObjective(unsigned Pattern);
120};
121}
122
123char MachineCombiner::ID = 0;
124char &llvm::MachineCombinerID = MachineCombiner::ID;
125
127 "Machine InstCombiner", false, false)
130INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner",
132
133void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
134 AU.setPreservesCFG();
135 AU.addPreserved<MachineDominatorTreeWrapperPass>();
136 AU.addRequired<MachineLoopInfoWrapperPass>();
137 AU.addPreserved<MachineLoopInfoWrapperPass>();
138 AU.addRequired<MachineTraceMetricsWrapperPass>();
139 AU.addPreserved<MachineTraceMetricsWrapperPass>();
140 AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
141 AU.addRequired<ProfileSummaryInfoWrapperPass>();
143}
144
146MachineCombiner::getOperandDef(const MachineOperand &MO) {
147 MachineInstr *DefInstr = nullptr;
148 // We need a virtual register definition.
149 if (MO.isReg() && MO.getReg().isVirtual())
150 DefInstr = MRI->getUniqueVRegDef(MO.getReg());
151 return DefInstr;
152}
153
154/// Return true if MI is unlikely to generate an actual target instruction.
155bool MachineCombiner::isTransientMI(const MachineInstr *MI) {
156 if (!MI->isCopy())
157 return MI->isTransient();
158
159 // If MI is a COPY, check if its src and dst registers can be coalesced.
160 Register Dst = MI->getOperand(0).getReg();
161 Register Src = MI->getOperand(1).getReg();
162
163 if (!MI->isFullCopy()) {
164 // If src RC contains super registers of dst RC, it can also be coalesced.
165 if (MI->getOperand(0).getSubReg() || Src.isPhysical() || Dst.isPhysical())
166 return false;
167
168 auto SrcSub = MI->getOperand(1).getSubReg();
169 auto SrcRC = MRI->getRegClass(Src);
170 auto DstRC = MRI->getRegClass(Dst);
171 return TRI->getMatchingSuperRegClass(SrcRC, DstRC, SrcSub) != nullptr;
172 }
173
174 if (Src.isPhysical() && Dst.isPhysical())
175 return Src == Dst;
176
177 if (Src.isVirtual() && Dst.isVirtual()) {
178 auto SrcRC = MRI->getRegClass(Src);
179 auto DstRC = MRI->getRegClass(Dst);
180 return SrcRC->hasSuperClassEq(DstRC) || SrcRC->hasSubClassEq(DstRC);
181 }
182
183 if (Src.isVirtual())
184 std::swap(Src, Dst);
185
186 // Now Src is physical register, Dst is virtual register.
187 auto DstRC = MRI->getRegClass(Dst);
188 return DstRC->contains(Src);
189}
190
191/// Computes depth of instructions in vector \InsInstr.
192///
193/// \param InsInstrs is a vector of machine instructions
194/// \param InstrIdxForVirtReg is a dense map of virtual register to index
195/// of defining machine instruction in \p InsInstrs
196/// \param BlockTrace is a trace of machine instructions
197///
198/// \returns Depth of last instruction in \InsInstrs ("NewRoot")
199unsigned
200MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
201 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
203 const MachineBasicBlock &MBB) {
204 SmallVector<unsigned, 16> InstrDepth;
205 // For each instruction in the new sequence compute the depth based on the
206 // operands. Use the trace information when possible. For new operands which
207 // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
208 for (auto *InstrPtr : InsInstrs) { // for each Use
209 unsigned IDepth = 0;
210 for (const MachineOperand &MO : InstrPtr->all_uses()) {
211 // Check for virtual register operand.
212 if (!MO.getReg().isVirtual())
213 continue;
214 unsigned DepthOp = 0;
215 unsigned LatencyOp = 0;
216 DenseMap<Register, unsigned>::iterator II =
217 InstrIdxForVirtReg.find(MO.getReg());
218 if (II != InstrIdxForVirtReg.end()) {
219 // Operand is new virtual register not in trace
220 assert(II->second < InstrDepth.size() && "Bad Index");
221 MachineInstr *DefInstr = InsInstrs[II->second];
222 assert(DefInstr &&
223 "There must be a definition for a new virtual register");
224 DepthOp = InstrDepth[II->second];
225 int DefIdx =
226 DefInstr->findRegisterDefOperandIdx(MO.getReg(), /*TRI=*/nullptr);
227 int UseIdx =
228 InstrPtr->findRegisterUseOperandIdx(MO.getReg(), /*TRI=*/nullptr);
229 LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx,
230 InstrPtr, UseIdx);
231 } else {
232 MachineInstr *DefInstr = getOperandDef(MO);
233 if (DefInstr && (TII->getMachineCombinerTraceStrategy() !=
234 MachineTraceStrategy::TS_Local ||
235 DefInstr->getParent() == &MBB)) {
236 DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth;
237 if (!isTransientMI(DefInstr))
238 LatencyOp = TSchedModel.computeOperandLatency(
239 DefInstr,
240 DefInstr->findRegisterDefOperandIdx(MO.getReg(),
241 /*TRI=*/nullptr),
242 InstrPtr,
243 InstrPtr->findRegisterUseOperandIdx(MO.getReg(),
244 /*TRI=*/nullptr));
245 }
246 }
247 IDepth = std::max(IDepth, DepthOp + LatencyOp);
248 }
249 InstrDepth.push_back(IDepth);
250 }
251 unsigned NewRootIdx = InsInstrs.size() - 1;
252 return InstrDepth[NewRootIdx];
253}
254
255/// Computes instruction latency as max of latency of defined operands.
256///
257/// \param Root is a machine instruction that could be replaced by NewRoot.
258/// It is used to compute a more accurate latency information for NewRoot in
259/// case there is a dependent instruction in the same trace (\p BlockTrace)
260/// \param NewRoot is the instruction for which the latency is computed
261/// \param BlockTrace is a trace of machine instructions
262///
263/// \returns Latency of \p NewRoot
264unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
265 MachineTraceMetrics::Trace BlockTrace) {
266 // Check each definition in NewRoot and compute the latency
267 unsigned NewRootLatency = 0;
268
269 for (const MachineOperand &MO : NewRoot->all_defs()) {
270 // Check for virtual register operand.
271 if (!MO.getReg().isVirtual())
272 continue;
273 // Get the first instruction that uses MO
274 MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg());
275 RI++;
276 if (RI == MRI->reg_end())
277 continue;
278 MachineInstr *UseMO = RI->getParent();
279 unsigned LatencyOp = 0;
280 if (UseMO && BlockTrace.isDepInTrace(*Root, *UseMO)) {
281 LatencyOp = TSchedModel.computeOperandLatency(
282 NewRoot,
283 NewRoot->findRegisterDefOperandIdx(MO.getReg(), /*TRI=*/nullptr),
284 UseMO,
285 UseMO->findRegisterUseOperandIdx(MO.getReg(), /*TRI=*/nullptr));
286 } else {
287 LatencyOp = TSchedModel.computeInstrLatency(NewRoot);
288 }
289 NewRootLatency = std::max(NewRootLatency, LatencyOp);
290 }
291 return NewRootLatency;
292}
293
294CombinerObjective MachineCombiner::getCombinerObjective(unsigned Pattern) {
295 // TODO: If C++ ever gets a real enum class, make this part of the
296 // MachineCombinerPattern class.
297 switch (Pattern) {
298 case MachineCombinerPattern::REASSOC_AX_BY:
299 case MachineCombinerPattern::REASSOC_AX_YB:
300 case MachineCombinerPattern::REASSOC_XA_BY:
301 case MachineCombinerPattern::REASSOC_XA_YB:
302 return CombinerObjective::MustReduceDepth;
303 default:
304 return TII->getCombinerObjective(Pattern);
305 }
306}
307
308/// Estimate the latency of the new and original instruction sequence by summing
309/// up the latencies of the inserted and deleted instructions. This assumes
310/// that the inserted and deleted instructions are dependent instruction chains,
311/// which might not hold in all cases.
312std::pair<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences(
313 MachineInstr &MI, SmallVectorImpl<MachineInstr *> &InsInstrs,
314 SmallVectorImpl<MachineInstr *> &DelInstrs,
315 MachineTraceMetrics::Trace BlockTrace) {
316 assert(!InsInstrs.empty() && "Only support sequences that insert instrs.");
317 unsigned NewRootLatency = 0;
318 // NewRoot is the last instruction in the \p InsInstrs vector.
319 MachineInstr *NewRoot = InsInstrs.back();
320 for (unsigned i = 0; i < InsInstrs.size() - 1; i++)
321 NewRootLatency += TSchedModel.computeInstrLatency(InsInstrs[i]);
322 NewRootLatency += getLatency(&MI, NewRoot, BlockTrace);
323
324 unsigned RootLatency = 0;
325 for (auto *I : DelInstrs)
326 RootLatency += TSchedModel.computeInstrLatency(I);
327
328 return {NewRootLatency, RootLatency};
329}
330
331bool MachineCombiner::reduceRegisterPressure(
332 MachineInstr &Root, MachineBasicBlock *MBB,
333 SmallVectorImpl<MachineInstr *> &InsInstrs,
334 SmallVectorImpl<MachineInstr *> &DelInstrs, unsigned Pattern) {
335 // FIXME: for now, we don't do any check for the register pressure patterns.
336 // We treat them as always profitable. But we can do better if we make
337 // RegPressureTracker class be aware of TIE attribute. Then we can get an
338 // accurate compare of register pressure with DelInstrs or InsInstrs.
339 return true;
340}
341
342/// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
343/// The new code sequence ends in MI NewRoot. A necessary condition for the new
344/// sequence to replace the old sequence is that it cannot lengthen the critical
345/// path. The definition of "improve" may be restricted by specifying that the
346/// new path improves the data dependency chain (MustReduceDepth).
347bool MachineCombiner::improvesCriticalPathLen(
348 MachineBasicBlock *MBB, MachineInstr *Root,
350 SmallVectorImpl<MachineInstr *> &InsInstrs,
351 SmallVectorImpl<MachineInstr *> &DelInstrs,
352 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned Pattern,
353 bool SlackIsAccurate) {
354 // Get depth and latency of NewRoot and Root.
355 unsigned NewRootDepth =
356 getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace, *MBB);
357 unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth;
358
359 LLVM_DEBUG(dbgs() << " Dependence data for " << *Root << "\tNewRootDepth: "
360 << NewRootDepth << "\tRootDepth: " << RootDepth);
361
362 // For a transform such as reassociation, the cost equation is
363 // conservatively calculated so that we must improve the depth (data
364 // dependency cycles) in the critical path to proceed with the transform.
365 // Being conservative also protects against inaccuracies in the underlying
366 // machine trace metrics and CPU models.
367 if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) {
368 LLVM_DEBUG(dbgs() << "\tIt MustReduceDepth ");
369 LLVM_DEBUG(NewRootDepth < RootDepth
370 ? dbgs() << "\t and it does it\n"
371 : dbgs() << "\t but it does NOT do it\n");
372 return NewRootDepth < RootDepth;
373 }
374
375 // A more flexible cost calculation for the critical path includes the slack
376 // of the original code sequence. This may allow the transform to proceed
377 // even if the instruction depths (data dependency cycles) become worse.
378
379 // Account for the latency of the inserted and deleted instructions by
380 unsigned NewRootLatency, RootLatency;
381 if (TII->accumulateInstrSeqToRootLatency(*Root)) {
382 std::tie(NewRootLatency, RootLatency) =
383 getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace);
384 } else {
385 NewRootLatency = TSchedModel.computeInstrLatency(InsInstrs.back());
386 RootLatency = TSchedModel.computeInstrLatency(Root);
387 }
388
389 unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
390 unsigned NewCycleCount = NewRootDepth + NewRootLatency;
391 unsigned OldCycleCount =
392 RootDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0);
393 LLVM_DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency
394 << "\tRootLatency: " << RootLatency << "\n\tRootSlack: "
395 << RootSlack << " SlackIsAccurate=" << SlackIsAccurate
396 << "\n\tNewRootDepth + NewRootLatency = " << NewCycleCount
397 << "\n\tRootDepth + RootLatency + RootSlack = "
398 << OldCycleCount);
399 LLVM_DEBUG(NewCycleCount <= OldCycleCount
400 ? dbgs() << "\n\t It IMPROVES PathLen because"
401 : dbgs() << "\n\t It DOES NOT improve PathLen because");
402 LLVM_DEBUG(dbgs() << "\n\t\tNewCycleCount = " << NewCycleCount
403 << ", OldCycleCount = " << OldCycleCount << "\n");
404
405 return NewCycleCount <= OldCycleCount;
406}
407
408/// helper routine to convert instructions into SC
409void MachineCombiner::instr2instrSC(
410 SmallVectorImpl<MachineInstr *> &Instrs,
411 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) {
412 for (auto *InstrPtr : Instrs) {
413 unsigned Opc = InstrPtr->getOpcode();
414 unsigned Idx = TII->get(Opc).getSchedClass();
415 const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx);
416 InstrsSC.push_back(SC);
417 }
418}
419
420/// True when the new instructions do not increase resource length
421bool MachineCombiner::preservesResourceLen(
422 MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace,
423 SmallVectorImpl<MachineInstr *> &InsInstrs,
424 SmallVectorImpl<MachineInstr *> &DelInstrs) {
425 if (!TSchedModel.hasInstrSchedModel())
426 return true;
427
428 // Compute current resource length
429
430 //ArrayRef<const MachineBasicBlock *> MBBarr(MBB);
432 MBBarr.push_back(MBB);
433 unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr);
434
435 // Deal with SC rather than Instructions.
438
439 instr2instrSC(InsInstrs, InsInstrsSC);
440 instr2instrSC(DelInstrs, DelInstrsSC);
441
442 ArrayRef<const MCSchedClassDesc *> MSCInsArr{InsInstrsSC};
443 ArrayRef<const MCSchedClassDesc *> MSCDelArr{DelInstrsSC};
444
445 // Compute new resource length.
446 unsigned ResLenAfterCombine =
447 BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr);
448
449 LLVM_DEBUG(dbgs() << "\t\tResource length before replacement: "
450 << ResLenBeforeCombine
451 << " and after: " << ResLenAfterCombine << "\n");
453 ResLenAfterCombine <=
454 ResLenBeforeCombine + TII->getExtendResourceLenLimit()
455 ? dbgs() << "\t\t As result it IMPROVES/PRESERVES Resource Length\n"
456 : dbgs() << "\t\t As result it DOES NOT improve/preserve Resource "
457 "Length\n");
458
459 return ResLenAfterCombine <=
460 ResLenBeforeCombine + TII->getExtendResourceLenLimit();
461}
462
463/// Inserts InsInstrs and deletes DelInstrs. Incrementally updates instruction
464/// depths if requested.
465///
466/// \param MBB basic block to insert instructions in
467/// \param MI current machine instruction
468/// \param InsInstrs new instructions to insert in \p MBB
469/// \param DelInstrs instruction to delete from \p MBB
470/// \param TraceEnsemble is a pointer to the machine trace information
471/// \param RegUnits set of live registers, needed to compute instruction depths
472/// \param TII is target instruction info, used to call target hook
473/// \param Pattern is used to call target hook finalizeInsInstrs
474/// \param IncrementalUpdate if true, compute instruction depths incrementally,
475/// otherwise invalidate the trace
476static void
480 MachineTraceMetrics::Ensemble *TraceEnsemble,
481 LiveRegUnitSet &RegUnits, const TargetInstrInfo *TII,
482 unsigned Pattern, bool IncrementalUpdate) {
483 // If we want to fix up some placeholder for some target, do it now.
484 // We need this because in genAlternativeCodeSequence, we have not decided the
485 // better pattern InsInstrs or DelInstrs, so we don't want generate some
486 // sideeffect to the function. For example we need to delay the constant pool
487 // entry creation here after InsInstrs is selected as better pattern.
488 // Otherwise the constant pool entry created for InsInstrs will not be deleted
489 // even if InsInstrs is not the better pattern.
490 TII->finalizeInsInstrs(MI, Pattern, InsInstrs);
491
492 for (auto *InstrPtr : InsInstrs)
493 MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr);
494
495 for (auto *InstrPtr : DelInstrs) {
496 InstrPtr->eraseFromParent();
497 // Erase all LiveRegs defined by the removed instruction
498 for (auto *I = RegUnits.begin(); I != RegUnits.end();) {
499 if (I->MI == InstrPtr)
500 I = RegUnits.erase(I);
501 else
502 I++;
503 }
504 }
505
506 if (IncrementalUpdate)
507 for (auto *InstrPtr : InsInstrs)
508 TraceEnsemble->updateDepth(MBB, *InstrPtr, RegUnits);
509 else
510 TraceEnsemble->invalidate(MBB);
511
512 NumInstCombined++;
513}
514
515/// Substitute a slow code sequence with a faster one by
516/// evaluating instruction combining pattern.
517/// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
518/// combining based on machine trace metrics. Only combine a sequence of
519/// instructions when this neither lengthens the critical path nor increases
520/// resource pressure. When optimizing for codesize always combine when the new
521/// sequence is shorter.
522bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
523 bool Changed = false;
524 LLVM_DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
525
526 bool IncrementalUpdate = false;
527 auto BlockIter = MBB->begin();
528 decltype(BlockIter) LastUpdate;
529 // Check if the block is in a loop.
530 const MachineLoop *ML = MLI->getLoopFor(MBB);
531 if (!TraceEnsemble)
532 TraceEnsemble = Traces->getEnsemble(TII->getMachineCombinerTraceStrategy());
533
534 LiveRegUnitSet RegUnits;
535 RegUnits.setUniverse(TRI->getNumRegUnits());
536
537 bool OptForSize = llvm::shouldOptimizeForSize(MBB, PSI, MBFI);
538
539 bool DoRegPressureReduce =
540 TII->shouldReduceRegisterPressure(MBB, &RegClassInfo);
541
542 while (BlockIter != MBB->end()) {
543 auto &MI = *BlockIter++;
544 SmallVector<unsigned, 16> Patterns;
545 // The motivating example is:
546 //
547 // MUL Other MUL_op1 MUL_op2 Other
548 // \ / \ | /
549 // ADD/SUB => MADD/MSUB
550 // (=Root) (=NewRoot)
551
552 // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is
553 // usually beneficial for code size it unfortunately can hurt performance
554 // when the ADD is on the critical path, but the MUL is not. With the
555 // substitution the MUL becomes part of the critical path (in form of the
556 // MADD) and can lengthen it on architectures where the MADD latency is
557 // longer than the ADD latency.
558 //
559 // For each instruction we check if it can be the root of a combiner
560 // pattern. Then for each pattern the new code sequence in form of MI is
561 // generated and evaluated. When the efficiency criteria (don't lengthen
562 // critical path, don't use more resources) is met the new sequence gets
563 // hooked up into the basic block before the old sequence is removed.
564 //
565 // The algorithm does not try to evaluate all patterns and pick the best.
566 // This is only an artificial restriction though. In practice there is
567 // mostly one pattern, and getMachineCombinerPatterns() can order patterns
568 // based on an internal cost heuristic. If
569 // machine-combiner-verify-pattern-order is enabled, all patterns are
570 // checked to ensure later patterns do not provide better latency savings.
571
572 if (!TII->getMachineCombinerPatterns(MI, Patterns, DoRegPressureReduce))
573 continue;
574
575 // Only used when VerifyPatternOrder is enabled.
576 [[maybe_unused]] long PrevLatencyDiff = std::numeric_limits<long>::max();
577
578 for (const auto P : Patterns) {
581 DenseMap<Register, unsigned> InstrIdxForVirtReg;
582 TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
583 InstrIdxForVirtReg);
584 // Found pattern, but did not generate alternative sequence.
585 // This can happen e.g. when an immediate could not be materialized
586 // in a single instruction.
587 if (InsInstrs.empty())
588 continue;
589
591 dbgs() << "\tFor the Pattern (" << (int)P
592 << ") these instructions could be removed\n";
593 for (auto const *InstrPtr : DelInstrs)
594 InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
595 /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
596 dbgs() << "\tThese instructions could replace the removed ones\n";
597 for (auto const *InstrPtr : InsInstrs)
598 InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
599 /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
600 });
601
602 // Check that the difference between original and new latency is
603 // decreasing for later patterns. This helps to discover sub-optimal
604 // pattern orderings.
606 auto [NewRootLatency, RootLatency] = getLatenciesForInstrSequences(
607 MI, InsInstrs, DelInstrs, TraceEnsemble->getTrace(MBB));
608 long CurrentLatencyDiff = ((long)RootLatency) - ((long)NewRootLatency);
609 assert(CurrentLatencyDiff <= PrevLatencyDiff &&
610 "Current pattern is expected to be better than the previous "
611 "pattern.");
612 PrevLatencyDiff = CurrentLatencyDiff;
613 }
614
615 if (IncrementalUpdate && LastUpdate != BlockIter) {
616 // Update depths since the last incremental update.
617 TraceEnsemble->updateDepths(LastUpdate, BlockIter, RegUnits);
618 LastUpdate = BlockIter;
619 }
620
621 if (DoRegPressureReduce &&
622 getCombinerObjective(P) ==
623 CombinerObjective::MustReduceRegisterPressure) {
624 if (MBB->size() > inc_threshold) {
625 // Use incremental depth updates for basic blocks above threshold
626 IncrementalUpdate = true;
627 LastUpdate = BlockIter;
628 }
629 if (reduceRegisterPressure(MI, MBB, InsInstrs, DelInstrs, P)) {
630 // Replace DelInstrs with InsInstrs.
631 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
632 RegUnits, TII, P, IncrementalUpdate);
633 Changed |= true;
634
635 // Go back to previous instruction as it may have ILP reassociation
636 // opportunity.
637 BlockIter--;
638 break;
639 }
640 }
641
642 if (ML && TII->isThroughputPattern(P)) {
643 LLVM_DEBUG(dbgs() << "\t Replacing due to throughput pattern in loop\n");
644 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
645 RegUnits, TII, P, IncrementalUpdate);
646 // Eagerly stop after the first pattern fires.
647 Changed = true;
648 break;
649 } else if (OptForSize && InsInstrs.size() < DelInstrs.size()) {
650 LLVM_DEBUG(dbgs() << "\t Replacing due to OptForSize ("
651 << InsInstrs.size() << " < "
652 << DelInstrs.size() << ")\n");
653 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
654 RegUnits, TII, P, IncrementalUpdate);
655 // Eagerly stop after the first pattern fires.
656 Changed = true;
657 break;
658 } else {
659 // For big basic blocks, we only compute the full trace the first time
660 // we hit this. We do not invalidate the trace, but instead update the
661 // instruction depths incrementally.
662 // NOTE: Only the instruction depths up to MI are accurate. All other
663 // trace information is not updated.
664 MachineTraceMetrics::Trace BlockTrace = TraceEnsemble->getTrace(MBB);
665 Traces->verifyAnalysis();
666 if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs,
667 InstrIdxForVirtReg, P,
668 !IncrementalUpdate) &&
669 preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) {
670 if (MBB->size() > inc_threshold) {
671 // Use incremental depth updates for basic blocks above treshold
672 IncrementalUpdate = true;
673 LastUpdate = BlockIter;
674 }
675
676 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
677 RegUnits, TII, P, IncrementalUpdate);
678
679 // Eagerly stop after the first pattern fires.
680 Changed = true;
681 break;
682 }
683 // Cleanup instructions of the alternative code sequence. There is no
684 // use for them.
685 MachineFunction *MF = MBB->getParent();
686 for (auto *InstrPtr : InsInstrs)
687 MF->deleteMachineInstr(InstrPtr);
688 }
689 InstrIdxForVirtReg.clear();
690 }
691 }
692
693 if (Changed && IncrementalUpdate)
694 Traces->invalidate(MBB);
695 return Changed;
696}
697
698bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
699 STI = &MF.getSubtarget();
700 TII = STI->getInstrInfo();
701 TRI = STI->getRegisterInfo();
702 SchedModel = STI->getSchedModel();
703 TSchedModel.init(STI);
704 MRI = &MF.getRegInfo();
705 MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
706 Traces = &getAnalysis<MachineTraceMetricsWrapperPass>().getMTM();
707 PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
708 MBFI = (PSI && PSI->hasProfileSummary()) ?
709 &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
710 nullptr;
711 TraceEnsemble = nullptr;
712 RegClassInfo.runOnMachineFunction(MF);
713
714 LLVM_DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n');
715 if (!TII->useMachineCombiner()) {
717 dbgs()
718 << " Skipping pass: Target does not support machine combiner\n");
719 return false;
720 }
721
722 bool Changed = false;
723
724 // Try to combine instructions.
725 for (auto &MBB : MF)
726 Changed |= combineInstructions(&MBB);
727
728 return Changed;
729}
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
This file defines the DenseMap class.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
===- LazyMachineBlockFrequencyInfo.h - Lazy Block Frequency -*- C++ -*–===//
#define I(x, y, z)
Definition MD5.cpp:57
static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, MachineTraceMetrics::Ensemble *TraceEnsemble, LiveRegUnitSet &RegUnits, const TargetInstrInfo *TII, unsigned Pattern, bool IncrementalUpdate)
Inserts InsInstrs and deletes DelInstrs.
static cl::opt< bool > VerifyPatternOrder("machine-combiner-verify-pattern-order", cl::Hidden, cl::desc("Verify that the generated patterns are ordered by increasing latency"), cl::init(false))
static cl::opt< unsigned > inc_threshold("machine-combiner-inc-threshold", cl::Hidden, cl::desc("Incremental depth computation will be used for basic " "blocks with more instructions."), cl::init(500))
static cl::opt< bool > dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden, cl::desc("Dump all substituted intrs"), cl::init(false))
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
uint64_t IntrinsicInst * II
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
Represent the analysis usage information of a pass.
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:81
bool useMachineCombiner() const override
This is an alternative analysis pass to MachineBlockFrequencyInfo.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
const MCSchedModel & getSchedModel() const
Get the machine model for this subtarget's CPU.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
LLVM_ABI StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
Analysis pass which computes a MachineDominatorTree.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
LLVM_ABI int findRegisterUseOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isKill=false) const
Returns the operand index that is a use of the specific register or -1 if it is not found.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
defusechain_iterator< true, true, false, true, false > reg_iterator
reg_iterator/reg_begin/reg_end - Walk all defs and uses of the specified register.
A trace ensemble is a collection of traces selected using the same strategy, for example 'minimum res...
void invalidate(const MachineBasicBlock *MBB)
Invalidate traces through BadMBB.
void updateDepth(TraceBlockInfo &TBI, const MachineInstr &, LiveRegUnitSet &RegUnits)
Updates the depth of an machine instruction, given RegUnits.
void updateDepths(MachineBasicBlock::iterator Start, MachineBasicBlock::iterator End, LiveRegUnitSet &RegUnits)
Updates the depth of the instructions from Start to End.
Trace getTrace(const MachineBasicBlock *MBB)
Get the trace that passes through MBB.
unsigned getResourceLength(ArrayRef< const MachineBasicBlock * > Extrablocks={}, ArrayRef< const MCSchedClassDesc * > ExtraInstrs={}, ArrayRef< const MCSchedClassDesc * > RemoveInstrs={}) const
Return the resource length of the trace.
InstrCycles getInstrCycles(const MachineInstr &MI) const
Return the depth and height of MI.
unsigned getInstrSlack(const MachineInstr &MI) const
Return the slack of MI.
bool isDepInTrace(const MachineInstr &DefMI, const MachineInstr &UseMI) const
A dependence is useful if the basic block of the defining instruction is part of the trace of the use...
Ensemble * getEnsemble(MachineTraceStrategy)
Get the trace ensemble representing the given trace selection strategy.
void invalidate(const MachineBasicBlock *MBB)
Invalidate cached information about MBB.
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Analysis providing profile information.
bool hasProfileSummary() const
Returns true if profile summary is available.
LLVM_ABI void runOnMachineFunction(const MachineFunction &MF, bool Rev=false)
runOnFunction - Prepare to answer questions about MF.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
iterator erase(iterator I)
erase - Erases an existing element identified by a valid iterator.
Definition SparseSet.h:287
const_iterator begin() const
Definition SparseSet.h:170
const_iterator end() const
Definition SparseSet.h:171
void setUniverse(unsigned U)
setUniverse - Set the universe size which determines the largest key the set can hold.
Definition SparseSet.h:152
TargetInstrInfo - Interface to description of machine instruction set.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Provide an instruction scheduling machine model to CodeGen passes.
LLVM_ABI bool hasInstrSchedModel() const
Return true if this machine model includes an instruction-level scheduling model.
LLVM_ABI void init(const TargetSubtargetInfo *TSInfo, bool EnableSModel=true, bool EnableSItins=true)
Initialize the machine model for instruction scheduling.
LLVM_ABI unsigned computeOperandLatency(const MachineInstr *DefMI, unsigned DefOperIdx, const MachineInstr *UseMI, unsigned UseOperIdx) const
Compute operand latency based on the available machine model.
bool hasInstrSchedModelOrItineraries() const
Return true if this machine model includes an instruction-level scheduling model or cycle-to-cycle it...
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Changed
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
LLVM_ABI bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
LLVM_ABI char & MachineCombinerID
This pass performs instruction combining using trace metrics to estimate critical-path and resource d...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
CombinerObjective
The combiner's goal may differ based on which pattern it is attempting to optimize.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
SparseSet< LiveRegUnit, MCRegUnit, MCRegUnitToIndex > LiveRegUnitSet
ArrayRef(const T &OneElt) -> ArrayRef< T >
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:258
const MCSchedClassDesc * getSchedClassDesc(unsigned SchedClassIdx) const
Definition MCSchedule.h:366
unsigned Depth
Earliest issue cycle as determined by data dependencies and instruction latencies from the beginning ...