LLVM 23.0.0git
MachineCombiner.cpp
Go to the documentation of this file.
1//===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// The machine combiner pass uses machine trace metrics to ensure the combined
10// instructions do not lengthen the critical path or the resource depth.
11//===----------------------------------------------------------------------===//
12
13#include "llvm/ADT/DenseMap.h"
14#include "llvm/ADT/Statistic.h"
32#include "llvm/Support/Debug.h"
34
35using namespace llvm;
36
37#define DEBUG_TYPE "machine-combiner"
38
39STATISTIC(NumInstCombined, "Number of machineinst combined");
40
42inc_threshold("machine-combiner-inc-threshold", cl::Hidden,
43 cl::desc("Incremental depth computation will be used for basic "
44 "blocks with more instructions."), cl::init(500));
45
46static cl::opt<bool> dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden,
47 cl::desc("Dump all substituted intrs"),
48 cl::init(false));
49
50#ifdef EXPENSIVE_CHECKS
52 "machine-combiner-verify-pattern-order", cl::Hidden,
54 "Verify that the generated patterns are ordered by increasing latency"),
55 cl::init(true));
56#else
58 "machine-combiner-verify-pattern-order", cl::Hidden,
60 "Verify that the generated patterns are ordered by increasing latency"),
61 cl::init(false));
62#endif
63
64namespace {
65class MachineCombiner : public MachineFunctionPass {
66 const TargetSubtargetInfo *STI = nullptr;
67 const TargetInstrInfo *TII = nullptr;
68 const TargetRegisterInfo *TRI = nullptr;
69 MCSchedModel SchedModel;
70 MachineRegisterInfo *MRI = nullptr;
71 MachineLoopInfo *MLI = nullptr; // Current MachineLoopInfo
72 MachineTraceMetrics *Traces = nullptr;
73 MachineTraceMetrics::Ensemble *TraceEnsemble = nullptr;
74 MachineBlockFrequencyInfo *MBFI = nullptr;
75 ProfileSummaryInfo *PSI = nullptr;
76 RegisterClassInfo RegClassInfo;
77
78 TargetSchedModel TSchedModel;
79
80public:
81 static char ID;
82 MachineCombiner() : MachineFunctionPass(ID) {}
83 void getAnalysisUsage(AnalysisUsage &AU) const override;
84 bool runOnMachineFunction(MachineFunction &MF) override;
85 StringRef getPassName() const override { return "Machine InstCombiner"; }
86
87private:
88 bool combineInstructions(MachineBasicBlock *);
89 MachineInstr *getOperandDef(const MachineOperand &MO);
90 bool isTransientMI(const MachineInstr *MI);
91 unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
92 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
94 const MachineBasicBlock &MBB);
95 unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot,
97 bool improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
99 SmallVectorImpl<MachineInstr *> &InsInstrs,
100 SmallVectorImpl<MachineInstr *> &DelInstrs,
101 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
102 unsigned Pattern, bool SlackIsAccurate);
103 bool reduceRegisterPressure(MachineInstr &Root, MachineBasicBlock *MBB,
104 SmallVectorImpl<MachineInstr *> &InsInstrs,
105 SmallVectorImpl<MachineInstr *> &DelInstrs,
106 unsigned Pattern);
107 bool preservesResourceLen(MachineBasicBlock *MBB,
109 SmallVectorImpl<MachineInstr *> &InsInstrs,
110 SmallVectorImpl<MachineInstr *> &DelInstrs);
111 void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs,
112 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC);
113 std::pair<unsigned, unsigned>
114 getLatenciesForInstrSequences(MachineInstr &MI,
115 SmallVectorImpl<MachineInstr *> &InsInstrs,
116 SmallVectorImpl<MachineInstr *> &DelInstrs,
117 MachineTraceMetrics::Trace BlockTrace);
118
119 CombinerObjective getCombinerObjective(unsigned Pattern);
120};
121}
122
123char MachineCombiner::ID = 0;
124char &llvm::MachineCombinerID = MachineCombiner::ID;
125
127 "Machine InstCombiner", false, false)
130INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner",
132
133void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
134 AU.setPreservesCFG();
135 AU.addPreserved<MachineDominatorTreeWrapperPass>();
136 AU.addRequired<MachineLoopInfoWrapperPass>();
137 AU.addPreserved<MachineLoopInfoWrapperPass>();
138 AU.addRequired<MachineTraceMetricsWrapperPass>();
139 AU.addPreserved<MachineTraceMetricsWrapperPass>();
140 AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
141 AU.addRequired<ProfileSummaryInfoWrapperPass>();
143}
144
146MachineCombiner::getOperandDef(const MachineOperand &MO) {
147 MachineInstr *DefInstr = nullptr;
148 // We need a virtual register definition.
149 if (MO.isReg() && MO.getReg().isVirtual())
150 DefInstr = MRI->getUniqueVRegDef(MO.getReg());
151 return DefInstr;
152}
153
154/// Return true if MI is unlikely to generate an actual target instruction.
155bool MachineCombiner::isTransientMI(const MachineInstr *MI) {
156 if (!MI->isCopy())
157 return MI->isTransient();
158
159 // If MI is a COPY, check if its src and dst registers can be coalesced.
160 Register Dst = MI->getOperand(0).getReg();
161 Register Src = MI->getOperand(1).getReg();
162
163 if (!MI->isFullCopy()) {
164 // If src RC contains super registers of dst RC, it can also be coalesced.
165 if (MI->getOperand(0).getSubReg() || Src.isPhysical() || Dst.isPhysical())
166 return false;
167
168 auto SrcSub = MI->getOperand(1).getSubReg();
169 auto SrcRC = MRI->getRegClass(Src);
170 auto DstRC = MRI->getRegClass(Dst);
171 return TRI->getMatchingSuperRegClass(SrcRC, DstRC, SrcSub) != nullptr;
172 }
173
174 if (Src.isPhysical() && Dst.isPhysical())
175 return Src == Dst;
176
177 if (Src.isVirtual() && Dst.isVirtual()) {
178 auto SrcRC = MRI->getRegClass(Src);
179 auto DstRC = MRI->getRegClass(Dst);
180 return SrcRC->hasSuperClassEq(DstRC) || SrcRC->hasSubClassEq(DstRC);
181 }
182
183 if (Src.isVirtual())
184 std::swap(Src, Dst);
185
186 // Now Src is physical register, Dst is virtual register.
187 auto DstRC = MRI->getRegClass(Dst);
188 return DstRC->contains(Src);
189}
190
191/// Computes depth of instructions in vector \InsInstr.
192///
193/// \param InsInstrs is a vector of machine instructions
194/// \param InstrIdxForVirtReg is a dense map of virtual register to index
195/// of defining machine instruction in \p InsInstrs
196/// \param BlockTrace is a trace of machine instructions
197///
198/// \returns Depth of last instruction in \InsInstrs ("NewRoot")
199unsigned
200MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
201 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
203 const MachineBasicBlock &MBB) {
204 SmallVector<unsigned, 16> InstrDepth;
205 // For each instruction in the new sequence compute the depth based on the
206 // operands. Use the trace information when possible. For new operands which
207 // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
208 for (auto *InstrPtr : InsInstrs) { // for each Use
209 unsigned IDepth = 0;
210 for (const MachineOperand &MO : InstrPtr->all_uses()) {
211 // Check for virtual register operand.
212 if (!MO.getReg().isVirtual())
213 continue;
214 unsigned DepthOp = 0;
215 unsigned LatencyOp = 0;
216 auto II = InstrIdxForVirtReg.find(MO.getReg());
217 if (II != InstrIdxForVirtReg.end()) {
218 // Operand is new virtual register not in trace
219 assert(II->second < InstrDepth.size() && "Bad Index");
220 MachineInstr *DefInstr = InsInstrs[II->second];
221 assert(DefInstr &&
222 "There must be a definition for a new virtual register");
223 DepthOp = InstrDepth[II->second];
224 int DefIdx =
225 DefInstr->findRegisterDefOperandIdx(MO.getReg(), /*TRI=*/nullptr);
226 int UseIdx =
227 InstrPtr->findRegisterUseOperandIdx(MO.getReg(), /*TRI=*/nullptr);
228 LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx,
229 InstrPtr, UseIdx);
230 } else {
231 MachineInstr *DefInstr = getOperandDef(MO);
232 if (DefInstr && (TII->getMachineCombinerTraceStrategy() !=
233 MachineTraceStrategy::TS_Local ||
234 DefInstr->getParent() == &MBB)) {
235 DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth;
236 if (!isTransientMI(DefInstr))
237 LatencyOp = TSchedModel.computeOperandLatency(
238 DefInstr,
239 DefInstr->findRegisterDefOperandIdx(MO.getReg(),
240 /*TRI=*/nullptr),
241 InstrPtr,
242 InstrPtr->findRegisterUseOperandIdx(MO.getReg(),
243 /*TRI=*/nullptr));
244 }
245 }
246 IDepth = std::max(IDepth, DepthOp + LatencyOp);
247 }
248 InstrDepth.push_back(IDepth);
249 }
250 unsigned NewRootIdx = InsInstrs.size() - 1;
251 return InstrDepth[NewRootIdx];
252}
253
254/// Computes instruction latency as max of latency of defined operands.
255///
256/// \param Root is a machine instruction that could be replaced by NewRoot.
257/// It is used to compute a more accurate latency information for NewRoot in
258/// case there is a dependent instruction in the same trace (\p BlockTrace)
259/// \param NewRoot is the instruction for which the latency is computed
260/// \param BlockTrace is a trace of machine instructions
261///
262/// \returns Latency of \p NewRoot
263unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
264 MachineTraceMetrics::Trace BlockTrace) {
265 // Check each definition in NewRoot and compute the latency
266 unsigned NewRootLatency = 0;
267
268 for (const MachineOperand &MO : NewRoot->all_defs()) {
269 // Check for virtual register operand.
270 if (!MO.getReg().isVirtual())
271 continue;
272 // Get the first instruction that uses MO
274 RI++;
275 if (RI == MRI->reg_end())
276 continue;
277 MachineInstr *UseMO = RI->getParent();
278 unsigned LatencyOp = 0;
279 if (UseMO && BlockTrace.isDepInTrace(*Root, *UseMO)) {
280 LatencyOp = TSchedModel.computeOperandLatency(
281 NewRoot,
282 NewRoot->findRegisterDefOperandIdx(MO.getReg(), /*TRI=*/nullptr),
283 UseMO,
284 UseMO->findRegisterUseOperandIdx(MO.getReg(), /*TRI=*/nullptr));
285 } else {
286 LatencyOp = TSchedModel.computeInstrLatency(NewRoot);
287 }
288 NewRootLatency = std::max(NewRootLatency, LatencyOp);
289 }
290 return NewRootLatency;
291}
292
293CombinerObjective MachineCombiner::getCombinerObjective(unsigned Pattern) {
294 // TODO: If C++ ever gets a real enum class, make this part of the
295 // MachineCombinerPattern class.
296 switch (Pattern) {
297 case MachineCombinerPattern::REASSOC_AX_BY:
298 case MachineCombinerPattern::REASSOC_AX_YB:
299 case MachineCombinerPattern::REASSOC_XA_BY:
300 case MachineCombinerPattern::REASSOC_XA_YB:
301 return CombinerObjective::MustReduceDepth;
302 default:
303 return TII->getCombinerObjective(Pattern);
304 }
305}
306
307/// Estimate the latency of the new and original instruction sequence by summing
308/// up the latencies of the inserted and deleted instructions. This assumes
309/// that the inserted and deleted instructions are dependent instruction chains,
310/// which might not hold in all cases.
311std::pair<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences(
312 MachineInstr &MI, SmallVectorImpl<MachineInstr *> &InsInstrs,
313 SmallVectorImpl<MachineInstr *> &DelInstrs,
314 MachineTraceMetrics::Trace BlockTrace) {
315 assert(!InsInstrs.empty() && "Only support sequences that insert instrs.");
316 unsigned NewRootLatency = 0;
317 // NewRoot is the last instruction in the \p InsInstrs vector.
318 MachineInstr *NewRoot = InsInstrs.back();
319 for (unsigned i = 0; i < InsInstrs.size() - 1; i++)
320 NewRootLatency += TSchedModel.computeInstrLatency(InsInstrs[i]);
321 NewRootLatency += getLatency(&MI, NewRoot, BlockTrace);
322
323 unsigned RootLatency = 0;
324 for (auto *I : DelInstrs)
325 RootLatency += TSchedModel.computeInstrLatency(I);
326
327 return {NewRootLatency, RootLatency};
328}
329
330bool MachineCombiner::reduceRegisterPressure(
331 MachineInstr &Root, MachineBasicBlock *MBB,
332 SmallVectorImpl<MachineInstr *> &InsInstrs,
333 SmallVectorImpl<MachineInstr *> &DelInstrs, unsigned Pattern) {
334 // FIXME: for now, we don't do any check for the register pressure patterns.
335 // We treat them as always profitable. But we can do better if we make
336 // RegPressureTracker class be aware of TIE attribute. Then we can get an
337 // accurate compare of register pressure with DelInstrs or InsInstrs.
338 return true;
339}
340
341/// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
342/// The new code sequence ends in MI NewRoot. A necessary condition for the new
343/// sequence to replace the old sequence is that it cannot lengthen the critical
344/// path. The definition of "improve" may be restricted by specifying that the
345/// new path improves the data dependency chain (MustReduceDepth).
346bool MachineCombiner::improvesCriticalPathLen(
347 MachineBasicBlock *MBB, MachineInstr *Root,
349 SmallVectorImpl<MachineInstr *> &InsInstrs,
350 SmallVectorImpl<MachineInstr *> &DelInstrs,
351 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned Pattern,
352 bool SlackIsAccurate) {
353 // Get depth and latency of NewRoot and Root.
354 unsigned NewRootDepth =
355 getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace, *MBB);
356 unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth;
357
358 LLVM_DEBUG(dbgs() << " Dependence data for " << *Root << "\tNewRootDepth: "
359 << NewRootDepth << "\tRootDepth: " << RootDepth);
360
361 // For a transform such as reassociation, the cost equation is
362 // conservatively calculated so that we must improve the depth (data
363 // dependency cycles) in the critical path to proceed with the transform.
364 // Being conservative also protects against inaccuracies in the underlying
365 // machine trace metrics and CPU models.
366 if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) {
367 LLVM_DEBUG(dbgs() << "\tIt MustReduceDepth ");
368 LLVM_DEBUG(NewRootDepth < RootDepth
369 ? dbgs() << "\t and it does it\n"
370 : dbgs() << "\t but it does NOT do it\n");
371 return NewRootDepth < RootDepth;
372 }
373
374 // A more flexible cost calculation for the critical path includes the slack
375 // of the original code sequence. This may allow the transform to proceed
376 // even if the instruction depths (data dependency cycles) become worse.
377
378 // Account for the latency of the inserted and deleted instructions by
379 unsigned NewRootLatency, RootLatency;
380 if (TII->accumulateInstrSeqToRootLatency(*Root)) {
381 std::tie(NewRootLatency, RootLatency) =
382 getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace);
383 } else {
384 NewRootLatency = TSchedModel.computeInstrLatency(InsInstrs.back());
385 RootLatency = TSchedModel.computeInstrLatency(Root);
386 }
387
388 unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
389 unsigned NewCycleCount = NewRootDepth + NewRootLatency;
390 unsigned OldCycleCount =
391 RootDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0);
392 LLVM_DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency
393 << "\tRootLatency: " << RootLatency << "\n\tRootSlack: "
394 << RootSlack << " SlackIsAccurate=" << SlackIsAccurate
395 << "\n\tNewRootDepth + NewRootLatency = " << NewCycleCount
396 << "\n\tRootDepth + RootLatency + RootSlack = "
397 << OldCycleCount);
398 LLVM_DEBUG(NewCycleCount <= OldCycleCount
399 ? dbgs() << "\n\t It IMPROVES PathLen because"
400 : dbgs() << "\n\t It DOES NOT improve PathLen because");
401 LLVM_DEBUG(dbgs() << "\n\t\tNewCycleCount = " << NewCycleCount
402 << ", OldCycleCount = " << OldCycleCount << "\n");
403
404 return NewCycleCount <= OldCycleCount;
405}
406
407/// helper routine to convert instructions into SC
408void MachineCombiner::instr2instrSC(
409 SmallVectorImpl<MachineInstr *> &Instrs,
410 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) {
411 for (auto *InstrPtr : Instrs) {
412 unsigned Opc = InstrPtr->getOpcode();
413 unsigned Idx = TII->get(Opc).getSchedClass();
414 const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx);
415 InstrsSC.push_back(SC);
416 }
417}
418
419/// True when the new instructions do not increase resource length
420bool MachineCombiner::preservesResourceLen(
421 MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace,
422 SmallVectorImpl<MachineInstr *> &InsInstrs,
423 SmallVectorImpl<MachineInstr *> &DelInstrs) {
424 if (!TSchedModel.hasInstrSchedModel())
425 return true;
426
427 // Compute current resource length
428
429 //ArrayRef<const MachineBasicBlock *> MBBarr(MBB);
431 MBBarr.push_back(MBB);
432 unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr);
433
434 // Deal with SC rather than Instructions.
437
438 instr2instrSC(InsInstrs, InsInstrsSC);
439 instr2instrSC(DelInstrs, DelInstrsSC);
440
441 ArrayRef<const MCSchedClassDesc *> MSCInsArr{InsInstrsSC};
442 ArrayRef<const MCSchedClassDesc *> MSCDelArr{DelInstrsSC};
443
444 // Compute new resource length.
445 unsigned ResLenAfterCombine =
446 BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr);
447
448 LLVM_DEBUG(dbgs() << "\t\tResource length before replacement: "
449 << ResLenBeforeCombine
450 << " and after: " << ResLenAfterCombine << "\n");
452 ResLenAfterCombine <=
453 ResLenBeforeCombine + TII->getExtendResourceLenLimit()
454 ? dbgs() << "\t\t As result it IMPROVES/PRESERVES Resource Length\n"
455 : dbgs() << "\t\t As result it DOES NOT improve/preserve Resource "
456 "Length\n");
457
458 return ResLenAfterCombine <=
459 ResLenBeforeCombine + TII->getExtendResourceLenLimit();
460}
461
462/// Inserts InsInstrs and deletes DelInstrs. Incrementally updates instruction
463/// depths if requested.
464///
465/// \param MBB basic block to insert instructions in
466/// \param MI current machine instruction
467/// \param InsInstrs new instructions to insert in \p MBB
468/// \param DelInstrs instruction to delete from \p MBB
469/// \param TraceEnsemble is a pointer to the machine trace information
470/// \param RegUnits set of live registers, needed to compute instruction depths
471/// \param TII is target instruction info, used to call target hook
472/// \param Pattern is used to call target hook finalizeInsInstrs
473/// \param IncrementalUpdate if true, compute instruction depths incrementally,
474/// otherwise invalidate the trace
475static void
479 MachineTraceMetrics::Ensemble *TraceEnsemble,
480 LiveRegUnitSet &RegUnits, const TargetInstrInfo *TII,
481 unsigned Pattern, bool IncrementalUpdate) {
482 // If we want to fix up some placeholder for some target, do it now.
483 // We need this because in genAlternativeCodeSequence, we have not decided the
484 // better pattern InsInstrs or DelInstrs, so we don't want generate some
485 // sideeffect to the function. For example we need to delay the constant pool
486 // entry creation here after InsInstrs is selected as better pattern.
487 // Otherwise the constant pool entry created for InsInstrs will not be deleted
488 // even if InsInstrs is not the better pattern.
489 TII->finalizeInsInstrs(MI, Pattern, InsInstrs);
490
491 for (auto *InstrPtr : InsInstrs)
492 MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr);
493
494 for (auto *InstrPtr : DelInstrs) {
495 InstrPtr->eraseFromParent();
496 // Erase all LiveRegs defined by the removed instruction
497 for (auto *I = RegUnits.begin(); I != RegUnits.end();) {
498 if (I->MI == InstrPtr)
499 I = RegUnits.erase(I);
500 else
501 I++;
502 }
503 }
504
505 if (IncrementalUpdate)
506 for (auto *InstrPtr : InsInstrs)
507 TraceEnsemble->updateDepth(MBB, *InstrPtr, RegUnits);
508 else
509 TraceEnsemble->invalidate(MBB);
510
511 NumInstCombined++;
512}
513
514/// Substitute a slow code sequence with a faster one by
515/// evaluating instruction combining pattern.
516/// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
517/// combining based on machine trace metrics. Only combine a sequence of
518/// instructions when this neither lengthens the critical path nor increases
519/// resource pressure. When optimizing for codesize always combine when the new
520/// sequence is shorter.
521bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
522 bool Changed = false;
523 LLVM_DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
524
525 bool IncrementalUpdate = false;
526 auto BlockIter = MBB->begin();
527 decltype(BlockIter) LastUpdate;
528 // Check if the block is in a loop.
529 const MachineLoop *ML = MLI->getLoopFor(MBB);
530 if (!TraceEnsemble)
531 TraceEnsemble = Traces->getEnsemble(TII->getMachineCombinerTraceStrategy());
532
533 LiveRegUnitSet RegUnits;
534 RegUnits.setUniverse(TRI->getNumRegUnits());
535
536 bool OptForSize = llvm::shouldOptimizeForSize(MBB, PSI, MBFI);
537
538 bool DoRegPressureReduce =
539 TII->shouldReduceRegisterPressure(MBB, &RegClassInfo);
540
541 while (BlockIter != MBB->end()) {
542 auto &MI = *BlockIter++;
543 SmallVector<unsigned, 16> Patterns;
544 // The motivating example is:
545 //
546 // MUL Other MUL_op1 MUL_op2 Other
547 // \ / \ | /
548 // ADD/SUB => MADD/MSUB
549 // (=Root) (=NewRoot)
550
551 // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is
552 // usually beneficial for code size it unfortunately can hurt performance
553 // when the ADD is on the critical path, but the MUL is not. With the
554 // substitution the MUL becomes part of the critical path (in form of the
555 // MADD) and can lengthen it on architectures where the MADD latency is
556 // longer than the ADD latency.
557 //
558 // For each instruction we check if it can be the root of a combiner
559 // pattern. Then for each pattern the new code sequence in form of MI is
560 // generated and evaluated. When the efficiency criteria (don't lengthen
561 // critical path, don't use more resources) is met the new sequence gets
562 // hooked up into the basic block before the old sequence is removed.
563 //
564 // The algorithm does not try to evaluate all patterns and pick the best.
565 // This is only an artificial restriction though. In practice there is
566 // mostly one pattern, and getMachineCombinerPatterns() can order patterns
567 // based on an internal cost heuristic. If
568 // machine-combiner-verify-pattern-order is enabled, all patterns are
569 // checked to ensure later patterns do not provide better latency savings.
570
571 if (!TII->getMachineCombinerPatterns(MI, Patterns, DoRegPressureReduce))
572 continue;
573
574 // Only used when VerifyPatternOrder is enabled.
575 [[maybe_unused]] long PrevLatencyDiff = std::numeric_limits<long>::max();
576
577 for (const auto P : Patterns) {
580 DenseMap<Register, unsigned> InstrIdxForVirtReg;
581 TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
582 InstrIdxForVirtReg);
583 // Found pattern, but did not generate alternative sequence.
584 // This can happen e.g. when an immediate could not be materialized
585 // in a single instruction.
586 if (InsInstrs.empty())
587 continue;
588
590 dbgs() << "\tFor the Pattern (" << (int)P
591 << ") these instructions could be removed\n";
592 for (auto const *InstrPtr : DelInstrs)
593 InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
594 /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
595 dbgs() << "\tThese instructions could replace the removed ones\n";
596 for (auto const *InstrPtr : InsInstrs)
597 InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
598 /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
599 });
600
601 // Check that the difference between original and new latency is
602 // decreasing for later patterns. This helps to discover sub-optimal
603 // pattern orderings.
605 auto [NewRootLatency, RootLatency] = getLatenciesForInstrSequences(
606 MI, InsInstrs, DelInstrs, TraceEnsemble->getTrace(MBB));
607 long CurrentLatencyDiff = ((long)RootLatency) - ((long)NewRootLatency);
608 assert(CurrentLatencyDiff <= PrevLatencyDiff &&
609 "Current pattern is expected to be better than the previous "
610 "pattern.");
611 PrevLatencyDiff = CurrentLatencyDiff;
612 }
613
614 if (IncrementalUpdate && LastUpdate != BlockIter) {
615 // Update depths since the last incremental update.
616 TraceEnsemble->updateDepths(LastUpdate, BlockIter, RegUnits);
617 LastUpdate = BlockIter;
618 }
619
620 if (DoRegPressureReduce &&
621 getCombinerObjective(P) ==
622 CombinerObjective::MustReduceRegisterPressure) {
623 if (MBB->size() > inc_threshold) {
624 // Use incremental depth updates for basic blocks above threshold
625 IncrementalUpdate = true;
626 LastUpdate = BlockIter;
627 }
628 if (reduceRegisterPressure(MI, MBB, InsInstrs, DelInstrs, P)) {
629 // Replace DelInstrs with InsInstrs.
630 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
631 RegUnits, TII, P, IncrementalUpdate);
632 Changed |= true;
633
634 // Go back to previous instruction as it may have ILP reassociation
635 // opportunity.
636 BlockIter--;
637 break;
638 }
639 }
640
641 if (ML && TII->isThroughputPattern(P)) {
642 LLVM_DEBUG(dbgs() << "\t Replacing due to throughput pattern in loop\n");
643 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
644 RegUnits, TII, P, IncrementalUpdate);
645 // Eagerly stop after the first pattern fires.
646 Changed = true;
647 break;
648 } else if (OptForSize && InsInstrs.size() < DelInstrs.size()) {
649 LLVM_DEBUG(dbgs() << "\t Replacing due to OptForSize ("
650 << InsInstrs.size() << " < "
651 << DelInstrs.size() << ")\n");
652 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
653 RegUnits, TII, P, IncrementalUpdate);
654 // Eagerly stop after the first pattern fires.
655 Changed = true;
656 break;
657 } else {
658 // For big basic blocks, we only compute the full trace the first time
659 // we hit this. We do not invalidate the trace, but instead update the
660 // instruction depths incrementally.
661 // NOTE: Only the instruction depths up to MI are accurate. All other
662 // trace information is not updated.
663 MachineTraceMetrics::Trace BlockTrace = TraceEnsemble->getTrace(MBB);
664 Traces->verifyAnalysis();
665 if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs,
666 InstrIdxForVirtReg, P,
667 !IncrementalUpdate) &&
668 preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) {
669 if (MBB->size() > inc_threshold) {
670 // Use incremental depth updates for basic blocks above treshold
671 IncrementalUpdate = true;
672 LastUpdate = BlockIter;
673 }
674
675 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
676 RegUnits, TII, P, IncrementalUpdate);
677
678 // Eagerly stop after the first pattern fires.
679 Changed = true;
680 break;
681 }
682 // Cleanup instructions of the alternative code sequence. There is no
683 // use for them.
684 MachineFunction *MF = MBB->getParent();
685 for (auto *InstrPtr : InsInstrs)
686 MF->deleteMachineInstr(InstrPtr);
687 }
688 InstrIdxForVirtReg.clear();
689 }
690 }
691
692 if (Changed && IncrementalUpdate)
693 Traces->invalidate(MBB);
694 return Changed;
695}
696
697bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
698 STI = &MF.getSubtarget();
699 TII = STI->getInstrInfo();
700 TRI = STI->getRegisterInfo();
701 SchedModel = STI->getSchedModel();
702 TSchedModel.init(STI);
703 MRI = &MF.getRegInfo();
704 MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
705 Traces = &getAnalysis<MachineTraceMetricsWrapperPass>().getMTM();
706 PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
707 MBFI = (PSI && PSI->hasProfileSummary()) ?
708 &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
709 nullptr;
710 TraceEnsemble = nullptr;
711 RegClassInfo.runOnMachineFunction(MF);
712
713 LLVM_DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n');
714 if (!TII->useMachineCombiner()) {
716 dbgs()
717 << " Skipping pass: Target does not support machine combiner\n");
718 return false;
719 }
720
721 bool Changed = false;
722
723 // Try to combine instructions.
724 for (auto &MBB : MF)
725 Changed |= combineInstructions(&MBB);
726
727 return Changed;
728}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
This file defines the DenseMap class.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
===- LazyMachineBlockFrequencyInfo.h - Lazy Block Frequency -*- C++ -*–===//
#define I(x, y, z)
Definition MD5.cpp:57
static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, MachineTraceMetrics::Ensemble *TraceEnsemble, LiveRegUnitSet &RegUnits, const TargetInstrInfo *TII, unsigned Pattern, bool IncrementalUpdate)
Inserts InsInstrs and deletes DelInstrs.
static cl::opt< bool > VerifyPatternOrder("machine-combiner-verify-pattern-order", cl::Hidden, cl::desc("Verify that the generated patterns are ordered by increasing latency"), cl::init(false))
static cl::opt< unsigned > inc_threshold("machine-combiner-inc-threshold", cl::Hidden, cl::desc("Incremental depth computation will be used for basic " "blocks with more instructions."), cl::init(500))
static cl::opt< bool > dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden, cl::desc("Dump all substituted intrs"), cl::init(false))
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
uint64_t IntrinsicInst * II
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
Represent the analysis usage information of a pass.
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:81
bool useMachineCombiner() const override
This is an alternative analysis pass to MachineBlockFrequencyInfo.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
const MCSchedModel & getSchedModel() const
Get the machine model for this subtarget's CPU.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
LLVM_ABI StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
Analysis pass which computes a MachineDominatorTree.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
LLVM_ABI int findRegisterUseOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isKill=false) const
Returns the operand index that is a use of the specific register or -1 if it is not found.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static reg_iterator reg_end()
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
reg_iterator reg_begin(Register RegNo) const
defusechain_iterator< true, true, false, true, false > reg_iterator
reg_iterator/reg_begin/reg_end - Walk all defs and uses of the specified register.
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
A trace ensemble is a collection of traces selected using the same strategy, for example 'minimum res...
void invalidate(const MachineBasicBlock *MBB)
Invalidate traces through BadMBB.
void updateDepth(TraceBlockInfo &TBI, const MachineInstr &, LiveRegUnitSet &RegUnits)
Updates the depth of an machine instruction, given RegUnits.
void updateDepths(MachineBasicBlock::iterator Start, MachineBasicBlock::iterator End, LiveRegUnitSet &RegUnits)
Updates the depth of the instructions from Start to End.
Trace getTrace(const MachineBasicBlock *MBB)
Get the trace that passes through MBB.
unsigned getResourceLength(ArrayRef< const MachineBasicBlock * > Extrablocks={}, ArrayRef< const MCSchedClassDesc * > ExtraInstrs={}, ArrayRef< const MCSchedClassDesc * > RemoveInstrs={}) const
Return the resource length of the trace.
InstrCycles getInstrCycles(const MachineInstr &MI) const
Return the depth and height of MI.
unsigned getInstrSlack(const MachineInstr &MI) const
Return the slack of MI.
bool isDepInTrace(const MachineInstr &DefMI, const MachineInstr &UseMI) const
A dependence is useful if the basic block of the defining instruction is part of the trace of the use...
Ensemble * getEnsemble(MachineTraceStrategy)
Get the trace ensemble representing the given trace selection strategy.
void invalidate(const MachineBasicBlock *MBB)
Invalidate cached information about MBB.
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Analysis providing profile information.
bool hasProfileSummary() const
Returns true if profile summary is available.
LLVM_ABI void runOnMachineFunction(const MachineFunction &MF, bool Rev=false)
runOnFunction - Prepare to answer questions about MF.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
iterator erase(iterator I)
erase - Erases an existing element identified by a valid iterator.
Definition SparseSet.h:287
const_iterator begin() const
Definition SparseSet.h:170
const_iterator end() const
Definition SparseSet.h:171
void setUniverse(unsigned U)
setUniverse - Set the universe size which determines the largest key the set can hold.
Definition SparseSet.h:152
TargetInstrInfo - Interface to description of machine instruction set.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Provide an instruction scheduling machine model to CodeGen passes.
LLVM_ABI bool hasInstrSchedModel() const
Return true if this machine model includes an instruction-level scheduling model.
LLVM_ABI void init(const TargetSubtargetInfo *TSInfo, bool EnableSModel=true, bool EnableSItins=true)
Initialize the machine model for instruction scheduling.
LLVM_ABI unsigned computeOperandLatency(const MachineInstr *DefMI, unsigned DefOperIdx, const MachineInstr *UseMI, unsigned UseOperIdx) const
Compute operand latency based on the available machine model.
bool hasInstrSchedModelOrItineraries() const
Return true if this machine model includes an instruction-level scheduling model or cycle-to-cycle it...
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Changed
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
LLVM_ABI char & MachineCombinerID
This pass performs instruction combining using trace metrics to estimate critical-path and resource d...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
CombinerObjective
The combiner's goal may differ based on which pattern it is attempting to optimize.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
SparseSet< LiveRegUnit, MCRegUnit, MCRegUnitToIndex > LiveRegUnitSet
ArrayRef(const T &OneElt) -> ArrayRef< T >
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:876
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:258
const MCSchedClassDesc * getSchedClassDesc(unsigned SchedClassIdx) const
Definition MCSchedule.h:366
unsigned Depth
Earliest issue cycle as determined by data dependencies and instruction latencies from the beginning ...