37#define DEBUG_TYPE "machine-combiner"
39STATISTIC(NumInstCombined,
"Number of machineinst combined");
43 cl::desc(
"Incremental depth computation will be used for basic "
44 "blocks with more instructions."),
cl::init(500));
47 cl::desc(
"Dump all substituted intrs"),
50#ifdef EXPENSIVE_CHECKS
52 "machine-combiner-verify-pattern-order",
cl::Hidden,
54 "Verify that the generated patterns are ordered by increasing latency"),
58 "machine-combiner-verify-pattern-order",
cl::Hidden,
60 "Verify that the generated patterns are ordered by increasing latency"),
119 std::pair<unsigned, unsigned>
130char MachineCombiner::ID = 0;
134 "Machine InstCombiner",
false,
false)
140void MachineCombiner::getAnalysisUsage(
AnalysisUsage &AU)
const {
141 AU.setPreservesCFG();
157 DefInstr =
MRI->getUniqueVRegDef(MO.
getReg());
159 if (DefInstr && DefInstr->
isPHI())
167 return MI->isTransient();
173 if (!
MI->isFullCopy()) {
175 if (
MI->getOperand(0).getSubReg() || Src.isPhysical() || Dst.isPhysical())
178 auto SrcSub =
MI->getOperand(1).getSubReg();
179 auto SrcRC =
MRI->getRegClass(Src);
180 auto DstRC =
MRI->getRegClass(Dst);
181 return TRI->getMatchingSuperRegClass(SrcRC, DstRC, SrcSub) !=
nullptr;
184 if (Src.isPhysical() && Dst.isPhysical())
187 if (Src.isVirtual() && Dst.isVirtual()) {
188 auto SrcRC =
MRI->getRegClass(Src);
189 auto DstRC =
MRI->getRegClass(Dst);
190 return SrcRC->hasSuperClassEq(DstRC) || SrcRC->hasSubClassEq(DstRC);
197 auto DstRC =
MRI->getRegClass(Dst);
198 return DstRC->contains(Src);
218 for (
auto *InstrPtr : InsInstrs) {
224 unsigned DepthOp = 0;
225 unsigned LatencyOp = 0;
228 if (II != InstrIdxForVirtReg.
end()) {
230 assert(II->second < InstrDepth.
size() &&
"Bad Index");
233 "There must be a definition for a new virtual register");
234 DepthOp = InstrDepth[II->second];
236 int UseIdx = InstrPtr->findRegisterUseOperandIdx(MO.
getReg());
237 LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx,
241 if (DefInstr && (
TII->getMachineCombinerTraceStrategy() !=
242 MachineTraceStrategy::TS_Local ||
245 if (!isTransientMI(DefInstr))
246 LatencyOp = TSchedModel.computeOperandLatency(
248 InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.
getReg()));
251 IDepth = std::max(IDepth, DepthOp + LatencyOp);
255 unsigned NewRootIdx = InsInstrs.size() - 1;
256 return InstrDepth[NewRootIdx];
271 unsigned NewRootLatency = 0;
280 if (RI ==
MRI->reg_end())
283 unsigned LatencyOp = 0;
285 LatencyOp = TSchedModel.computeOperandLatency(
289 LatencyOp = TSchedModel.computeInstrLatency(NewRoot);
291 NewRootLatency = std::max(NewRootLatency, LatencyOp);
293 return NewRootLatency;
308 case MachineCombinerPattern::REASSOC_AX_BY:
309 case MachineCombinerPattern::REASSOC_AX_YB:
310 case MachineCombinerPattern::REASSOC_XA_BY:
311 case MachineCombinerPattern::REASSOC_XA_YB:
312 case MachineCombinerPattern::REASSOC_XY_AMM_BMM:
313 case MachineCombinerPattern::REASSOC_XMM_AMM_BMM:
314 case MachineCombinerPattern::SUBADD_OP1:
315 case MachineCombinerPattern::SUBADD_OP2:
316 case MachineCombinerPattern::FMADD_AX:
317 case MachineCombinerPattern::FMADD_XA:
318 case MachineCombinerPattern::FMSUB:
319 case MachineCombinerPattern::FNMSUB:
321 case MachineCombinerPattern::REASSOC_XY_BCA:
322 case MachineCombinerPattern::REASSOC_XY_BAC:
333std::pair<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences(
337 assert(!InsInstrs.
empty() &&
"Only support sequences that insert instrs.");
338 unsigned NewRootLatency = 0;
341 for (
unsigned i = 0; i < InsInstrs.
size() - 1; i++)
342 NewRootLatency += TSchedModel.computeInstrLatency(InsInstrs[i]);
345 unsigned RootLatency = 0;
346 for (
auto *
I : DelInstrs)
347 RootLatency += TSchedModel.computeInstrLatency(
I);
349 return {NewRootLatency, RootLatency};
352bool MachineCombiner::reduceRegisterPressure(
369bool MachineCombiner::improvesCriticalPathLen(
376 bool SlackIsAccurate) {
378 unsigned NewRootDepth =
379 getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace, *
MBB);
382 LLVM_DEBUG(
dbgs() <<
" Dependence data for " << *Root <<
"\tNewRootDepth: "
383 << NewRootDepth <<
"\tRootDepth: " << RootDepth);
393 ?
dbgs() <<
"\t and it does it\n"
394 :
dbgs() <<
"\t but it does NOT do it\n");
395 return NewRootDepth < RootDepth;
403 unsigned NewRootLatency, RootLatency;
404 if (
TII->accumulateInstrSeqToRootLatency(*Root)) {
405 std::tie(NewRootLatency, RootLatency) =
406 getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace);
408 NewRootLatency = TSchedModel.computeInstrLatency(InsInstrs.
back());
409 RootLatency = TSchedModel.computeInstrLatency(Root);
413 unsigned NewCycleCount = NewRootDepth + NewRootLatency;
414 unsigned OldCycleCount =
415 RootDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0);
417 <<
"\tRootLatency: " << RootLatency <<
"\n\tRootSlack: "
418 << RootSlack <<
" SlackIsAccurate=" << SlackIsAccurate
419 <<
"\n\tNewRootDepth + NewRootLatency = " << NewCycleCount
420 <<
"\n\tRootDepth + RootLatency + RootSlack = "
423 ?
dbgs() <<
"\n\t It IMPROVES PathLen because"
424 :
dbgs() <<
"\n\t It DOES NOT improve PathLen because");
426 <<
", OldCycleCount = " << OldCycleCount <<
"\n");
428 return NewCycleCount <= OldCycleCount;
432void MachineCombiner::instr2instrSC(
435 for (
auto *InstrPtr : Instrs) {
436 unsigned Opc = InstrPtr->getOpcode();
437 unsigned Idx =
TII->get(Opc).getSchedClass();
444bool MachineCombiner::preservesResourceLen(
448 if (!TSchedModel.hasInstrSchedModel())
454 SmallVector <const MachineBasicBlock *, 1> MBBarr;
462 instr2instrSC(InsInstrs, InsInstrsSC);
463 instr2instrSC(DelInstrs, DelInstrsSC);
469 unsigned ResLenAfterCombine =
473 << ResLenBeforeCombine
474 <<
" and after: " << ResLenAfterCombine <<
"\n";);
476 ResLenAfterCombine <=
477 ResLenBeforeCombine +
TII->getExtendResourceLenLimit()
478 ?
dbgs() <<
"\t\t As result it IMPROVES/PRESERVES Resource Length\n"
479 :
dbgs() <<
"\t\t As result it DOES NOT improve/preserve Resource "
482 return ResLenAfterCombine <=
483 ResLenBeforeCombine +
TII->getExtendResourceLenLimit();
515 for (
auto *InstrPtr : InsInstrs)
518 for (
auto *InstrPtr : DelInstrs) {
519 InstrPtr->eraseFromParent();
521 for (
auto *
I = RegUnits.
begin();
I != RegUnits.
end();) {
522 if (
I->MI == InstrPtr)
529 if (IncrementalUpdate)
530 for (
auto *InstrPtr : InsInstrs)
540void MachineCombiner::verifyPatternOrder(
543 long PrevLatencyDiff = std::numeric_limits<long>::max();
544 (void)PrevLatencyDiff;
545 for (
auto P : Patterns) {
549 TII->genAlternativeCodeSequence(Root,
P, InsInstrs, DelInstrs,
554 if (InsInstrs.
empty() || !TSchedModel.hasInstrSchedModelOrItineraries())
557 unsigned NewRootLatency, RootLatency;
558 std::tie(NewRootLatency, RootLatency) = getLatenciesForInstrSequences(
559 Root, InsInstrs, DelInstrs, TraceEnsemble->getTrace(
MBB));
560 long CurrentLatencyDiff = ((long)RootLatency) - ((long)NewRootLatency);
561 assert(CurrentLatencyDiff <= PrevLatencyDiff &&
562 "Current pattern is better than previous pattern.");
563 PrevLatencyDiff = CurrentLatencyDiff;
575 bool Changed =
false;
578 bool IncrementalUpdate =
false;
580 decltype(BlockIter) LastUpdate;
584 TraceEnsemble = Traces->getEnsemble(
TII->getMachineCombinerTraceStrategy());
591 bool DoRegPressureReduce =
592 TII->shouldReduceRegisterPressure(
MBB, &RegClassInfo);
594 while (BlockIter !=
MBB->
end()) {
595 auto &
MI = *BlockIter++;
624 if (!
TII->getMachineCombinerPatterns(
MI, Patterns, DoRegPressureReduce))
628 verifyPatternOrder(
MBB,
MI, Patterns);
630 for (
const auto P : Patterns) {
634 TII->genAlternativeCodeSequence(
MI,
P, InsInstrs, DelInstrs,
639 if (InsInstrs.
empty())
643 dbgs() <<
"\tFor the Pattern (" << (int)
P
644 <<
") these instructions could be removed\n";
645 for (
auto const *InstrPtr : DelInstrs)
646 InstrPtr->print(
dbgs(),
false,
false,
648 dbgs() <<
"\tThese instructions could replace the removed ones\n";
649 for (
auto const *InstrPtr : InsInstrs)
650 InstrPtr->print(
dbgs(),
false,
false,
654 if (IncrementalUpdate && LastUpdate != BlockIter) {
656 TraceEnsemble->updateDepths(LastUpdate, BlockIter, RegUnits);
657 LastUpdate = BlockIter;
660 if (DoRegPressureReduce &&
665 IncrementalUpdate =
true;
666 LastUpdate = BlockIter;
668 if (reduceRegisterPressure(
MI,
MBB, InsInstrs, DelInstrs,
P)) {
671 RegUnits,
TII,
P, IncrementalUpdate);
681 if (
ML &&
TII->isThroughputPattern(
P)) {
682 LLVM_DEBUG(
dbgs() <<
"\t Replacing due to throughput pattern in loop\n");
684 RegUnits,
TII,
P, IncrementalUpdate);
688 }
else if (OptForSize && InsInstrs.size() < DelInstrs.size()) {
690 << InsInstrs.size() <<
" < "
691 << DelInstrs.size() <<
")\n");
693 RegUnits,
TII,
P, IncrementalUpdate);
704 Traces->verifyAnalysis();
705 if (improvesCriticalPathLen(
MBB, &
MI, BlockTrace, InsInstrs, DelInstrs,
706 InstrIdxForVirtReg,
P,
707 !IncrementalUpdate) &&
708 preservesResourceLen(
MBB, BlockTrace, InsInstrs, DelInstrs)) {
711 IncrementalUpdate =
true;
712 LastUpdate = BlockIter;
716 RegUnits,
TII,
P, IncrementalUpdate);
725 for (
auto *InstrPtr : InsInstrs)
728 InstrIdxForVirtReg.
clear();
732 if (Changed && IncrementalUpdate)
733 Traces->invalidate(
MBB);
739 TII = STI->getInstrInfo();
740 TRI = STI->getRegisterInfo();
741 SchedModel = STI->getSchedModel();
742 TSchedModel.init(STI);
744 MLI = &getAnalysis<MachineLoopInfo>();
745 Traces = &getAnalysis<MachineTraceMetrics>();
746 PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
747 MBFI = (PSI && PSI->hasProfileSummary()) ?
748 &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
750 TraceEnsemble =
nullptr;
752 RegClassInfo.runOnMachineFunction(MF);
755 if (!
TII->useMachineCombiner()) {
758 <<
" Skipping pass: Target does not support machine combiner\n");
762 bool Changed =
false;
766 Changed |= combineInstructions(&
MBB);
unsigned const MachineRegisterInfo * MRI
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines the DenseMap class.
static int getLatency(LLVMDisasmContext *DC, const MCInst &Inst)
Gets latency information for Inst, based on DC information.
const HexagonInstrInfo * TII
===- LazyMachineBlockFrequencyInfo.h - Lazy Block Frequency -*- C++ -*–===//
static CombinerObjective getCombinerObjective(MachineCombinerPattern P)
static cl::opt< bool > VerifyPatternOrder("machine-combiner-verify-pattern-order", cl::Hidden, cl::desc("Verify that the generated patterns are ordered by increasing latency"), cl::init(false))
static cl::opt< unsigned > inc_threshold("machine-combiner-inc-threshold", cl::Hidden, cl::desc("Incremental depth computation will be used for basic " "blocks with more instructions."), cl::init(500))
static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, MachineTraceMetrics::Ensemble *TraceEnsemble, SparseSet< LiveRegUnit > &RegUnits, const TargetInstrInfo *TII, MachineCombinerPattern Pattern, bool IncrementalUpdate)
Inserts InsInstrs and deletes DelInstrs.
static cl::opt< bool > dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden, cl::desc("Dump all substituted intrs"), cl::init(false))
CombinerObjective
The combiner's goal may differ based on which pattern it is attempting to optimize.
@ MustReduceRegisterPressure
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Represent the analysis usage information of a pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
iterator find(const_arg_type_t< KeyT > Val)
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
The core instruction combiner logic.
This is an alternative analysis pass to MachineBlockFrequencyInfo.
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
void deleteMachineInstr(MachineInstr *MI)
DeleteMachineInstr - Delete the given MachineInstr.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Representation of each machine instruction.
int findRegisterUseOperandIdx(Register Reg, bool isKill=false, const TargetRegisterInfo *TRI=nullptr) const
Returns the operand index that is a use of the specific register or -1 if it is not found.
const MachineBasicBlock * getParent() const
int findRegisterDefOperandIdx(Register Reg, bool isDead=false, bool Overlap=false, const TargetRegisterInfo *TRI=nullptr) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
iterator_range< filtered_mop_iterator > all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
Register getReg() const
getReg - Returns the register number.
reg_begin/reg_end - Provide iteration support to walk over all definitions and uses of a register wit...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
A trace ensemble is a collection of traces selected using the same strategy, for example 'minimum res...
void invalidate(const MachineBasicBlock *MBB)
Invalidate traces through BadMBB.
void updateDepth(TraceBlockInfo &TBI, const MachineInstr &, SparseSet< LiveRegUnit > &RegUnits)
Updates the depth of an machine instruction, given RegUnits.
A trace represents a plausible sequence of executed basic blocks that passes through the current basi...
InstrCycles getInstrCycles(const MachineInstr &MI) const
Return the depth and height of MI.
unsigned getInstrSlack(const MachineInstr &MI) const
Return the slack of MI.
bool isDepInTrace(const MachineInstr &DefMI, const MachineInstr &UseMI) const
A dependence is useful if the basic block of the defining instruction is part of the trace of the use...
unsigned getResourceLength(ArrayRef< const MachineBasicBlock * > Extrablocks=std::nullopt, ArrayRef< const MCSchedClassDesc * > ExtraInstrs=std::nullopt, ArrayRef< const MCSchedClassDesc * > RemoveInstrs=std::nullopt) const
Return the resource length of the trace.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Analysis providing profile information.
Wrapper class representing virtual and physical registers.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
SparseSet - Fast set implementation for objects that can be identified by small unsigned keys.
iterator erase(iterator I)
erase - Erases an existing element identified by a valid iterator.
const_iterator begin() const
const_iterator end() const
void setUniverse(unsigned U)
setUniverse - Set the universe size which determines the largest key the set can hold.
StringRef - Represent a constant reference to a string, i.e.
TargetInstrInfo - Interface to description of machine instruction set.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Provide an instruction scheduling machine model to CodeGen passes.
TargetSubtargetInfo - Generic base class for all target subtargets.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ SC
CHAIN = SC CHAIN, Imm128 - System call.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
char & MachineCombinerID
This pass performs instruction combining using trace metrics to estimate critical-path and resource d...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
MachineCombinerPattern
These are instruction patterns matched by the machine combiner pass.
void initializeMachineCombinerPass(PassRegistry &)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Machine model for scheduling, bundling, and heuristics.
unsigned Depth
Earliest issue cycle as determined by data dependencies and instruction latencies from the beginning ...