Go to the documentation of this file.
17 #include "llvm/Config/llvm-config.h"
40 #define DEBUG_TYPE "block-freq"
44 "check-bfi-unknown-block-queries",
46 cl::desc(
"Check if block frequency is queried for an unknown block "
47 "for debugging missed BFI updates"));
51 cl::desc(
"Apply an iterative post-processing to infer correct BFI counts"));
55 cl::desc(
"Iterative inference: maximum number of update iterations "
60 cl::desc(
"Iterative inference: delta convergence precision; smaller values "
61 "typically lead to better results at the cost of worsen runtime"));
70 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
82 for (
int Digits = 0; Digits < 16; ++Digits)
115 struct DitheringDistributer {
119 DitheringDistributer(Distribution &Dist,
const BlockMass &Mass);
126 DitheringDistributer::DitheringDistributer(Distribution &Dist,
129 RemWeight = Dist.Total;
134 assert(Weight &&
"invalid weight");
135 assert(Weight <= RemWeight);
145 Weight::DistType
Type) {
146 assert(Amount &&
"invalid weight of 0");
150 bool IsOverflow = NewTotal <
Total;
151 assert(!(DidOverflow && IsOverflow) &&
"unexpected repeated overflow");
152 DidOverflow |= IsOverflow;
158 Weights.push_back(Weight(
Type, Node, Amount));
162 assert(OtherW.TargetNode.isValid());
168 assert(
W.TargetNode == OtherW.TargetNode);
169 assert(OtherW.Amount &&
"Expected non-zero weight");
170 if (
W.Amount >
W.Amount + OtherW.Amount)
174 W.Amount += OtherW.Amount;
179 llvm::sort(Weights, [](
const Weight &L,
const Weight &R) {
180 return L.TargetNode < R.TargetNode;
184 WeightList::iterator
O = Weights.begin();
185 for (WeightList::const_iterator
I =
O, L =
O,
E = Weights.end();
I !=
E;
190 for (++L; L !=
E &&
I->TargetNode == L->TargetNode; ++L)
195 Weights.erase(
O, Weights.end());
203 for (
const Weight &
W : Weights)
207 if (Weights.size() == Combined.size())
212 Weights.reserve(Combined.size());
213 for (
const auto &
I : Combined)
214 Weights.push_back(
I.second);
219 if (Weights.size() > 128) {
235 void Distribution::normalize() {
241 if (Weights.size() > 1)
245 if (Weights.size() == 1) {
247 Weights.front().Amount = 1;
258 else if (
Total > UINT32_MAX)
265 assert(
Total == std::accumulate(Weights.begin(), Weights.end(), UINT64_C(0),
267 return Sum +
W.Amount;
269 "Expected total to be correct");
281 assert(
W.TargetNode.isValid());
283 assert(
W.Amount <= UINT32_MAX);
294 std::vector<FrequencyData>().swap(Freqs);
295 IsIrrLoopHeader.clear();
296 std::vector<WorkingData>().swap(Working);
304 std::vector<FrequencyData> SavedFreqs(
std::move(
BFI.Freqs));
319 auto isLoopHeader = [&OuterLoop](
const BlockNode &Node) {
320 return OuterLoop && OuterLoop->
isHeader(Node);
326 auto debugSuccessor = [&](
const char *
Type) {
329 if (!isLoopHeader(Resolved))
331 if (Resolved != Succ)
335 (void)debugSuccessor;
338 if (isLoopHeader(Resolved)) {
344 if (Working[Resolved.Index].getContainingLoop() != OuterLoop) {
350 if (Resolved < Pred) {
351 if (!isLoopHeader(Pred)) {
354 "unhandled irreducible control flow");
365 "unhandled irreducible control flow");
376 for (
const auto &
I :
Loop.Exits)
399 const Scaled64 InfiniteLoopScale(1, 12);
404 for (
auto &Mass :
Loop.BackedgeMass)
405 TotalBackedgeMass += Mass;
417 <<
" - scale = " <<
Loop.Scale <<
"\n");
426 if (
auto *
Loop = Working[
M.Index].getPackagedLoop())
430 Loop.IsPackaged =
true;
435 const DitheringDistributer &
D,
const BlockNode &
T,
437 dbgs() <<
" => assign " <<
M <<
" (" <<
D.RemMass <<
")";
439 dbgs() <<
" [" << Desc <<
"]";
441 dbgs() <<
" to " <<
BFI.getBlockName(
T);
453 DitheringDistributer
D(Dist, Mass);
458 if (
W.Type == Weight::Local) {
459 Working[
W.TargetNode.Index].getMass() += Taken;
465 assert(OuterLoop &&
"backedge or exit outside of loop");
468 if (
W.Type == Weight::Backedge) {
475 assert(
W.Type == Weight::Exit);
476 OuterLoop->
Exits.push_back(std::make_pair(
W.TargetNode, Taken));
489 const unsigned MaxBits = 64;
490 const unsigned SpreadBits = (Max / Min).lg();
492 if (SpreadBits <= MaxBits - 3) {
501 ScalingFactor =
Scaled64(1, MaxBits) / Max;
505 LLVM_DEBUG(
dbgs() <<
"float-to-int: min = " << Min <<
", max = " << Max
506 <<
", factor = " << ScalingFactor <<
"\n");
512 <<
", int = " <<
BFI.Freqs[
Index].Integer <<
"\n");
522 <<
": mass = " <<
Loop.Mass <<
", scale = " <<
Loop.Scale
525 Loop.IsPackaged =
false;
531 for (
const BlockNode &
N :
Loop.Nodes) {
532 const auto &Working =
BFI.Working[
N.Index];
533 Scaled64 &
F = Working.isAPackage() ? Working.getPackagedLoop()->Scale
534 :
BFI.Freqs[
N.Index].Scaled;
554 auto Min = Scaled64::getLargest();
555 auto Max = Scaled64::getZero();
574 if (!Node.isValid()) {
579 OS <<
"*** Detected BFI query for unknown block " <<
getBlockName(Node);
585 return Freqs[Node.Index].Integer;
588 std::optional<uint64_t>
591 bool AllowSynthetic)
const {
592 return getProfileCountFromFreq(
F, getBlockFreq(Node).getFrequency(),
596 std::optional<uint64_t>
599 bool AllowSynthetic)
const {
600 auto EntryCount =
F.getEntryCount(AllowSynthetic);
604 APInt BlockCount(128, EntryCount->getCount());
605 APInt BlockFreq(128, Freq);
606 APInt EntryFreq(128, getEntryFreq());
607 BlockCount *= BlockFreq;
610 BlockCount = (BlockCount + EntryFreq.
lshr(1)).udiv(EntryFreq);
618 return IsIrrLoopHeader.test(Node.Index);
624 return Scaled64::getZero();
625 return Freqs[Node.Index].Scaled;
630 assert(Node.isValid() &&
"Expected valid node");
631 assert(Node.Index < Freqs.size() &&
"Expected legal index");
632 Freqs[Node.Index].Integer = Freq;
648 return OS << getFloatingBlockFreq(Node);
657 return OS << Block / Entry;
663 for (
auto N : OuterLoop.
Nodes)
683 if (OuterLoop && OuterLoop->
isHeader(Succ))
689 Irr.
Edges.push_back(&SuccIrr);
690 SuccIrr.
Edges.push_front(&Irr);
715 const std::vector<const IrreducibleGraph::IrrNode *> &
SCC,
721 for (
const auto *
I :
SCC)
725 auto &Irr = *
I->first;
726 for (
const auto *
P :
make_range(Irr.pred_begin(), Irr.pred_end())) {
732 Headers.push_back(Irr.Node);
738 assert(Headers.size() >= 2 &&
739 "Expected irreducible CFG; -loop-info is likely invalid");
740 if (Headers.size() == InSCC.
size()) {
747 for (
const auto &
I : InSCC) {
752 auto &Irr = *
I.first;
753 for (
const auto *
P :
make_range(Irr.pred_begin(), Irr.pred_end())) {
755 if (
P->Node < Irr.Node)
764 Headers.push_back(Irr.Node);
769 if (Headers.back() == Irr.Node)
774 Others.push_back(Irr.Node);
783 LoopData *OuterLoop, std::list<LoopData>::iterator Insert,
784 const std::vector<const IrreducibleGraph::IrrNode *> &
SCC) {
792 auto Loop =
BFI.Loops.emplace(
Insert, OuterLoop, Headers.begin(),
793 Headers.end(), Others.begin(), Others.end());
796 for (
const auto &
N :
Loop->Nodes)
797 if (
BFI.Working[
N.Index].isLoopHeader())
798 BFI.Working[
N.Index].Loop->Parent = &*
Loop;
806 std::list<LoopData>::iterator Insert) {
808 auto Prev = OuterLoop ? std::prev(
Insert) :
Loops.end();
828 auto O = OuterLoop.
Nodes.begin() + 1;
829 for (
auto I =
O,
E = OuterLoop.
Nodes.end();
I !=
E; ++
I)
830 if (!Working[
I->Index].isPackaged())
836 assert(
Loop.isIrreducible() &&
"this only makes sense on irreducible loops");
849 auto &HeaderNode =
Loop.Nodes[
H];
850 auto &BackedgeMass =
Loop.BackedgeMass[
Loop.getHeaderIndex(HeaderNode)];
854 if (BackedgeMass.getMass() > 0)
855 Dist.
addLocal(HeaderNode, BackedgeMass.getMass());
860 DitheringDistributer
D(Dist, LoopMass);
863 <<
" to headers using above weights\n");
866 assert(
W.Type == Weight::Local &&
"all weights should be local");
867 Working[
W.TargetNode.Index].getMass() = Taken;
874 DitheringDistributer
D(Dist, LoopMass);
877 assert(
W.Type == Weight::Local &&
"all weights should be local");
878 Working[
W.TargetNode.Index].getMass() = Taken;
std::string getLoopName(const LoopData &Loop) const
static void combineWeightsByHashing(WeightList &Weights)
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This is an optimization pass for GlobalISel generic memory operations.
iterator erase(const_iterator CI)
void distributeIrrLoopHeaderMass(Distribution &Dist)
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
void computeLoopScale(LoopData &Loop)
Compute the loop scale for a loop.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Vector Rotate Left Mask Mask Insert
static void unwrapLoop(BlockFrequencyInfoImplBase &BFI, LoopData &Loop)
Unwrap a loop package.
void addNodesInLoop(const BFIBase::LoopData &OuterLoop)
Graph of irreducible control flow.
Represents a single loop in the control flow graph.
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
static void combineWeights(WeightList &Weights)
static NodeRef getEntryNode(const GraphT &G)
Stats about a block itself.
static ChildIteratorType child_end(NodeRef N)
ScaledNumber inverse() const
static void cleanup(BlockFrequencyInfoImplBase &BFI)
Clear all memory not needed downstream.
llvm::DenseMapBase< SmallDenseMap< KeyT, ValueT, 4, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::begin iterator begin()
The instances of the Type class are immutable: once they are created, they are never changed.
void addNode(const BlockNode &Node)
SmallVector< NodeAddr< NodeBase * >, 4 > NodeList
llvm::DenseMapBase< SmallDenseMap< KeyT, ValueT, 4, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::count size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
void addLocal(const BlockNode &Node, uint64_t Amount)
Expected< ExpressionValue > max(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
void clear()
Clear all memory.
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
BlockNode getHeader() const
std::optional< uint64_t > getProfileCountFromFreq(const Function &F, uint64_t Freq, bool AllowSynthetic=false) const
void packageLoop(LoopData &Loop)
Package up a loop.
GraphT::IrrNode::iterator ChildIteratorType
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void updateLoopWithIrreducible(LoopData &OuterLoop)
Update a loop after packaging irreducible SCCs inside of it.
void finalizeMetrics()
Finalize frequency metrics.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
void addEdge(IrrNode &Irr, const BlockNode &Succ, const BFIBase::LoopData *OuterLoop)
ScaledNumber< uint64_t > toScaled() const
Convert to scaled number.
BlockFrequency getBlockFreq(const BlockNode &Node) const
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
bool isHeader(const BlockNode &Node) const
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
void distributeMass(const BlockNode &Source, LoopData *OuterLoop, Distribution &Dist)
Distribute mass according to a distribution.
static BlockMass getFull()
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
This class implements an extremely fast bulk output stream that can only output to a stream.
static void createIrreducibleLoop(BlockFrequencyInfoImplBase &BFI, const IrreducibleGraph &G, LoopData *OuterLoop, std::list< LoopData >::iterator Insert, const std::vector< const IrreducibleGraph::IrrNode * > &SCC)
static void combineWeightsBySorting(WeightList &Weights)
Distribution of unscaled probability weight.
scc_iterator< T > scc_begin(const T &G)
Construct the begin iterator for a deduced graph type T.
void sort(IteratorTy Start, IteratorTy End)
raw_ostream & printBlockFreq(raw_ostream &OS, const BlockNode &Node) const
bool isIrreducible() const
SmallVector< Weight, 4 > WeightList
void addBackedge(const BlockNode &Node, uint64_t Amount)
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static void combineWeight(Weight &W, const Weight &OtherW)
compiles ldr LCPI1_0 ldr ldr mov lsr tst moveq r1 ldr LCPI1_1 and r0 bx lr It would be better to do something like to fold the shift into the conditional move
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
initializer< Ty > init(const Ty &Val)
llvm::cl::opt< double > IterativeBFIPrecision
std::vector< WorkingData > Working
Loop data: see initializeLoops().
static ChildIteratorType child_begin(NodeRef N)
static void debugAssign(const BlockFrequencyInfoImplBase &BFI, const DitheringDistributer &D, const BlockNode &T, const BlockMass &M, const char *Desc)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool addLoopSuccessorsToDist(const LoopData *OuterLoop, LoopData &Loop, Distribution &Dist)
Add all edges out of a packaged loop to the distribution.
void addNodesInFunction()
void unwrapLoops()
Unwrap loops.
static BlockMass getEmpty()
Class for arbitrary precision integers.
llvm::cl::opt< bool > CheckBFIUnknownBlockQueries
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
static void findIrreducibleHeaders(const BlockFrequencyInfoImplBase &BFI, const IrreducibleGraph &G, const std::vector< const IrreducibleGraph::IrrNode * > &SCC, LoopData::NodeList &Headers, LoopData::NodeList &Others)
Find extra irreducible headers.
StringRef str() const
Return a StringRef for the vector contents.
std::optional< uint64_t > getBlockProfileCount(const Function &F, const BlockNode &Node, bool AllowSynthetic=false) const
Base class for BlockFrequencyInfoImpl.
ExitMap Exits
Successor edges (and weights).
const CustomOperand< const MCSubtargetInfo & > Msg[]
ScaledNumber< uint64_t > Scaled64
llvm::DenseMapBase< SmallDenseMap< KeyT, ValueT, 4, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::end iterator end()
void adjustLoopHeaderMass(LoopData &Loop)
Adjust the mass of all headers in an irreducible loop.
Scaled64 getFloatingBlockFreq(const BlockNode &Node) const
WeightList Weights
Individual successor weights.
BlockT * getHeader() const
llvm::DenseMapBase< SmallDenseMap< KeyT, ValueT, 4, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::size unsigned size() const
void setBlockFreq(const BlockNode &Node, uint64_t Freq)
ScaledNumber< uint64_t > Scaled64
Unscaled probability weight.
NodeList Nodes
Header and the members of the loop.
static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI, const Scaled64 &Min, const Scaled64 &Max)
raw_ostream & print(raw_ostream &OS) const
static uint64_t shiftRightAndRound(uint64_t N, int Shift)
HeaderMassList BackedgeMass
Mass returned to each loop header.
unsigned countLeadingZeros(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
llvm::cl::opt< unsigned > IterativeBFIMaxIterationsPerBlock
std::string getBlockName(const BlockT *BB)
Get the name of a MachineBasicBlock.
A range adaptor for a pair of iterators.
llvm::cl::opt< bool > UseIterativeBFIInference
A raw_ostream that writes to an SmallVector or SmallString.
virtual std::string getBlockName(const BlockNode &Node) const
HeaderMassList::difference_type getHeaderIndex(const BlockNode &B)
bool isIrrLoopHeader(const BlockNode &Node)
std::vector< IrrNode > Nodes
SmallDenseMap< uint32_t, IrrNode *, 4 > Lookup
void addExit(const BlockNode &Node, uint64_t Amount)
Representative of a block.
std::deque< const IrrNode * > Edges
bool addToDist(Distribution &Dist, const LoopData *OuterLoop, const BlockNode &Pred, const BlockNode &Succ, uint64_t Weight)
Add an edge to the distribution.
iterator_range< std::list< LoopData >::iterator > analyzeIrreducible(const bfi_detail::IrreducibleGraph &G, LoopData *OuterLoop, std::list< LoopData >::iterator Insert)
Analyze irreducible SCCs.
static char getHexDigit(int N)