21#define DEBUG_TYPE "balanced-partitioning"
24 OS <<
formatv(
"{{ID={0} Utilities={{{1:$[,]}} Bucket={2}}",
Id,
28template <
typename Func>
29void BalancedPartitioning::BPThreadPool::async(Func &&
F) {
30#if LLVM_ENABLE_THREADS
33 TheThreadPool.async([=]() {
38 if (--NumActiveThreads == 0) {
41 std::unique_lock<std::mutex> lock(mtx);
42 assert(!IsFinishedSpawning);
43 IsFinishedSpawning =
true;
53void BalancedPartitioning::BPThreadPool::wait() {
54#if LLVM_ENABLE_THREADS
58 std::unique_lock<std::mutex> lock(mtx);
59 cv.wait(lock, [&]() {
return IsFinishedSpawning; });
60 assert(IsFinishedSpawning && NumActiveThreads == 0);
74 for (
unsigned I = 1;
I < LOG_CACHE_SIZE;
I++)
75 Log2Cache[
I] = std::log2(
I);
81 "Partitioning %d nodes using depth %d and %d iterations per split\n",
83 std::optional<BPThreadPool> TP;
84#if LLVM_ENABLE_THREADS
87 TP.emplace(TheThreadPool);
91 for (
unsigned I = 0;
I < Nodes.size();
I++)
92 Nodes[
I].InputOrderIndex =
I;
95 auto BisectTask = [=, &TP]() {
96 bisect(NodesRange, 0, 1, 0, TP);
99 TP->async(std::move(BisectTask));
106 return L.Bucket < R.Bucket;
112void BalancedPartitioning::bisect(
const FunctionNodeRange Nodes,
113 unsigned RecDepth,
unsigned RootBucket,
115 std::optional<BPThreadPool> &TP)
const {
116 unsigned NumNodes = std::distance(Nodes.begin(), Nodes.end());
117 if (NumNodes <= 1 || RecDepth >= Config.
SplitDepth) {
121 return L.InputOrderIndex < R.InputOrderIndex;
123 for (
auto &
N : Nodes)
129 NumNodes, RootBucket));
131 std::mt19937 RNG(RootBucket);
133 unsigned LeftBucket = 2 * RootBucket;
134 unsigned RightBucket = 2 * RootBucket + 1;
137 split(Nodes, LeftBucket);
139 runIterations(Nodes, RecDepth, LeftBucket, RightBucket, RNG);
144 unsigned MidOffset =
Offset + std::distance(Nodes.begin(), NodesMid);
149 auto LeftRecTask = [=, &TP]() {
150 bisect(LeftNodes, RecDepth + 1, LeftBucket,
Offset, TP);
152 auto RightRecTask = [=, &TP]() {
153 bisect(RightNodes, RecDepth + 1, RightBucket, MidOffset, TP);
156 if (TP && RecDepth < Config.TaskSplitDepth && NumNodes >= 4) {
157 TP->async(std::move(LeftRecTask));
158 TP->async(std::move(RightRecTask));
165void BalancedPartitioning::runIterations(
const FunctionNodeRange Nodes,
166 unsigned RecDepth,
unsigned LeftBucket,
167 unsigned RightBucket,
168 std::mt19937 &RNG)
const {
169 unsigned NumNodes = std::distance(Nodes.begin(), Nodes.end());
171 for (
auto &
N : Nodes)
172 for (
auto &UN :
N.UtilityNodes)
173 ++UtilityNodeDegree[UN];
176 for (
auto &
N : Nodes)
178 return UtilityNodeDegree[UN] <= 1 || UtilityNodeDegree[UN] >= NumNodes;
183 for (
auto &
N : Nodes)
184 for (
auto &UN :
N.UtilityNodes)
185 if (!UtilityNodeIndex.
count(UN))
186 UtilityNodeIndex[UN] = UtilityNodeIndex.
size();
187 for (
auto &
N : Nodes)
188 for (
auto &UN :
N.UtilityNodes)
189 UN = UtilityNodeIndex[UN];
193 for (
auto &
N : Nodes) {
194 for (
auto &UN :
N.UtilityNodes) {
196 if (
N.Bucket == LeftBucket) {
205 unsigned NumMovedNodes =
206 runIteration(Nodes, LeftBucket, RightBucket,
Signatures, RNG);
207 if (NumMovedNodes == 0)
212unsigned BalancedPartitioning::runIteration(
const FunctionNodeRange Nodes,
214 unsigned RightBucket,
216 std::mt19937 &RNG)
const {
219 if (Signature.CachedGainIsValid)
221 unsigned L = Signature.LeftCount;
222 unsigned R = Signature.RightCount;
223 assert((L > 0 || R > 0) &&
"incorrect signature");
224 float Cost = logCost(L, R);
225 Signature.CachedGainLR = 0.f;
226 Signature.CachedGainRL = 0.f;
228 Signature.CachedGainLR =
Cost - logCost(L - 1, R + 1);
230 Signature.CachedGainRL =
Cost - logCost(L + 1, R - 1);
231 Signature.CachedGainIsValid =
true;
235 typedef std::pair<float, BPFunctionNode *> GainPair;
236 std::vector<GainPair> Gains;
237 for (
auto &
N : Nodes) {
238 bool FromLeftToRight = (
N.Bucket == LeftBucket);
240 Gains.push_back(std::make_pair(Gain, &
N));
245 Gains, [&](
const auto &GP) {
return GP.second->Bucket == LeftBucket; });
250 auto LargerGain = [](
const auto &
L,
const auto &
R) {
251 return L.first >
R.first;
256 unsigned NumMovedDataVertices = 0;
257 for (
auto [LeftPair, RightPair] :
llvm::zip(LeftRange, RightRange)) {
258 auto &[LeftGain, LeftNode] = LeftPair;
259 auto &[RightGain, RightNode] = RightPair;
261 if (LeftGain + RightGain <= 0.f)
264 if (moveFunctionNode(*LeftNode, LeftBucket, RightBucket,
Signatures, RNG))
265 ++NumMovedDataVertices;
266 if (moveFunctionNode(*RightNode, LeftBucket, RightBucket,
Signatures, RNG))
267 ++NumMovedDataVertices;
269 return NumMovedDataVertices;
274 unsigned RightBucket,
276 std::mt19937 &RNG)
const {
278 if (std::uniform_real_distribution<float>(0.f, 1.f)(RNG) <=
282 bool FromLeftToRight = (
N.Bucket == LeftBucket);
284 N.Bucket = (FromLeftToRight ? RightBucket : LeftBucket);
287 if (FromLeftToRight) {
288 for (
auto &UN :
N.UtilityNodes) {
290 Signature.LeftCount--;
291 Signature.RightCount++;
292 Signature.CachedGainIsValid =
false;
295 for (
auto &UN :
N.UtilityNodes) {
297 Signature.LeftCount++;
298 Signature.RightCount--;
299 Signature.CachedGainIsValid =
false;
305void BalancedPartitioning::split(
const FunctionNodeRange Nodes,
306 unsigned StartBucket)
const {
307 unsigned NumNodes = std::distance(Nodes.begin(), Nodes.end());
308 auto NodesMid = Nodes.begin() + (NumNodes + 1) / 2;
310 std::nth_element(Nodes.begin(), NodesMid, Nodes.end(), [](
auto &L,
auto &R) {
311 return L.InputOrderIndex < R.InputOrderIndex;
315 N.Bucket = StartBucket;
317 N.Bucket = StartBucket + 1;
321 bool FromLeftToRight,
324 for (
auto &UN :
N.UtilityNodes)
325 Gain += (FromLeftToRight ?
Signatures[UN].CachedGainLR
330float BalancedPartitioning::logCost(
unsigned X,
unsigned Y)
const {
331 return -(
X * log2Cached(
X + 1) +
Y * log2Cached(
Y + 1));
334float BalancedPartitioning::log2Cached(
unsigned i)
const {
335 return (i < LOG_CACHE_SIZE) ? Log2Cache[i] : std::log2(i);
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static const FuncProtoTy Signatures[]
A function with a set of utility nodes where it is beneficial to order two functions close together i...
IDT Id
The ID of this node.
SmallVector< UtilityNodeT, 4 > UtilityNodes
The list of utility nodes associated with this node.
std::optional< unsigned > Bucket
The bucket assigned by balanced partitioning.
void dump(raw_ostream &OS) const
static float moveGain(const BPFunctionNode &N, bool FromLeftToRight, const SignaturesT &Signatures)
Compute the move gain for uniform log-gap cost.
void run(std::vector< BPFunctionNode > &Nodes) const
Run recursive graph partitioning that optimizes a given objective.
BalancedPartitioning(const BalancedPartitioningConfig &Config)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
A ThreadPool for asynchronous parallel execution on a defined number of threads.
This class implements an extremely fast bulk output stream that can only output to a stream.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
This is an optimization pass for GlobalISel generic memory operations.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
void stable_sort(R &&Range)
auto formatv(const char *Fmt, Ts &&... Vals) -> formatv_object< decltype(std::make_tuple(detail::build_format_adapter(std::forward< Ts >(Vals))...))>
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
auto partition(R &&Range, UnaryPredicate P)
Provide wrappers to std::partition which take ranges instead of having to pass begin/end explicitly.
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Algorithm parameters; default values are tuned on real-world binaries.
unsigned SplitDepth
The depth of the recursive bisection.
float SkipProbability
The probability for a vertex to skip a move from its current bucket to another bucket; it often helps...
unsigned TaskSplitDepth
Recursive subtasks up to the given depth are added to the queue and distributed among threads by Thre...
unsigned IterationsPerSplit
The maximum number of bp iterations per split.