LLVM  15.0.0git
CodeLayout.cpp
Go to the documentation of this file.
1 //===- CodeLayout.cpp - Implementation of code layout algorithms ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // ExtTSP - layout of basic blocks with i-cache optimization.
10 //
11 // The algorithm tries to find a layout of nodes (basic blocks) of a given CFG
12 // optimizing jump locality and thus processor I-cache utilization. This is
13 // achieved via increasing the number of fall-through jumps and co-locating
14 // frequently executed nodes together. The name follows the underlying
15 // optimization problem, Extended-TSP, which is a generalization of classical
16 // (maximum) Traveling Salesmen Problem.
17 //
18 // The algorithm is a greedy heuristic that works with chains (ordered lists)
19 // of basic blocks. Initially all chains are isolated basic blocks. On every
20 // iteration, we pick a pair of chains whose merging yields the biggest increase
21 // in the ExtTSP score, which models how i-cache "friendly" a specific chain is.
22 // A pair of chains giving the maximum gain is merged into a new chain. The
23 // procedure stops when there is only one chain left, or when merging does not
24 // increase ExtTSP. In the latter case, the remaining chains are sorted by
25 // density in the decreasing order.
26 //
27 // An important aspect is the way two chains are merged. Unlike earlier
28 // algorithms (e.g., based on the approach of Pettis-Hansen), two
29 // chains, X and Y, are first split into three, X1, X2, and Y. Then we
30 // consider all possible ways of gluing the three chains (e.g., X1YX2, X1X2Y,
31 // X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the largest score.
32 // This improves the quality of the final result (the search space is larger)
33 // while keeping the implementation sufficiently fast.
34 //
35 // Reference:
36 // * A. Newell and S. Pupyrev, Improved Basic Block Reordering,
37 // IEEE Transactions on Computers, 2020
38 //
39 //===----------------------------------------------------------------------===//
40 
43 
44 using namespace llvm;
45 #define DEBUG_TYPE "code-layout"
46 
48  "enable-ext-tsp-block-placement", cl::Hidden, cl::init(false),
49  cl::desc("Enable machine block placement based on the ext-tsp model, "
50  "optimizing I-cache utilization."));
51 
53  "ext-tsp-apply-without-profile",
54  cl::desc("Whether to apply ext-tsp placement for instances w/o profile"),
56 
57 // Algorithm-specific constants. The values are tuned for the best performance
58 // of large-scale front-end bound binaries.
59 static cl::opt<double>
60  ForwardWeight("ext-tsp-forward-weight", cl::Hidden, cl::init(0.1),
61  cl::desc("The weight of forward jumps for ExtTSP value"));
62 
63 static cl::opt<double>
64  BackwardWeight("ext-tsp-backward-weight", cl::Hidden, cl::init(0.1),
65  cl::desc("The weight of backward jumps for ExtTSP value"));
66 
68  "ext-tsp-forward-distance", cl::Hidden, cl::init(1024),
69  cl::desc("The maximum distance (in bytes) of a forward jump for ExtTSP"));
70 
72  "ext-tsp-backward-distance", cl::Hidden, cl::init(640),
73  cl::desc("The maximum distance (in bytes) of a backward jump for ExtTSP"));
74 
75 // The maximum size of a chain created by the algorithm. The size is bounded
76 // so that the algorithm can efficiently process extremely large instance.
77 static cl::opt<unsigned>
78  MaxChainSize("ext-tsp-max-chain-size", cl::Hidden, cl::init(4096),
79  cl::desc("The maximum size of a chain to create."));
80 
81 // The maximum size of a chain for splitting. Larger values of the threshold
82 // may yield better quality at the cost of worsen run-time.
84  "ext-tsp-chain-split-threshold", cl::Hidden, cl::init(128),
85  cl::desc("The maximum size of a chain to apply splitting"));
86 
87 // The option enables splitting (large) chains along in-coming and out-going
88 // jumps. This typically results in a better quality.
90  "ext-tsp-enable-chain-split-along-jumps", cl::Hidden, cl::init(true),
91  cl::desc("The maximum size of a chain to apply splitting"));
92 
93 namespace {
94 
95 // Epsilon for comparison of doubles.
96 constexpr double EPS = 1e-8;
97 
98 // Compute the Ext-TSP score for a jump between a given pair of blocks,
99 // using their sizes, (estimated) addresses and the jump execution count.
100 double extTSPScore(uint64_t SrcAddr, uint64_t SrcSize, uint64_t DstAddr,
101  uint64_t Count) {
102  // Fallthrough
103  if (SrcAddr + SrcSize == DstAddr) {
104  // Assume that FallthroughWeight = 1.0 after normalization
105  return static_cast<double>(Count);
106  }
107  // Forward
108  if (SrcAddr + SrcSize < DstAddr) {
109  const auto Dist = DstAddr - (SrcAddr + SrcSize);
110  if (Dist <= ForwardDistance) {
111  double Prob = 1.0 - static_cast<double>(Dist) / ForwardDistance;
112  return ForwardWeight * Prob * Count;
113  }
114  return 0;
115  }
116  // Backward
117  const auto Dist = SrcAddr + SrcSize - DstAddr;
118  if (Dist <= BackwardDistance) {
119  double Prob = 1.0 - static_cast<double>(Dist) / BackwardDistance;
120  return BackwardWeight * Prob * Count;
121  }
122  return 0;
123 }
124 
125 /// A type of merging two chains, X and Y. The former chain is split into
126 /// X1 and X2 and then concatenated with Y in the order specified by the type.
127 enum class MergeTypeTy : int { X_Y, X1_Y_X2, Y_X2_X1, X2_X1_Y };
128 
129 /// The gain of merging two chains, that is, the Ext-TSP score of the merge
130 /// together with the corresponfiding merge 'type' and 'offset'.
131 class MergeGainTy {
132 public:
133  explicit MergeGainTy() = default;
134  explicit MergeGainTy(double Score, size_t MergeOffset, MergeTypeTy MergeType)
135  : Score(Score), MergeOffset(MergeOffset), MergeType(MergeType) {}
136 
137  double score() const { return Score; }
138 
139  size_t mergeOffset() const { return MergeOffset; }
140 
141  MergeTypeTy mergeType() const { return MergeType; }
142 
143  // Returns 'true' iff Other is preferred over this.
144  bool operator<(const MergeGainTy &Other) const {
145  return (Other.Score > EPS && Other.Score > Score + EPS);
146  }
147 
148  // Update the current gain if Other is preferred over this.
149  void updateIfLessThan(const MergeGainTy &Other) {
150  if (*this < Other)
151  *this = Other;
152  }
153 
154 private:
155  double Score{-1.0};
156  size_t MergeOffset{0};
157  MergeTypeTy MergeType{MergeTypeTy::X_Y};
158 };
159 
160 class Jump;
161 class Chain;
162 class ChainEdge;
163 
164 /// A node in the graph, typically corresponding to a basic block in CFG.
165 class Block {
166 public:
167  Block(const Block &) = delete;
168  Block(Block &&) = default;
169  Block &operator=(const Block &) = delete;
170  Block &operator=(Block &&) = default;
171 
172  // The original index of the block in CFG.
173  size_t Index{0};
174  // The index of the block in the current chain.
175  size_t CurIndex{0};
176  // Size of the block in the binary.
177  uint64_t Size{0};
178  // Execution count of the block in the profile data.
179  uint64_t ExecutionCount{0};
180  // Current chain of the node.
181  Chain *CurChain{nullptr};
182  // An offset of the block in the current chain.
183  mutable uint64_t EstimatedAddr{0};
184  // Forced successor of the block in CFG.
185  Block *ForcedSucc{nullptr};
186  // Forced predecessor of the block in CFG.
187  Block *ForcedPred{nullptr};
188  // Outgoing jumps from the block.
189  std::vector<Jump *> OutJumps;
190  // Incoming jumps to the block.
191  std::vector<Jump *> InJumps;
192 
193 public:
194  explicit Block(size_t Index, uint64_t Size_, uint64_t EC)
195  : Index(Index), Size(Size_), ExecutionCount(EC) {}
196  bool isEntry() const { return Index == 0; }
197 };
198 
199 /// An arc in the graph, typically corresponding to a jump between two blocks.
200 class Jump {
201 public:
202  Jump(const Jump &) = delete;
203  Jump(Jump &&) = default;
204  Jump &operator=(const Jump &) = delete;
205  Jump &operator=(Jump &&) = default;
206 
207  // Source block of the jump.
208  Block *Source;
209  // Target block of the jump.
210  Block *Target;
211  // Execution count of the arc in the profile data.
212  uint64_t ExecutionCount{0};
213 
214 public:
215  explicit Jump(Block *Source, Block *Target, uint64_t ExecutionCount)
216  : Source(Source), Target(Target), ExecutionCount(ExecutionCount) {}
217 };
218 
219 /// A chain (ordered sequence) of blocks.
220 class Chain {
221 public:
222  Chain(const Chain &) = delete;
223  Chain(Chain &&) = default;
224  Chain &operator=(const Chain &) = delete;
225  Chain &operator=(Chain &&) = default;
226 
227  explicit Chain(uint64_t Id, Block *Block)
228  : Id(Id), Score(0), Blocks(1, Block) {}
229 
230  uint64_t id() const { return Id; }
231 
232  bool isEntry() const { return Blocks[0]->Index == 0; }
233 
234  double score() const { return Score; }
235 
236  void setScore(double NewScore) { Score = NewScore; }
237 
238  const std::vector<Block *> &blocks() const { return Blocks; }
239 
240  size_t numBlocks() const { return Blocks.size(); }
241 
242  const std::vector<std::pair<Chain *, ChainEdge *>> &edges() const {
243  return Edges;
244  }
245 
246  ChainEdge *getEdge(Chain *Other) const {
247  for (auto It : Edges) {
248  if (It.first == Other)
249  return It.second;
250  }
251  return nullptr;
252  }
253 
254  void removeEdge(Chain *Other) {
255  auto It = Edges.begin();
256  while (It != Edges.end()) {
257  if (It->first == Other) {
258  Edges.erase(It);
259  return;
260  }
261  It++;
262  }
263  }
264 
265  void addEdge(Chain *Other, ChainEdge *Edge) {
266  Edges.push_back(std::make_pair(Other, Edge));
267  }
268 
269  void merge(Chain *Other, const std::vector<Block *> &MergedBlocks) {
270  Blocks = MergedBlocks;
271  // Update the block's chains
272  for (size_t Idx = 0; Idx < Blocks.size(); Idx++) {
273  Blocks[Idx]->CurChain = this;
274  Blocks[Idx]->CurIndex = Idx;
275  }
276  }
277 
278  void mergeEdges(Chain *Other);
279 
280  void clear() {
281  Blocks.clear();
282  Blocks.shrink_to_fit();
283  Edges.clear();
284  Edges.shrink_to_fit();
285  }
286 
287 private:
288  // Unique chain identifier.
289  uint64_t Id;
290  // Cached ext-tsp score for the chain.
291  double Score;
292  // Blocks of the chain.
293  std::vector<Block *> Blocks;
294  // Adjacent chains and corresponding edges (lists of jumps).
295  std::vector<std::pair<Chain *, ChainEdge *>> Edges;
296 };
297 
298 /// An edge in CFG representing jumps between two chains.
299 /// When blocks are merged into chains, the edges are combined too so that
300 /// there is always at most one edge between a pair of chains
301 class ChainEdge {
302 public:
303  ChainEdge(const ChainEdge &) = delete;
304  ChainEdge(ChainEdge &&) = default;
305  ChainEdge &operator=(const ChainEdge &) = delete;
306  ChainEdge &operator=(ChainEdge &&) = default;
307 
308  explicit ChainEdge(Jump *Jump)
309  : SrcChain(Jump->Source->CurChain), DstChain(Jump->Target->CurChain),
310  Jumps(1, Jump) {}
311 
312  const std::vector<Jump *> &jumps() const { return Jumps; }
313 
314  void changeEndpoint(Chain *From, Chain *To) {
315  if (From == SrcChain)
316  SrcChain = To;
317  if (From == DstChain)
318  DstChain = To;
319  }
320 
321  void appendJump(Jump *Jump) { Jumps.push_back(Jump); }
322 
323  void moveJumps(ChainEdge *Other) {
324  Jumps.insert(Jumps.end(), Other->Jumps.begin(), Other->Jumps.end());
325  Other->Jumps.clear();
326  Other->Jumps.shrink_to_fit();
327  }
328 
329  bool hasCachedMergeGain(Chain *Src, Chain *Dst) const {
330  return Src == SrcChain ? CacheValidForward : CacheValidBackward;
331  }
332 
333  MergeGainTy getCachedMergeGain(Chain *Src, Chain *Dst) const {
334  return Src == SrcChain ? CachedGainForward : CachedGainBackward;
335  }
336 
337  void setCachedMergeGain(Chain *Src, Chain *Dst, MergeGainTy MergeGain) {
338  if (Src == SrcChain) {
339  CachedGainForward = MergeGain;
340  CacheValidForward = true;
341  } else {
342  CachedGainBackward = MergeGain;
343  CacheValidBackward = true;
344  }
345  }
346 
347  void invalidateCache() {
348  CacheValidForward = false;
349  CacheValidBackward = false;
350  }
351 
352 private:
353  // Source chain.
354  Chain *SrcChain{nullptr};
355  // Destination chain.
356  Chain *DstChain{nullptr};
357  // Original jumps in the binary with correspinding execution counts.
358  std::vector<Jump *> Jumps;
359  // Cached ext-tsp value for merging the pair of chains.
360  // Since the gain of merging (Src, Dst) and (Dst, Src) might be different,
361  // we store both values here.
362  MergeGainTy CachedGainForward;
363  MergeGainTy CachedGainBackward;
364  // Whether the cached value must be recomputed.
365  bool CacheValidForward{false};
366  bool CacheValidBackward{false};
367 };
368 
369 void Chain::mergeEdges(Chain *Other) {
370  assert(this != Other && "cannot merge a chain with itself");
371 
372  // Update edges adjacent to chain Other
373  for (auto EdgeIt : Other->Edges) {
374  const auto DstChain = EdgeIt.first;
375  const auto DstEdge = EdgeIt.second;
376  const auto TargetChain = DstChain == Other ? this : DstChain;
377  auto CurEdge = getEdge(TargetChain);
378  if (CurEdge == nullptr) {
379  DstEdge->changeEndpoint(Other, this);
380  this->addEdge(TargetChain, DstEdge);
381  if (DstChain != this && DstChain != Other) {
382  DstChain->addEdge(this, DstEdge);
383  }
384  } else {
385  CurEdge->moveJumps(DstEdge);
386  }
387  // Cleanup leftover edge
388  if (DstChain != Other) {
389  DstChain->removeEdge(Other);
390  }
391  }
392 }
393 
394 using BlockIter = std::vector<Block *>::const_iterator;
395 
396 /// A wrapper around three chains of blocks; it is used to avoid extra
397 /// instantiation of the vectors.
398 class MergedChain {
399 public:
400  MergedChain(BlockIter Begin1, BlockIter End1, BlockIter Begin2 = BlockIter(),
401  BlockIter End2 = BlockIter(), BlockIter Begin3 = BlockIter(),
402  BlockIter End3 = BlockIter())
403  : Begin1(Begin1), End1(End1), Begin2(Begin2), End2(End2), Begin3(Begin3),
404  End3(End3) {}
405 
406  template <typename F> void forEach(const F &Func) const {
407  for (auto It = Begin1; It != End1; It++)
408  Func(*It);
409  for (auto It = Begin2; It != End2; It++)
410  Func(*It);
411  for (auto It = Begin3; It != End3; It++)
412  Func(*It);
413  }
414 
415  std::vector<Block *> getBlocks() const {
416  std::vector<Block *> Result;
417  Result.reserve(std::distance(Begin1, End1) + std::distance(Begin2, End2) +
418  std::distance(Begin3, End3));
419  Result.insert(Result.end(), Begin1, End1);
420  Result.insert(Result.end(), Begin2, End2);
421  Result.insert(Result.end(), Begin3, End3);
422  return Result;
423  }
424 
425  const Block *getFirstBlock() const { return *Begin1; }
426 
427 private:
428  BlockIter Begin1;
429  BlockIter End1;
430  BlockIter Begin2;
431  BlockIter End2;
432  BlockIter Begin3;
433  BlockIter End3;
434 };
435 
436 /// The implementation of the ExtTSP algorithm.
437 class ExtTSPImpl {
438  using EdgeT = std::pair<uint64_t, uint64_t>;
439  using EdgeCountMap = DenseMap<EdgeT, uint64_t>;
440 
441 public:
442  ExtTSPImpl(size_t NumNodes, const std::vector<uint64_t> &NodeSizes,
443  const std::vector<uint64_t> &NodeCounts,
444  const EdgeCountMap &EdgeCounts)
445  : NumNodes(NumNodes) {
446  initialize(NodeSizes, NodeCounts, EdgeCounts);
447  }
448 
449  /// Run the algorithm and return an optimized ordering of blocks.
450  void run(std::vector<uint64_t> &Result) {
451  // Pass 1: Merge blocks with their mutually forced successors
452  mergeForcedPairs();
453 
454  // Pass 2: Merge pairs of chains while improving the ExtTSP objective
455  mergeChainPairs();
456 
457  // Pass 3: Merge cold blocks to reduce code size
458  mergeColdChains();
459 
460  // Collect blocks from all chains
461  concatChains(Result);
462  }
463 
464 private:
465  /// Initialize the algorithm's data structures.
466  void initialize(const std::vector<uint64_t> &NodeSizes,
467  const std::vector<uint64_t> &NodeCounts,
468  const EdgeCountMap &EdgeCounts) {
469  // Initialize blocks
470  AllBlocks.reserve(NumNodes);
471  for (uint64_t Node = 0; Node < NumNodes; Node++) {
472  uint64_t Size = std::max<uint64_t>(NodeSizes[Node], 1ULL);
473  uint64_t ExecutionCount = NodeCounts[Node];
474  // The execution count of the entry block is set to at least 1
475  if (Node == 0 && ExecutionCount == 0)
476  ExecutionCount = 1;
477  AllBlocks.emplace_back(Node, Size, ExecutionCount);
478  }
479 
480  // Initialize jumps between blocks
481  SuccNodes = std::vector<std::vector<uint64_t>>(NumNodes);
482  PredNodes = std::vector<std::vector<uint64_t>>(NumNodes);
483  AllJumps.reserve(EdgeCounts.size());
484  for (auto It : EdgeCounts) {
485  auto Pred = It.first.first;
486  auto Succ = It.first.second;
487  // Ignore self-edges
488  if (Pred == Succ)
489  continue;
490 
491  SuccNodes[Pred].push_back(Succ);
492  PredNodes[Succ].push_back(Pred);
493  auto ExecutionCount = It.second;
494  if (ExecutionCount > 0) {
495  auto &Block = AllBlocks[Pred];
496  auto &SuccBlock = AllBlocks[Succ];
497  AllJumps.emplace_back(&Block, &SuccBlock, ExecutionCount);
498  SuccBlock.InJumps.push_back(&AllJumps.back());
499  Block.OutJumps.push_back(&AllJumps.back());
500  }
501  }
502 
503  // Initialize chains
504  AllChains.reserve(NumNodes);
505  HotChains.reserve(NumNodes);
506  for (auto &Block : AllBlocks) {
507  AllChains.emplace_back(Block.Index, &Block);
508  Block.CurChain = &AllChains.back();
509  if (Block.ExecutionCount > 0) {
510  HotChains.push_back(&AllChains.back());
511  }
512  }
513 
514  // Initialize chain edges
515  AllEdges.reserve(AllJumps.size());
516  for (auto &Block : AllBlocks) {
517  for (auto &Jump : Block.OutJumps) {
518  auto SuccBlock = Jump->Target;
519  auto CurEdge = Block.CurChain->getEdge(SuccBlock->CurChain);
520  // this edge is already present in the graph
521  if (CurEdge != nullptr) {
522  assert(SuccBlock->CurChain->getEdge(Block.CurChain) != nullptr);
523  CurEdge->appendJump(Jump);
524  continue;
525  }
526  // this is a new edge
527  AllEdges.emplace_back(Jump);
528  Block.CurChain->addEdge(SuccBlock->CurChain, &AllEdges.back());
529  SuccBlock->CurChain->addEdge(Block.CurChain, &AllEdges.back());
530  }
531  }
532  }
533 
534  /// For a pair of blocks, A and B, block B is the forced successor of A,
535  /// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps
536  /// to B are from A. Such blocks should be adjacent in the optimal ordering;
537  /// the method finds and merges such pairs of blocks.
538  void mergeForcedPairs() {
539  // Find fallthroughs based on edge weights
540  for (auto &Block : AllBlocks) {
541  if (SuccNodes[Block.Index].size() == 1 &&
542  PredNodes[SuccNodes[Block.Index][0]].size() == 1 &&
543  SuccNodes[Block.Index][0] != 0) {
544  size_t SuccIndex = SuccNodes[Block.Index][0];
545  Block.ForcedSucc = &AllBlocks[SuccIndex];
546  AllBlocks[SuccIndex].ForcedPred = &Block;
547  }
548  }
549 
550  // There might be 'cycles' in the forced dependencies, since profile
551  // data isn't 100% accurate. Typically this is observed in loops, when the
552  // loop edges are the hottest successors for the basic blocks of the loop.
553  // Break the cycles by choosing the block with the smallest index as the
554  // head. This helps to keep the original order of the loops, which likely
555  // have already been rotated in the optimized manner.
556  for (auto &Block : AllBlocks) {
557  if (Block.ForcedSucc == nullptr || Block.ForcedPred == nullptr)
558  continue;
559 
560  auto SuccBlock = Block.ForcedSucc;
561  while (SuccBlock != nullptr && SuccBlock != &Block) {
562  SuccBlock = SuccBlock->ForcedSucc;
563  }
564  if (SuccBlock == nullptr)
565  continue;
566  // Break the cycle
567  AllBlocks[Block.ForcedPred->Index].ForcedSucc = nullptr;
568  Block.ForcedPred = nullptr;
569  }
570 
571  // Merge blocks with their fallthrough successors
572  for (auto &Block : AllBlocks) {
573  if (Block.ForcedPred == nullptr && Block.ForcedSucc != nullptr) {
574  auto CurBlock = &Block;
575  while (CurBlock->ForcedSucc != nullptr) {
576  const auto NextBlock = CurBlock->ForcedSucc;
577  mergeChains(Block.CurChain, NextBlock->CurChain, 0, MergeTypeTy::X_Y);
578  CurBlock = NextBlock;
579  }
580  }
581  }
582  }
583 
584  /// Merge pairs of chains while improving the ExtTSP objective.
585  void mergeChainPairs() {
586  /// Deterministically compare pairs of chains
587  auto compareChainPairs = [](const Chain *A1, const Chain *B1,
588  const Chain *A2, const Chain *B2) {
589  if (A1 != A2)
590  return A1->id() < A2->id();
591  return B1->id() < B2->id();
592  };
593 
594  while (HotChains.size() > 1) {
595  Chain *BestChainPred = nullptr;
596  Chain *BestChainSucc = nullptr;
597  auto BestGain = MergeGainTy();
598  // Iterate over all pairs of chains
599  for (auto ChainPred : HotChains) {
600  // Get candidates for merging with the current chain
601  for (auto EdgeIter : ChainPred->edges()) {
602  auto ChainSucc = EdgeIter.first;
603  auto ChainEdge = EdgeIter.second;
604  // Ignore loop edges
605  if (ChainPred == ChainSucc)
606  continue;
607 
608  // Stop early if the combined chain violates the maximum allowed size
609  if (ChainPred->numBlocks() + ChainSucc->numBlocks() >= MaxChainSize)
610  continue;
611 
612  // Compute the gain of merging the two chains
613  auto CurGain = getBestMergeGain(ChainPred, ChainSucc, ChainEdge);
614  if (CurGain.score() <= EPS)
615  continue;
616 
617  if (BestGain < CurGain ||
618  (std::abs(CurGain.score() - BestGain.score()) < EPS &&
619  compareChainPairs(ChainPred, ChainSucc, BestChainPred,
620  BestChainSucc))) {
621  BestGain = CurGain;
622  BestChainPred = ChainPred;
623  BestChainSucc = ChainSucc;
624  }
625  }
626  }
627 
628  // Stop merging when there is no improvement
629  if (BestGain.score() <= EPS)
630  break;
631 
632  // Merge the best pair of chains
633  mergeChains(BestChainPred, BestChainSucc, BestGain.mergeOffset(),
634  BestGain.mergeType());
635  }
636  }
637 
638  /// Merge cold blocks to reduce code size.
639  void mergeColdChains() {
640  for (size_t SrcBB = 0; SrcBB < NumNodes; SrcBB++) {
641  // Iterating over neighbors in the reverse order to make sure original
642  // fallthrough jumps are merged first
643  size_t NumSuccs = SuccNodes[SrcBB].size();
644  for (size_t Idx = 0; Idx < NumSuccs; Idx++) {
645  auto DstBB = SuccNodes[SrcBB][NumSuccs - Idx - 1];
646  auto SrcChain = AllBlocks[SrcBB].CurChain;
647  auto DstChain = AllBlocks[DstBB].CurChain;
648  if (SrcChain != DstChain && !DstChain->isEntry() &&
649  SrcChain->blocks().back()->Index == SrcBB &&
650  DstChain->blocks().front()->Index == DstBB) {
651  mergeChains(SrcChain, DstChain, 0, MergeTypeTy::X_Y);
652  }
653  }
654  }
655  }
656 
657  /// Compute the Ext-TSP score for a given block order and a list of jumps.
658  double extTSPScore(const MergedChain &MergedBlocks,
659  const std::vector<Jump *> &Jumps) const {
660  if (Jumps.empty())
661  return 0.0;
662  uint64_t CurAddr = 0;
663  MergedBlocks.forEach([&](const Block *BB) {
664  BB->EstimatedAddr = CurAddr;
665  CurAddr += BB->Size;
666  });
667 
668  double Score = 0;
669  for (auto &Jump : Jumps) {
670  const auto SrcBlock = Jump->Source;
671  const auto DstBlock = Jump->Target;
672  Score += ::extTSPScore(SrcBlock->EstimatedAddr, SrcBlock->Size,
673  DstBlock->EstimatedAddr, Jump->ExecutionCount);
674  }
675  return Score;
676  }
677 
678  /// Compute the gain of merging two chains.
679  ///
680  /// The function considers all possible ways of merging two chains and
681  /// computes the one having the largest increase in ExtTSP objective. The
682  /// result is a pair with the first element being the gain and the second
683  /// element being the corresponding merging type.
684  MergeGainTy getBestMergeGain(Chain *ChainPred, Chain *ChainSucc,
685  ChainEdge *Edge) const {
686  if (Edge->hasCachedMergeGain(ChainPred, ChainSucc)) {
687  return Edge->getCachedMergeGain(ChainPred, ChainSucc);
688  }
689 
690  // Precompute jumps between ChainPred and ChainSucc
691  auto Jumps = Edge->jumps();
692  auto EdgePP = ChainPred->getEdge(ChainPred);
693  if (EdgePP != nullptr) {
694  Jumps.insert(Jumps.end(), EdgePP->jumps().begin(), EdgePP->jumps().end());
695  }
696  assert(!Jumps.empty() && "trying to merge chains w/o jumps");
697 
698  // The object holds the best currently chosen gain of merging the two chains
699  MergeGainTy Gain = MergeGainTy();
700 
701  /// Given a merge offset and a list of merge types, try to merge two chains
702  /// and update Gain with a better alternative
703  auto tryChainMerging = [&](size_t Offset,
704  const std::vector<MergeTypeTy> &MergeTypes) {
705  // Skip merging corresponding to concatenation w/o splitting
706  if (Offset == 0 || Offset == ChainPred->blocks().size())
707  return;
708  // Skip merging if it breaks Forced successors
709  auto BB = ChainPred->blocks()[Offset - 1];
710  if (BB->ForcedSucc != nullptr)
711  return;
712  // Apply the merge, compute the corresponding gain, and update the best
713  // value, if the merge is beneficial
714  for (auto &MergeType : MergeTypes) {
715  Gain.updateIfLessThan(
716  computeMergeGain(ChainPred, ChainSucc, Jumps, Offset, MergeType));
717  }
718  };
719 
720  // Try to concatenate two chains w/o splitting
721  Gain.updateIfLessThan(
722  computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeTy::X_Y));
723 
725  // Attach (a part of) ChainPred before the first block of ChainSucc
726  for (auto &Jump : ChainSucc->blocks().front()->InJumps) {
727  const auto SrcBlock = Jump->Source;
728  if (SrcBlock->CurChain != ChainPred)
729  continue;
730  size_t Offset = SrcBlock->CurIndex + 1;
731  tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::X2_X1_Y});
732  }
733 
734  // Attach (a part of) ChainPred after the last block of ChainSucc
735  for (auto &Jump : ChainSucc->blocks().back()->OutJumps) {
736  const auto DstBlock = Jump->Source;
737  if (DstBlock->CurChain != ChainPred)
738  continue;
739  size_t Offset = DstBlock->CurIndex;
740  tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::Y_X2_X1});
741  }
742  }
743 
744  // Try to break ChainPred in various ways and concatenate with ChainSucc
745  if (ChainPred->blocks().size() <= ChainSplitThreshold) {
746  for (size_t Offset = 1; Offset < ChainPred->blocks().size(); Offset++) {
747  // Try to split the chain in different ways. In practice, applying
748  // X2_Y_X1 merging is almost never provides benefits; thus, we exclude
749  // it from consideration to reduce the search space
750  tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::Y_X2_X1,
751  MergeTypeTy::X2_X1_Y});
752  }
753  }
754  Edge->setCachedMergeGain(ChainPred, ChainSucc, Gain);
755  return Gain;
756  }
757 
758  /// Compute the score gain of merging two chains, respecting a given
759  /// merge 'type' and 'offset'.
760  ///
761  /// The two chains are not modified in the method.
762  MergeGainTy computeMergeGain(const Chain *ChainPred, const Chain *ChainSucc,
763  const std::vector<Jump *> &Jumps,
764  size_t MergeOffset,
765  MergeTypeTy MergeType) const {
766  auto MergedBlocks = mergeBlocks(ChainPred->blocks(), ChainSucc->blocks(),
767  MergeOffset, MergeType);
768 
769  // Do not allow a merge that does not preserve the original entry block
770  if ((ChainPred->isEntry() || ChainSucc->isEntry()) &&
771  !MergedBlocks.getFirstBlock()->isEntry())
772  return MergeGainTy();
773 
774  // The gain for the new chain
775  auto NewGainScore = extTSPScore(MergedBlocks, Jumps) - ChainPred->score();
776  return MergeGainTy(NewGainScore, MergeOffset, MergeType);
777  }
778 
779  /// Merge two chains of blocks respecting a given merge 'type' and 'offset'.
780  ///
781  /// If MergeType == 0, then the result is a concatentation of two chains.
782  /// Otherwise, the first chain is cut into two sub-chains at the offset,
783  /// and merged using all possible ways of concatenating three chains.
784  MergedChain mergeBlocks(const std::vector<Block *> &X,
785  const std::vector<Block *> &Y, size_t MergeOffset,
786  MergeTypeTy MergeType) const {
787  // Split the first chain, X, into X1 and X2
788  BlockIter BeginX1 = X.begin();
789  BlockIter EndX1 = X.begin() + MergeOffset;
790  BlockIter BeginX2 = X.begin() + MergeOffset;
791  BlockIter EndX2 = X.end();
792  BlockIter BeginY = Y.begin();
793  BlockIter EndY = Y.end();
794 
795  // Construct a new chain from the three existing ones
796  switch (MergeType) {
797  case MergeTypeTy::X_Y:
798  return MergedChain(BeginX1, EndX2, BeginY, EndY);
799  case MergeTypeTy::X1_Y_X2:
800  return MergedChain(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2);
801  case MergeTypeTy::Y_X2_X1:
802  return MergedChain(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1);
803  case MergeTypeTy::X2_X1_Y:
804  return MergedChain(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY);
805  }
806  llvm_unreachable("unexpected chain merge type");
807  }
808 
809  /// Merge chain From into chain Into, update the list of active chains,
810  /// adjacency information, and the corresponding cached values.
811  void mergeChains(Chain *Into, Chain *From, size_t MergeOffset,
812  MergeTypeTy MergeType) {
813  assert(Into != From && "a chain cannot be merged with itself");
814 
815  // Merge the blocks
816  auto MergedBlocks =
817  mergeBlocks(Into->blocks(), From->blocks(), MergeOffset, MergeType);
818  Into->merge(From, MergedBlocks.getBlocks());
819  Into->mergeEdges(From);
820  From->clear();
821 
822  // Update cached ext-tsp score for the new chain
823  auto SelfEdge = Into->getEdge(Into);
824  if (SelfEdge != nullptr) {
825  MergedBlocks = MergedChain(Into->blocks().begin(), Into->blocks().end());
826  Into->setScore(extTSPScore(MergedBlocks, SelfEdge->jumps()));
827  }
828 
829  // Remove chain From from the list of active chains
830  auto Iter = std::remove(HotChains.begin(), HotChains.end(), From);
831  HotChains.erase(Iter, HotChains.end());
832 
833  // Invalidate caches
834  for (auto EdgeIter : Into->edges()) {
835  EdgeIter.second->invalidateCache();
836  }
837  }
838 
839  /// Concatenate all chains into a final order of blocks.
840  void concatChains(std::vector<uint64_t> &Order) {
841  // Collect chains and calculate some stats for their sorting
842  std::vector<Chain *> SortedChains;
843  DenseMap<const Chain *, double> ChainDensity;
844  for (auto &Chain : AllChains) {
845  if (!Chain.blocks().empty()) {
846  SortedChains.push_back(&Chain);
847  // Using doubles to avoid overflow of ExecutionCount
848  double Size = 0;
849  double ExecutionCount = 0;
850  for (auto Block : Chain.blocks()) {
851  Size += static_cast<double>(Block->Size);
852  ExecutionCount += static_cast<double>(Block->ExecutionCount);
853  }
854  assert(Size > 0 && "a chain of zero size");
855  ChainDensity[&Chain] = ExecutionCount / Size;
856  }
857  }
858 
859  // Sorting chains by density in the decreasing order
860  std::stable_sort(SortedChains.begin(), SortedChains.end(),
861  [&](const Chain *C1, const Chain *C2) {
862  // Makre sure the original entry block is at the
863  // beginning of the order
864  if (C1->isEntry() != C2->isEntry()) {
865  return C1->isEntry();
866  }
867 
868  const double D1 = ChainDensity[C1];
869  const double D2 = ChainDensity[C2];
870  // Compare by density and break ties by chain identifiers
871  return (D1 != D2) ? (D1 > D2) : (C1->id() < C2->id());
872  });
873 
874  // Collect the blocks in the order specified by their chains
875  Order.reserve(NumNodes);
876  for (auto Chain : SortedChains) {
877  for (auto Block : Chain->blocks()) {
878  Order.push_back(Block->Index);
879  }
880  }
881  }
882 
883 private:
884  /// The number of nodes in the graph.
885  const size_t NumNodes;
886 
887  /// Successors of each node.
888  std::vector<std::vector<uint64_t>> SuccNodes;
889 
890  /// Predecessors of each node.
891  std::vector<std::vector<uint64_t>> PredNodes;
892 
893  /// All basic blocks.
894  std::vector<Block> AllBlocks;
895 
896  /// All jumps between blocks.
897  std::vector<Jump> AllJumps;
898 
899  /// All chains of basic blocks.
900  std::vector<Chain> AllChains;
901 
902  /// All edges between chains.
903  std::vector<ChainEdge> AllEdges;
904 
905  /// Active chains. The vector gets updated at runtime when chains are merged.
906  std::vector<Chain *> HotChains;
907 };
908 
909 } // end of anonymous namespace
910 
911 std::vector<uint64_t> llvm::applyExtTspLayout(
912  const std::vector<uint64_t> &NodeSizes,
913  const std::vector<uint64_t> &NodeCounts,
914  const DenseMap<std::pair<uint64_t, uint64_t>, uint64_t> &EdgeCounts) {
915  size_t NumNodes = NodeSizes.size();
916 
917  // Verify correctness of the input data.
918  assert(NodeCounts.size() == NodeSizes.size() && "Incorrect input");
919  assert(NumNodes > 2 && "Incorrect input");
920 
921  // Apply the reordering algorithm.
922  auto Alg = ExtTSPImpl(NumNodes, NodeSizes, NodeCounts, EdgeCounts);
923  std::vector<uint64_t> Result;
924  Alg.run(Result);
925 
926  // Verify correctness of the output.
927  assert(Result.front() == 0 && "Original entry point is not preserved");
928  assert(Result.size() == NumNodes && "Incorrect size of reordered layout");
929  return Result;
930 }
931 
933  const std::vector<uint64_t> &Order, const std::vector<uint64_t> &NodeSizes,
934  const std::vector<uint64_t> &NodeCounts,
935  const DenseMap<std::pair<uint64_t, uint64_t>, uint64_t> &EdgeCounts) {
936  // Estimate addresses of the blocks in memory
937  auto Addr = std::vector<uint64_t>(NodeSizes.size(), 0);
938  for (size_t Idx = 1; Idx < Order.size(); Idx++) {
939  Addr[Order[Idx]] = Addr[Order[Idx - 1]] + NodeSizes[Order[Idx - 1]];
940  }
941 
942  // Increase the score for each jump
943  double Score = 0;
944  for (auto It : EdgeCounts) {
945  auto Pred = It.first.first;
946  auto Succ = It.first.second;
947  uint64_t Count = It.second;
948  Score += extTSPScore(Addr[Pred], NodeSizes[Pred], Addr[Succ], Count);
949  }
950  return Score;
951 }
952 
954  const std::vector<uint64_t> &NodeSizes,
955  const std::vector<uint64_t> &NodeCounts,
956  const DenseMap<std::pair<uint64_t, uint64_t>, uint64_t> &EdgeCounts) {
957  auto Order = std::vector<uint64_t>(NodeSizes.size());
958  for (size_t Idx = 0; Idx < NodeSizes.size(); Idx++) {
959  Order[Idx] = Idx;
960  }
961  return calcExtTspScore(Order, NodeSizes, NodeCounts, EdgeCounts);
962 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:76
B1
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:17
EnableChainSplitAlongJumps
static cl::opt< bool > EnableChainSplitAlongJumps("ext-tsp-enable-chain-split-along-jumps", cl::Hidden, cl::init(true), cl::desc("The maximum size of a chain to apply splitting"))
return
return
Definition: README.txt:242
addEdge
static void addEdge(SmallVectorImpl< LazyCallGraph::Edge > &Edges, DenseMap< LazyCallGraph::Node *, int > &EdgeIndexMap, LazyCallGraph::Node &N, LazyCallGraph::Edge::Kind EK)
Definition: LazyCallGraph.cpp:65
C1
instcombine should handle this C2 when C1
Definition: README.txt:263
llvm::Target
Target - Wrapper for Target specific information.
Definition: TargetRegistry.h:140
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:139
initialize
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, ArrayRef< StringLiteral > StandardNames)
Initialize the set of available library functions based on the specified target triple.
Definition: TargetLibraryInfo.cpp:116
F
#define F(x, y, z)
Definition: MD5.cpp:55
clear
static void clear(coro::Shape &Shape)
Definition: Coroutines.cpp:149
CommandLine.h
int
Clang compiles this i1 i64 store i64 i64 store i64 i64 store i64 i64 store i64 align Which gets codegen d xmm0 movaps rbp movaps rbp movaps rbp movaps rbp rbp rbp rbp rbp It would be better to have movq s of instead of the movaps s LLVM produces ret int
Definition: README.txt:536
Y
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
llvm::ms_demangle::QualifierMangleMode::Result
@ Result
merge
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Definition: LoopDeletion.cpp:53
llvm::dwarf::Index
Index
Definition: Dwarf.h:472
ChainSplitThreshold
static cl::opt< unsigned > ChainSplitThreshold("ext-tsp-chain-split-threshold", cl::Hidden, cl::init(128), cl::desc("The maximum size of a chain to apply splitting"))
ForwardDistance
static cl::opt< unsigned > ForwardDistance("ext-tsp-forward-distance", cl::Hidden, cl::init(1024), cl::desc("The maximum distance (in bytes) of a forward jump for ExtTSP"))
CodeLayout.h
X
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
llvm::dxil::PointerTypeAnalysis::run
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
Definition: PointerTypeAnalysis.cpp:101
llvm::cl::ZeroOrMore
@ ZeroOrMore
Definition: CommandLine.h:116
ApplyExtTspWithoutProfile
cl::opt< bool > ApplyExtTspWithoutProfile("ext-tsp-apply-without-profile", cl::desc("Whether to apply ext-tsp placement for instances w/o profile"), cl::init(true), cl::Hidden, cl::ZeroOrMore)
llvm::cl::opt< bool >
EnableExtTspBlockPlacement
cl::opt< bool > EnableExtTspBlockPlacement("enable-ext-tsp-block-placement", cl::Hidden, cl::init(false), cl::desc("Enable machine block placement based on the ext-tsp model, " "optimizing I-cache utilization."))
llvm::AMDGPU::Hwreg::Offset
Offset
Definition: SIDefines.h:409
uint64_t
Addr
uint64_t Addr
Definition: ELFObjHandler.cpp:78
llvm::numbers::e
constexpr double e
Definition: MathExtras.h:57
llvm::DenseMap
Definition: DenseMap.h:716
llvm::operator<
bool operator<(int64_t V1, const APSInt &V2)
Definition: APSInt.h:339
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
BackwardDistance
static cl::opt< unsigned > BackwardDistance("ext-tsp-backward-distance", cl::Hidden, cl::init(640), cl::desc("The maximum distance (in bytes) of a backward jump for ExtTSP"))
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::codeview::CompileSym2Flags::EC
@ EC
llvm::size
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1588
llvm::sys::fs::remove
std::error_code remove(const Twine &path, bool IgnoreNonExisting=true)
Remove path.
llvm::Sched::Source
@ Source
Definition: TargetLowering.h:99
this
Analysis the ScalarEvolution expression for r is this
Definition: README.txt:8
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
ForwardWeight
static cl::opt< double > ForwardWeight("ext-tsp-forward-weight", cl::Hidden, cl::init(0.1), cl::desc("The weight of forward jumps for ExtTSP value"))
llvm::ifs::IFSSymbolType::Func
@ Func
llvm::calcExtTspScore
double calcExtTspScore(const std::vector< uint64_t > &Order, const std::vector< uint64_t > &NodeSizes, const std::vector< uint64_t > &NodeCounts, const DenseMap< std::pair< uint64_t, uint64_t >, uint64_t > &EdgeCounts)
Estimate the "quality" of a given node order in CFG.
Definition: CodeLayout.cpp:932
llvm::stable_sort
void stable_sort(R &&Range)
Definition: STLExtras.h:1751
llvm::Constant::operator=
void operator=(const Constant &)=delete
llvm::applyExtTspLayout
std::vector< uint64_t > applyExtTspLayout(const std::vector< uint64_t > &NodeSizes, const std::vector< uint64_t > &NodeCounts, const DenseMap< std::pair< uint64_t, uint64_t >, uint64_t > &EdgeCounts)
Find a layout of nodes (basic blocks) of a given CFG optimizing jump locality and thus processor I-ca...
Definition: CodeLayout.cpp:911
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
From
BlockVerifier::State From
Definition: BlockVerifier.cpp:55
llvm::cl::desc
Definition: CommandLine.h:405
BackwardWeight
static cl::opt< double > BackwardWeight("ext-tsp-backward-weight", cl::Hidden, cl::init(0.1), cl::desc("The weight of backward jumps for ExtTSP value"))
llvm::AMDGPU::VGPRIndexMode::Id
Id
Definition: SIDefines.h:235
MaxChainSize
static cl::opt< unsigned > MaxChainSize("ext-tsp-max-chain-size", cl::Hidden, cl::init(4096), cl::desc("The maximum size of a chain to create."))
llvm::pdb::PDB_SymType::Block
@ Block
llvm::abs
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1281
Other
Optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1225