48 #define DEBUG_TYPE "code-layout"
52 cl::desc(
"Enable machine block placement based on the ext-tsp model, "
53 "optimizing I-cache utilization."));
56 "ext-tsp-apply-without-profile",
57 cl::desc(
"Whether to apply ext-tsp placement for instances w/o profile"),
64 cl::desc(
"The weight of conditional forward jumps for ExtTSP value"));
68 cl::desc(
"The weight of unconditional forward jumps for ExtTSP value"));
72 cl::desc(
"The weight of conditonal backward jumps for ExtTSP value"));
76 cl::desc(
"The weight of unconditonal backward jumps for ExtTSP value"));
80 cl::desc(
"The weight of conditional fallthrough jumps for ExtTSP value"));
84 cl::desc(
"The weight of unconditional fallthrough jumps for ExtTSP value"));
88 cl::desc(
"The maximum distance (in bytes) of a forward jump for ExtTSP"));
92 cl::desc(
"The maximum distance (in bytes) of a backward jump for ExtTSP"));
98 cl::desc(
"The maximum size of a chain to create."));
104 cl::desc(
"The maximum size of a chain to apply splitting"));
110 cl::desc(
"The maximum size of a chain to apply splitting"));
115 constexpr
double EPS = 1
e-8;
120 if (JumpDist > JumpMaxDist)
122 double Prob = 1.0 -
static_cast<double>(JumpDist) / JumpMaxDist;
123 return Weight * Prob * Count;
129 uint64_t Count,
bool IsConditional) {
131 if (SrcAddr + SrcSize == DstAddr) {
132 return jumpExtTSPScore(0, 1, Count,
137 if (SrcAddr + SrcSize < DstAddr) {
138 const uint64_t Dist = DstAddr - (SrcAddr + SrcSize);
144 const uint64_t Dist = SrcAddr + SrcSize - DstAddr;
152 enum class MergeTypeTy :
int { X_Y, X1_Y_X2, Y_X2_X1, X2_X1_Y };
158 explicit MergeGainTy() =
default;
159 explicit MergeGainTy(
double Score,
size_t MergeOffset, MergeTypeTy MergeType)
160 : Score(Score), MergeOffset(MergeOffset), MergeType(MergeType) {}
162 double score()
const {
return Score; }
164 size_t mergeOffset()
const {
return MergeOffset; }
166 MergeTypeTy mergeType()
const {
return MergeType; }
169 bool operator<(
const MergeGainTy &Other)
const {
170 return (
Other.Score > EPS &&
Other.Score > Score + EPS);
174 void updateIfLessThan(
const MergeGainTy &Other) {
181 size_t MergeOffset{0};
182 MergeTypeTy MergeType{MergeTypeTy::X_Y};
192 Block(
const Block &) =
delete;
193 Block(Block &&) =
default;
206 Chain *CurChain{
nullptr};
210 Block *ForcedSucc{
nullptr};
212 Block *ForcedPred{
nullptr};
214 std::vector<Jump *> OutJumps;
216 std::vector<Jump *> InJumps;
221 bool isEntry()
const {
return Index == 0; }
227 Jump(
const Jump &) =
delete;
228 Jump(Jump &&) =
default;
239 bool IsConditional{
false};
249 Chain(
const Chain &) =
delete;
250 Chain(Chain &&) =
default;
251 Chain &
operator=(
const Chain &) =
delete;
259 bool isEntry()
const {
return Blocks[0]->Index == 0; }
261 bool isCold()
const {
262 for (
auto *Block : Blocks) {
263 if (
Block->ExecutionCount > 0)
269 double score()
const {
return Score; }
271 void setScore(
double NewScore) { Score = NewScore; }
273 const std::vector<Block *> &blocks()
const {
return Blocks; }
275 size_t numBlocks()
const {
return Blocks.size(); }
277 const std::vector<std::pair<Chain *, ChainEdge *>> &edges()
const {
281 ChainEdge *getEdge(Chain *Other)
const {
282 for (
auto It : Edges) {
283 if (It.first == Other)
289 void removeEdge(Chain *Other) {
290 auto It = Edges.begin();
291 while (It != Edges.end()) {
292 if (It->first == Other) {
300 void addEdge(Chain *Other, ChainEdge *Edge) {
301 Edges.push_back(std::make_pair(Other, Edge));
304 void merge(Chain *Other,
const std::vector<Block *> &MergedBlocks) {
305 Blocks = MergedBlocks;
307 for (
size_t Idx = 0; Idx < Blocks.size(); Idx++) {
308 Blocks[Idx]->CurChain =
this;
309 Blocks[Idx]->CurIndex = Idx;
313 void mergeEdges(Chain *Other);
317 Blocks.shrink_to_fit();
319 Edges.shrink_to_fit();
328 std::vector<Block *> Blocks;
330 std::vector<std::pair<Chain *, ChainEdge *>> Edges;
338 ChainEdge(
const ChainEdge &) =
delete;
339 ChainEdge(ChainEdge &&) =
default;
340 ChainEdge &
operator=(
const ChainEdge &) =
delete;
341 ChainEdge &
operator=(ChainEdge &&) =
default;
343 explicit ChainEdge(Jump *Jump)
344 : SrcChain(Jump->
Source->CurChain), DstChain(Jump->
Target->CurChain),
347 const std::vector<Jump *> &jumps()
const {
return Jumps; }
349 void changeEndpoint(Chain *
From, Chain *To) {
350 if (
From == SrcChain)
352 if (
From == DstChain)
356 void appendJump(Jump *Jump) { Jumps.push_back(Jump); }
358 void moveJumps(ChainEdge *Other) {
359 Jumps.insert(Jumps.end(),
Other->Jumps.begin(),
Other->Jumps.end());
360 Other->Jumps.clear();
361 Other->Jumps.shrink_to_fit();
364 bool hasCachedMergeGain(Chain *Src, Chain *Dst)
const {
365 return Src == SrcChain ? CacheValidForward : CacheValidBackward;
368 MergeGainTy getCachedMergeGain(Chain *Src, Chain *Dst)
const {
369 return Src == SrcChain ? CachedGainForward : CachedGainBackward;
372 void setCachedMergeGain(Chain *Src, Chain *Dst, MergeGainTy MergeGain) {
373 if (Src == SrcChain) {
374 CachedGainForward = MergeGain;
375 CacheValidForward =
true;
377 CachedGainBackward = MergeGain;
378 CacheValidBackward =
true;
382 void invalidateCache() {
383 CacheValidForward =
false;
384 CacheValidBackward =
false;
389 Chain *SrcChain{
nullptr};
391 Chain *DstChain{
nullptr};
393 std::vector<Jump *> Jumps;
397 MergeGainTy CachedGainForward;
398 MergeGainTy CachedGainBackward;
400 bool CacheValidForward{
false};
401 bool CacheValidBackward{
false};
404 void Chain::mergeEdges(Chain *Other) {
405 assert(
this != Other &&
"cannot merge a chain with itself");
408 for (
auto EdgeIt :
Other->Edges) {
409 Chain *DstChain = EdgeIt.first;
410 ChainEdge *DstEdge = EdgeIt.second;
411 Chain *TargetChain = DstChain ==
Other ?
this : DstChain;
412 ChainEdge *CurEdge = getEdge(TargetChain);
413 if (CurEdge ==
nullptr) {
414 DstEdge->changeEndpoint(Other,
this);
415 this->
addEdge(TargetChain, DstEdge);
416 if (DstChain !=
this && DstChain != Other) {
417 DstChain->addEdge(
this, DstEdge);
420 CurEdge->moveJumps(DstEdge);
423 if (DstChain != Other) {
424 DstChain->removeEdge(Other);
429 using BlockIter = std::vector<Block *>::const_iterator;
435 MergedChain(BlockIter Begin1, BlockIter End1, BlockIter Begin2 = BlockIter(),
436 BlockIter End2 = BlockIter(), BlockIter Begin3 = BlockIter(),
437 BlockIter End3 = BlockIter())
438 : Begin1(Begin1), End1(End1), Begin2(Begin2), End2(End2), Begin3(Begin3),
441 template <
typename F>
void forEach(
const F &Func)
const {
442 for (
auto It = Begin1; It != End1; It++)
444 for (
auto It = Begin2; It != End2; It++)
446 for (
auto It = Begin3; It != End3; It++)
450 std::vector<Block *> getBlocks()
const {
451 std::vector<Block *>
Result;
452 Result.reserve(std::distance(Begin1, End1) + std::distance(Begin2, End2) +
453 std::distance(Begin3, End3));
460 const Block *getFirstBlock()
const {
return *Begin1; }
473 using EdgeT = std::pair<uint64_t, uint64_t>;
474 using EdgeCountMap = std::vector<std::pair<EdgeT, uint64_t>>;
477 ExtTSPImpl(
size_t NumNodes,
const std::vector<uint64_t> &NodeSizes,
478 const std::vector<uint64_t> &NodeCounts,
479 const EdgeCountMap &EdgeCounts)
480 : NumNodes(NumNodes) {
481 initialize(NodeSizes, NodeCounts, EdgeCounts);
485 void run(std::vector<uint64_t> &Result) {
496 concatChains(Result);
501 void initialize(
const std::vector<uint64_t> &NodeSizes,
502 const std::vector<uint64_t> &NodeCounts,
503 const EdgeCountMap &EdgeCounts) {
505 AllBlocks.reserve(NumNodes);
506 for (
uint64_t Node = 0; Node < NumNodes; Node++) {
507 uint64_t Size = std::max<uint64_t>(NodeSizes[Node], 1ULL);
508 uint64_t ExecutionCount = NodeCounts[Node];
510 if (Node == 0 && ExecutionCount == 0)
512 AllBlocks.emplace_back(Node, Size, ExecutionCount);
516 SuccNodes.resize(NumNodes);
517 PredNodes.resize(NumNodes);
518 std::vector<uint64_t> OutDegree(NumNodes, 0);
519 AllJumps.reserve(EdgeCounts.size());
520 for (
auto It : EdgeCounts) {
521 auto Pred = It.first.first;
522 auto Succ = It.first.second;
528 SuccNodes[Pred].push_back(Succ);
529 PredNodes[Succ].push_back(Pred);
530 auto ExecutionCount = It.second;
531 if (ExecutionCount > 0) {
532 auto &
Block = AllBlocks[Pred];
533 auto &SuccBlock = AllBlocks[Succ];
534 AllJumps.emplace_back(&Block, &SuccBlock, ExecutionCount);
535 SuccBlock.InJumps.push_back(&AllJumps.back());
536 Block.OutJumps.push_back(&AllJumps.back());
539 for (
auto &Jump : AllJumps) {
540 assert(OutDegree[Jump.Source->Index] > 0);
541 Jump.IsConditional = OutDegree[Jump.Source->Index] > 1;
545 AllChains.reserve(NumNodes);
546 HotChains.reserve(NumNodes);
547 for (Block &Block : AllBlocks) {
548 AllChains.emplace_back(
Block.Index, &Block);
549 Block.CurChain = &AllChains.back();
550 if (
Block.ExecutionCount > 0) {
551 HotChains.push_back(&AllChains.back());
556 AllEdges.reserve(AllJumps.size());
557 for (Block &Block : AllBlocks) {
558 for (
auto &Jump :
Block.OutJumps) {
559 auto SuccBlock = Jump->Target;
560 ChainEdge *CurEdge =
Block.CurChain->getEdge(SuccBlock->CurChain);
562 if (CurEdge !=
nullptr) {
563 assert(SuccBlock->CurChain->getEdge(
Block.CurChain) !=
nullptr);
564 CurEdge->appendJump(Jump);
568 AllEdges.emplace_back(Jump);
569 Block.CurChain->addEdge(SuccBlock->CurChain, &AllEdges.back());
570 SuccBlock->CurChain->addEdge(
Block.CurChain, &AllEdges.back());
579 void mergeForcedPairs() {
581 for (
auto &Block : AllBlocks) {
582 if (SuccNodes[
Block.Index].size() == 1 &&
583 PredNodes[SuccNodes[
Block.Index][0]].size() == 1 &&
584 SuccNodes[
Block.Index][0] != 0) {
585 size_t SuccIndex = SuccNodes[
Block.Index][0];
586 Block.ForcedSucc = &AllBlocks[SuccIndex];
587 AllBlocks[SuccIndex].ForcedPred = &
Block;
597 for (
auto &Block : AllBlocks) {
598 if (
Block.ForcedSucc ==
nullptr ||
Block.ForcedPred ==
nullptr)
601 auto SuccBlock =
Block.ForcedSucc;
602 while (SuccBlock !=
nullptr && SuccBlock != &Block) {
603 SuccBlock = SuccBlock->ForcedSucc;
605 if (SuccBlock ==
nullptr)
608 AllBlocks[
Block.ForcedPred->Index].ForcedSucc =
nullptr;
609 Block.ForcedPred =
nullptr;
613 for (
auto &Block : AllBlocks) {
614 if (
Block.ForcedPred ==
nullptr &&
Block.ForcedSucc !=
nullptr) {
615 auto CurBlock = &
Block;
616 while (CurBlock->ForcedSucc !=
nullptr) {
617 const auto NextBlock = CurBlock->ForcedSucc;
618 mergeChains(
Block.CurChain, NextBlock->CurChain, 0, MergeTypeTy::X_Y);
619 CurBlock = NextBlock;
626 void mergeChainPairs() {
628 auto compareChainPairs = [](
const Chain *A1,
const Chain *B1,
629 const Chain *A2,
const Chain *B2) {
631 return A1->id() < A2->id();
632 return B1->id() < B2->id();
635 while (HotChains.size() > 1) {
636 Chain *BestChainPred =
nullptr;
637 Chain *BestChainSucc =
nullptr;
638 auto BestGain = MergeGainTy();
640 for (Chain *ChainPred : HotChains) {
642 for (
auto EdgeIter : ChainPred->edges()) {
643 Chain *ChainSucc = EdgeIter.first;
644 class ChainEdge *ChainEdge = EdgeIter.second;
646 if (ChainPred == ChainSucc)
650 if (ChainPred->numBlocks() + ChainSucc->numBlocks() >=
MaxChainSize)
654 MergeGainTy CurGain =
655 getBestMergeGain(ChainPred, ChainSucc, ChainEdge);
656 if (CurGain.score() <= EPS)
659 if (BestGain < CurGain ||
660 (
std::abs(CurGain.score() - BestGain.score()) < EPS &&
661 compareChainPairs(ChainPred, ChainSucc, BestChainPred,
664 BestChainPred = ChainPred;
665 BestChainSucc = ChainSucc;
671 if (BestGain.score() <= EPS)
675 mergeChains(BestChainPred, BestChainSucc, BestGain.mergeOffset(),
676 BestGain.mergeType());
683 void mergeColdChains() {
684 for (
size_t SrcBB = 0; SrcBB < NumNodes; SrcBB++) {
687 size_t NumSuccs = SuccNodes[SrcBB].size();
688 for (
size_t Idx = 0; Idx < NumSuccs; Idx++) {
689 auto DstBB = SuccNodes[SrcBB][NumSuccs - Idx - 1];
690 auto SrcChain = AllBlocks[SrcBB].CurChain;
691 auto DstChain = AllBlocks[DstBB].CurChain;
692 if (SrcChain != DstChain && !DstChain->isEntry() &&
693 SrcChain->blocks().back()->Index == SrcBB &&
694 DstChain->blocks().front()->Index == DstBB &&
695 SrcChain->isCold() == DstChain->isCold()) {
696 mergeChains(SrcChain, DstChain, 0, MergeTypeTy::X_Y);
703 double extTSPScore(
const MergedChain &MergedBlocks,
704 const std::vector<Jump *> &Jumps)
const {
708 MergedBlocks.forEach([&](
const Block *
BB) {
709 BB->EstimatedAddr = CurAddr;
714 for (
auto &Jump : Jumps) {
715 const Block *SrcBlock = Jump->Source;
716 const Block *DstBlock = Jump->Target;
717 Score += ::extTSPScore(SrcBlock->EstimatedAddr, SrcBlock->Size,
718 DstBlock->EstimatedAddr, Jump->ExecutionCount,
719 Jump->IsConditional);
730 MergeGainTy getBestMergeGain(Chain *ChainPred, Chain *ChainSucc,
731 ChainEdge *Edge)
const {
732 if (Edge->hasCachedMergeGain(ChainPred, ChainSucc)) {
733 return Edge->getCachedMergeGain(ChainPred, ChainSucc);
737 auto Jumps = Edge->jumps();
738 ChainEdge *EdgePP = ChainPred->getEdge(ChainPred);
739 if (EdgePP !=
nullptr) {
740 Jumps.insert(Jumps.end(), EdgePP->jumps().begin(), EdgePP->jumps().end());
742 assert(!Jumps.empty() &&
"trying to merge chains w/o jumps");
745 MergeGainTy Gain = MergeGainTy();
749 auto tryChainMerging = [&](
size_t Offset,
750 const std::vector<MergeTypeTy> &MergeTypes) {
752 if (
Offset == 0 ||
Offset == ChainPred->blocks().size())
755 auto BB = ChainPred->blocks()[
Offset - 1];
756 if (
BB->ForcedSucc !=
nullptr)
760 for (
const auto &MergeType : MergeTypes) {
761 Gain.updateIfLessThan(
762 computeMergeGain(ChainPred, ChainSucc, Jumps,
Offset, MergeType));
767 Gain.updateIfLessThan(
768 computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeTy::X_Y));
772 for (
auto &Jump : ChainSucc->blocks().front()->InJumps) {
773 const auto SrcBlock = Jump->Source;
774 if (SrcBlock->CurChain != ChainPred)
776 size_t Offset = SrcBlock->CurIndex + 1;
777 tryChainMerging(
Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::X2_X1_Y});
781 for (
auto &Jump : ChainSucc->blocks().back()->OutJumps) {
782 const auto DstBlock = Jump->Source;
783 if (DstBlock->CurChain != ChainPred)
785 size_t Offset = DstBlock->CurIndex;
786 tryChainMerging(
Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::Y_X2_X1});
796 tryChainMerging(
Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::Y_X2_X1,
797 MergeTypeTy::X2_X1_Y});
800 Edge->setCachedMergeGain(ChainPred, ChainSucc, Gain);
808 MergeGainTy computeMergeGain(
const Chain *ChainPred,
const Chain *ChainSucc,
809 const std::vector<Jump *> &Jumps,
811 MergeTypeTy MergeType)
const {
812 auto MergedBlocks = mergeBlocks(ChainPred->blocks(), ChainSucc->blocks(),
813 MergeOffset, MergeType);
816 if ((ChainPred->isEntry() || ChainSucc->isEntry()) &&
817 !MergedBlocks.getFirstBlock()->isEntry())
818 return MergeGainTy();
821 auto NewGainScore = extTSPScore(MergedBlocks, Jumps) - ChainPred->score();
822 return MergeGainTy(NewGainScore, MergeOffset, MergeType);
830 MergedChain mergeBlocks(
const std::vector<Block *> &
X,
831 const std::vector<Block *> &
Y,
size_t MergeOffset,
832 MergeTypeTy MergeType)
const {
834 BlockIter BeginX1 =
X.begin();
835 BlockIter EndX1 =
X.begin() + MergeOffset;
836 BlockIter BeginX2 =
X.begin() + MergeOffset;
837 BlockIter EndX2 =
X.end();
838 BlockIter BeginY =
Y.begin();
839 BlockIter EndY =
Y.end();
843 case MergeTypeTy::X_Y:
844 return MergedChain(BeginX1, EndX2, BeginY, EndY);
845 case MergeTypeTy::X1_Y_X2:
846 return MergedChain(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2);
847 case MergeTypeTy::Y_X2_X1:
848 return MergedChain(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1);
849 case MergeTypeTy::X2_X1_Y:
850 return MergedChain(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY);
857 void mergeChains(Chain *Into, Chain *
From,
size_t MergeOffset,
858 MergeTypeTy MergeType) {
859 assert(Into !=
From &&
"a chain cannot be merged with itself");
862 MergedChain MergedBlocks =
863 mergeBlocks(Into->blocks(),
From->blocks(), MergeOffset, MergeType);
864 Into->merge(
From, MergedBlocks.getBlocks());
865 Into->mergeEdges(
From);
869 ChainEdge *SelfEdge = Into->getEdge(Into);
870 if (SelfEdge !=
nullptr) {
871 MergedBlocks = MergedChain(Into->blocks().begin(), Into->blocks().end());
872 Into->setScore(extTSPScore(MergedBlocks, SelfEdge->jumps()));
879 for (
auto EdgeIter : Into->edges()) {
880 EdgeIter.second->invalidateCache();
885 void concatChains(std::vector<uint64_t> &Order) {
887 std::vector<Chain *> SortedChains;
889 for (
auto &Chain : AllChains) {
890 if (!Chain.blocks().empty()) {
891 SortedChains.push_back(&Chain);
894 double ExecutionCount = 0;
895 for (
auto *Block : Chain.blocks()) {
896 Size +=
static_cast<double>(
Block->Size);
897 ExecutionCount +=
static_cast<double>(
Block->ExecutionCount);
900 ChainDensity[&Chain] = ExecutionCount /
Size;
906 [&](
const Chain *
C1,
const Chain *C2) {
909 if (C1->isEntry() != C2->isEntry()) {
910 return C1->isEntry();
913 const double D1 = ChainDensity[
C1];
914 const double D2 = ChainDensity[C2];
916 return (D1 != D2) ? (D1 > D2) : (
C1->id() < C2->id());
920 Order.reserve(NumNodes);
921 for (Chain *Chain : SortedChains) {
922 for (Block *Block : Chain->blocks()) {
923 Order.push_back(
Block->Index);
930 const size_t NumNodes;
933 std::vector<std::vector<uint64_t>> SuccNodes;
936 std::vector<std::vector<uint64_t>> PredNodes;
939 std::vector<Block> AllBlocks;
942 std::vector<Jump> AllJumps;
945 std::vector<Chain> AllChains;
948 std::vector<ChainEdge> AllEdges;
951 std::vector<Chain *> HotChains;
957 const std::vector<uint64_t> &NodeSizes,
958 const std::vector<uint64_t> &NodeCounts,
959 const std::vector<std::pair<EdgeT, uint64_t>> &EdgeCounts) {
960 size_t NumNodes = NodeSizes.size();
963 assert(NodeCounts.size() == NodeSizes.size() &&
"Incorrect input");
964 assert(NumNodes > 2 &&
"Incorrect input");
967 auto Alg = ExtTSPImpl(NumNodes, NodeSizes, NodeCounts, EdgeCounts);
968 std::vector<uint64_t>
Result;
972 assert(
Result.front() == 0 &&
"Original entry point is not preserved");
973 assert(
Result.size() == NumNodes &&
"Incorrect size of reordered layout");
978 const std::vector<uint64_t> &Order,
const std::vector<uint64_t> &NodeSizes,
979 const std::vector<uint64_t> &NodeCounts,
980 const std::vector<std::pair<EdgeT, uint64_t>> &EdgeCounts) {
982 std::vector<uint64_t>
Addr(NodeSizes.size(), 0);
983 for (
size_t Idx = 1; Idx < Order.size(); Idx++) {
984 Addr[Order[Idx]] =
Addr[Order[Idx - 1]] + NodeSizes[Order[Idx - 1]];
986 std::vector<uint64_t> OutDegree(NodeSizes.size(), 0);
987 for (
auto It : EdgeCounts) {
988 auto Pred = It.first.first;
994 for (
auto It : EdgeCounts) {
995 auto Pred = It.first.first;
996 auto Succ = It.first.second;
998 bool IsConditional = OutDegree[Pred] > 1;
999 Score += ::extTSPScore(
Addr[Pred], NodeSizes[Pred],
Addr[Succ], Count,
1006 const std::vector<uint64_t> &NodeSizes,
1007 const std::vector<uint64_t> &NodeCounts,
1008 const std::vector<std::pair<EdgeT, uint64_t>> &EdgeCounts) {
1009 std::vector<uint64_t> Order(NodeSizes.size());
1010 for (
size_t Idx = 0; Idx < NodeSizes.size(); Idx++) {