54 #define DEBUG_TYPE "loop-interchange"
56 STATISTIC(LoopsInterchanged,
"Number of loops interchanged");
60 cl::desc(
"Interchange if you gain more than this number"));
67 using CharMatrix = std::vector<std::vector<char>>;
77 #ifdef DUMP_DEP_MATRICIES
78 static void printDepMatrix(CharMatrix &DepMatrix) {
79 for (
auto &Row : DepMatrix) {
97 if (!isa<Instruction>(
I))
99 if (
auto *Ld = dyn_cast<LoadInst>(&
I)) {
102 MemInstr.push_back(&
I);
103 }
else if (
auto *St = dyn_cast<StoreInst>(&
I)) {
106 MemInstr.push_back(&
I);
112 <<
" Loads and Stores to analyze\n");
114 ValueVector::iterator
I,
IE, J, JE;
116 for (
I = MemInstr.begin(),
IE = MemInstr.end();
I !=
IE; ++
I) {
117 for (J =
I, JE = MemInstr.end(); J != JE; ++J) {
118 std::vector<char> Dep;
122 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
125 if (
auto D = DI->
depends(Src, Dst,
true)) {
126 assert(
D->isOrdered() &&
"Expected an output, flow or anti dep.");
128 D->isFlow() ?
"flow" :
D->isAnti() ?
"anti" :
"output";
129 dbgs() <<
"Found " << DepType
130 <<
" dependency between Src and Dst\n"
131 <<
" Src:" << *Src <<
"\n Dst:" << *Dst <<
'\n');
132 unsigned Levels =
D->getLevels();
134 for (
unsigned II = 1; II <= Levels; ++II) {
135 const SCEV *Distance =
D->getDistance(II);
137 dyn_cast_or_null<SCEVConstant>(Distance);
147 }
else if (
D->isScalar(II)) {
151 unsigned Dir =
D->getDirection(II);
165 while (Dep.size() !=
Level) {
169 DepMatrix.push_back(Dep);
172 <<
" dependencies inside loop\n");
186 for (
unsigned I = 0,
E = DepMatrix.size();
I <
E; ++
I)
187 std::swap(DepMatrix[
I][ToIndx], DepMatrix[
I][FromIndx]);
194 for (
unsigned i = 0;
i <= Column; ++
i) {
195 if (DepMatrix[Row][
i] ==
'<')
197 if (DepMatrix[Row][
i] ==
'>')
207 for (
unsigned i = 0;
i < Column; ++
i) {
208 if (DepMatrix[Row][
i] !=
'=' && DepMatrix[Row][
i] !=
'S' &&
209 DepMatrix[Row][
i] !=
'I')
216 unsigned OuterLoopId,
char InnerDep,
221 if (InnerDep == OuterDep)
227 if (InnerDep ==
'=' || InnerDep ==
'S' || InnerDep ==
'I')
233 if (InnerDep ==
'>') {
236 if (OuterLoopId == 0)
254 unsigned InnerLoopId,
255 unsigned OuterLoopId) {
256 unsigned NumRows = DepMatrix.size();
258 for (
unsigned Row = 0; Row < NumRows; ++Row) {
259 char InnerDep = DepMatrix[Row][InnerLoopId];
260 char OuterDep = DepMatrix[Row][OuterLoopId];
261 if (InnerDep ==
'*' || OuterDep ==
'*')
273 assert(LoopList.empty() &&
"LoopList should initially be empty!");
274 Loop *CurrentLoop = &L;
275 const std::vector<Loop *> *Vec = &CurrentLoop->
getSubLoops();
276 while (!Vec->empty()) {
280 if (Vec->size() != 1) {
285 LoopList.push_back(CurrentLoop);
286 CurrentLoop = Vec->front();
289 LoopList.push_back(CurrentLoop);
296 class LoopInterchangeLegality {
300 : OuterLoop(
Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
303 bool canInterchangeLoops(
unsigned InnerLoopId,
unsigned OuterLoopId,
304 CharMatrix &DepMatrix);
312 bool isLoopStructureUnderstood();
314 bool currentLimitations();
317 return OuterInnerReductions;
321 return InnerLoopInductions;
325 bool tightlyNested(
Loop *Outer,
Loop *Inner);
332 bool findInductionAndReductions(
Loop *L,
354 class LoopInterchangeProfitability {
358 : OuterLoop(
Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
361 bool isProfitable(
unsigned InnerLoopId,
unsigned OuterLoopId,
362 CharMatrix &DepMatrix);
365 int getInstrOrderCost();
378 class LoopInterchangeTransform {
382 const LoopInterchangeLegality &LIL)
383 : OuterLoop(
Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {}
387 void restructureLoops(
Loop *NewInner,
Loop *NewOuter,
390 void removeChildLoop(
Loop *OuterLoop,
Loop *InnerLoop);
393 bool adjustLoopLinks();
394 bool adjustLoopBranches();
405 const LoopInterchangeLegality &LIL;
408 struct LoopInterchange {
419 : SE(SE), LI(LI), DI(DI), DT(DT), ORE(ORE) {}
426 return processLoopList(LoopList);
431 for (
unsigned I = 1;
I < LoopList.size(); ++
I)
432 if (LoopList[
I]->getParentLoop() != LoopList[
I - 1])
434 return processLoopList(LoopList);
438 for (
Loop *L : LoopList) {
440 if (isa<SCEVCouldNotCompute>(ExitCountOuter)) {
459 return LoopList.
size() - 1;
463 bool Changed =
false;
464 unsigned LoopNestDepth = LoopList.size();
465 if (LoopNestDepth < 2) {
466 LLVM_DEBUG(
dbgs() <<
"Loop doesn't contain minimum nesting level.\n");
474 if (!isComputableLoopNest(LoopList)) {
475 LLVM_DEBUG(
dbgs() <<
"Not valid loop candidate for interchange\n");
479 LLVM_DEBUG(
dbgs() <<
"Processing LoopList of size = " << LoopNestDepth
482 CharMatrix DependencyMatrix;
483 Loop *OuterMostLoop = *(LoopList.begin());
485 OuterMostLoop, DI)) {
489 #ifdef DUMP_DEP_MATRICIES
491 printDepMatrix(DependencyMatrix);
501 unsigned SelecLoopId = selectLoopForInterchange(LoopList);
506 for (
unsigned j = SelecLoopId;
j > 0;
j--) {
507 bool ChangedPerIter =
false;
508 for (
unsigned i = SelecLoopId;
i > SelecLoopId -
j;
i--) {
509 bool Interchanged = processLoop(LoopList[
i], LoopList[
i - 1],
i,
i - 1,
517 #ifdef DUMP_DEP_MATRICIES
519 printDepMatrix(DependencyMatrix);
521 ChangedPerIter |= Interchanged;
522 Changed |= Interchanged;
532 bool processLoop(
Loop *InnerLoop,
Loop *OuterLoop,
unsigned InnerLoopId,
533 unsigned OuterLoopId,
534 std::vector<std::vector<char>> &DependencyMatrix) {
536 <<
" and OuterLoopId = " << OuterLoopId <<
"\n");
537 LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
538 if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
539 LLVM_DEBUG(
dbgs() <<
"Not interchanging loops. Cannot prove legality.\n");
543 LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
544 if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
553 <<
"Loop interchanged with enclosing loop.";
556 LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL);
562 "Inner loop not left in LCSSA form after loop interchange!");
564 "Outer loop not left in LCSSA form after loop interchange!");
572 bool LoopInterchangeLegality::containsUnsafeInstructions(
BasicBlock *
BB) {
574 return I.mayHaveSideEffects() ||
I.mayReadFromMemory();
578 bool LoopInterchangeLegality::tightlyNested(
Loop *OuterLoop,
Loop *InnerLoop) {
590 if (!OuterLoopHeaderBI)
594 if (Succ != InnerLoopPreHeader && Succ != InnerLoop->
getHeader() &&
595 Succ != OuterLoopLatch)
598 LLVM_DEBUG(
dbgs() <<
"Checking instructions in Loop header and Loop latch\n");
601 if (containsUnsafeInstructions(OuterLoopHeader) ||
602 containsUnsafeInstructions(OuterLoopLatch))
608 if (InnerLoopPreHeader != OuterLoopHeader &&
609 containsUnsafeInstructions(InnerLoopPreHeader))
617 if (&SuccInner != OuterLoopLatch) {
619 <<
" does not lead to the outer loop latch.\n";);
625 if (containsUnsafeInstructions(InnerLoopExit))
633 bool LoopInterchangeLegality::isLoopStructureUnderstood() {
635 for (
PHINode *InnerInduction : InnerLoopInductions) {
636 unsigned Num = InnerInduction->getNumOperands();
637 for (
unsigned i = 0;
i < Num; ++
i) {
638 Value *Val = InnerInduction->getOperand(
i);
639 if (isa<Constant>(Val))
648 if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
649 InnerLoopPreheader &&
669 Value *Op0 = InnerLoopCmp->getOperand(0);
670 Value *Op1 = InnerLoopCmp->getOperand(1);
682 IsPathToInnerIndVar = [
this, &IsPathToInnerIndVar](
const Value *V) ->
bool {
685 if (isa<Constant>(V))
690 if (isa<CastInst>(
I))
691 return IsPathToInnerIndVar(
I->getOperand(0));
692 if (isa<BinaryOperator>(
I))
693 return IsPathToInnerIndVar(
I->getOperand(0)) &&
694 IsPathToInnerIndVar(
I->getOperand(1));
700 if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1))
705 if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Op0)) {
708 }
else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Op1)) {
727 PHINode *PHI = dyn_cast<PHINode>(SV);
739 if (isa<Constant>(V))
744 if (PHI->getNumIncomingValues() == 1)
760 bool LoopInterchangeLegality::findInductionAndReductions(
768 Inductions.push_back(&PHI);
773 if (!OuterInnerReductions.count(&PHI)) {
775 "across the outer loop.\n");
779 assert(PHI.getNumIncomingValues() == 2 &&
780 "Phis in loop header should have exactly 2 incoming values");
789 <<
"Failed to recognize PHI as an induction or reduction.\n");
792 OuterInnerReductions.insert(&PHI);
793 OuterInnerReductions.insert(InnerRedPhi);
802 bool LoopInterchangeLegality::currentLimitations() {
812 dbgs() <<
"Loops where the latch is not the exiting block are not"
813 <<
" supported currently.\n");
818 <<
"Loops where the latch is not the exiting block cannot be"
819 " interchange currently.";
825 if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) {
827 dbgs() <<
"Only outer loops with induction or reduction PHI nodes "
828 <<
"are supported currently.\n");
833 <<
"Only outer loops with induction or reduction PHI nodes can be"
834 " interchanged currently.";
840 if (!findInductionAndReductions(InnerLoop, Inductions,
nullptr)) {
842 dbgs() <<
"Only inner loops with induction or reduction PHI nodes "
843 <<
"are supported currently.\n");
848 <<
"Only inner loops with induction or reduction PHI nodes can be"
849 " interchange currently.";
855 if (!isLoopStructureUnderstood()) {
861 <<
"Inner loop structure not understood currently.";
869 bool LoopInterchangeLegality::findInductions(
874 Inductions.push_back(&PHI);
876 return !Inductions.empty();
888 if (PHI.getNumIncomingValues() > 1)
890 if (
any_of(PHI.users(), [&Reductions, OuterL](
User *U) {
891 PHINode *PN = dyn_cast<PHINode>(U);
893 (!Reductions.count(PN) && OuterL->contains(PN->getParent()));
911 for (
unsigned i = 0;
i < PHI.getNumIncomingValues();
i++) {
912 Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(
i));
958 for (
auto *U : PHI.users()) {
967 bool LoopInterchangeLegality::canInterchangeLoops(
unsigned InnerLoopId,
968 unsigned OuterLoopId,
969 CharMatrix &DepMatrix) {
971 LLVM_DEBUG(
dbgs() <<
"Failed interchange InnerLoopId = " << InnerLoopId
972 <<
" and OuterLoopId = " << OuterLoopId
973 <<
" due to dependence\n");
978 <<
"Cannot interchange loops due to dependences.";
983 for (
auto *
BB : OuterLoop->
blocks())
985 if (
CallInst *CI = dyn_cast<CallInst>(&
I)) {
987 if (CI->onlyWritesMemory())
990 dbgs() <<
"Loops with call instructions cannot be interchanged "
996 <<
"Cannot interchange loops due to call instruction.";
1002 if (!findInductions(InnerLoop, InnerLoopInductions)) {
1003 LLVM_DEBUG(
dbgs() <<
"Cound not find inner loop induction variables.\n");
1008 LLVM_DEBUG(
dbgs() <<
"Found unsupported PHI nodes in inner loop latch.\n");
1013 <<
"Cannot interchange loops because unsupported PHI nodes found "
1014 "in inner loop latch.";
1021 if (currentLimitations()) {
1022 LLVM_DEBUG(
dbgs() <<
"Not legal because of current transform limitation\n");
1027 if (!tightlyNested(OuterLoop, InnerLoop)) {
1033 <<
"Cannot interchange loops because they are not tightly "
1040 OuterInnerReductions)) {
1041 LLVM_DEBUG(
dbgs() <<
"Found unsupported PHI nodes in inner loop exit.\n");
1046 <<
"Found unsupported PHI node in loop exit.";
1052 LLVM_DEBUG(
dbgs() <<
"Found unsupported PHI nodes in outer loop exit.\n");
1057 <<
"Found unsupported PHI node in loop exit.";
1065 int LoopInterchangeProfitability::getInstrOrderCost() {
1066 unsigned GoodOrder, BadOrder;
1067 BadOrder = GoodOrder = 0;
1071 unsigned NumOp =
GEP->getNumOperands();
1072 bool FoundInnerInduction =
false;
1073 bool FoundOuterInduction =
false;
1074 for (
unsigned i = 0;
i < NumOp; ++
i) {
1089 if (AR->
getLoop() == InnerLoop) {
1092 FoundInnerInduction =
true;
1093 if (FoundOuterInduction) {
1103 if (AR->
getLoop() == OuterLoop) {
1106 FoundOuterInduction =
true;
1107 if (FoundInnerInduction) {
1116 return GoodOrder - BadOrder;
1120 unsigned OuterLoopId,
1121 CharMatrix &DepMatrix) {
1125 for (
auto &Row : DepMatrix) {
1126 if (Row[InnerLoopId] !=
'S' && Row[InnerLoopId] !=
'I')
1129 if (Row[OuterLoopId] !=
'=')
1135 return !DepMatrix.empty();
1138 bool LoopInterchangeProfitability::isProfitable(
unsigned InnerLoopId,
1139 unsigned OuterLoopId,
1140 CharMatrix &DepMatrix) {
1149 int Cost = getInstrOrderCost();
1163 <<
"Interchanging loops is too costly (cost="
1164 <<
ore::NV(
"Cost", Cost) <<
", threshold="
1166 <<
") and it does not improve parallelism.";
1171 void LoopInterchangeTransform::removeChildLoop(
Loop *OuterLoop,
1173 for (
Loop *L : *OuterLoop)
1174 if (L == InnerLoop) {
1175 OuterLoop->removeChildLoop(L);
1204 void LoopInterchangeTransform::restructureLoops(
1214 if (OuterLoopParent) {
1216 removeChildLoop(OuterLoopParent, NewInner);
1217 removeChildLoop(NewInner, NewOuter);
1220 removeChildLoop(NewInner, NewOuter);
1245 if (
BB == OuterHeader ||
BB == OuterLatch)
1262 bool Transformed =
false;
1267 auto &InductionPHIs = LIL.getInnerLoopInductions();
1268 if (InductionPHIs.empty()) {
1269 LLVM_DEBUG(
dbgs() <<
"Failed to find the point to split loop latch \n");
1274 for (
PHINode *CurInductionPHI : InductionPHIs) {
1275 if (CurInductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
1276 InnerIndexVarList.push_back(
1277 dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(1)));
1279 InnerIndexVarList.push_back(
1280 dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(0)));
1293 auto MoveInstructions = [&
i, &WorkList,
this, &InductionPHIs, NewLatch]() {
1294 for (;
i < WorkList.size();
i++) {
1300 "Moving instructions with side-effects may change behavior of "
1303 Instruction *UserI = cast<Instruction>(U.getUser());
1311 for (
Value *
Op : WorkList[
i]->operands()) {
1317 WorkList.insert(OpI);
1327 WorkList.insert(CondI);
1329 for (
Instruction *InnerIndexVar : InnerIndexVarList)
1330 WorkList.insert(cast<Instruction>(InnerIndexVar));
1345 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1346 if (InnerLoopPreHeader != OuterLoopHeader) {
1350 std::prev(InnerLoopPreHeader->
end()))))
1354 Transformed |= adjustLoopLinks();
1369 ToList.splice(InsertBefore->
getIterator(), FromList, FromList.begin(),
1380 I->removeFromParent();
1395 std::vector<DominatorTree::UpdateType> &DTUpdates,
1396 bool MustUpdateOnce =
true) {
1397 assert((!MustUpdateOnce ||
1401 }) == 1) &&
"BI must jump to OldBB exactly once.");
1402 bool Changed =
false;
1410 DTUpdates.push_back(
1412 DTUpdates.push_back(
1415 assert(Changed &&
"Expected a successor to be updated");
1432 assert(
P.getNumIncomingValues() == 1 &&
1433 "Only loops with a single exit are supported!");
1436 auto IncI = cast<Instruction>(
P.getIncomingValueForBlock(InnerLatch));
1439 auto IncIInnerMost = cast<Instruction>(
followLCSSA(IncI));
1442 if (IncIInnerMost->getParent() != InnerLatch &&
1443 IncIInnerMost->
getParent() != InnerHeader)
1447 [OuterHeader, OuterExit, IncI, InnerHeader](
User *U) {
1448 return (cast<PHINode>(U)->getParent() == OuterHeader &&
1449 IncI->getParent() == InnerHeader) ||
1450 cast<PHINode>(U)->getParent() == OuterExit;
1452 "Can only replace phis iff the uses are in the loop nest exit or "
1453 "the incoming value is defined in the inner header (it will "
1454 "dominate all loop blocks after interchanging)");
1455 P.replaceAllUsesWith(IncI);
1456 P.eraseFromParent();
1461 LcssaInnerExit.push_back(&
P);
1465 LcssaInnerLatch.push_back(&
P);
1486 if (
P.getNumIncomingValues() != 1)
1490 auto I = dyn_cast<Instruction>(
P.getIncomingValue(0));
1494 PHINode *NewPhi = dyn_cast<PHINode>(
P.clone());
1500 if (Pred == OuterLatch)
1505 P.setIncomingValue(0, NewPhi);
1515 bool LoopInterchangeTransform::adjustLoopBranches() {
1517 std::vector<DominatorTree::UpdateType> DTUpdates;
1519 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1522 assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
1523 InnerLoopPreHeader != InnerLoop->
getHeader() && OuterLoopPreHeader &&
1524 InnerLoopPreHeader &&
"Guaranteed by loop-simplify form");
1529 if (isa<PHINode>(OuterLoopPreHeader->
begin()) ||
1531 OuterLoopPreHeader =
1533 if (InnerLoopPreHeader == OuterLoop->getHeader())
1534 InnerLoopPreHeader =
1539 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1541 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
1557 if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
1558 !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
1563 dyn_cast<BranchInst>(InnerLoopLatchPredecessor->
getTerminator());
1565 dyn_cast<BranchInst>(OuterLoopPredecessor->
getTerminator());
1567 if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
1570 if (!InnerLoopHeaderSuccessor)
1578 InnerLoopPreHeader, DTUpdates,
false);
1588 InnerLoopHeaderSuccessor, DTUpdates,
1596 OuterLoopPreHeader, DTUpdates);
1599 if (InnerLoopLatchBI->
getSuccessor(0) == InnerLoopHeader)
1600 InnerLoopLatchSuccessor = InnerLoopLatchBI->
getSuccessor(1);
1602 InnerLoopLatchSuccessor = InnerLoopLatchBI->
getSuccessor(0);
1605 InnerLoopLatchSuccessor, DTUpdates);
1607 if (OuterLoopLatchBI->
getSuccessor(0) == OuterLoopHeader)
1608 OuterLoopLatchSuccessor = OuterLoopLatchBI->
getSuccessor(1);
1610 OuterLoopLatchSuccessor = OuterLoopLatchBI->
getSuccessor(0);
1613 OuterLoopLatchSuccessor, DTUpdates);
1614 updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch,
1618 restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader,
1619 OuterLoopPreHeader);
1621 moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch,
1622 OuterLoopHeader, OuterLoopLatch, InnerLoop->
getExitBlock(),
1628 auto &OuterInnerReductions = LIL.getOuterInnerReductions();
1632 if (OuterInnerReductions.contains(&PHI))
1633 InnerLoopPHIs.push_back(&PHI);
1636 if (OuterInnerReductions.contains(&PHI))
1637 OuterLoopPHIs.push_back(&PHI);
1642 for (
PHINode *PHI : OuterLoopPHIs) {
1643 LLVM_DEBUG(
dbgs() <<
"Outer loop reduction PHIs:\n"; PHI->dump(););
1645 assert(OuterInnerReductions.count(PHI) &&
"Expected a reduction PHI node");
1647 for (
PHINode *PHI : InnerLoopPHIs) {
1648 LLVM_DEBUG(
dbgs() <<
"Inner loop reduction PHIs:\n"; PHI->dump(););
1650 assert(OuterInnerReductions.count(PHI) &&
"Expected a reduction PHI node");
1667 MayNeedLCSSAPhis.push_back(&
I);
1673 bool LoopInterchangeTransform::adjustLoopLinks() {
1675 bool Changed = adjustLoopBranches();
1680 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1689 struct LoopInterchangeLegacyPass :
public LoopPass {
1707 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1708 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1709 auto *DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI();
1710 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1711 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1713 return LoopInterchange(SE, LI, DI, DT, ORE).run(L);
1721 "Interchanges loops for cache reuse",
false,
false)
1730 return new LoopInterchangeLegacyPass();
1741 if (!LoopInterchange(&AR.
SE, &AR.
LI, &DI, &AR.
DT, &ORE).run(LN))