LLVM 17.0.0git
LoopInterchange.cpp
Go to the documentation of this file.
1//===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This Pass handles loop interchange transform.
10// This pass interchanges loops to provide a more cache-friendly memory access
11// patterns.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/STLExtras.h"
18#include "llvm/ADT/Statistic.h"
19#include "llvm/ADT/StringRef.h"
28#include "llvm/IR/BasicBlock.h"
29#include "llvm/IR/Constants.h"
31#include "llvm/IR/Dominators.h"
32#include "llvm/IR/Function.h"
33#include "llvm/IR/IRBuilder.h"
34#include "llvm/IR/InstrTypes.h"
35#include "llvm/IR/Instruction.h"
37#include "llvm/IR/User.h"
38#include "llvm/IR/Value.h"
40#include "llvm/Pass.h"
43#include "llvm/Support/Debug.h"
50#include <cassert>
51#include <utility>
52#include <vector>
53
54using namespace llvm;
55
56#define DEBUG_TYPE "loop-interchange"
57
58STATISTIC(LoopsInterchanged, "Number of loops interchanged");
59
61 "loop-interchange-threshold", cl::init(0), cl::Hidden,
62 cl::desc("Interchange if you gain more than this number"));
63
64namespace {
65
66using LoopVector = SmallVector<Loop *, 8>;
67
68// TODO: Check if we can use a sparse matrix here.
69using CharMatrix = std::vector<std::vector<char>>;
70
71} // end anonymous namespace
72
73// Maximum number of dependencies that can be handled in the dependency matrix.
74static const unsigned MaxMemInstrCount = 100;
75
76// Maximum loop depth supported.
77static const unsigned MaxLoopNestDepth = 10;
78
79#ifdef DUMP_DEP_MATRICIES
80static void printDepMatrix(CharMatrix &DepMatrix) {
81 for (auto &Row : DepMatrix) {
82 for (auto D : Row)
83 LLVM_DEBUG(dbgs() << D << " ");
84 LLVM_DEBUG(dbgs() << "\n");
85 }
86}
87#endif
88
89static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
90 Loop *L, DependenceInfo *DI,
91 ScalarEvolution *SE) {
92 using ValueVector = SmallVector<Value *, 16>;
93
94 ValueVector MemInstr;
95
96 // For each block.
97 for (BasicBlock *BB : L->blocks()) {
98 // Scan the BB and collect legal loads and stores.
99 for (Instruction &I : *BB) {
100 if (!isa<Instruction>(I))
101 return false;
102 if (auto *Ld = dyn_cast<LoadInst>(&I)) {
103 if (!Ld->isSimple())
104 return false;
105 MemInstr.push_back(&I);
106 } else if (auto *St = dyn_cast<StoreInst>(&I)) {
107 if (!St->isSimple())
108 return false;
109 MemInstr.push_back(&I);
110 }
111 }
112 }
113
114 LLVM_DEBUG(dbgs() << "Found " << MemInstr.size()
115 << " Loads and Stores to analyze\n");
116
117 ValueVector::iterator I, IE, J, JE;
118
119 for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
120 for (J = I, JE = MemInstr.end(); J != JE; ++J) {
121 std::vector<char> Dep;
122 Instruction *Src = cast<Instruction>(*I);
123 Instruction *Dst = cast<Instruction>(*J);
124 // Ignore Input dependencies.
125 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
126 continue;
127 // Track Output, Flow, and Anti dependencies.
128 if (auto D = DI->depends(Src, Dst, true)) {
129 assert(D->isOrdered() && "Expected an output, flow or anti dep.");
130 // If the direction vector is negative, normalize it to
131 // make it non-negative.
132 if (D->normalize(SE))
133 LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
134 LLVM_DEBUG(StringRef DepType =
135 D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
136 dbgs() << "Found " << DepType
137 << " dependency between Src and Dst\n"
138 << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
139 unsigned Levels = D->getLevels();
140 char Direction;
141 for (unsigned II = 1; II <= Levels; ++II) {
142 if (D->isScalar(II)) {
143 Direction = 'S';
144 Dep.push_back(Direction);
145 } else {
146 unsigned Dir = D->getDirection(II);
147 if (Dir == Dependence::DVEntry::LT ||
149 Direction = '<';
150 else if (Dir == Dependence::DVEntry::GT ||
152 Direction = '>';
153 else if (Dir == Dependence::DVEntry::EQ)
154 Direction = '=';
155 else
156 Direction = '*';
157 Dep.push_back(Direction);
158 }
159 }
160 while (Dep.size() != Level) {
161 Dep.push_back('I');
162 }
163
164 DepMatrix.push_back(Dep);
165 if (DepMatrix.size() > MaxMemInstrCount) {
166 LLVM_DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
167 << " dependencies inside loop\n");
168 return false;
169 }
170 }
171 }
172 }
173
174 return true;
175}
176
177// A loop is moved from index 'from' to an index 'to'. Update the Dependence
178// matrix by exchanging the two columns.
179static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
180 unsigned ToIndx) {
181 for (unsigned I = 0, E = DepMatrix.size(); I < E; ++I)
182 std::swap(DepMatrix[I][ToIndx], DepMatrix[I][FromIndx]);
183}
184
185// After interchanging, check if the direction vector is valid.
186// [Theorem] A permutation of the loops in a perfect nest is legal if and only
187// if the direction matrix, after the same permutation is applied to its
188// columns, has no ">" direction as the leftmost non-"=" direction in any row.
189static bool isLexicographicallyPositive(std::vector<char> &DV) {
190 for (unsigned Level = 0; Level < DV.size(); ++Level) {
191 unsigned char Direction = DV[Level];
192 if (Direction == '<')
193 return true;
194 if (Direction == '>' || Direction == '*')
195 return false;
196 }
197 return true;
198}
199
200// Checks if it is legal to interchange 2 loops.
201static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
202 unsigned InnerLoopId,
203 unsigned OuterLoopId) {
204 unsigned NumRows = DepMatrix.size();
205 std::vector<char> Cur;
206 // For each row check if it is valid to interchange.
207 for (unsigned Row = 0; Row < NumRows; ++Row) {
208 // Create temporary DepVector check its lexicographical order
209 // before and after swapping OuterLoop vs InnerLoop
210 Cur = DepMatrix[Row];
212 return false;
213 std::swap(Cur[InnerLoopId], Cur[OuterLoopId]);
215 return false;
216 }
217 return true;
218}
219
220static void populateWorklist(Loop &L, LoopVector &LoopList) {
221 LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
222 << L.getHeader()->getParent()->getName() << " Loop: %"
223 << L.getHeader()->getName() << '\n');
224 assert(LoopList.empty() && "LoopList should initially be empty!");
225 Loop *CurrentLoop = &L;
226 const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
227 while (!Vec->empty()) {
228 // The current loop has multiple subloops in it hence it is not tightly
229 // nested.
230 // Discard all loops above it added into Worklist.
231 if (Vec->size() != 1) {
232 LoopList = {};
233 return;
234 }
235
236 LoopList.push_back(CurrentLoop);
237 CurrentLoop = Vec->front();
238 Vec = &CurrentLoop->getSubLoops();
239 }
240 LoopList.push_back(CurrentLoop);
241}
242
243namespace {
244
245/// LoopInterchangeLegality checks if it is legal to interchange the loop.
246class LoopInterchangeLegality {
247public:
248 LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
250 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
251
252 /// Check if the loops can be interchanged.
253 bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
254 CharMatrix &DepMatrix);
255
256 /// Discover induction PHIs in the header of \p L. Induction
257 /// PHIs are added to \p Inductions.
258 bool findInductions(Loop *L, SmallVectorImpl<PHINode *> &Inductions);
259
260 /// Check if the loop structure is understood. We do not handle triangular
261 /// loops for now.
262 bool isLoopStructureUnderstood();
263
264 bool currentLimitations();
265
266 const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const {
267 return OuterInnerReductions;
268 }
269
270 const SmallVectorImpl<PHINode *> &getInnerLoopInductions() const {
271 return InnerLoopInductions;
272 }
273
274private:
275 bool tightlyNested(Loop *Outer, Loop *Inner);
276 bool containsUnsafeInstructions(BasicBlock *BB);
277
278 /// Discover induction and reduction PHIs in the header of \p L. Induction
279 /// PHIs are added to \p Inductions, reductions are added to
280 /// OuterInnerReductions. When the outer loop is passed, the inner loop needs
281 /// to be passed as \p InnerLoop.
282 bool findInductionAndReductions(Loop *L,
283 SmallVector<PHINode *, 8> &Inductions,
284 Loop *InnerLoop);
285
286 Loop *OuterLoop;
287 Loop *InnerLoop;
288
289 ScalarEvolution *SE;
290
291 /// Interface to emit optimization remarks.
293
294 /// Set of reduction PHIs taking part of a reduction across the inner and
295 /// outer loop.
296 SmallPtrSet<PHINode *, 4> OuterInnerReductions;
297
298 /// Set of inner loop induction PHIs
299 SmallVector<PHINode *, 8> InnerLoopInductions;
300};
301
302/// LoopInterchangeProfitability checks if it is profitable to interchange the
303/// loop.
304class LoopInterchangeProfitability {
305public:
306 LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
308 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
309
310 /// Check if the loop interchange is profitable.
311 bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
312 unsigned InnerLoopId, unsigned OuterLoopId,
313 CharMatrix &DepMatrix,
315 std::unique_ptr<CacheCost> &CC);
316
317private:
318 int getInstrOrderCost();
319 std::optional<bool> isProfitablePerLoopCacheAnalysis(
321 std::unique_ptr<CacheCost> &CC);
322 std::optional<bool> isProfitablePerInstrOrderCost();
323 std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
324 unsigned OuterLoopId,
325 CharMatrix &DepMatrix);
326 Loop *OuterLoop;
327 Loop *InnerLoop;
328
329 /// Scev analysis.
330 ScalarEvolution *SE;
331
332 /// Interface to emit optimization remarks.
334};
335
336/// LoopInterchangeTransform interchanges the loop.
337class LoopInterchangeTransform {
338public:
339 LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
340 LoopInfo *LI, DominatorTree *DT,
341 const LoopInterchangeLegality &LIL)
342 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {}
343
344 /// Interchange OuterLoop and InnerLoop.
345 bool transform();
346 void restructureLoops(Loop *NewInner, Loop *NewOuter,
347 BasicBlock *OrigInnerPreHeader,
348 BasicBlock *OrigOuterPreHeader);
349 void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
350
351private:
352 bool adjustLoopLinks();
353 bool adjustLoopBranches();
354
355 Loop *OuterLoop;
356 Loop *InnerLoop;
357
358 /// Scev analysis.
359 ScalarEvolution *SE;
360
361 LoopInfo *LI;
362 DominatorTree *DT;
363
364 const LoopInterchangeLegality &LIL;
365};
366
367struct LoopInterchange {
368 ScalarEvolution *SE = nullptr;
369 LoopInfo *LI = nullptr;
370 DependenceInfo *DI = nullptr;
371 DominatorTree *DT = nullptr;
372 std::unique_ptr<CacheCost> CC = nullptr;
373
374 /// Interface to emit optimization remarks.
376
377 LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
378 DominatorTree *DT, std::unique_ptr<CacheCost> &CC,
380 : SE(SE), LI(LI), DI(DI), DT(DT), CC(std::move(CC)), ORE(ORE) {}
381
382 bool run(Loop *L) {
383 if (L->getParentLoop())
384 return false;
385 SmallVector<Loop *, 8> LoopList;
386 populateWorklist(*L, LoopList);
387 return processLoopList(LoopList);
388 }
389
390 bool run(LoopNest &LN) {
391 SmallVector<Loop *, 8> LoopList(LN.getLoops().begin(), LN.getLoops().end());
392 for (unsigned I = 1; I < LoopList.size(); ++I)
393 if (LoopList[I]->getParentLoop() != LoopList[I - 1])
394 return false;
395 return processLoopList(LoopList);
396 }
397
398 bool isComputableLoopNest(ArrayRef<Loop *> LoopList) {
399 for (Loop *L : LoopList) {
400 const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
401 if (isa<SCEVCouldNotCompute>(ExitCountOuter)) {
402 LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
403 return false;
404 }
405 if (L->getNumBackEdges() != 1) {
406 LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
407 return false;
408 }
409 if (!L->getExitingBlock()) {
410 LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
411 return false;
412 }
413 }
414 return true;
415 }
416
417 unsigned selectLoopForInterchange(ArrayRef<Loop *> LoopList) {
418 // TODO: Add a better heuristic to select the loop to be interchanged based
419 // on the dependence matrix. Currently we select the innermost loop.
420 return LoopList.size() - 1;
421 }
422
423 bool processLoopList(SmallVectorImpl<Loop *> &LoopList) {
424 bool Changed = false;
425 unsigned LoopNestDepth = LoopList.size();
426 if (LoopNestDepth < 2) {
427 LLVM_DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");
428 return false;
429 }
430 if (LoopNestDepth > MaxLoopNestDepth) {
431 LLVM_DEBUG(dbgs() << "Cannot handle loops of depth greater than "
432 << MaxLoopNestDepth << "\n");
433 return false;
434 }
435 if (!isComputableLoopNest(LoopList)) {
436 LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
437 return false;
438 }
439
440 LLVM_DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth
441 << "\n");
442
443 CharMatrix DependencyMatrix;
444 Loop *OuterMostLoop = *(LoopList.begin());
445 if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
446 OuterMostLoop, DI, SE)) {
447 LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
448 return false;
449 }
450#ifdef DUMP_DEP_MATRICIES
451 LLVM_DEBUG(dbgs() << "Dependence before interchange\n");
452 printDepMatrix(DependencyMatrix);
453#endif
454
455 // Get the Outermost loop exit.
456 BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
457 if (!LoopNestExit) {
458 LLVM_DEBUG(dbgs() << "OuterMostLoop needs an unique exit block");
459 return false;
460 }
461
462 unsigned SelecLoopId = selectLoopForInterchange(LoopList);
463 // Obtain the loop vector returned from loop cache analysis beforehand,
464 // and put each <Loop, index> pair into a map for constant time query
465 // later. Indices in loop vector reprsent the optimal order of the
466 // corresponding loop, e.g., given a loopnest with depth N, index 0
467 // indicates the loop should be placed as the outermost loop and index N
468 // indicates the loop should be placed as the innermost loop.
469 //
470 // For the old pass manager CacheCost would be null.
472 if (CC != nullptr) {
473 const auto &LoopCosts = CC->getLoopCosts();
474 for (unsigned i = 0; i < LoopCosts.size(); i++) {
475 CostMap[LoopCosts[i].first] = i;
476 }
477 }
478 // We try to achieve the globally optimal memory access for the loopnest,
479 // and do interchange based on a bubble-sort fasion. We start from
480 // the innermost loop, move it outwards to the best possible position
481 // and repeat this process.
482 for (unsigned j = SelecLoopId; j > 0; j--) {
483 bool ChangedPerIter = false;
484 for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
485 bool Interchanged = processLoop(LoopList[i], LoopList[i - 1], i, i - 1,
486 DependencyMatrix, CostMap);
487 if (!Interchanged)
488 continue;
489 // Loops interchanged, update LoopList accordingly.
490 std::swap(LoopList[i - 1], LoopList[i]);
491 // Update the DependencyMatrix
492 interChangeDependencies(DependencyMatrix, i, i - 1);
493#ifdef DUMP_DEP_MATRICIES
494 LLVM_DEBUG(dbgs() << "Dependence after interchange\n");
495 printDepMatrix(DependencyMatrix);
496#endif
497 ChangedPerIter |= Interchanged;
498 Changed |= Interchanged;
499 }
500 // Early abort if there was no interchange during an entire round of
501 // moving loops outwards.
502 if (!ChangedPerIter)
503 break;
504 }
505 return Changed;
506 }
507
508 bool processLoop(Loop *InnerLoop, Loop *OuterLoop, unsigned InnerLoopId,
509 unsigned OuterLoopId,
510 std::vector<std::vector<char>> &DependencyMatrix,
511 const DenseMap<const Loop *, unsigned> &CostMap) {
512 LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
513 << " and OuterLoopId = " << OuterLoopId << "\n");
514 LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
515 if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
516 LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n");
517 return false;
518 }
519 LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
520 LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
521 if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
522 DependencyMatrix, CostMap, CC)) {
523 LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
524 return false;
525 }
526
527 ORE->emit([&]() {
528 return OptimizationRemark(DEBUG_TYPE, "Interchanged",
529 InnerLoop->getStartLoc(),
530 InnerLoop->getHeader())
531 << "Loop interchanged with enclosing loop.";
532 });
533
534 LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL);
535 LIT.transform();
536 LLVM_DEBUG(dbgs() << "Loops interchanged.\n");
537 LoopsInterchanged++;
538
539 llvm::formLCSSARecursively(*OuterLoop, *DT, LI, SE);
540 return true;
541 }
542};
543
544} // end anonymous namespace
545
546bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB) {
547 return any_of(*BB, [](const Instruction &I) {
548 return I.mayHaveSideEffects() || I.mayReadFromMemory();
549 });
550}
551
552bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
553 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
554 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
555 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
556
557 LLVM_DEBUG(dbgs() << "Checking if loops are tightly nested\n");
558
559 // A perfectly nested loop will not have any branch in between the outer and
560 // inner block i.e. outer header will branch to either inner preheader and
561 // outerloop latch.
562 BranchInst *OuterLoopHeaderBI =
563 dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
564 if (!OuterLoopHeaderBI)
565 return false;
566
567 for (BasicBlock *Succ : successors(OuterLoopHeaderBI))
568 if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader() &&
569 Succ != OuterLoopLatch)
570 return false;
571
572 LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
573 // We do not have any basic block in between now make sure the outer header
574 // and outer loop latch doesn't contain any unsafe instructions.
575 if (containsUnsafeInstructions(OuterLoopHeader) ||
576 containsUnsafeInstructions(OuterLoopLatch))
577 return false;
578
579 // Also make sure the inner loop preheader does not contain any unsafe
580 // instructions. Note that all instructions in the preheader will be moved to
581 // the outer loop header when interchanging.
582 if (InnerLoopPreHeader != OuterLoopHeader &&
583 containsUnsafeInstructions(InnerLoopPreHeader))
584 return false;
585
586 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
587 // Ensure the inner loop exit block flows to the outer loop latch possibly
588 // through empty blocks.
589 const BasicBlock &SuccInner =
590 LoopNest::skipEmptyBlockUntil(InnerLoopExit, OuterLoopLatch);
591 if (&SuccInner != OuterLoopLatch) {
592 LLVM_DEBUG(dbgs() << "Inner loop exit block " << *InnerLoopExit
593 << " does not lead to the outer loop latch.\n";);
594 return false;
595 }
596 // The inner loop exit block does flow to the outer loop latch and not some
597 // other BBs, now make sure it contains safe instructions, since it will be
598 // moved into the (new) inner loop after interchange.
599 if (containsUnsafeInstructions(InnerLoopExit))
600 return false;
601
602 LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
603 // We have a perfect loop nest.
604 return true;
605}
606
607bool LoopInterchangeLegality::isLoopStructureUnderstood() {
608 BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
609 for (PHINode *InnerInduction : InnerLoopInductions) {
610 unsigned Num = InnerInduction->getNumOperands();
611 for (unsigned i = 0; i < Num; ++i) {
612 Value *Val = InnerInduction->getOperand(i);
613 if (isa<Constant>(Val))
614 continue;
615 Instruction *I = dyn_cast<Instruction>(Val);
616 if (!I)
617 return false;
618 // TODO: Handle triangular loops.
619 // e.g. for(int i=0;i<N;i++)
620 // for(int j=i;j<N;j++)
621 unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
622 if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
623 InnerLoopPreheader &&
624 !OuterLoop->isLoopInvariant(I)) {
625 return false;
626 }
627 }
628 }
629
630 // TODO: Handle triangular loops of another form.
631 // e.g. for(int i=0;i<N;i++)
632 // for(int j=0;j<i;j++)
633 // or,
634 // for(int i=0;i<N;i++)
635 // for(int j=0;j*i<N;j++)
636 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
637 BranchInst *InnerLoopLatchBI =
638 dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
639 if (!InnerLoopLatchBI->isConditional())
640 return false;
641 if (CmpInst *InnerLoopCmp =
642 dyn_cast<CmpInst>(InnerLoopLatchBI->getCondition())) {
643 Value *Op0 = InnerLoopCmp->getOperand(0);
644 Value *Op1 = InnerLoopCmp->getOperand(1);
645
646 // LHS and RHS of the inner loop exit condition, e.g.,
647 // in "for(int j=0;j<i;j++)", LHS is j and RHS is i.
648 Value *Left = nullptr;
649 Value *Right = nullptr;
650
651 // Check if V only involves inner loop induction variable.
652 // Return true if V is InnerInduction, or a cast from
653 // InnerInduction, or a binary operator that involves
654 // InnerInduction and a constant.
655 std::function<bool(Value *)> IsPathToInnerIndVar;
656 IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value *V) -> bool {
657 if (llvm::is_contained(InnerLoopInductions, V))
658 return true;
659 if (isa<Constant>(V))
660 return true;
661 const Instruction *I = dyn_cast<Instruction>(V);
662 if (!I)
663 return false;
664 if (isa<CastInst>(I))
665 return IsPathToInnerIndVar(I->getOperand(0));
666 if (isa<BinaryOperator>(I))
667 return IsPathToInnerIndVar(I->getOperand(0)) &&
668 IsPathToInnerIndVar(I->getOperand(1));
669 return false;
670 };
671
672 // In case of multiple inner loop indvars, it is okay if LHS and RHS
673 // are both inner indvar related variables.
674 if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1))
675 return true;
676
677 // Otherwise we check if the cmp instruction compares an inner indvar
678 // related variable (Left) with a outer loop invariant (Right).
679 if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Op0)) {
680 Left = Op0;
681 Right = Op1;
682 } else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Op1)) {
683 Left = Op1;
684 Right = Op0;
685 }
686
687 if (Left == nullptr)
688 return false;
689
690 const SCEV *S = SE->getSCEV(Right);
691 if (!SE->isLoopInvariant(S, OuterLoop))
692 return false;
693 }
694
695 return true;
696}
697
698// If SV is a LCSSA PHI node with a single incoming value, return the incoming
699// value.
700static Value *followLCSSA(Value *SV) {
701 PHINode *PHI = dyn_cast<PHINode>(SV);
702 if (!PHI)
703 return SV;
704
705 if (PHI->getNumIncomingValues() != 1)
706 return SV;
707 return followLCSSA(PHI->getIncomingValue(0));
708}
709
710// Check V's users to see if it is involved in a reduction in L.
712 // Reduction variables cannot be constants.
713 if (isa<Constant>(V))
714 return nullptr;
715
716 for (Value *User : V->users()) {
717 if (PHINode *PHI = dyn_cast<PHINode>(User)) {
718 if (PHI->getNumIncomingValues() == 1)
719 continue;
722 // Detect floating point reduction only when it can be reordered.
723 if (RD.getExactFPMathInst() != nullptr)
724 return nullptr;
725 return PHI;
726 }
727 return nullptr;
728 }
729 }
730
731 return nullptr;
732}
733
734bool LoopInterchangeLegality::findInductionAndReductions(
735 Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) {
736 if (!L->getLoopLatch() || !L->getLoopPredecessor())
737 return false;
738 for (PHINode &PHI : L->getHeader()->phis()) {
741 Inductions.push_back(&PHI);
742 else {
743 // PHIs in inner loops need to be part of a reduction in the outer loop,
744 // discovered when checking the PHIs of the outer loop earlier.
745 if (!InnerLoop) {
746 if (!OuterInnerReductions.count(&PHI)) {
747 LLVM_DEBUG(dbgs() << "Inner loop PHI is not part of reductions "
748 "across the outer loop.\n");
749 return false;
750 }
751 } else {
752 assert(PHI.getNumIncomingValues() == 2 &&
753 "Phis in loop header should have exactly 2 incoming values");
754 // Check if we have a PHI node in the outer loop that has a reduction
755 // result from the inner loop as an incoming value.
756 Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch()));
757 PHINode *InnerRedPhi = findInnerReductionPhi(InnerLoop, V);
758 if (!InnerRedPhi ||
759 !llvm::is_contained(InnerRedPhi->incoming_values(), &PHI)) {
761 dbgs()
762 << "Failed to recognize PHI as an induction or reduction.\n");
763 return false;
764 }
765 OuterInnerReductions.insert(&PHI);
766 OuterInnerReductions.insert(InnerRedPhi);
767 }
768 }
769 }
770 return true;
771}
772
773// This function indicates the current limitations in the transform as a result
774// of which we do not proceed.
775bool LoopInterchangeLegality::currentLimitations() {
776 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
777
778 // transform currently expects the loop latches to also be the exiting
779 // blocks.
780 if (InnerLoop->getExitingBlock() != InnerLoopLatch ||
781 OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() ||
782 !isa<BranchInst>(InnerLoopLatch->getTerminator()) ||
783 !isa<BranchInst>(OuterLoop->getLoopLatch()->getTerminator())) {
785 dbgs() << "Loops where the latch is not the exiting block are not"
786 << " supported currently.\n");
787 ORE->emit([&]() {
788 return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch",
789 OuterLoop->getStartLoc(),
790 OuterLoop->getHeader())
791 << "Loops where the latch is not the exiting block cannot be"
792 " interchange currently.";
793 });
794 return true;
795 }
796
797 SmallVector<PHINode *, 8> Inductions;
798 if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) {
800 dbgs() << "Only outer loops with induction or reduction PHI nodes "
801 << "are supported currently.\n");
802 ORE->emit([&]() {
803 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
804 OuterLoop->getStartLoc(),
805 OuterLoop->getHeader())
806 << "Only outer loops with induction or reduction PHI nodes can be"
807 " interchanged currently.";
808 });
809 return true;
810 }
811
812 Inductions.clear();
813 // For multi-level loop nests, make sure that all phi nodes for inner loops
814 // at all levels can be recognized as a induction or reduction phi. Bail out
815 // if a phi node at a certain nesting level cannot be properly recognized.
816 Loop *CurLevelLoop = OuterLoop;
817 while (!CurLevelLoop->getSubLoops().empty()) {
818 // We already made sure that the loop nest is tightly nested.
819 CurLevelLoop = CurLevelLoop->getSubLoops().front();
820 if (!findInductionAndReductions(CurLevelLoop, Inductions, nullptr)) {
822 dbgs() << "Only inner loops with induction or reduction PHI nodes "
823 << "are supported currently.\n");
824 ORE->emit([&]() {
825 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
826 CurLevelLoop->getStartLoc(),
827 CurLevelLoop->getHeader())
828 << "Only inner loops with induction or reduction PHI nodes can be"
829 " interchange currently.";
830 });
831 return true;
832 }
833 }
834
835 // TODO: Triangular loops are not handled for now.
836 if (!isLoopStructureUnderstood()) {
837 LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
838 ORE->emit([&]() {
839 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
840 InnerLoop->getStartLoc(),
841 InnerLoop->getHeader())
842 << "Inner loop structure not understood currently.";
843 });
844 return true;
845 }
846
847 return false;
848}
849
850bool LoopInterchangeLegality::findInductions(
851 Loop *L, SmallVectorImpl<PHINode *> &Inductions) {
852 for (PHINode &PHI : L->getHeader()->phis()) {
855 Inductions.push_back(&PHI);
856 }
857 return !Inductions.empty();
858}
859
860// We currently only support LCSSA PHI nodes in the inner loop exit, if their
861// users are either reduction PHIs or PHIs outside the outer loop (which means
862// the we are only interested in the final value after the loop).
863static bool
865 SmallPtrSetImpl<PHINode *> &Reductions) {
866 BasicBlock *InnerExit = OuterL->getUniqueExitBlock();
867 for (PHINode &PHI : InnerExit->phis()) {
868 // Reduction lcssa phi will have only 1 incoming block that from loop latch.
869 if (PHI.getNumIncomingValues() > 1)
870 return false;
871 if (any_of(PHI.users(), [&Reductions, OuterL](User *U) {
872 PHINode *PN = dyn_cast<PHINode>(U);
873 return !PN ||
874 (!Reductions.count(PN) && OuterL->contains(PN->getParent()));
875 })) {
876 return false;
877 }
878 }
879 return true;
880}
881
882// We currently support LCSSA PHI nodes in the outer loop exit, if their
883// incoming values do not come from the outer loop latch or if the
884// outer loop latch has a single predecessor. In that case, the value will
885// be available if both the inner and outer loop conditions are true, which
886// will still be true after interchanging. If we have multiple predecessor,
887// that may not be the case, e.g. because the outer loop latch may be executed
888// if the inner loop is not executed.
889static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
890 BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
891 for (PHINode &PHI : LoopNestExit->phis()) {
892 for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) {
893 Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i));
894 if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
895 continue;
896
897 // The incoming value is defined in the outer loop latch. Currently we
898 // only support that in case the outer loop latch has a single predecessor.
899 // This guarantees that the outer loop latch is executed if and only if
900 // the inner loop is executed (because tightlyNested() guarantees that the
901 // outer loop header only branches to the inner loop or the outer loop
902 // latch).
903 // FIXME: We could weaken this logic and allow multiple predecessors,
904 // if the values are produced outside the loop latch. We would need
905 // additional logic to update the PHI nodes in the exit block as
906 // well.
907 if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
908 return false;
909 }
910 }
911 return true;
912}
913
914// In case of multi-level nested loops, it may occur that lcssa phis exist in
915// the latch of InnerLoop, i.e., when defs of the incoming values are further
916// inside the loopnest. Sometimes those incoming values are not available
917// after interchange, since the original inner latch will become the new outer
918// latch which may have predecessor paths that do not include those incoming
919// values.
920// TODO: Handle transformation of lcssa phis in the InnerLoop latch in case of
921// multi-level loop nests.
922static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
923 if (InnerLoop->getSubLoops().empty())
924 return true;
925 // If the original outer latch has only one predecessor, then values defined
926 // further inside the looploop, e.g., in the innermost loop, will be available
927 // at the new outer latch after interchange.
928 if (OuterLoop->getLoopLatch()->getUniquePredecessor() != nullptr)
929 return true;
930
931 // The outer latch has more than one predecessors, i.e., the inner
932 // exit and the inner header.
933 // PHI nodes in the inner latch are lcssa phis where the incoming values
934 // are defined further inside the loopnest. Check if those phis are used
935 // in the original inner latch. If that is the case then bail out since
936 // those incoming values may not be available at the new outer latch.
937 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
938 for (PHINode &PHI : InnerLoopLatch->phis()) {
939 for (auto *U : PHI.users()) {
940 Instruction *UI = cast<Instruction>(U);
941 if (InnerLoopLatch == UI->getParent())
942 return false;
943 }
944 }
945 return true;
946}
947
948bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
949 unsigned OuterLoopId,
950 CharMatrix &DepMatrix) {
951 if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
952 LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
953 << " and OuterLoopId = " << OuterLoopId
954 << " due to dependence\n");
955 ORE->emit([&]() {
956 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
957 InnerLoop->getStartLoc(),
958 InnerLoop->getHeader())
959 << "Cannot interchange loops due to dependences.";
960 });
961 return false;
962 }
963 // Check if outer and inner loop contain legal instructions only.
964 for (auto *BB : OuterLoop->blocks())
966 if (CallInst *CI = dyn_cast<CallInst>(&I)) {
967 // readnone functions do not prevent interchanging.
968 if (CI->onlyWritesMemory())
969 continue;
971 dbgs() << "Loops with call instructions cannot be interchanged "
972 << "safely.");
973 ORE->emit([&]() {
974 return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst",
975 CI->getDebugLoc(),
976 CI->getParent())
977 << "Cannot interchange loops due to call instruction.";
978 });
979
980 return false;
981 }
982
983 if (!findInductions(InnerLoop, InnerLoopInductions)) {
984 LLVM_DEBUG(dbgs() << "Cound not find inner loop induction variables.\n");
985 return false;
986 }
987
988 if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) {
989 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n");
990 ORE->emit([&]() {
991 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerLatchPHI",
992 InnerLoop->getStartLoc(),
993 InnerLoop->getHeader())
994 << "Cannot interchange loops because unsupported PHI nodes found "
995 "in inner loop latch.";
996 });
997 return false;
998 }
999
1000 // TODO: The loops could not be interchanged due to current limitations in the
1001 // transform module.
1002 if (currentLimitations()) {
1003 LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");
1004 return false;
1005 }
1006
1007 // Check if the loops are tightly nested.
1008 if (!tightlyNested(OuterLoop, InnerLoop)) {
1009 LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");
1010 ORE->emit([&]() {
1011 return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
1012 InnerLoop->getStartLoc(),
1013 InnerLoop->getHeader())
1014 << "Cannot interchange loops because they are not tightly "
1015 "nested.";
1016 });
1017 return false;
1018 }
1019
1020 if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop,
1021 OuterInnerReductions)) {
1022 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n");
1023 ORE->emit([&]() {
1024 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1025 InnerLoop->getStartLoc(),
1026 InnerLoop->getHeader())
1027 << "Found unsupported PHI node in loop exit.";
1028 });
1029 return false;
1030 }
1031
1032 if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
1033 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
1034 ORE->emit([&]() {
1035 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1036 OuterLoop->getStartLoc(),
1037 OuterLoop->getHeader())
1038 << "Found unsupported PHI node in loop exit.";
1039 });
1040 return false;
1041 }
1042
1043 return true;
1044}
1045
1046int LoopInterchangeProfitability::getInstrOrderCost() {
1047 unsigned GoodOrder, BadOrder;
1048 BadOrder = GoodOrder = 0;
1049 for (BasicBlock *BB : InnerLoop->blocks()) {
1050 for (Instruction &Ins : *BB) {
1051 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) {
1052 unsigned NumOp = GEP->getNumOperands();
1053 bool FoundInnerInduction = false;
1054 bool FoundOuterInduction = false;
1055 for (unsigned i = 0; i < NumOp; ++i) {
1056 // Skip operands that are not SCEV-able.
1057 if (!SE->isSCEVable(GEP->getOperand(i)->getType()))
1058 continue;
1059
1060 const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i));
1061 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal);
1062 if (!AR)
1063 continue;
1064
1065 // If we find the inner induction after an outer induction e.g.
1066 // for(int i=0;i<N;i++)
1067 // for(int j=0;j<N;j++)
1068 // A[i][j] = A[i-1][j-1]+k;
1069 // then it is a good order.
1070 if (AR->getLoop() == InnerLoop) {
1071 // We found an InnerLoop induction after OuterLoop induction. It is
1072 // a good order.
1073 FoundInnerInduction = true;
1074 if (FoundOuterInduction) {
1075 GoodOrder++;
1076 break;
1077 }
1078 }
1079 // If we find the outer induction after an inner induction e.g.
1080 // for(int i=0;i<N;i++)
1081 // for(int j=0;j<N;j++)
1082 // A[j][i] = A[j-1][i-1]+k;
1083 // then it is a bad order.
1084 if (AR->getLoop() == OuterLoop) {
1085 // We found an OuterLoop induction after InnerLoop induction. It is
1086 // a bad order.
1087 FoundOuterInduction = true;
1088 if (FoundInnerInduction) {
1089 BadOrder++;
1090 break;
1091 }
1092 }
1093 }
1094 }
1095 }
1096 }
1097 return GoodOrder - BadOrder;
1098}
1099
1100std::optional<bool>
1101LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(
1102 const DenseMap<const Loop *, unsigned> &CostMap,
1103 std::unique_ptr<CacheCost> &CC) {
1104 // This is the new cost model returned from loop cache analysis.
1105 // A smaller index means the loop should be placed an outer loop, and vice
1106 // versa.
1107 if (CostMap.contains(InnerLoop) && CostMap.contains(OuterLoop)) {
1108 unsigned InnerIndex = 0, OuterIndex = 0;
1109 InnerIndex = CostMap.find(InnerLoop)->second;
1110 OuterIndex = CostMap.find(OuterLoop)->second;
1111 LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
1112 << ", OuterIndex = " << OuterIndex << "\n");
1113 if (InnerIndex < OuterIndex)
1114 return std::optional<bool>(true);
1115 assert(InnerIndex != OuterIndex && "CostMap should assign unique "
1116 "numbers to each loop");
1117 if (CC->getLoopCost(*OuterLoop) == CC->getLoopCost(*InnerLoop))
1118 return std::nullopt;
1119 return std::optional<bool>(false);
1120 }
1121 return std::nullopt;
1122}
1123
1124std::optional<bool>
1125LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
1126 // Legacy cost model: this is rough cost estimation algorithm. It counts the
1127 // good and bad order of induction variables in the instruction and allows
1128 // reordering if number of bad orders is more than good.
1129 int Cost = getInstrOrderCost();
1130 LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
1131 if (Cost < 0 && Cost < LoopInterchangeCostThreshold)
1132 return std::optional<bool>(true);
1133
1134 return std::nullopt;
1135}
1136
1137std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
1138 unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
1139 for (auto &Row : DepMatrix) {
1140 // If the inner loop is loop independent or doesn't carry any dependency
1141 // it is not profitable to move this to outer position, since we are
1142 // likely able to do inner loop vectorization already.
1143 if (Row[InnerLoopId] == 'I' || Row[InnerLoopId] == '=')
1144 return std::optional<bool>(false);
1145
1146 // If the outer loop is not loop independent it is not profitable to move
1147 // this to inner position, since doing so would not enable inner loop
1148 // parallelism.
1149 if (Row[OuterLoopId] != 'I' && Row[OuterLoopId] != '=')
1150 return std::optional<bool>(false);
1151 }
1152 // If inner loop has dependence and outer loop is loop independent then it
1153 // is/ profitable to interchange to enable inner loop parallelism.
1154 // If there are no dependences, interchanging will not improve anything.
1155 return std::optional<bool>(!DepMatrix.empty());
1156}
1157
1158bool LoopInterchangeProfitability::isProfitable(
1159 const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
1160 unsigned OuterLoopId, CharMatrix &DepMatrix,
1161 const DenseMap<const Loop *, unsigned> &CostMap,
1162 std::unique_ptr<CacheCost> &CC) {
1163 // isProfitable() is structured to avoid endless loop interchange.
1164 // If loop cache analysis could decide the profitability then,
1165 // profitability check will stop and return the analysis result.
1166 // If cache analysis failed to analyze the loopnest (e.g.,
1167 // due to delinearization issues) then only check whether it is
1168 // profitable for InstrOrderCost. Likewise, if InstrOrderCost failed to
1169 // analysis the profitability then only, isProfitableForVectorization
1170 // will decide.
1171 std::optional<bool> shouldInterchange =
1172 isProfitablePerLoopCacheAnalysis(CostMap, CC);
1173 if (!shouldInterchange.has_value()) {
1174 shouldInterchange = isProfitablePerInstrOrderCost();
1175 if (!shouldInterchange.has_value())
1176 shouldInterchange =
1177 isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
1178 }
1179 if (!shouldInterchange.has_value()) {
1180 ORE->emit([&]() {
1181 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1182 InnerLoop->getStartLoc(),
1183 InnerLoop->getHeader())
1184 << "Insufficient information to calculate the cost of loop for "
1185 "interchange.";
1186 });
1187 return false;
1188 } else if (!shouldInterchange.value()) {
1189 ORE->emit([&]() {
1190 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1191 InnerLoop->getStartLoc(),
1192 InnerLoop->getHeader())
1193 << "Interchanging loops is not considered to improve cache "
1194 "locality nor vectorization.";
1195 });
1196 return false;
1197 }
1198 return true;
1199}
1200
1201void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
1202 Loop *InnerLoop) {
1203 for (Loop *L : *OuterLoop)
1204 if (L == InnerLoop) {
1205 OuterLoop->removeChildLoop(L);
1206 return;
1207 }
1208 llvm_unreachable("Couldn't find loop");
1209}
1210
1211/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
1212/// new inner and outer loop after interchanging: NewInner is the original
1213/// outer loop and NewOuter is the original inner loop.
1214///
1215/// Before interchanging, we have the following structure
1216/// Outer preheader
1217// Outer header
1218// Inner preheader
1219// Inner header
1220// Inner body
1221// Inner latch
1222// outer bbs
1223// Outer latch
1224//
1225// After interchanging:
1226// Inner preheader
1227// Inner header
1228// Outer preheader
1229// Outer header
1230// Inner body
1231// outer bbs
1232// Outer latch
1233// Inner latch
1234void LoopInterchangeTransform::restructureLoops(
1235 Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader,
1236 BasicBlock *OrigOuterPreHeader) {
1237 Loop *OuterLoopParent = OuterLoop->getParentLoop();
1238 // The original inner loop preheader moves from the new inner loop to
1239 // the parent loop, if there is one.
1240 NewInner->removeBlockFromLoop(OrigInnerPreHeader);
1241 LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent);
1242
1243 // Switch the loop levels.
1244 if (OuterLoopParent) {
1245 // Remove the loop from its parent loop.
1246 removeChildLoop(OuterLoopParent, NewInner);
1247 removeChildLoop(NewInner, NewOuter);
1248 OuterLoopParent->addChildLoop(NewOuter);
1249 } else {
1250 removeChildLoop(NewInner, NewOuter);
1251 LI->changeTopLevelLoop(NewInner, NewOuter);
1252 }
1253 while (!NewOuter->isInnermost())
1254 NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin()));
1255 NewOuter->addChildLoop(NewInner);
1256
1257 // BBs from the original inner loop.
1258 SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks());
1259
1260 // Add BBs from the original outer loop to the original inner loop (excluding
1261 // BBs already in inner loop)
1262 for (BasicBlock *BB : NewInner->blocks())
1263 if (LI->getLoopFor(BB) == NewInner)
1264 NewOuter->addBlockEntry(BB);
1265
1266 // Now remove inner loop header and latch from the new inner loop and move
1267 // other BBs (the loop body) to the new inner loop.
1268 BasicBlock *OuterHeader = NewOuter->getHeader();
1269 BasicBlock *OuterLatch = NewOuter->getLoopLatch();
1270 for (BasicBlock *BB : OrigInnerBBs) {
1271 // Nothing will change for BBs in child loops.
1272 if (LI->getLoopFor(BB) != NewOuter)
1273 continue;
1274 // Remove the new outer loop header and latch from the new inner loop.
1275 if (BB == OuterHeader || BB == OuterLatch)
1276 NewInner->removeBlockFromLoop(BB);
1277 else
1278 LI->changeLoopFor(BB, NewInner);
1279 }
1280
1281 // The preheader of the original outer loop becomes part of the new
1282 // outer loop.
1283 NewOuter->addBlockEntry(OrigOuterPreHeader);
1284 LI->changeLoopFor(OrigOuterPreHeader, NewOuter);
1285
1286 // Tell SE that we move the loops around.
1287 SE->forgetLoop(NewOuter);
1288}
1289
1290bool LoopInterchangeTransform::transform() {
1291 bool Transformed = false;
1292
1293 if (InnerLoop->getSubLoops().empty()) {
1294 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1295 LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
1296 auto &InductionPHIs = LIL.getInnerLoopInductions();
1297 if (InductionPHIs.empty()) {
1298 LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
1299 return false;
1300 }
1301
1302 SmallVector<Instruction *, 8> InnerIndexVarList;
1303 for (PHINode *CurInductionPHI : InductionPHIs) {
1304 if (CurInductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
1305 InnerIndexVarList.push_back(
1306 dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(1)));
1307 else
1308 InnerIndexVarList.push_back(
1309 dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(0)));
1310 }
1311
1312 // Create a new latch block for the inner loop. We split at the
1313 // current latch's terminator and then move the condition and all
1314 // operands that are not either loop-invariant or the induction PHI into the
1315 // new latch block.
1316 BasicBlock *NewLatch =
1317 SplitBlock(InnerLoop->getLoopLatch(),
1318 InnerLoop->getLoopLatch()->getTerminator(), DT, LI);
1319
1321 unsigned i = 0;
1322 auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() {
1323 for (; i < WorkList.size(); i++) {
1324 // Duplicate instruction and move it the new latch. Update uses that
1325 // have been moved.
1326 Instruction *NewI = WorkList[i]->clone();
1327 NewI->insertBefore(NewLatch->getFirstNonPHI());
1328 assert(!NewI->mayHaveSideEffects() &&
1329 "Moving instructions with side-effects may change behavior of "
1330 "the loop nest!");
1331 for (Use &U : llvm::make_early_inc_range(WorkList[i]->uses())) {
1332 Instruction *UserI = cast<Instruction>(U.getUser());
1333 if (!InnerLoop->contains(UserI->getParent()) ||
1334 UserI->getParent() == NewLatch ||
1335 llvm::is_contained(InductionPHIs, UserI))
1336 U.set(NewI);
1337 }
1338 // Add operands of moved instruction to the worklist, except if they are
1339 // outside the inner loop or are the induction PHI.
1340 for (Value *Op : WorkList[i]->operands()) {
1341 Instruction *OpI = dyn_cast<Instruction>(Op);
1342 if (!OpI ||
1343 this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop ||
1344 llvm::is_contained(InductionPHIs, OpI))
1345 continue;
1346 WorkList.insert(OpI);
1347 }
1348 }
1349 };
1350
1351 // FIXME: Should we interchange when we have a constant condition?
1352 Instruction *CondI = dyn_cast<Instruction>(
1353 cast<BranchInst>(InnerLoop->getLoopLatch()->getTerminator())
1354 ->getCondition());
1355 if (CondI)
1356 WorkList.insert(CondI);
1357 MoveInstructions();
1358 for (Instruction *InnerIndexVar : InnerIndexVarList)
1359 WorkList.insert(cast<Instruction>(InnerIndexVar));
1360 MoveInstructions();
1361 }
1362
1363 // Ensure the inner loop phi nodes have a separate basic block.
1364 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1365 if (InnerLoopHeader->getFirstNonPHI() != InnerLoopHeader->getTerminator()) {
1366 SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);
1367 LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
1368 }
1369
1370 // Instructions in the original inner loop preheader may depend on values
1371 // defined in the outer loop header. Move them there, because the original
1372 // inner loop preheader will become the entry into the interchanged loop nest.
1373 // Currently we move all instructions and rely on LICM to move invariant
1374 // instructions outside the loop nest.
1375 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1376 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1377 if (InnerLoopPreHeader != OuterLoopHeader) {
1379 for (Instruction &I :
1380 make_early_inc_range(make_range(InnerLoopPreHeader->begin(),
1381 std::prev(InnerLoopPreHeader->end()))))
1382 I.moveBefore(OuterLoopHeader->getTerminator());
1383 }
1384
1385 Transformed |= adjustLoopLinks();
1386 if (!Transformed) {
1387 LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
1388 return false;
1389 }
1390
1391 return true;
1392}
1393
1394/// \brief Move all instructions except the terminator from FromBB right before
1395/// InsertBefore
1396static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
1397 BasicBlock *ToBB = InsertBefore->getParent();
1398
1399 ToBB->splice(InsertBefore->getIterator(), FromBB, FromBB->begin(),
1400 FromBB->getTerminator()->getIterator());
1401}
1402
1403/// Swap instructions between \p BB1 and \p BB2 but keep terminators intact.
1404static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) {
1405 // Save all non-terminator instructions of BB1 into TempInstrs and unlink them
1406 // from BB1 afterwards.
1407 auto Iter = map_range(*BB1, [](Instruction &I) { return &I; });
1408 SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(Iter.end()));
1409 for (Instruction *I : TempInstrs)
1410 I->removeFromParent();
1411
1412 // Move instructions from BB2 to BB1.
1413 moveBBContents(BB2, BB1->getTerminator());
1414
1415 // Move instructions from TempInstrs to BB2.
1416 for (Instruction *I : TempInstrs)
1417 I->insertBefore(BB2->getTerminator());
1418}
1419
1420// Update BI to jump to NewBB instead of OldBB. Records updates to the
1421// dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that
1422// \p OldBB is exactly once in BI's successor list.
1423static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB,
1424 BasicBlock *NewBB,
1425 std::vector<DominatorTree::UpdateType> &DTUpdates,
1426 bool MustUpdateOnce = true) {
1427 assert((!MustUpdateOnce ||
1429 [OldBB](BasicBlock *BB) {
1430 return BB == OldBB;
1431 }) == 1) && "BI must jump to OldBB exactly once.");
1432 bool Changed = false;
1433 for (Use &Op : BI->operands())
1434 if (Op == OldBB) {
1435 Op.set(NewBB);
1436 Changed = true;
1437 }
1438
1439 if (Changed) {
1440 DTUpdates.push_back(
1441 {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB});
1442 DTUpdates.push_back(
1443 {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB});
1444 }
1445 assert(Changed && "Expected a successor to be updated");
1446}
1447
1448// Move Lcssa PHIs to the right place.
1449static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
1450 BasicBlock *InnerLatch, BasicBlock *OuterHeader,
1451 BasicBlock *OuterLatch, BasicBlock *OuterExit,
1452 Loop *InnerLoop, LoopInfo *LI) {
1453
1454 // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are
1455 // defined either in the header or latch. Those blocks will become header and
1456 // latch of the new outer loop, and the only possible users can PHI nodes
1457 // in the exit block of the loop nest or the outer loop header (reduction
1458 // PHIs, in that case, the incoming value must be defined in the inner loop
1459 // header). We can just substitute the user with the incoming value and remove
1460 // the PHI.
1461 for (PHINode &P : make_early_inc_range(InnerExit->phis())) {
1462 assert(P.getNumIncomingValues() == 1 &&
1463 "Only loops with a single exit are supported!");
1464
1465 // Incoming values are guaranteed be instructions currently.
1466 auto IncI = cast<Instruction>(P.getIncomingValueForBlock(InnerLatch));
1467 // In case of multi-level nested loops, follow LCSSA to find the incoming
1468 // value defined from the innermost loop.
1469 auto IncIInnerMost = cast<Instruction>(followLCSSA(IncI));
1470 // Skip phis with incoming values from the inner loop body, excluding the
1471 // header and latch.
1472 if (IncIInnerMost->getParent() != InnerLatch &&
1473 IncIInnerMost->getParent() != InnerHeader)
1474 continue;
1475
1476 assert(all_of(P.users(),
1477 [OuterHeader, OuterExit, IncI, InnerHeader](User *U) {
1478 return (cast<PHINode>(U)->getParent() == OuterHeader &&
1479 IncI->getParent() == InnerHeader) ||
1480 cast<PHINode>(U)->getParent() == OuterExit;
1481 }) &&
1482 "Can only replace phis iff the uses are in the loop nest exit or "
1483 "the incoming value is defined in the inner header (it will "
1484 "dominate all loop blocks after interchanging)");
1485 P.replaceAllUsesWith(IncI);
1486 P.eraseFromParent();
1487 }
1488
1489 SmallVector<PHINode *, 8> LcssaInnerExit;
1490 for (PHINode &P : InnerExit->phis())
1491 LcssaInnerExit.push_back(&P);
1492
1493 SmallVector<PHINode *, 8> LcssaInnerLatch;
1494 for (PHINode &P : InnerLatch->phis())
1495 LcssaInnerLatch.push_back(&P);
1496
1497 // Lcssa PHIs for values used outside the inner loop are in InnerExit.
1498 // If a PHI node has users outside of InnerExit, it has a use outside the
1499 // interchanged loop and we have to preserve it. We move these to
1500 // InnerLatch, which will become the new exit block for the innermost
1501 // loop after interchanging.
1502 for (PHINode *P : LcssaInnerExit)
1503 P->moveBefore(InnerLatch->getFirstNonPHI());
1504
1505 // If the inner loop latch contains LCSSA PHIs, those come from a child loop
1506 // and we have to move them to the new inner latch.
1507 for (PHINode *P : LcssaInnerLatch)
1508 P->moveBefore(InnerExit->getFirstNonPHI());
1509
1510 // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have
1511 // incoming values defined in the outer loop, we have to add a new PHI
1512 // in the inner loop latch, which became the exit block of the outer loop,
1513 // after interchanging.
1514 if (OuterExit) {
1515 for (PHINode &P : OuterExit->phis()) {
1516 if (P.getNumIncomingValues() != 1)
1517 continue;
1518 // Skip Phis with incoming values defined in the inner loop. Those should
1519 // already have been updated.
1520 auto I = dyn_cast<Instruction>(P.getIncomingValue(0));
1521 if (!I || LI->getLoopFor(I->getParent()) == InnerLoop)
1522 continue;
1523
1524 PHINode *NewPhi = dyn_cast<PHINode>(P.clone());
1525 NewPhi->setIncomingValue(0, P.getIncomingValue(0));
1526 NewPhi->setIncomingBlock(0, OuterLatch);
1527 // We might have incoming edges from other BBs, i.e., the original outer
1528 // header.
1529 for (auto *Pred : predecessors(InnerLatch)) {
1530 if (Pred == OuterLatch)
1531 continue;
1532 NewPhi->addIncoming(P.getIncomingValue(0), Pred);
1533 }
1534 NewPhi->insertBefore(InnerLatch->getFirstNonPHI());
1535 P.setIncomingValue(0, NewPhi);
1536 }
1537 }
1538
1539 // Now adjust the incoming blocks for the LCSSA PHIs.
1540 // For PHIs moved from Inner's exit block, we need to replace Inner's latch
1541 // with the new latch.
1542 InnerLatch->replacePhiUsesWith(InnerLatch, OuterLatch);
1543}
1544
1545bool LoopInterchangeTransform::adjustLoopBranches() {
1546 LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
1547 std::vector<DominatorTree::UpdateType> DTUpdates;
1548
1549 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1550 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1551
1552 assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
1553 InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&
1554 InnerLoopPreHeader && "Guaranteed by loop-simplify form");
1555 // Ensure that both preheaders do not contain PHI nodes and have single
1556 // predecessors. This allows us to move them easily. We use
1557 // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing
1558 // preheaders do not satisfy those conditions.
1559 if (isa<PHINode>(OuterLoopPreHeader->begin()) ||
1560 !OuterLoopPreHeader->getUniquePredecessor())
1561 OuterLoopPreHeader =
1562 InsertPreheaderForLoop(OuterLoop, DT, LI, nullptr, true);
1563 if (InnerLoopPreHeader == OuterLoop->getHeader())
1564 InnerLoopPreHeader =
1565 InsertPreheaderForLoop(InnerLoop, DT, LI, nullptr, true);
1566
1567 // Adjust the loop preheader
1568 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1569 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1570 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1571 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
1572 BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
1573 BasicBlock *InnerLoopLatchPredecessor =
1574 InnerLoopLatch->getUniquePredecessor();
1575 BasicBlock *InnerLoopLatchSuccessor;
1576 BasicBlock *OuterLoopLatchSuccessor;
1577
1578 BranchInst *OuterLoopLatchBI =
1579 dyn_cast<BranchInst>(OuterLoopLatch->getTerminator());
1580 BranchInst *InnerLoopLatchBI =
1581 dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
1582 BranchInst *OuterLoopHeaderBI =
1583 dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
1584 BranchInst *InnerLoopHeaderBI =
1585 dyn_cast<BranchInst>(InnerLoopHeader->getTerminator());
1586
1587 if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
1588 !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
1589 !InnerLoopHeaderBI)
1590 return false;
1591
1592 BranchInst *InnerLoopLatchPredecessorBI =
1593 dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator());
1594 BranchInst *OuterLoopPredecessorBI =
1595 dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator());
1596
1597 if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
1598 return false;
1599 BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
1600 if (!InnerLoopHeaderSuccessor)
1601 return false;
1602
1603 // Adjust Loop Preheader and headers.
1604 // The branches in the outer loop predecessor and the outer loop header can
1605 // be unconditional branches or conditional branches with duplicates. Consider
1606 // this when updating the successors.
1607 updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader,
1608 InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false);
1609 // The outer loop header might or might not branch to the outer latch.
1610 // We are guaranteed to branch to the inner loop preheader.
1611 if (llvm::is_contained(OuterLoopHeaderBI->successors(), OuterLoopLatch)) {
1612 // In this case the outerLoopHeader should branch to the InnerLoopLatch.
1613 updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, InnerLoopLatch,
1614 DTUpdates,
1615 /*MustUpdateOnce=*/false);
1616 }
1617 updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader,
1618 InnerLoopHeaderSuccessor, DTUpdates,
1619 /*MustUpdateOnce=*/false);
1620
1621 // Adjust reduction PHI's now that the incoming block has changed.
1622 InnerLoopHeaderSuccessor->replacePhiUsesWith(InnerLoopHeader,
1623 OuterLoopHeader);
1624
1625 updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor,
1626 OuterLoopPreHeader, DTUpdates);
1627
1628 // -------------Adjust loop latches-----------
1629 if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
1630 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
1631 else
1632 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
1633
1634 updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch,
1635 InnerLoopLatchSuccessor, DTUpdates);
1636
1637 if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
1638 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
1639 else
1640 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
1641
1642 updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor,
1643 OuterLoopLatchSuccessor, DTUpdates);
1644 updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch,
1645 DTUpdates);
1646
1647 DT->applyUpdates(DTUpdates);
1648 restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader,
1649 OuterLoopPreHeader);
1650
1651 moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch,
1652 OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock(),
1653 InnerLoop, LI);
1654 // For PHIs in the exit block of the outer loop, outer's latch has been
1655 // replaced by Inners'.
1656 OuterLoopLatchSuccessor->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
1657
1658 auto &OuterInnerReductions = LIL.getOuterInnerReductions();
1659 // Now update the reduction PHIs in the inner and outer loop headers.
1660 SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
1661 for (PHINode &PHI : InnerLoopHeader->phis())
1662 if (OuterInnerReductions.contains(&PHI))
1663 InnerLoopPHIs.push_back(&PHI);
1664
1665 for (PHINode &PHI : OuterLoopHeader->phis())
1666 if (OuterInnerReductions.contains(&PHI))
1667 OuterLoopPHIs.push_back(&PHI);
1668
1669 // Now move the remaining reduction PHIs from outer to inner loop header and
1670 // vice versa. The PHI nodes must be part of a reduction across the inner and
1671 // outer loop and all the remains to do is and updating the incoming blocks.
1672 for (PHINode *PHI : OuterLoopPHIs) {
1673 LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump(););
1674 PHI->moveBefore(InnerLoopHeader->getFirstNonPHI());
1675 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
1676 }
1677 for (PHINode *PHI : InnerLoopPHIs) {
1678 LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump(););
1679 PHI->moveBefore(OuterLoopHeader->getFirstNonPHI());
1680 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
1681 }
1682
1683 // Update the incoming blocks for moved PHI nodes.
1684 OuterLoopHeader->replacePhiUsesWith(InnerLoopPreHeader, OuterLoopPreHeader);
1685 OuterLoopHeader->replacePhiUsesWith(InnerLoopLatch, OuterLoopLatch);
1686 InnerLoopHeader->replacePhiUsesWith(OuterLoopPreHeader, InnerLoopPreHeader);
1687 InnerLoopHeader->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
1688
1689 // Values defined in the outer loop header could be used in the inner loop
1690 // latch. In that case, we need to create LCSSA phis for them, because after
1691 // interchanging they will be defined in the new inner loop and used in the
1692 // new outer loop.
1693 IRBuilder<> Builder(OuterLoopHeader->getContext());
1694 SmallVector<Instruction *, 4> MayNeedLCSSAPhis;
1695 for (Instruction &I :
1696 make_range(OuterLoopHeader->begin(), std::prev(OuterLoopHeader->end())))
1697 MayNeedLCSSAPhis.push_back(&I);
1698 formLCSSAForInstructions(MayNeedLCSSAPhis, *DT, *LI, SE, Builder);
1699
1700 return true;
1701}
1702
1703bool LoopInterchangeTransform::adjustLoopLinks() {
1704 // Adjust all branches in the inner and outer loop.
1705 bool Changed = adjustLoopBranches();
1706 if (Changed) {
1707 // We have interchanged the preheaders so we need to interchange the data in
1708 // the preheaders as well. This is because the content of the inner
1709 // preheader was previously executed inside the outer loop.
1710 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1711 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1712 swapBBContents(OuterLoopPreHeader, InnerLoopPreHeader);
1713 }
1714 return Changed;
1715}
1716
1720 LPMUpdater &U) {
1721 Function &F = *LN.getParent();
1722
1723 DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
1724 std::unique_ptr<CacheCost> CC =
1727 if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LN))
1728 return PreservedAnalyses::all();
1729 U.markLoopNestChanged(true);
1731}
Rewrite undef for PHI
assume Assume Builder
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Given that RA is a live propagate it s liveness to any other values it uses(according to Uses). void DeadArgumentEliminationPass
#define LLVM_DEBUG(X)
Definition: Debug.h:101
Hexagon Common GEP
This file defines the interface for the loop cache analysis.
Loop::LoopBounds::Direction Direction
Definition: LoopInfo.cpp:230
static cl::opt< int > LoopInterchangeCostThreshold("loop-interchange-threshold", cl::init(0), cl::Hidden, cl::desc("Interchange if you gain more than this number"))
static PHINode * findInnerReductionPhi(Loop *L, Value *V)
static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop)
static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore)
Move all instructions except the terminator from FromBB right before InsertBefore.
static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx, unsigned ToIndx)
static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader, BasicBlock *InnerLatch, BasicBlock *OuterHeader, BasicBlock *OuterLatch, BasicBlock *OuterExit, Loop *InnerLoop, LoopInfo *LI)
static const unsigned MaxLoopNestDepth
static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2)
Swap instructions between BB1 and BB2 but keep terminators intact.
static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop)
static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, unsigned InnerLoopId, unsigned OuterLoopId)
static bool areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL, SmallPtrSetImpl< PHINode * > &Reductions)
#define DEBUG_TYPE
static const unsigned MaxMemInstrCount
static Value * followLCSSA(Value *SV)
static void populateWorklist(Loop &L, LoopVector &LoopList)
static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB, BasicBlock *NewBB, std::vector< DominatorTree::UpdateType > &DTUpdates, bool MustUpdateOnce=true)
static bool isLexicographicallyPositive(std::vector< char > &DV)
static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, Loop *L, DependenceInfo *DI, ScalarEvolution *SE)
This file defines the interface for the loop nest analysis.
This header provides classes for managing a pipeline of passes over loops in LLVM IR.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define P(N)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:620
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:163
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
iterator end()
Definition: BasicBlock.h:316
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:314
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:372
iterator_range< filter_iterator< BasicBlock::const_iterator, std::function< bool(const Instruction &)> > > instructionsWithoutDebug(bool SkipPseudoOp=true) const
Return a const iterator range over the instructions in the block, skipping any debug instructions.
Definition: BasicBlock.cpp:103
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:208
const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
Definition: BasicBlock.cpp:322
void replacePhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block to refer to basic block New instead of basic block Old.
Definition: BasicBlock.cpp:487
const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
Definition: BasicBlock.cpp:292
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:112
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:35
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:127
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition: BasicBlock.h:468
Conditional or Unconditional Branch instruction.
iterator_range< succ_op_iterator > successors()
bool isConditional() const
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
static std::unique_ptr< CacheCost > getCacheCost(Loop &Root, LoopStandardAnalysisResults &AR, DependenceInfo &DI, std::optional< unsigned > TRT=std::nullopt)
Create a CacheCost for the loop nest rooted by Root.
This class represents a function call, abstracting a target machine's calling convention.
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:708
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
DependenceInfo - This class is the main dependence-analysis driver.
std::unique_ptr< Dependence > depends(Instruction *Src, Instruction *Dst, bool PossiblyLoopIndependent)
depends - Tests for a dependence between the Src and Dst instructions.
void applyUpdates(ArrayRef< UpdateType > Updates)
Inform the dominator tree about a sequence of CFG edge insertions and deletions and perform a batch u...
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:940
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2558
A struct for saving information about induction variables.
static bool isInductionPHI(PHINode *Phi, const Loop *L, ScalarEvolution *SE, InductionDescriptor &D, const SCEV *Expr=nullptr, SmallVectorImpl< Instruction * > *CastsToIgnore=nullptr)
Returns true if Phi is an induction in the loop L.
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction.
Definition: Instruction.cpp:88
const BasicBlock * getParent() const
Definition: Instruction.h:90
bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:139
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
Definition: LoopInfoImpl.h:232
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
Definition: LoopInfo.h:182
void removeBlockFromLoop(BlockT *BB)
This removes the specified basic block from the current loop, updating the Blocks as appropriate.
Definition: LoopInfo.h:477
const std::vector< LoopT * > & getSubLoops() const
Return the loops contained entirely within this loop.
Definition: LoopInfo.h:160
BlockT * getHeader() const
Definition: LoopInfo.h:105
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:195
void addChildLoop(LoopT *NewChild)
Add the specified loop to be a child of this loop.
Definition: LoopInfo.h:412
void addBlockEntry(BlockT *BB)
This adds a basic block directly to the basic block list.
Definition: LoopInfo.h:440
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
Definition: LoopInfoImpl.h:107
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:183
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
Definition: LoopInfoImpl.h:48
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
Definition: LoopInfo.h:114
iterator begin() const
Definition: LoopInfo.h:171
BlockT * getUniqueExitBlock() const
If getUniqueExitBlocks would return exactly one block, return that block.
Definition: LoopInfoImpl.h:158
LoopT * removeChildLoop(iterator I)
This removes the specified child from being a subloop of this loop.
Definition: LoopInfo.h:421
void changeTopLevelLoop(LoopT *OldLoop, LoopT *NewLoop)
Replace the specified loop in the top-level loops list with the indicated loop.
Definition: LoopInfo.h:1040
void changeLoopFor(BlockT *BB, LoopT *L)
Change the top-level loop that contains BB to the specified loop.
Definition: LoopInfo.h:1030
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Definition: LoopInfo.h:992
This class represents a loop nest and can be used to query its properties.
static const BasicBlock & skipEmptyBlockUntil(const BasicBlock *From, const BasicBlock *End, bool CheckUniquePred=false)
Recursivelly traverse all empty 'single successor' basic blocks of From (if there are any).
ArrayRef< Loop * > getLoops() const
Get the loops in the nest.
Function * getParent() const
Return the function to which the loop-nest belongs.
Loop & getOutermostLoop() const
Return the outermost loop in the loop nest.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:547
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:631
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:60
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
op_range incoming_values()
void setIncomingBlock(unsigned i, BasicBlock *BB)
void setIncomingValue(unsigned i, Value *V)
static unsigned getIncomingValueNumForOperand(unsigned i)
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:152
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:158
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:69
Instruction * getExactFPMathInst() const
Returns 1st non-reassociative FP instruction in the PHI node's use-chain.
static bool isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes, DemandedBits *DB=nullptr, AssumptionCache *AC=nullptr, DominatorTree *DT=nullptr, ScalarEvolution *SE=nullptr)
Returns true if Phi is a reduction in TheLoop.
This node represents a polynomial recurrence on the trip count of the specified loop.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:77
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:141
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:344
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:301
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:242
LLVM Value Representation.
Definition: Value.h:74
self_iterator getIterator()
Definition: ilist_node.h:82
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
BasicBlock * InsertPreheaderForLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
InsertPreheaderForLoop - Once we discover that a loop doesn't have a preheader, this method is called...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1819
auto successors(const MachineBasicBlock *BB)
bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition: LCSSA.cpp:410
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:748
auto map_range(ContainerTy &&C, FuncTy F)
Definition: STLExtras.h:461
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:2025
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1826
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool formLCSSAForInstructions(SmallVectorImpl< Instruction * > &Worklist, const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE, IRBuilderBase &Builder, SmallVectorImpl< PHINode * > *PHIsToRemove=nullptr)
Ensures LCSSA form for every instruction from the Worklist in the scope of innermost containing loop.
Definition: LCSSA.cpp:78
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1946
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:2018
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1976
BasicBlock * SplitBlock(BasicBlock *Old, Instruction *SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
Definition: BitVector.h:858
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
PreservedAnalyses run(LoopNest &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Direction
An enum for the direction of the loop.
Definition: LoopInfo.h:723