LLVM 23.0.0git
LoopInterchange.cpp
Go to the documentation of this file.
1//===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This Pass handles loop interchange transform.
10// This pass interchanges loops to provide a more cache-friendly memory access
11// patterns.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/STLExtras.h"
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/Statistic.h"
20#include "llvm/ADT/StringMap.h"
21#include "llvm/ADT/StringRef.h"
30#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/Dominators.h"
33#include "llvm/IR/Function.h"
34#include "llvm/IR/IRBuilder.h"
35#include "llvm/IR/InstrTypes.h"
36#include "llvm/IR/Instruction.h"
38#include "llvm/IR/User.h"
39#include "llvm/IR/Value.h"
42#include "llvm/Support/Debug.h"
49#include <cassert>
50#include <utility>
51#include <vector>
52
53using namespace llvm;
54
55#define DEBUG_TYPE "loop-interchange"
56
57STATISTIC(LoopsInterchanged, "Number of loops interchanged");
58
60 "loop-interchange-threshold", cl::init(0), cl::Hidden,
61 cl::desc("Interchange if you gain more than this number"));
62
64 "loop-interchange-max-mem-instr-ratio", cl::init(4), cl::Hidden,
65 cl::desc("Maximum number of load/store instructions squared in relation to "
66 "the total number of instructions. Higher value may lead to more "
67 "interchanges at the cost of compile-time"));
68
69namespace {
70
72
73/// A list of direction vectors. Each entry represents a direction vector
74/// corresponding to one or more dependencies existing in the loop nest. The
75/// length of all direction vectors is equal and is N + 1, where N is the depth
76/// of the loop nest. The first N elements correspond to the dependency
77/// direction of each N loops. The last one indicates whether this entry is
78/// forward dependency ('<') or not ('*'). The term "forward" aligns with what
79/// is defined in LoopAccessAnalysis.
80// TODO: Check if we can use a sparse matrix here.
81using CharMatrix = std::vector<std::vector<char>>;
82
83/// Types of rules used in profitability check.
84enum class RuleTy {
85 PerLoopCacheAnalysis,
86 PerInstrOrderCost,
87 ForVectorization,
88 Ignore
89};
90
91} // end anonymous namespace
92
93// Minimum loop depth supported.
95 "loop-interchange-min-loop-nest-depth", cl::init(2), cl::Hidden,
96 cl::desc("Minimum depth of loop nest considered for the transform"));
97
98// Maximum loop depth supported.
100 "loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden,
101 cl::desc("Maximum depth of loop nest considered for the transform"));
102
103// We prefer cache cost to vectorization by default.
105 "loop-interchange-profitabilities", cl::MiscFlags::CommaSeparated,
107 cl::desc("List of profitability heuristics to be used. They are applied in "
108 "the given order"),
109 cl::list_init<RuleTy>({RuleTy::PerInstrOrderCost,
110 RuleTy::ForVectorization}),
111 cl::values(clEnumValN(RuleTy::PerLoopCacheAnalysis, "cache",
112 "Prioritize loop cache cost"),
113 clEnumValN(RuleTy::PerInstrOrderCost, "instorder",
114 "Prioritize the IVs order of each instruction"),
115 clEnumValN(RuleTy::ForVectorization, "vectorize",
116 "Prioritize vectorization"),
117 clEnumValN(RuleTy::Ignore, "ignore",
118 "Ignore profitability, force interchange (does not "
119 "work with other options)")));
120
121// Support for the inner-loop reduction pattern.
123 "loop-interchange-reduction-to-mem", cl::init(false), cl::Hidden,
124 cl::desc("Support for the inner-loop reduction pattern."));
125
126#ifndef NDEBUG
129 for (RuleTy Rule : Rules) {
130 if (!Set.insert(Rule).second)
131 return false;
132 if (Rule == RuleTy::Ignore)
133 return false;
134 }
135 return true;
136}
137
138static void printDepMatrix(CharMatrix &DepMatrix) {
139 for (auto &Row : DepMatrix) {
140 // Drop the last element because it is a flag indicating whether this is
141 // forward dependency or not, which doesn't affect the legality check.
142 for (char D : drop_end(Row))
143 LLVM_DEBUG(dbgs() << D << " ");
144 LLVM_DEBUG(dbgs() << "\n");
145 }
146}
147
148/// Return true if \p Src appears before \p Dst in the same basic block.
149/// Precondition: \p Src and \Dst are distinct instructions within the same
150/// basic block.
151static bool inThisOrder(const Instruction *Src, const Instruction *Dst) {
152 assert(Src->getParent() == Dst->getParent() && Src != Dst &&
153 "Expected Src and Dst to be different instructions in the same BB");
154
155 bool FoundSrc = false;
156 for (const Instruction &I : *(Src->getParent())) {
157 if (&I == Src) {
158 FoundSrc = true;
159 continue;
160 }
161 if (&I == Dst)
162 return FoundSrc;
163 }
164
165 llvm_unreachable("Dst not found");
166}
167#endif
168
169static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
170 Loop *L, DependenceInfo *DI,
171 ScalarEvolution *SE,
174
175 ValueVector MemInstr;
176 unsigned NumInsts = 0;
177
178 // For each block.
179 for (BasicBlock *BB : L->blocks()) {
180 // Scan the BB and collect legal loads and stores.
181 for (Instruction &I : *BB) {
182 if (!isa<Instruction>(I))
183 return false;
184 NumInsts++;
185 if (auto *Ld = dyn_cast<LoadInst>(&I)) {
186 if (!Ld->isSimple())
187 return false;
188 MemInstr.push_back(&I);
189 } else if (auto *St = dyn_cast<StoreInst>(&I)) {
190 if (!St->isSimple())
191 return false;
192 MemInstr.push_back(&I);
193 }
194 }
195 }
196
197 // To populate the dependence matrix, we perform dependence test for each pair
198 // of memory instructions, which has O(NumMemInstr^2) complexity. This implies
199 // that even if the number of memory instructions is small, the analysis can
200 // still be expensive if the most of the instructions in the loop are memory
201 // instructions. On the other hand, if the number of memory instructions is
202 // not small, but the loop is large (i.e., it contains many non-memory
203 // instructions), the analysis can still be affordable.
204 unsigned NumMemInstr = MemInstr.size();
205 LLVM_DEBUG(dbgs() << "Found " << NumMemInstr
206 << " Loads and Stores to analyze\n");
207 if (MaxMemInstrRatio * NumInsts < NumMemInstr * NumMemInstr) {
208 ORE->emit([&]() {
209 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoop",
210 L->getStartLoc(), L->getHeader())
211 << "Number of loads/stores exceeded, the supported maximum can be "
212 "increased with option -loop-interchange-max-mem-instr-ratio.";
213 });
214 return false;
215 }
216 ValueVector::iterator I, IE, J, JE;
217
218 // Manage direction vectors that are already seen. Map each direction vector
219 // to an index of DepMatrix at which it is stored.
221
222 for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
223 for (J = I, JE = MemInstr.end(); J != JE; ++J) {
224 std::vector<char> Dep;
227 // Ignore Input dependencies.
228 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
229 continue;
230 // Track Output, Flow, and Anti dependencies.
231 if (auto D = DI->depends(Src, Dst)) {
232 assert(D->isOrdered() && "Expected an output, flow or anti dep.");
233 // If the direction vector is negative, normalize it to
234 // make it non-negative.
235 if (D->normalize(SE))
236 LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
237 LLVM_DEBUG(StringRef DepType =
238 D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
239 dbgs() << "Found " << DepType
240 << " dependency between Src and Dst\n"
241 << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
242 unsigned Levels = D->getLevels();
243 char Direction;
244 for (unsigned II = 1; II <= Levels; ++II) {
245 // `DVEntry::LE` is converted to `*`. This is because `LE` means `<`
246 // or `=`, for which we don't have an equivalent representation, so
247 // that the conservative approximation is necessary. The same goes for
248 // `DVEntry::GE`.
249 // TODO: Use of fine-grained expressions allows for more accurate
250 // analysis.
251 unsigned Dir = D->getDirection(II);
252 if (Dir == Dependence::DVEntry::LT)
253 Direction = '<';
254 else if (Dir == Dependence::DVEntry::GT)
255 Direction = '>';
256 else if (Dir == Dependence::DVEntry::EQ)
257 Direction = '=';
258 else
259 Direction = '*';
260 Dep.push_back(Direction);
261 }
262
263 // If the Dependence object doesn't have any information, fill the
264 // dependency vector with '*'.
265 if (D->isConfused()) {
266 assert(Dep.empty() && "Expected empty dependency vector");
267 Dep.assign(Level, '*');
268 }
269
270 while (Dep.size() != Level) {
271 Dep.push_back('I');
272 }
273
274 // If all the elements of any direction vector have only '*', legality
275 // can't be proven. Exit early to save compile time.
276 if (all_of(Dep, equal_to('*'))) {
277 ORE->emit([&]() {
278 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
279 L->getStartLoc(), L->getHeader())
280 << "All loops have dependencies in all directions.";
281 });
282 return false;
283 }
284
285 // Test whether the dependency is forward or not.
286 bool IsKnownForward = true;
287 if (Src->getParent() != Dst->getParent()) {
288 // In general, when Src and Dst are in different BBs, the execution
289 // order of them within a single iteration is not guaranteed. Treat
290 // conservatively as not-forward dependency in this case.
291 IsKnownForward = false;
292 } else {
293 // Src and Dst are in the same BB. If they are the different
294 // instructions, Src should appear before Dst in the BB as they are
295 // stored to MemInstr in that order.
296 assert((Src == Dst || inThisOrder(Src, Dst)) &&
297 "Unexpected instructions");
298
299 // If the Dependence object is reversed (due to normalization), it
300 // represents the dependency from Dst to Src, meaning it is a backward
301 // dependency. Otherwise it should be a forward dependency.
302 bool IsReversed = D->getSrc() != Src;
303 if (IsReversed)
304 IsKnownForward = false;
305 }
306
307 // Initialize the last element. Assume forward dependencies only; it
308 // will be updated later if there is any non-forward dependency.
309 Dep.push_back('<');
310
311 // The last element should express the "summary" among one or more
312 // direction vectors whose first N elements are the same (where N is
313 // the depth of the loop nest). Hence we exclude the last element from
314 // the Seen map.
315 auto [Ite, Inserted] = Seen.try_emplace(
316 StringRef(Dep.data(), Dep.size() - 1), DepMatrix.size());
317
318 // Make sure we only add unique entries to the dependency matrix.
319 if (Inserted)
320 DepMatrix.push_back(Dep);
321
322 // If we cannot prove that this dependency is forward, change the last
323 // element of the corresponding entry. Since a `[... *]` dependency
324 // includes a `[... <]` dependency, we do not need to keep both and
325 // change the existing entry instead.
326 if (!IsKnownForward)
327 DepMatrix[Ite->second].back() = '*';
328 }
329 }
330 }
331
332 return true;
333}
334
335// A loop is moved from index 'from' to an index 'to'. Update the Dependence
336// matrix by exchanging the two columns.
337static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
338 unsigned ToIndx) {
339 for (auto &Row : DepMatrix)
340 std::swap(Row[ToIndx], Row[FromIndx]);
341}
342
343// Check if a direction vector is lexicographically positive. Return true if it
344// is positive, nullopt if it is "zero", otherwise false.
345// [Theorem] A permutation of the loops in a perfect nest is legal if and only
346// if the direction matrix, after the same permutation is applied to its
347// columns, has no ">" direction as the leftmost non-"=" direction in any row.
348static std::optional<bool>
349isLexicographicallyPositive(ArrayRef<char> DV, unsigned Begin, unsigned End) {
350 for (unsigned char Direction : DV.slice(Begin, End - Begin)) {
351 if (Direction == '<')
352 return true;
353 if (Direction == '>' || Direction == '*')
354 return false;
355 }
356 return std::nullopt;
357}
358
359// Checks if it is legal to interchange 2 loops.
360static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
361 unsigned InnerLoopId,
362 unsigned OuterLoopId) {
363 unsigned NumRows = DepMatrix.size();
364 std::vector<char> Cur;
365 // For each row check if it is valid to interchange.
366 for (unsigned Row = 0; Row < NumRows; ++Row) {
367 // Create temporary DepVector check its lexicographical order
368 // before and after swapping OuterLoop vs InnerLoop
369 Cur = DepMatrix[Row];
370
371 // If the surrounding loops already ensure that the direction vector is
372 // lexicographically positive, nothing within the loop will be able to break
373 // the dependence. In such a case we can skip the subsequent check.
374 if (isLexicographicallyPositive(Cur, 0, OuterLoopId) == true)
375 continue;
376
377 // Check if the direction vector is lexicographically positive (or zero)
378 // for both before/after exchanged. Ignore the last element because it
379 // doesn't affect the legality.
380 if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
381 return false;
382 std::swap(Cur[InnerLoopId], Cur[OuterLoopId]);
383 if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
384 return false;
385 }
386 return true;
387}
388
389static void populateWorklist(Loop &L, LoopVector &LoopList) {
390 LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
391 << L.getHeader()->getParent()->getName() << " Loop: %"
392 << L.getHeader()->getName() << '\n');
393 assert(LoopList.empty() && "LoopList should initially be empty!");
394 Loop *CurrentLoop = &L;
395 const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
396 while (!Vec->empty()) {
397 // The current loop has multiple subloops in it hence it is not tightly
398 // nested.
399 // Discard all loops above it added into Worklist.
400 if (Vec->size() != 1) {
401 LoopList = {};
402 return;
403 }
404
405 LoopList.push_back(CurrentLoop);
406 CurrentLoop = Vec->front();
407 Vec = &CurrentLoop->getSubLoops();
408 }
409 LoopList.push_back(CurrentLoop);
410}
411
414 unsigned LoopNestDepth = LoopList.size();
415 if (LoopNestDepth < MinLoopNestDepth || LoopNestDepth > MaxLoopNestDepth) {
416 LLVM_DEBUG(dbgs() << "Unsupported depth of loop nest " << LoopNestDepth
417 << ", the supported range is [" << MinLoopNestDepth
418 << ", " << MaxLoopNestDepth << "].\n");
419 Loop *OuterLoop = LoopList.front();
420 ORE.emit([&]() {
421 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoopNestDepth",
422 OuterLoop->getStartLoc(),
423 OuterLoop->getHeader())
424 << "Unsupported depth of loop nest, the supported range is ["
425 << std::to_string(MinLoopNestDepth) << ", "
426 << std::to_string(MaxLoopNestDepth) << "].\n";
427 });
428 return false;
429 }
430 return true;
431}
432
434 ArrayRef<Loop *> LoopList) {
435 for (Loop *L : LoopList) {
436 const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
437 if (isa<SCEVCouldNotCompute>(ExitCountOuter)) {
438 LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
439 return false;
440 }
441 if (L->getNumBackEdges() != 1) {
442 LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
443 return false;
444 }
445 if (!L->getExitingBlock()) {
446 LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
447 return false;
448 }
449 }
450 return true;
451}
452
453namespace {
454
455/// LoopInterchangeLegality checks if it is legal to interchange the loop.
456class LoopInterchangeLegality {
457public:
458 LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
459 OptimizationRemarkEmitter *ORE, DominatorTree *DT)
460 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), DT(DT), ORE(ORE) {}
461
462 /// Check if the loops can be interchanged.
463 bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
464 CharMatrix &DepMatrix);
465
466 /// Discover induction PHIs in the header of \p L. Induction
467 /// PHIs are added to \p Inductions.
468 bool findInductions(Loop *L, SmallVectorImpl<PHINode *> &Inductions);
469
470 /// Check if the loop structure is understood. We do not handle triangular
471 /// loops for now.
472 bool isLoopStructureUnderstood();
473
474 bool currentLimitations();
475
476 const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const {
477 return OuterInnerReductions;
478 }
479
480 const ArrayRef<PHINode *> getInnerLoopInductions() const {
481 return InnerLoopInductions;
482 }
483
484 ArrayRef<Instruction *> getHasNoWrapReductions() const {
485 return HasNoWrapReductions;
486 }
487
488 ArrayRef<Instruction *> getHasNoInfInsts() const { return HasNoInfInsts; }
489
490 /// Record reductions in the inner loop. Currently supported reductions:
491 /// - initialized from a constant.
492 /// - reduction PHI node has only one user.
493 /// - located in the innermost loop.
494 struct InnerReduction {
495 /// The reduction itself.
496 PHINode *Reduction;
497 Value *Init;
498 Value *Next;
499 /// The Lcssa PHI.
500 PHINode *LcssaPhi;
501 /// Store reduction result into memory object.
502 StoreInst *LcssaStore;
503 /// The memory Location.
504 Value *MemRef;
505 Type *ElemTy;
506 };
507
508 ArrayRef<InnerReduction> getInnerReductions() const {
509 return InnerReductions;
510 }
511
512private:
513 bool tightlyNested(Loop *Outer, Loop *Inner);
514 bool containsUnsafeInstructions(BasicBlock *BB, Instruction *Skip);
515
516 /// Discover induction and reduction PHIs in the header of \p L. Induction
517 /// PHIs are added to \p Inductions, reductions are added to
518 /// OuterInnerReductions. When the outer loop is passed, the inner loop needs
519 /// to be passed as \p InnerLoop.
520 bool findInductionAndReductions(Loop *L,
521 SmallVector<PHINode *, 8> &Inductions,
522 Loop *InnerLoop);
523
524 /// Detect and record the reduction of the inner loop. Add them to
525 /// InnerReductions.
526 ///
527 /// innerloop:
528 /// Re = phi<0.0, Next>
529 /// Next = Re op ...
530 /// OuterLoopLatch:
531 /// Lcssa = phi<Next> ; lcssa phi
532 /// store Lcssa, MemRef ; LcssaStore
533 ///
534 bool isInnerReduction(Loop *L, PHINode *Phi,
535 SmallVectorImpl<Instruction *> &HasNoWrapInsts);
536
537 Loop *OuterLoop;
538 Loop *InnerLoop;
539
540 ScalarEvolution *SE;
541 DominatorTree *DT;
542
543 /// Interface to emit optimization remarks.
544 OptimizationRemarkEmitter *ORE;
545
546 /// Set of reduction PHIs taking part of a reduction across the inner and
547 /// outer loop.
548 SmallPtrSet<PHINode *, 4> OuterInnerReductions;
549
550 /// Set of inner loop induction PHIs
551 SmallVector<PHINode *, 8> InnerLoopInductions;
552
553 /// Hold instructions that have nuw/nsw flags and involved in reductions,
554 /// like integer addition/multiplication. Those flags must be dropped when
555 /// interchanging the loops.
556 SmallVector<Instruction *, 4> HasNoWrapReductions;
557
558 /// Hold instructions that have ninf flags and involved in reductions. Those
559 /// flags must be dropped when interchanging the loops.
560 SmallVector<Instruction *, 4> HasNoInfInsts;
561
562 /// Vector of reductions in the inner loop.
563 SmallVector<InnerReduction, 8> InnerReductions;
564};
565
566/// Manages information utilized by the profitability check for cache. The main
567/// purpose of this class is to delay the computation of CacheCost until it is
568/// actually needed.
569class CacheCostManager {
570 Loop *OutermostLoop;
571 LoopStandardAnalysisResults *AR;
572 DependenceInfo *DI;
573
574 /// CacheCost for \ref OutermostLoop. Once it is computed, it is cached. Note
575 /// that the result can be nullptr.
576 std::optional<std::unique_ptr<CacheCost>> CC;
577
578 /// Maps each loop to an index representing the optimal position within the
579 /// loop-nest, as determined by the cache cost analysis.
580 DenseMap<const Loop *, unsigned> CostMap;
581
582 void computeIfUnitinialized();
583
584public:
585 CacheCostManager(Loop *OutermostLoop, LoopStandardAnalysisResults *AR,
586 DependenceInfo *DI)
587 : OutermostLoop(OutermostLoop), AR(AR), DI(DI) {}
588 CacheCost *getCacheCost();
589 const DenseMap<const Loop *, unsigned> &getCostMap();
590};
591
592/// LoopInterchangeProfitability checks if it is profitable to interchange the
593/// loop.
594class LoopInterchangeProfitability {
595public:
596 LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
597 OptimizationRemarkEmitter *ORE)
598 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
599
600 /// Check if the loop interchange is profitable.
601 bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
602 unsigned InnerLoopId, unsigned OuterLoopId,
603 CharMatrix &DepMatrix, CacheCostManager &CCM);
604
605private:
606 int getInstrOrderCost();
607 std::optional<bool> isProfitablePerLoopCacheAnalysis(
608 const DenseMap<const Loop *, unsigned> &CostMap, CacheCost *CC);
609 std::optional<bool> isProfitablePerInstrOrderCost();
610 std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
611 unsigned OuterLoopId,
612 CharMatrix &DepMatrix);
613 Loop *OuterLoop;
614 Loop *InnerLoop;
615
616 /// Scev analysis.
617 ScalarEvolution *SE;
618
619 /// Interface to emit optimization remarks.
620 OptimizationRemarkEmitter *ORE;
621};
622
623/// LoopInterchangeTransform interchanges the loop.
624class LoopInterchangeTransform {
625public:
626 LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
627 LoopInfo *LI, DominatorTree *DT,
628 const LoopInterchangeLegality &LIL)
629 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {}
630
631 /// Interchange OuterLoop and InnerLoop.
632 bool transform(ArrayRef<Instruction *> DropNoWrapInsts,
633 ArrayRef<Instruction *> DropNoInfInsts);
634 void reduction2Memory();
635 void restructureLoops(Loop *NewInner, Loop *NewOuter,
636 BasicBlock *OrigInnerPreHeader,
637 BasicBlock *OrigOuterPreHeader);
638 void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
639
640private:
641 bool adjustLoopLinks();
642 bool adjustLoopBranches();
643
644 Loop *OuterLoop;
645 Loop *InnerLoop;
646
647 /// Scev analysis.
648 ScalarEvolution *SE;
649
650 LoopInfo *LI;
651 DominatorTree *DT;
652
653 const LoopInterchangeLegality &LIL;
654};
655
656struct LoopInterchange {
657 ScalarEvolution *SE = nullptr;
658 LoopInfo *LI = nullptr;
659 DependenceInfo *DI = nullptr;
660 DominatorTree *DT = nullptr;
661 LoopStandardAnalysisResults *AR = nullptr;
662
663 /// Interface to emit optimization remarks.
664 OptimizationRemarkEmitter *ORE;
665
666 LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
667 DominatorTree *DT, LoopStandardAnalysisResults *AR,
668 OptimizationRemarkEmitter *ORE)
669 : SE(SE), LI(LI), DI(DI), DT(DT), AR(AR), ORE(ORE) {}
670
671 bool run(Loop *L) {
672 if (L->getParentLoop())
673 return false;
674 SmallVector<Loop *, 8> LoopList;
675 populateWorklist(*L, LoopList);
676 return processLoopList(LoopList);
677 }
678
679 bool run(LoopNest &LN) {
680 SmallVector<Loop *, 8> LoopList(LN.getLoops());
681 for (unsigned I = 1; I < LoopList.size(); ++I)
682 if (LoopList[I]->getParentLoop() != LoopList[I - 1])
683 return false;
684 return processLoopList(LoopList);
685 }
686
687 unsigned selectLoopForInterchange(ArrayRef<Loop *> LoopList) {
688 // TODO: Add a better heuristic to select the loop to be interchanged based
689 // on the dependence matrix. Currently we select the innermost loop.
690 return LoopList.size() - 1;
691 }
692
693 bool processLoopList(SmallVectorImpl<Loop *> &LoopList) {
694 bool Changed = false;
695
696 // Ensure proper loop nest depth.
697 assert(hasSupportedLoopDepth(LoopList, *ORE) &&
698 "Unsupported depth of loop nest.");
699
700 unsigned LoopNestDepth = LoopList.size();
701
702 LLVM_DEBUG({
703 dbgs() << "Processing LoopList of size = " << LoopNestDepth
704 << " containing the following loops:\n";
705 for (auto *L : LoopList) {
706 dbgs() << " - ";
707 L->print(dbgs());
708 }
709 });
710
711 CharMatrix DependencyMatrix;
712 Loop *OuterMostLoop = *(LoopList.begin());
713 if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
714 OuterMostLoop, DI, SE, ORE)) {
715 LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
716 return false;
717 }
718
719 LLVM_DEBUG(dbgs() << "Dependency matrix before interchange:\n";
720 printDepMatrix(DependencyMatrix));
721
722 // Get the Outermost loop exit.
723 BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
724 if (!LoopNestExit) {
725 LLVM_DEBUG(dbgs() << "OuterMostLoop '" << OuterMostLoop->getName()
726 << "' needs an unique exit block");
727 return false;
728 }
729
730 unsigned SelecLoopId = selectLoopForInterchange(LoopList);
731 CacheCostManager CCM(LoopList[0], AR, DI);
732 // We try to achieve the globally optimal memory access for the loopnest,
733 // and do interchange based on a bubble-sort fasion. We start from
734 // the innermost loop, move it outwards to the best possible position
735 // and repeat this process.
736 for (unsigned j = SelecLoopId; j > 0; j--) {
737 bool ChangedPerIter = false;
738 for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
739 bool Interchanged =
740 processLoop(LoopList, i, i - 1, DependencyMatrix, CCM);
741 ChangedPerIter |= Interchanged;
742 Changed |= Interchanged;
743 }
744 // Early abort if there was no interchange during an entire round of
745 // moving loops outwards.
746 if (!ChangedPerIter)
747 break;
748 }
749 return Changed;
750 }
751
752 bool processLoop(SmallVectorImpl<Loop *> &LoopList, unsigned InnerLoopId,
753 unsigned OuterLoopId,
754 std::vector<std::vector<char>> &DependencyMatrix,
755 CacheCostManager &CCM) {
756 Loop *OuterLoop = LoopList[OuterLoopId];
757 Loop *InnerLoop = LoopList[InnerLoopId];
758 LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
759 << " and OuterLoopId = " << OuterLoopId << "\n");
760 LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE, DT);
761 if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
762 LLVM_DEBUG(dbgs() << "Cannot prove legality, not interchanging loops '"
763 << OuterLoop->getName() << "' and '"
764 << InnerLoop->getName() << "'\n");
765 return false;
766 }
767 LLVM_DEBUG(dbgs() << "Loops '" << OuterLoop->getName() << "' and '"
768 << InnerLoop->getName()
769 << "' are legal to interchange\n");
770 LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
771 if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
772 DependencyMatrix, CCM)) {
773 LLVM_DEBUG(dbgs() << "Interchanging loops '" << OuterLoop->getName()
774 << "' and '" << InnerLoop->getName()
775 << "' not profitable.\n");
776 return false;
777 }
778
779 ORE->emit([&]() {
780 return OptimizationRemark(DEBUG_TYPE, "Interchanged",
781 InnerLoop->getStartLoc(),
782 InnerLoop->getHeader())
783 << "Loop interchanged with enclosing loop.";
784 });
785
786 LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL);
787 LIT.transform(LIL.getHasNoWrapReductions(), LIL.getHasNoInfInsts());
788 LLVM_DEBUG(dbgs() << "Loops interchanged: outer loop '"
789 << OuterLoop->getName() << "' and inner loop '"
790 << InnerLoop->getName() << "'\n");
791 LoopsInterchanged++;
792
793 llvm::formLCSSARecursively(*OuterLoop, *DT, LI, SE);
794
795 // Loops interchanged, update LoopList accordingly.
796 std::swap(LoopList[OuterLoopId], LoopList[InnerLoopId]);
797 // Update the DependencyMatrix
798 interChangeDependencies(DependencyMatrix, InnerLoopId, OuterLoopId);
799
800 LLVM_DEBUG(dbgs() << "Dependency matrix after interchange:\n";
801 printDepMatrix(DependencyMatrix));
802
803 return true;
804 }
805};
806
807} // end anonymous namespace
808
809bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB,
810 Instruction *Skip) {
811 return any_of(*BB, [Skip](const Instruction &I) {
812 if (&I == Skip)
813 return false;
814 return I.mayHaveSideEffects() || I.mayReadFromMemory();
815 });
816}
817
818bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
819 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
820 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
821 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
822
823 LLVM_DEBUG(dbgs() << "Checking if loops '" << OuterLoop->getName()
824 << "' and '" << InnerLoop->getName()
825 << "' are tightly nested\n");
826
827 // A perfectly nested loop will not have any branch in between the outer and
828 // inner block i.e. outer header will branch to either inner preheader and
829 // outerloop latch.
830 for (BasicBlock *Succ : successors(OuterLoopHeader))
831 if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader() &&
832 Succ != OuterLoopLatch)
833 return false;
834
835 LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
836
837 // The inner loop reduction pattern requires storing the LCSSA PHI in
838 // the OuterLoop Latch. Therefore, when reduction2Memory is enabled, skip
839 // that store during checks.
840 Instruction *Skip = nullptr;
841 assert(InnerReductions.size() <= 1 &&
842 "So far we only support at most one reduction.");
843 if (InnerReductions.size() == 1)
844 Skip = InnerReductions[0].LcssaStore;
845
846 // We do not have any basic block in between now make sure the outer header
847 // and outer loop latch doesn't contain any unsafe instructions.
848 if (containsUnsafeInstructions(OuterLoopHeader, Skip) ||
849 containsUnsafeInstructions(OuterLoopLatch, Skip))
850 return false;
851
852 // Also make sure the inner loop preheader does not contain any unsafe
853 // instructions. Note that all instructions in the preheader will be moved to
854 // the outer loop header when interchanging.
855 if (InnerLoopPreHeader != OuterLoopHeader &&
856 containsUnsafeInstructions(InnerLoopPreHeader, Skip))
857 return false;
858
859 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
860 // Ensure the inner loop exit block flows to the outer loop latch possibly
861 // through empty blocks.
862 const BasicBlock &SuccInner =
863 LoopNest::skipEmptyBlockUntil(InnerLoopExit, OuterLoopLatch);
864 if (&SuccInner != OuterLoopLatch) {
865 LLVM_DEBUG(dbgs() << "Inner loop exit block " << *InnerLoopExit
866 << " does not lead to the outer loop latch.\n";);
867 return false;
868 }
869 // The inner loop exit block does flow to the outer loop latch and not some
870 // other BBs, now make sure it contains safe instructions, since it will be
871 // moved into the (new) inner loop after interchange.
872 if (containsUnsafeInstructions(InnerLoopExit, Skip))
873 return false;
874
875 LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
876 // We have a perfect loop nest.
877 return true;
878}
879
880bool LoopInterchangeLegality::isLoopStructureUnderstood() {
881 BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
882 for (PHINode *InnerInduction : InnerLoopInductions) {
883 unsigned Num = InnerInduction->getNumOperands();
884 for (unsigned i = 0; i < Num; ++i) {
885 Value *Val = InnerInduction->getOperand(i);
886 if (isa<Constant>(Val))
887 continue;
889 if (!I)
890 return false;
891 // TODO: Handle triangular loops.
892 // e.g. for(int i=0;i<N;i++)
893 // for(int j=i;j<N;j++)
894 unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
895 if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
896 InnerLoopPreheader &&
897 !OuterLoop->isLoopInvariant(I)) {
898 return false;
899 }
900 }
901 }
902
903 // TODO: Handle triangular loops of another form.
904 // e.g. for(int i=0;i<N;i++)
905 // for(int j=0;j<i;j++)
906 // or,
907 // for(int i=0;i<N;i++)
908 // for(int j=0;j*i<N;j++)
909 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
910 CondBrInst *InnerLoopLatchBI =
911 dyn_cast<CondBrInst>(InnerLoopLatch->getTerminator());
912 if (!InnerLoopLatchBI)
913 return false;
914 if (CmpInst *InnerLoopCmp =
915 dyn_cast<CmpInst>(InnerLoopLatchBI->getCondition())) {
916 Value *Op0 = InnerLoopCmp->getOperand(0);
917 Value *Op1 = InnerLoopCmp->getOperand(1);
918
919 // LHS and RHS of the inner loop exit condition, e.g.,
920 // in "for(int j=0;j<i;j++)", LHS is j and RHS is i.
921 Value *Left = nullptr;
922 Value *Right = nullptr;
923
924 // Check if V only involves inner loop induction variable.
925 // Return true if V is InnerInduction, or a cast from
926 // InnerInduction, or a binary operator that involves
927 // InnerInduction and a constant.
928 std::function<bool(Value *)> IsPathToInnerIndVar;
929 IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value *V) -> bool {
930 if (llvm::is_contained(InnerLoopInductions, V))
931 return true;
932 if (isa<Constant>(V))
933 return true;
935 if (!I)
936 return false;
937 if (isa<CastInst>(I))
938 return IsPathToInnerIndVar(I->getOperand(0));
940 return IsPathToInnerIndVar(I->getOperand(0)) &&
941 IsPathToInnerIndVar(I->getOperand(1));
942 return false;
943 };
944
945 // In case of multiple inner loop indvars, it is okay if LHS and RHS
946 // are both inner indvar related variables.
947 if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1))
948 return true;
949
950 // Otherwise we check if the cmp instruction compares an inner indvar
951 // related variable (Left) with a outer loop invariant (Right).
952 if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Op0)) {
953 Left = Op0;
954 Right = Op1;
955 } else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Op1)) {
956 Left = Op1;
957 Right = Op0;
958 }
959
960 if (Left == nullptr)
961 return false;
962
963 const SCEV *S = SE->getSCEV(Right);
964 if (!SE->isLoopInvariant(S, OuterLoop))
965 return false;
966 }
967
968 return true;
969}
970
971// If SV is a LCSSA PHI node with a single incoming value, return the incoming
972// value.
973static Value *followLCSSA(Value *SV) {
975 if (!PHI)
976 return SV;
977
978 if (PHI->getNumIncomingValues() != 1)
979 return SV;
980 return followLCSSA(PHI->getIncomingValue(0));
981}
982
984 SmallVectorImpl<Instruction *> &HasNoWrapInsts,
985 SmallVectorImpl<Instruction *> &HasNoInfInsts) {
988 // Detect floating point reduction only when it can be reordered.
989 if (RD.getExactFPMathInst() != nullptr)
990 return false;
991
993 switch (RK) {
994 case RecurKind::Or:
995 case RecurKind::And:
996 case RecurKind::Xor:
997 case RecurKind::SMin:
998 case RecurKind::SMax:
999 case RecurKind::UMin:
1000 case RecurKind::UMax:
1001 case RecurKind::AnyOf:
1002 return true;
1003
1004 // Changing the order of floating-point operations may alter the results. If
1005 // a certain instruction has the ninf flag, it means that reordering can
1006 // produce a poison value, which may lead to undefined behavior. To prevent
1007 // this, we must drop the ninf flags if we decide to apply the
1008 // transformation.
1009 case RecurKind::FAdd:
1010 case RecurKind::FMul:
1011 case RecurKind::FMin:
1012 case RecurKind::FMax:
1017 case RecurKind::FMulAdd:
1018 for (Instruction *I : RD.getReductionOpChain(PHI, L))
1019 if (isa<FPMathOperator>(I) && I->hasNoInfs())
1020 HasNoInfInsts.push_back(I);
1021 return true;
1022
1023 // Change the order of integer addition/multiplication may change the
1024 // semantics. Consider the following case:
1025 //
1026 // int A[2][2] = {{ INT_MAX, INT_MAX }, { INT_MIN, INT_MIN }};
1027 // int sum = 0;
1028 // for (int i = 0; i < 2; i++)
1029 // for (int j = 0; j < 2; j++)
1030 // sum += A[j][i];
1031 //
1032 // If the above loops are exchanged, the addition will cause an
1033 // overflow. To prevent this, we must drop the nuw/nsw flags from the
1034 // addition/multiplication instructions when we actually exchanges the
1035 // loops.
1036 case RecurKind::Add:
1037 case RecurKind::Mul: {
1038 unsigned OpCode = RecurrenceDescriptor::getOpcode(RK);
1040
1041 // Bail out when we fail to collect reduction instructions chain.
1042 if (Ops.empty())
1043 return false;
1044
1045 for (Instruction *I : Ops) {
1046 assert(I->getOpcode() == OpCode &&
1047 "Expected the instruction to be the reduction operation");
1048 (void)OpCode;
1049
1050 // If the instruction has nuw/nsw flags, we must drop them when the
1051 // transformation is actually performed.
1052 if (I->hasNoSignedWrap() || I->hasNoUnsignedWrap())
1053 HasNoWrapInsts.push_back(I);
1054 }
1055 return true;
1056 }
1057
1058 default:
1059 return false;
1060 }
1061 } else
1062 return false;
1063}
1064
1065// Check V's users to see if it is involved in a reduction in L.
1066static PHINode *
1068 SmallVectorImpl<Instruction *> &HasNoWrapInsts,
1069 SmallVectorImpl<Instruction *> &HasNoInfInsts) {
1070 // Reduction variables cannot be constants.
1071 if (isa<Constant>(V))
1072 return nullptr;
1073
1074 for (Value *User : V->users()) {
1076 if (PHI->getNumIncomingValues() == 1)
1077 continue;
1078
1079 if (checkReductionKind(L, PHI, HasNoWrapInsts, HasNoInfInsts))
1080 return PHI;
1081 else
1082 return nullptr;
1083 }
1084 }
1085
1086 return nullptr;
1087}
1088
1089bool LoopInterchangeLegality::isInnerReduction(
1090 Loop *L, PHINode *Phi, SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
1091
1092 // Only support reduction2Mem when the loop nest to be interchanged is
1093 // the innermost two loops.
1094 if (!L->isInnermost()) {
1095 LLVM_DEBUG(dbgs() << "Only supported when the loop is the innermost.\n");
1096 ORE->emit([&]() {
1097 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1098 L->getStartLoc(), L->getHeader())
1099 << "Only supported when the loop is the innermost.";
1100 });
1101 return false;
1102 }
1103
1104 if (Phi->getNumIncomingValues() != 2)
1105 return false;
1106
1107 Value *Init = Phi->getIncomingValueForBlock(L->getLoopPreheader());
1108 Value *Next = Phi->getIncomingValueForBlock(L->getLoopLatch());
1109
1110 // So far only supports constant initial value.
1111 if (!isa<Constant>(Init)) {
1112 LLVM_DEBUG(
1113 dbgs()
1114 << "Only supported for the reduction with a constant initial value.\n");
1115 ORE->emit([&]() {
1116 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1117 L->getStartLoc(), L->getHeader())
1118 << "Only supported for the reduction with a constant initial "
1119 "value.";
1120 });
1121 return false;
1122 }
1123
1124 // The reduction result must live in the inner loop.
1125 if (Instruction *I = dyn_cast<Instruction>(Next)) {
1126 BasicBlock *BB = I->getParent();
1127 if (!L->contains(BB))
1128 return false;
1129 }
1130
1131 // The reduction should have only one user.
1132 if (!Phi->hasOneUser())
1133 return false;
1134
1135 // Check the reduction kind.
1136 if (!checkReductionKind(L, Phi, HasNoWrapInsts, HasNoInfInsts))
1137 return false;
1138
1139 // Find lcssa_phi in OuterLoop's Latch
1140 BasicBlock *ExitBlock = L->getExitBlock();
1141 if (!ExitBlock)
1142 return false;
1143
1144 PHINode *Lcssa = NULL;
1145 for (auto *U : Next->users()) {
1146 if (auto *P = dyn_cast<PHINode>(U)) {
1147 if (P == Phi)
1148 continue;
1149
1150 if (Lcssa == NULL && P->getParent() == ExitBlock &&
1151 P->getIncomingValueForBlock(L->getLoopLatch()) == Next)
1152 Lcssa = P;
1153 else
1154 return false;
1155 } else
1156 return false;
1157 }
1158 if (!Lcssa)
1159 return false;
1160
1161 if (!Lcssa->hasOneUser()) {
1162 LLVM_DEBUG(dbgs() << "Only supported when the reduction is used once in "
1163 "the outer loop.\n");
1164 ORE->emit([&]() {
1165 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1166 L->getStartLoc(), L->getHeader())
1167 << "Only supported when the reduction is used once in the outer "
1168 "loop.";
1169 });
1170 return false;
1171 }
1172
1173 StoreInst *LcssaStore =
1175 if (!LcssaStore || LcssaStore->getParent() != ExitBlock)
1176 return false;
1177
1178 Value *MemRef = LcssaStore->getOperand(1);
1179 Type *ElemTy = LcssaStore->getOperand(0)->getType();
1180
1181 // LcssaStore stores the reduction result in BB.
1182 // When the reduction is initialized from a constant value, we need to load
1183 // from the memory object into the target basic block of the inner loop. This
1184 // means the memory reference was used prematurely. So we must ensure that the
1185 // memory reference does not dominate the target basic block.
1186 // TODO: Move the memory reference definition into the loop header.
1187 if (!DT->dominates(dyn_cast<Instruction>(MemRef), L->getHeader())) {
1188 LLVM_DEBUG(dbgs() << "Only supported when memory reference dominate "
1189 "the inner loop.\n");
1190 ORE->emit([&]() {
1191 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1192 L->getStartLoc(), L->getHeader())
1193 << "Only supported when memory reference dominate the inner "
1194 "loop.";
1195 });
1196 return false;
1197 }
1198
1199 // Found a reduction in the inner loop.
1200 InnerReduction SR;
1201 SR.Reduction = Phi;
1202 SR.Init = Init;
1203 SR.Next = Next;
1204 SR.LcssaPhi = Lcssa;
1205 SR.LcssaStore = LcssaStore;
1206 SR.MemRef = MemRef;
1207 SR.ElemTy = ElemTy;
1208
1209 InnerReductions.push_back(SR);
1210 return true;
1211}
1212
1213bool LoopInterchangeLegality::findInductionAndReductions(
1214 Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) {
1215 if (!L->getLoopLatch() || !L->getLoopPredecessor())
1216 return false;
1217 for (PHINode &PHI : L->getHeader()->phis()) {
1218 InductionDescriptor ID;
1220 Inductions.push_back(&PHI);
1221 else {
1222 // PHIs in inner loops need to be part of a reduction in the outer loop,
1223 // discovered when checking the PHIs of the outer loop earlier.
1224 if (!InnerLoop) {
1225 if (OuterInnerReductions.count(&PHI)) {
1226 LLVM_DEBUG(dbgs() << "Found a reduction across the outer loop.\n");
1227 } else if (EnableReduction2Memory &&
1228 isInnerReduction(L, &PHI, HasNoWrapReductions)) {
1229 LLVM_DEBUG(dbgs() << "Found a reduction in the inner loop: \n"
1230 << PHI << '\n');
1231 } else
1232 return false;
1233 } else {
1234 assert(PHI.getNumIncomingValues() == 2 &&
1235 "Phis in loop header should have exactly 2 incoming values");
1236 // Check if we have a PHI node in the outer loop that has a reduction
1237 // result from the inner loop as an incoming value.
1238 Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch()));
1239 PHINode *InnerRedPhi = findInnerReductionPhi(
1240 InnerLoop, V, HasNoWrapReductions, HasNoInfInsts);
1241 if (!InnerRedPhi ||
1242 !llvm::is_contained(InnerRedPhi->incoming_values(), &PHI)) {
1243 LLVM_DEBUG(
1244 dbgs()
1245 << "Failed to recognize PHI as an induction or reduction.\n");
1246 return false;
1247 }
1248 OuterInnerReductions.insert(&PHI);
1249 OuterInnerReductions.insert(InnerRedPhi);
1250 }
1251 }
1252 }
1253
1254 // For now we only support at most one reduction.
1255 if (InnerReductions.size() > 1) {
1256 LLVM_DEBUG(dbgs() << "Only supports at most one reduction.\n");
1257 ORE->emit([&]() {
1258 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1259 L->getStartLoc(), L->getHeader())
1260 << "Only supports at most one reduction.";
1261 });
1262 return false;
1263 }
1264
1265 return true;
1266}
1267
1268// This function indicates the current limitations in the transform as a result
1269// of which we do not proceed.
1270bool LoopInterchangeLegality::currentLimitations() {
1271 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1272
1273 // transform currently expects the loop latches to also be the exiting
1274 // blocks.
1275 if (InnerLoop->getExitingBlock() != InnerLoopLatch ||
1276 OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() ||
1277 !isa<CondBrInst>(InnerLoopLatch->getTerminator()) ||
1278 !isa<CondBrInst>(OuterLoop->getLoopLatch()->getTerminator())) {
1279 LLVM_DEBUG(
1280 dbgs() << "Loops where the latch is not the exiting block are not"
1281 << " supported currently.\n");
1282 ORE->emit([&]() {
1283 return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch",
1284 OuterLoop->getStartLoc(),
1285 OuterLoop->getHeader())
1286 << "Loops where the latch is not the exiting block cannot be"
1287 " interchange currently.";
1288 });
1289 return true;
1290 }
1291
1292 SmallVector<PHINode *, 8> Inductions;
1293 if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) {
1294 LLVM_DEBUG(
1295 dbgs() << "Only outer loops with induction or reduction PHI nodes "
1296 << "are supported currently.\n");
1297 ORE->emit([&]() {
1298 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
1299 OuterLoop->getStartLoc(),
1300 OuterLoop->getHeader())
1301 << "Only outer loops with induction or reduction PHI nodes can be"
1302 " interchanged currently.";
1303 });
1304 return true;
1305 }
1306
1307 Inductions.clear();
1308 // For multi-level loop nests, make sure that all phi nodes for inner loops
1309 // at all levels can be recognized as a induction or reduction phi. Bail out
1310 // if a phi node at a certain nesting level cannot be properly recognized.
1311 Loop *CurLevelLoop = OuterLoop;
1312 while (!CurLevelLoop->getSubLoops().empty()) {
1313 // We already made sure that the loop nest is tightly nested.
1314 CurLevelLoop = CurLevelLoop->getSubLoops().front();
1315 if (!findInductionAndReductions(CurLevelLoop, Inductions, nullptr)) {
1316 LLVM_DEBUG(
1317 dbgs() << "Only inner loops with induction or reduction PHI nodes "
1318 << "are supported currently.\n");
1319 ORE->emit([&]() {
1320 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
1321 CurLevelLoop->getStartLoc(),
1322 CurLevelLoop->getHeader())
1323 << "Only inner loops with induction or reduction PHI nodes can be"
1324 " interchange currently.";
1325 });
1326 return true;
1327 }
1328 }
1329
1330 // TODO: Triangular loops are not handled for now.
1331 if (!isLoopStructureUnderstood()) {
1332 LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
1333 ORE->emit([&]() {
1334 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
1335 InnerLoop->getStartLoc(),
1336 InnerLoop->getHeader())
1337 << "Inner loop structure not understood currently.";
1338 });
1339 return true;
1340 }
1341
1342 return false;
1343}
1344
1345bool LoopInterchangeLegality::findInductions(
1346 Loop *L, SmallVectorImpl<PHINode *> &Inductions) {
1347 for (PHINode &PHI : L->getHeader()->phis()) {
1348 InductionDescriptor ID;
1350 Inductions.push_back(&PHI);
1351 }
1352 return !Inductions.empty();
1353}
1354
1355/// We currently only support LCSSA PHI nodes in the inner loop exit if their
1356/// users are either of the following:
1357///
1358/// - Reduction PHIs
1359/// - PHIs outside the outer loop
1360/// - PHIs belonging to the latch of the outer loop
1361///
1362/// These conditions mean that we are only interested in the final value after
1363/// the inner loop.
1364static bool
1367 PHINode *LcssaReduction) {
1368 BasicBlock *InnerExit = InnerL->getUniqueExitBlock();
1369 for (PHINode &PHI : InnerExit->phis()) {
1370 // The reduction LCSSA PHI will have only one incoming block, which comes
1371 // from the loop latch.
1372 if (PHI.getNumIncomingValues() > 1)
1373 return false;
1374 if (&PHI == LcssaReduction)
1375 return true;
1376 if (any_of(PHI.users(), [&Reductions, OuterL](User *U) {
1377 PHINode *PN = dyn_cast<PHINode>(U);
1378 if (!PN)
1379 return true;
1380 if (Reductions.count(PN))
1381 return false;
1382 BasicBlock *PB = PN->getParent();
1383 if (!OuterL->contains(PB))
1384 return false;
1385 return PB != OuterL->getLoopLatch();
1386 }))
1387 return false;
1388 }
1389 return true;
1390}
1391
1392// We currently support LCSSA PHI nodes in the outer loop exit, if their
1393// incoming values do not come from the outer loop latch or if the
1394// outer loop latch has a single predecessor. In that case, the value will
1395// be available if both the inner and outer loop conditions are true, which
1396// will still be true after interchanging. If we have multiple predecessor,
1397// that may not be the case, e.g. because the outer loop latch may be executed
1398// if the inner loop is not executed.
1399static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
1400 BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
1401 for (PHINode &PHI : LoopNestExit->phis()) {
1402 for (Value *Incoming : PHI.incoming_values()) {
1403 Instruction *IncomingI = dyn_cast<Instruction>(Incoming);
1404 if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
1405 continue;
1406
1407 // The incoming value is defined in the outer loop latch. Currently we
1408 // only support that in case the outer loop latch has a single predecessor.
1409 // This guarantees that the outer loop latch is executed if and only if
1410 // the inner loop is executed (because tightlyNested() guarantees that the
1411 // outer loop header only branches to the inner loop or the outer loop
1412 // latch).
1413 // FIXME: We could weaken this logic and allow multiple predecessors,
1414 // if the values are produced outside the loop latch. We would need
1415 // additional logic to update the PHI nodes in the exit block as
1416 // well.
1417 if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
1418 return false;
1419 }
1420 }
1421 return true;
1422}
1423
1424// In case of multi-level nested loops, it may occur that lcssa phis exist in
1425// the latch of InnerLoop, i.e., when defs of the incoming values are further
1426// inside the loopnest. Sometimes those incoming values are not available
1427// after interchange, since the original inner latch will become the new outer
1428// latch which may have predecessor paths that do not include those incoming
1429// values.
1430// TODO: Handle transformation of lcssa phis in the InnerLoop latch in case of
1431// multi-level loop nests.
1432static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
1433 if (InnerLoop->getSubLoops().empty())
1434 return true;
1435 // If the original outer latch has only one predecessor, then values defined
1436 // further inside the looploop, e.g., in the innermost loop, will be available
1437 // at the new outer latch after interchange.
1438 if (OuterLoop->getLoopLatch()->getUniquePredecessor() != nullptr)
1439 return true;
1440
1441 // The outer latch has more than one predecessors, i.e., the inner
1442 // exit and the inner header.
1443 // PHI nodes in the inner latch are lcssa phis where the incoming values
1444 // are defined further inside the loopnest. Check if those phis are used
1445 // in the original inner latch. If that is the case then bail out since
1446 // those incoming values may not be available at the new outer latch.
1447 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1448 for (PHINode &PHI : InnerLoopLatch->phis()) {
1449 for (auto *U : PHI.users()) {
1451 if (InnerLoopLatch == UI->getParent())
1452 return false;
1453 }
1454 }
1455 return true;
1456}
1457
1458bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
1459 unsigned OuterLoopId,
1460 CharMatrix &DepMatrix) {
1461 if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
1462 LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
1463 << " and OuterLoopId = " << OuterLoopId
1464 << " due to dependence\n");
1465 ORE->emit([&]() {
1466 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
1467 InnerLoop->getStartLoc(),
1468 InnerLoop->getHeader())
1469 << "Cannot interchange loops due to dependences.";
1470 });
1471 return false;
1472 }
1473 // Check if outer and inner loop contain legal instructions only.
1474 for (auto *BB : OuterLoop->blocks())
1475 for (Instruction &I : *BB)
1476 if (CallInst *CI = dyn_cast<CallInst>(&I)) {
1477 // Functions which don't access memory do not prevent interchanging.
1478 if (CI->doesNotAccessMemory() || isa<PseudoProbeInst>(CI))
1479 continue;
1480 LLVM_DEBUG(
1481 dbgs() << "Loops with call instructions cannot be interchanged "
1482 << "safely.");
1483 ORE->emit([&]() {
1484 return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst",
1485 CI->getDebugLoc(),
1486 CI->getParent())
1487 << "Cannot interchange loops due to call instruction.";
1488 });
1489
1490 return false;
1491 }
1492
1493 if (!findInductions(InnerLoop, InnerLoopInductions)) {
1494 LLVM_DEBUG(dbgs() << "Could not find inner loop induction variables.\n");
1495 return false;
1496 }
1497
1498 if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) {
1499 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n");
1500 ORE->emit([&]() {
1501 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerLatchPHI",
1502 InnerLoop->getStartLoc(),
1503 InnerLoop->getHeader())
1504 << "Cannot interchange loops because unsupported PHI nodes found "
1505 "in inner loop latch.";
1506 });
1507 return false;
1508 }
1509
1510 // TODO: The loops could not be interchanged due to current limitations in the
1511 // transform module.
1512 if (currentLimitations()) {
1513 LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");
1514 return false;
1515 }
1516
1517 // Check if the loops are tightly nested.
1518 if (!tightlyNested(OuterLoop, InnerLoop)) {
1519 LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");
1520 ORE->emit([&]() {
1521 return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
1522 InnerLoop->getStartLoc(),
1523 InnerLoop->getHeader())
1524 << "Cannot interchange loops because they are not tightly "
1525 "nested.";
1526 });
1527 return false;
1528 }
1529
1530 // The LCSSA PHI for the reduction has passed checks before; its user
1531 // is a store instruction.
1532 PHINode *LcssaReduction = nullptr;
1533 assert(InnerReductions.size() <= 1 &&
1534 "So far we only support at most one reduction.");
1535 if (InnerReductions.size() == 1)
1536 LcssaReduction = InnerReductions[0].LcssaPhi;
1537
1538 if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop, OuterInnerReductions,
1539 LcssaReduction)) {
1540 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n");
1541 ORE->emit([&]() {
1542 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1543 InnerLoop->getStartLoc(),
1544 InnerLoop->getHeader())
1545 << "Found unsupported PHI node in loop exit.";
1546 });
1547 return false;
1548 }
1549
1550 if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
1551 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
1552 ORE->emit([&]() {
1553 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1554 OuterLoop->getStartLoc(),
1555 OuterLoop->getHeader())
1556 << "Found unsupported PHI node in loop exit.";
1557 });
1558 return false;
1559 }
1560
1561 if (any_of(OuterLoop->getLoopLatch()->phis(),
1562 [](PHINode &PHI) { return PHI.getNumIncomingValues() != 1; })) {
1563 LLVM_DEBUG(dbgs() << "Only outer loop latch PHI nodes with one incoming "
1564 "value are supported.\n");
1565 ORE->emit([&]() {
1566 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLatchPHI",
1567 OuterLoop->getStartLoc(),
1568 OuterLoop->getHeader())
1569 << "Only outer loop latch PHI nodes with one incoming value are "
1570 "supported.";
1571 });
1572 return false;
1573 }
1574
1575 return true;
1576}
1577
1578void CacheCostManager::computeIfUnitinialized() {
1579 if (CC.has_value())
1580 return;
1581
1582 LLVM_DEBUG(dbgs() << "Compute CacheCost.\n");
1583 CC = CacheCost::getCacheCost(*OutermostLoop, *AR, *DI);
1584 // Obtain the loop vector returned from loop cache analysis beforehand,
1585 // and put each <Loop, index> pair into a map for constant time query
1586 // later. Indices in loop vector reprsent the optimal order of the
1587 // corresponding loop, e.g., given a loopnest with depth N, index 0
1588 // indicates the loop should be placed as the outermost loop and index N
1589 // indicates the loop should be placed as the innermost loop.
1590 //
1591 // For the old pass manager CacheCost would be null.
1592 if (*CC != nullptr)
1593 for (const auto &[Idx, Cost] : enumerate((*CC)->getLoopCosts()))
1594 CostMap[Cost.first] = Idx;
1595}
1596
1597CacheCost *CacheCostManager::getCacheCost() {
1598 computeIfUnitinialized();
1599 return CC->get();
1600}
1601
1602const DenseMap<const Loop *, unsigned> &CacheCostManager::getCostMap() {
1603 computeIfUnitinialized();
1604 return CostMap;
1605}
1606
1607/// If \S contains an affine addrec for \p L, return the step recurrence of it.
1608/// If \S is loop invariant with respect to \p L, return nullptr. Otherwise,
1609/// return std::nullopt, which indicates we cannot determine the coefficient of
1610/// the addrec for \p L in \S.
1611/// TODO: Handle more complex cases. Maybe using SCEVTraversal is a good way to
1612/// do that.
1613std::optional<const SCEV *> getAddRecCoefficient(ScalarEvolution &SE,
1614 const SCEV *S, const Loop *L) {
1616 if (!AR) {
1617 if (SE.isLoopInvariant(S, L))
1618 return nullptr;
1619 return std::nullopt;
1620 }
1621
1622 if (!AR->isAffine()) {
1623 LLVM_DEBUG(dbgs() << "Unexpected non-affine addrec\n");
1624 return std::nullopt;
1625 }
1626
1627 std::optional<const SCEV *> Coeff =
1628 getAddRecCoefficient(SE, AR->getStart(), L);
1629 if (!Coeff.has_value())
1630 return std::nullopt;
1631
1632 if (AR->getLoop() == L) {
1633 assert(!*Coeff && "Found more than one addrec for the same loop");
1634 Coeff = AR->getStepRecurrence(SE);
1635 }
1636 return Coeff;
1637}
1638
1639int LoopInterchangeProfitability::getInstrOrderCost() {
1640 SmallPtrSet<const SCEV *, 4> GoodBasePtrs, BadBasePtrs;
1641 for (BasicBlock *BB : InnerLoop->blocks()) {
1642 for (Instruction &Ins : *BB) {
1643 if (!isa<LoadInst, StoreInst>(&Ins))
1644 continue;
1645 const SCEV *Access = SE->getSCEV(getLoadStorePointerOperand(&Ins));
1646 const SCEV *BasePtr = SE->getPointerBase(Access);
1647 std::optional<const SCEV *> OuterCoeff =
1648 getAddRecCoefficient(*SE, Access, OuterLoop);
1649 std::optional<const SCEV *> InnerCoeff =
1650 getAddRecCoefficient(*SE, Access, InnerLoop);
1651
1652 if (!OuterCoeff.has_value() || !*OuterCoeff || !InnerCoeff.has_value() ||
1653 !*InnerCoeff)
1654 continue;
1655
1656 // This heuristic assumes that a smaller step recurrence implies that the
1657 // induction variable corresponding to the loop is used in the inner
1658 // dimension of the array. Placing such a loop in the inner position would
1659 // be beneficial in terms of locality. If the array access is of the form
1660 // like `A[3*i + 2*j]`, this heuristic may lead to an unprofitable
1661 // interchange, but we expect such cases to be rare.
1662 const SCEV *OuterStep = SE->getAbsExpr(*OuterCoeff, /*IsNSW=*/false);
1663 const SCEV *InnerStep = SE->getAbsExpr(*InnerCoeff, /*IsNSW=*/false);
1664 // If we find the inner induction after an outer induction e.g.
1665 //
1666 // for(int i=0;i<N;i++)
1667 // for(int j=0;j<N;j++)
1668 // A[i][j] = A[i-1][j-1]+k;
1669 //
1670 //
1671 // then it is a good order. If we find the outer induction after an inner
1672 // induction e.g.
1673 //
1674 // for(int i=0;i<N;i++)
1675 // for(int j=0;j<N;j++)
1676 // A[j][i] = A[j-1][i-1]+k;
1677 //
1678 // then it is a bad order.
1679 //
1680 // To avoid counting the same base pointers multiple times, we deduplicate
1681 // them by using a set of base pointers.
1682 if (SE->isKnownPredicate(ICmpInst::ICMP_SLT, InnerStep, OuterStep))
1683 GoodBasePtrs.insert(BasePtr);
1684 else if (SE->isKnownPredicate(ICmpInst::ICMP_SLT, OuterStep, InnerStep))
1685 BadBasePtrs.insert(BasePtr);
1686 }
1687 }
1688
1689 int GoodOrder = GoodBasePtrs.size();
1690 int BadOrder = BadBasePtrs.size();
1691 return GoodOrder - BadOrder;
1692}
1693
1694std::optional<bool>
1695LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(
1696 const DenseMap<const Loop *, unsigned> &CostMap, CacheCost *CC) {
1697 // This is the new cost model returned from loop cache analysis.
1698 // A smaller index means the loop should be placed an outer loop, and vice
1699 // versa.
1700 auto InnerLoopIt = CostMap.find(InnerLoop);
1701 if (InnerLoopIt == CostMap.end())
1702 return std::nullopt;
1703 auto OuterLoopIt = CostMap.find(OuterLoop);
1704 if (OuterLoopIt == CostMap.end())
1705 return std::nullopt;
1706
1707 if (CC->getLoopCost(*OuterLoop) == CC->getLoopCost(*InnerLoop))
1708 return std::nullopt;
1709 unsigned InnerIndex = InnerLoopIt->second;
1710 unsigned OuterIndex = OuterLoopIt->second;
1711 LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
1712 << ", OuterIndex = " << OuterIndex << "\n");
1713 assert(InnerIndex != OuterIndex && "CostMap should assign unique "
1714 "numbers to each loop");
1715 return std::optional<bool>(InnerIndex < OuterIndex);
1716}
1717
1718std::optional<bool>
1719LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
1720 // Legacy cost model: this is rough cost estimation algorithm. It counts the
1721 // good and bad order of induction variables in the instruction and allows
1722 // reordering if number of bad orders is more than good.
1723 int Cost = getInstrOrderCost();
1724 LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
1726 return std::optional<bool>(true);
1727
1728 return std::nullopt;
1729}
1730
1731/// Return true if we can vectorize the loop specified by \p LoopId.
1732static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
1733 for (const auto &Dep : DepMatrix) {
1734 char Dir = Dep[LoopId];
1735 char DepType = Dep.back();
1736 assert((DepType == '<' || DepType == '*') &&
1737 "Unexpected element in dependency vector");
1738
1739 // There are no loop-carried dependencies.
1740 if (Dir == '=' || Dir == 'I')
1741 continue;
1742
1743 // DepType being '<' means that this direction vector represents a forward
1744 // dependency. In principle, a loop with '<' direction can be vectorized in
1745 // this case.
1746 if (Dir == '<' && DepType == '<')
1747 continue;
1748
1749 // We cannot prove that the loop is vectorizable.
1750 return false;
1751 }
1752 return true;
1753}
1754
1755std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
1756 unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
1757 // If the outer loop cannot be vectorized, it is not profitable to move this
1758 // to inner position.
1759 if (!canVectorize(DepMatrix, OuterLoopId))
1760 return false;
1761
1762 // If the inner loop cannot be vectorized but the outer loop can be, then it
1763 // is profitable to interchange to enable inner loop parallelism.
1764 if (!canVectorize(DepMatrix, InnerLoopId))
1765 return true;
1766
1767 // If both the inner and the outer loop can be vectorized, it is necessary to
1768 // check the cost of each vectorized loop for profitability decision. At this
1769 // time we do not have a cost model to estimate them, so return nullopt.
1770 // TODO: Estimate the cost of vectorized loop when both the outer and the
1771 // inner loop can be vectorized.
1772 return std::nullopt;
1773}
1774
1775bool LoopInterchangeProfitability::isProfitable(
1776 const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
1777 unsigned OuterLoopId, CharMatrix &DepMatrix, CacheCostManager &CCM) {
1778 // Do not consider loops with a backedge that isn't taken, e.g. an
1779 // unconditional branch true/false, as candidates for interchange.
1780 // TODO: when interchange is forced, we should probably also allow
1781 // interchange for these loops, and thus this logic should be moved just
1782 // below the cost-model ignore check below. But this check is done first
1783 // to avoid the issue in #163954.
1784 const SCEV *InnerBTC = SE->getBackedgeTakenCount(InnerLoop);
1785 const SCEV *OuterBTC = SE->getBackedgeTakenCount(OuterLoop);
1786 if (InnerBTC && InnerBTC->isZero()) {
1787 LLVM_DEBUG(dbgs() << "Inner loop back-edge isn't taken, rejecting "
1788 "single iteration loop\n");
1789 return false;
1790 }
1791 if (OuterBTC && OuterBTC->isZero()) {
1792 LLVM_DEBUG(dbgs() << "Outer loop back-edge isn't taken, rejecting "
1793 "single iteration loop\n");
1794 return false;
1795 }
1796
1797 // Return true if interchange is forced and the cost-model ignored.
1798 if (Profitabilities.size() == 1 && Profitabilities[0] == RuleTy::Ignore)
1799 return true;
1801 "Duplicate rules and option 'ignore' are not allowed");
1802
1803 // isProfitable() is structured to avoid endless loop interchange. If the
1804 // highest priority rule (isProfitablePerLoopCacheAnalysis by default) could
1805 // decide the profitability then, profitability check will stop and return the
1806 // analysis result. If it failed to determine it (e.g., cache analysis failed
1807 // to analyze the loopnest due to delinearization issues) then go ahead the
1808 // second highest priority rule (isProfitablePerInstrOrderCost by default).
1809 // Likewise, if it failed to analysis the profitability then only, the last
1810 // rule (isProfitableForVectorization by default) will decide.
1811 std::optional<bool> shouldInterchange;
1812 for (RuleTy RT : Profitabilities) {
1813 switch (RT) {
1814 case RuleTy::PerLoopCacheAnalysis: {
1815 CacheCost *CC = CCM.getCacheCost();
1816 const DenseMap<const Loop *, unsigned> &CostMap = CCM.getCostMap();
1817 shouldInterchange = isProfitablePerLoopCacheAnalysis(CostMap, CC);
1818 break;
1819 }
1820 case RuleTy::PerInstrOrderCost:
1821 shouldInterchange = isProfitablePerInstrOrderCost();
1822 break;
1823 case RuleTy::ForVectorization:
1824 shouldInterchange =
1825 isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
1826 break;
1827 case RuleTy::Ignore:
1828 llvm_unreachable("Option 'ignore' is not supported with other options");
1829 break;
1830 }
1831
1832 // If this rule could determine the profitability, don't call subsequent
1833 // rules.
1834 if (shouldInterchange.has_value())
1835 break;
1836 }
1837
1838 if (!shouldInterchange.has_value()) {
1839 ORE->emit([&]() {
1840 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1841 InnerLoop->getStartLoc(),
1842 InnerLoop->getHeader())
1843 << "Insufficient information to calculate the cost of loop for "
1844 "interchange.";
1845 });
1846 return false;
1847 } else if (!shouldInterchange.value()) {
1848 ORE->emit([&]() {
1849 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1850 InnerLoop->getStartLoc(),
1851 InnerLoop->getHeader())
1852 << "Interchanging loops is not considered to improve cache "
1853 "locality nor vectorization.";
1854 });
1855 return false;
1856 }
1857 return true;
1858}
1859
1860void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
1861 Loop *InnerLoop) {
1862 for (Loop *L : *OuterLoop)
1863 if (L == InnerLoop) {
1864 OuterLoop->removeChildLoop(L);
1865 return;
1866 }
1867 llvm_unreachable("Couldn't find loop");
1868}
1869
1870/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
1871/// new inner and outer loop after interchanging: NewInner is the original
1872/// outer loop and NewOuter is the original inner loop.
1873///
1874/// Before interchanging, we have the following structure
1875/// Outer preheader
1876// Outer header
1877// Inner preheader
1878// Inner header
1879// Inner body
1880// Inner latch
1881// outer bbs
1882// Outer latch
1883//
1884// After interchanging:
1885// Inner preheader
1886// Inner header
1887// Outer preheader
1888// Outer header
1889// Inner body
1890// outer bbs
1891// Outer latch
1892// Inner latch
1893void LoopInterchangeTransform::restructureLoops(
1894 Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader,
1895 BasicBlock *OrigOuterPreHeader) {
1896 Loop *OuterLoopParent = OuterLoop->getParentLoop();
1897 // The original inner loop preheader moves from the new inner loop to
1898 // the parent loop, if there is one.
1899 NewInner->removeBlockFromLoop(OrigInnerPreHeader);
1900 LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent);
1901
1902 // Switch the loop levels.
1903 if (OuterLoopParent) {
1904 // Remove the loop from its parent loop.
1905 removeChildLoop(OuterLoopParent, NewInner);
1906 removeChildLoop(NewInner, NewOuter);
1907 OuterLoopParent->addChildLoop(NewOuter);
1908 } else {
1909 removeChildLoop(NewInner, NewOuter);
1910 LI->changeTopLevelLoop(NewInner, NewOuter);
1911 }
1912 while (!NewOuter->isInnermost())
1913 NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin()));
1914 NewOuter->addChildLoop(NewInner);
1915
1916 // BBs from the original inner loop.
1917 SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks());
1918
1919 // Add BBs from the original outer loop to the original inner loop (excluding
1920 // BBs already in inner loop)
1921 for (BasicBlock *BB : NewInner->blocks())
1922 if (LI->getLoopFor(BB) == NewInner)
1923 NewOuter->addBlockEntry(BB);
1924
1925 // Now remove inner loop header and latch from the new inner loop and move
1926 // other BBs (the loop body) to the new inner loop.
1927 BasicBlock *OuterHeader = NewOuter->getHeader();
1928 BasicBlock *OuterLatch = NewOuter->getLoopLatch();
1929 for (BasicBlock *BB : OrigInnerBBs) {
1930 // Nothing will change for BBs in child loops.
1931 if (LI->getLoopFor(BB) != NewOuter)
1932 continue;
1933 // Remove the new outer loop header and latch from the new inner loop.
1934 if (BB == OuterHeader || BB == OuterLatch)
1935 NewInner->removeBlockFromLoop(BB);
1936 else
1937 LI->changeLoopFor(BB, NewInner);
1938 }
1939
1940 // The preheader of the original outer loop becomes part of the new
1941 // outer loop.
1942 NewOuter->addBlockEntry(OrigOuterPreHeader);
1943 LI->changeLoopFor(OrigOuterPreHeader, NewOuter);
1944
1945 // Tell SE that we move the loops around.
1946 SE->forgetLoop(NewOuter);
1947}
1948
1949/// User can write, or optimizers can generate the reduction for inner loop.
1950/// To make the interchange valid, apply Reduction2Mem by moving the
1951/// initializer and store instructions into the inner loop. So far we only
1952/// handle cases where the reduction variable is initialized to a constant.
1953/// For example, below code:
1954///
1955/// loop:
1956/// re = phi<0.0, next>
1957/// next = re op ...
1958/// endloop
1959/// reduc_sum = phi<next> // lcssa phi
1960/// MEM_REF[idx] = reduc_sum // LcssaStore
1961///
1962/// is transformed into:
1963///
1964/// loop:
1965/// tmp = MEM_REF[idx];
1966/// new_var = !first_iteration ? tmp : 0.0;
1967/// next = new_var op ...
1968/// MEM_REF[idx] = next; // after moving
1969/// endloop
1970///
1971/// In this way the initial const is used in the first iteration of loop.
1972void LoopInterchangeTransform::reduction2Memory() {
1974 LIL.getInnerReductions();
1975
1976 assert(InnerReductions.size() == 1 &&
1977 "So far we only support at most one reduction.");
1978
1979 LoopInterchangeLegality::InnerReduction SR = InnerReductions[0];
1980 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1981 IRBuilder<> Builder(&*(InnerLoopHeader->getFirstNonPHIIt()));
1982
1983 // Check if it's the first iteration.
1984 LLVMContext &Context = InnerLoopHeader->getContext();
1985 PHINode *FirstIter =
1986 Builder.CreatePHI(Type::getInt1Ty(Context), 2, "first.iter");
1987 FirstIter->addIncoming(ConstantInt::get(Type::getInt1Ty(Context), 1),
1988 InnerLoop->getLoopPreheader());
1989 FirstIter->addIncoming(ConstantInt::get(Type::getInt1Ty(Context), 0),
1990 InnerLoop->getLoopLatch());
1991 assert(FirstIter->isComplete() && "The FirstIter PHI node is not complete.");
1992
1993 // When the reduction is initialized from a constant value, we need to add
1994 // a stmt loading from the memory object to target basic block in inner
1995 // loop.
1996 Instruction *LoadMem = Builder.CreateLoad(SR.ElemTy, SR.MemRef);
1997
1998 // Init new_var to MEM_REF or CONST depending on if it is the first iteration.
1999 Value *NewVar = Builder.CreateSelect(FirstIter, SR.Init, LoadMem, "new.var");
2000
2001 // Replace all uses of the reduction variable with a new variable.
2002 SR.Reduction->replaceAllUsesWith(NewVar);
2003
2004 // Move store instruction into inner loop, just after reduction next's
2005 // definition.
2006 SR.LcssaStore->setOperand(0, SR.Next);
2007 SR.LcssaStore->moveAfter(dyn_cast<Instruction>(SR.Next));
2008}
2009
2010bool LoopInterchangeTransform::transform(
2011 ArrayRef<Instruction *> DropNoWrapInsts,
2012 ArrayRef<Instruction *> DropNoInfInsts) {
2013 bool Transformed = false;
2014
2016 LIL.getInnerReductions();
2017 if (InnerReductions.size() == 1)
2018 reduction2Memory();
2019
2020 LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
2021 auto &InductionPHIs = LIL.getInnerLoopInductions();
2022 if (InductionPHIs.empty()) {
2023 LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
2024 return false;
2025 }
2026
2027 SmallVector<Instruction *, 8> InnerIndexVarList;
2028 for (PHINode *CurInductionPHI : InductionPHIs) {
2029 Instruction *IncomingValue = dyn_cast<Instruction>(
2030 CurInductionPHI->getIncomingValueForBlock(InnerLoop->getLoopLatch()));
2031 assert(IncomingValue &&
2032 "Incoming value from loop latch isn't an instruction");
2033 if (is_contained(InductionPHIs, IncomingValue))
2034 continue;
2035 InnerIndexVarList.push_back(IncomingValue);
2036 }
2037
2038 // Create a new latch block for the inner loop. We split at the
2039 // current latch's terminator and then move the condition and all
2040 // operands that are not either loop-invariant or the induction PHI into the
2041 // new latch block.
2042 BasicBlock *NewLatch =
2043 SplitBlock(InnerLoop->getLoopLatch(),
2044 InnerLoop->getLoopLatch()->getTerminator(), DT, LI);
2045
2046 SmallSetVector<Instruction *, 4> WorkList;
2047 unsigned i = 0;
2048 auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() {
2049 for (; i < WorkList.size(); i++) {
2050 // Duplicate instruction and move it to the new latch. Update uses that
2051 // have been moved.
2052 Instruction *NewI = WorkList[i]->clone();
2053 NewI->insertBefore(NewLatch->getFirstNonPHIIt());
2054 assert(!NewI->mayHaveSideEffects() &&
2055 "Moving instructions with side-effects may change behavior of "
2056 "the loop nest!");
2057 for (Use &U : llvm::make_early_inc_range(WorkList[i]->uses())) {
2058 Instruction *UserI = cast<Instruction>(U.getUser());
2059 if (!InnerLoop->contains(UserI->getParent()) ||
2060 UserI->getParent() == NewLatch ||
2061 llvm::is_contained(InductionPHIs, UserI))
2062 U.set(NewI);
2063 }
2064 // Add operands of moved instruction to the worklist, except if they are
2065 // outside the inner loop or are the induction PHI.
2066 for (Value *Op : WorkList[i]->operands()) {
2068 if (!OpI || this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop ||
2069 llvm::is_contained(InductionPHIs, OpI))
2070 continue;
2071 WorkList.insert(OpI);
2072 }
2073 }
2074 };
2075
2076 // FIXME: Should we interchange when we have a constant condition?
2079 ->getCondition());
2080 if (CondI)
2081 WorkList.insert(CondI);
2082 MoveInstructions();
2083 for (Instruction *InnerIndexVar : InnerIndexVarList)
2084 WorkList.insert(cast<Instruction>(InnerIndexVar));
2085 MoveInstructions();
2086
2087 // Ensure the inner loop phi nodes have a separate basic block.
2088 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
2089 if (&*InnerLoopHeader->getFirstNonPHIIt() !=
2090 InnerLoopHeader->getTerminator()) {
2091 SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHIIt(), DT, LI);
2092 LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
2093 }
2094
2095 // Instructions in the original inner loop preheader may depend on values
2096 // defined in the outer loop header. Move them there, because the original
2097 // inner loop preheader will become the entry into the interchanged loop nest.
2098 // Currently we move all instructions and rely on LICM to move invariant
2099 // instructions outside the loop nest.
2100 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2101 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
2102
2103 if (InnerLoopPreHeader != OuterLoopHeader) {
2104 // Eliminate PHIs in the inner-loop preheader.
2105 for (PHINode &P : make_early_inc_range(InnerLoopPreHeader->phis())) {
2106 assert(P.getNumIncomingValues() == 1 &&
2107 "Expected single-incoming PHIs in inner loop preheader");
2108 P.replaceAllUsesWith(P.getIncomingValue(0));
2109 P.eraseFromParent();
2110 }
2111 for (Instruction &I :
2112 make_early_inc_range(make_range(InnerLoopPreHeader->begin(),
2113 std::prev(InnerLoopPreHeader->end()))))
2114 I.moveBeforePreserving(OuterLoopHeader->getTerminator()->getIterator());
2115 }
2116
2117 Transformed |= adjustLoopLinks();
2118 if (!Transformed) {
2119 LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
2120 return false;
2121 }
2122
2123 // Finally, drop the nsw/nuw/ninf flags from the instructions for reduction
2124 // calculations.
2125 for (Instruction *Reduction : DropNoWrapInsts) {
2126 Reduction->setHasNoSignedWrap(false);
2127 Reduction->setHasNoUnsignedWrap(false);
2128 }
2129 for (Instruction *I : DropNoInfInsts)
2130 I->setHasNoInfs(false);
2131
2132 return true;
2133}
2134
2135/// \brief Move all instructions except the terminator from FromBB right before
2136/// InsertBefore
2137static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
2138 BasicBlock *ToBB = InsertBefore->getParent();
2139
2140 ToBB->splice(InsertBefore->getIterator(), FromBB, FromBB->begin(),
2141 FromBB->getTerminator()->getIterator());
2142}
2143
2144/// Swap instructions between \p BB1 and \p BB2 but keep terminators intact.
2145static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) {
2146 // Save all non-terminator instructions of BB1 into TempInstrs and unlink them
2147 // from BB1 afterwards.
2148 auto Iter = map_range(*BB1, [](Instruction &I) { return &I; });
2149 SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(Iter.end()));
2150 for (Instruction *I : TempInstrs)
2151 I->removeFromParent();
2152
2153 // Move instructions from BB2 to BB1.
2154 moveBBContents(BB2, BB1->getTerminator());
2155
2156 // Move instructions from TempInstrs to BB2.
2157 for (Instruction *I : TempInstrs)
2158 I->insertBefore(BB2->getTerminator()->getIterator());
2159}
2160
2161// Update BI to jump to NewBB instead of OldBB. Records updates to the
2162// dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that
2163// \p OldBB is exactly once in BI's successor list.
2164static void updateSuccessor(Instruction *Term, BasicBlock *OldBB,
2165 BasicBlock *NewBB,
2166 std::vector<DominatorTree::UpdateType> &DTUpdates,
2167 bool MustUpdateOnce = true) {
2168 assert((!MustUpdateOnce || llvm::count(successors(Term), OldBB) == 1) &&
2169 "BI must jump to OldBB exactly once.");
2170 bool Changed = false;
2171 for (Use &Op : Term->operands())
2172 if (Op == OldBB) {
2173 Op.set(NewBB);
2174 Changed = true;
2175 }
2176
2177 if (Changed) {
2178 DTUpdates.push_back(
2179 {DominatorTree::UpdateKind::Insert, Term->getParent(), NewBB});
2180 DTUpdates.push_back(
2181 {DominatorTree::UpdateKind::Delete, Term->getParent(), OldBB});
2182 }
2183 assert(Changed && "Expected a successor to be updated");
2184}
2185
2186// Move Lcssa PHIs to the right place.
2187static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
2188 BasicBlock *InnerLatch, BasicBlock *OuterHeader,
2189 BasicBlock *OuterLatch, BasicBlock *OuterExit,
2190 Loop *InnerLoop, LoopInfo *LI) {
2191
2192 // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are
2193 // defined either in the header or latch. Those blocks will become header and
2194 // latch of the new outer loop, and the only possible users can PHI nodes
2195 // in the exit block of the loop nest or the outer loop header (reduction
2196 // PHIs, in that case, the incoming value must be defined in the inner loop
2197 // header). We can just substitute the user with the incoming value and remove
2198 // the PHI.
2199 for (PHINode &P : make_early_inc_range(InnerExit->phis())) {
2200 assert(P.getNumIncomingValues() == 1 &&
2201 "Only loops with a single exit are supported!");
2202
2203 // Incoming values are guaranteed be instructions currently.
2204 auto IncI = cast<Instruction>(P.getIncomingValueForBlock(InnerLatch));
2205 // In case of multi-level nested loops, follow LCSSA to find the incoming
2206 // value defined from the innermost loop.
2207 auto IncIInnerMost = cast<Instruction>(followLCSSA(IncI));
2208 // Skip phis with incoming values from the inner loop body, excluding the
2209 // header and latch.
2210 if (IncIInnerMost->getParent() != InnerLatch &&
2211 IncIInnerMost->getParent() != InnerHeader)
2212 continue;
2213
2214 assert(all_of(P.users(),
2215 [OuterHeader, OuterExit, IncI, InnerHeader](User *U) {
2216 return (cast<PHINode>(U)->getParent() == OuterHeader &&
2217 IncI->getParent() == InnerHeader) ||
2218 cast<PHINode>(U)->getParent() == OuterExit;
2219 }) &&
2220 "Can only replace phis iff the uses are in the loop nest exit or "
2221 "the incoming value is defined in the inner header (it will "
2222 "dominate all loop blocks after interchanging)");
2223 P.replaceAllUsesWith(IncI);
2224 P.eraseFromParent();
2225 }
2226
2227 SmallVector<PHINode *, 8> LcssaInnerExit(
2228 llvm::make_pointer_range(InnerExit->phis()));
2229
2230 SmallVector<PHINode *, 8> LcssaInnerLatch(
2231 llvm::make_pointer_range(InnerLatch->phis()));
2232
2233 // Lcssa PHIs for values used outside the inner loop are in InnerExit.
2234 // If a PHI node has users outside of InnerExit, it has a use outside the
2235 // interchanged loop and we have to preserve it. We move these to
2236 // InnerLatch, which will become the new exit block for the innermost
2237 // loop after interchanging.
2238 for (PHINode *P : LcssaInnerExit)
2239 P->moveBefore(InnerLatch->getFirstNonPHIIt());
2240
2241 // If the inner loop latch contains LCSSA PHIs, those come from a child loop
2242 // and we have to move them to the new inner latch.
2243 for (PHINode *P : LcssaInnerLatch)
2244 P->moveBefore(InnerExit->getFirstNonPHIIt());
2245
2246 // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have
2247 // incoming values defined in the outer loop, we have to add a new PHI
2248 // in the inner loop latch, which became the exit block of the outer loop,
2249 // after interchanging.
2250 if (OuterExit) {
2251 for (PHINode &P : OuterExit->phis()) {
2252 if (P.getNumIncomingValues() != 1)
2253 continue;
2254 // Skip Phis with incoming values defined in the inner loop. Those should
2255 // already have been updated.
2256 auto I = dyn_cast<Instruction>(P.getIncomingValue(0));
2257 if (!I || LI->getLoopFor(I->getParent()) == InnerLoop)
2258 continue;
2259
2260 PHINode *NewPhi = dyn_cast<PHINode>(P.clone());
2261 NewPhi->setIncomingValue(0, P.getIncomingValue(0));
2262 NewPhi->setIncomingBlock(0, OuterLatch);
2263 // We might have incoming edges from other BBs, i.e., the original outer
2264 // header.
2265 for (auto *Pred : predecessors(InnerLatch)) {
2266 if (Pred == OuterLatch)
2267 continue;
2268 NewPhi->addIncoming(P.getIncomingValue(0), Pred);
2269 }
2270 NewPhi->insertBefore(InnerLatch->getFirstNonPHIIt());
2271 P.setIncomingValue(0, NewPhi);
2272 }
2273 }
2274
2275 // Now adjust the incoming blocks for the LCSSA PHIs.
2276 // For PHIs moved from Inner's exit block, we need to replace Inner's latch
2277 // with the new latch.
2278 InnerLatch->replacePhiUsesWith(InnerLatch, OuterLatch);
2279}
2280
2281/// This deals with a corner case when a LCSSA phi node appears in a non-exit
2282/// block: the outer loop latch block does not need to be exit block of the
2283/// inner loop. Consider a loop that was in LCSSA form, but then some
2284/// transformation like loop-unswitch comes along and creates an empty block,
2285/// where BB5 in this example is the outer loop latch block:
2286///
2287/// BB4:
2288/// br label %BB5
2289/// BB5:
2290/// %old.cond.lcssa = phi i16 [ %cond, %BB4 ]
2291/// br outer.header
2292///
2293/// Interchange then brings it in LCSSA form again resulting in this chain of
2294/// single-input phi nodes:
2295///
2296/// BB4:
2297/// %new.cond.lcssa = phi i16 [ %cond, %BB3 ]
2298/// br label %BB5
2299/// BB5:
2300/// %old.cond.lcssa = phi i16 [ %new.cond.lcssa, %BB4 ]
2301///
2302/// The problem is that interchange can reoder blocks BB4 and BB5 placing the
2303/// use before the def if we don't check this. The solution is to simplify
2304/// lcssa phi nodes (remove) if they appear in non-exit blocks.
2305///
2306static void simplifyLCSSAPhis(Loop *OuterLoop, Loop *InnerLoop) {
2307 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
2308 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
2309
2310 // Do not modify lcssa phis where they actually belong, i.e. in exit blocks.
2311 if (OuterLoopLatch == InnerLoopExit)
2312 return;
2313
2314 // Collect and remove phis in non-exit blocks if they have 1 input.
2316 llvm::make_pointer_range(OuterLoopLatch->phis()));
2317 for (PHINode *Phi : Phis) {
2318 assert(Phi->getNumIncomingValues() == 1 && "Single input phi expected");
2319 LLVM_DEBUG(dbgs() << "Removing 1-input phi in non-exit block: " << *Phi
2320 << "\n");
2321 Phi->replaceAllUsesWith(Phi->getIncomingValue(0));
2322 Phi->eraseFromParent();
2323 }
2324}
2325
2326bool LoopInterchangeTransform::adjustLoopBranches() {
2327 LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
2328 std::vector<DominatorTree::UpdateType> DTUpdates;
2329
2330 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
2331 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2332
2333 assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
2334 InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&
2335 InnerLoopPreHeader && "Guaranteed by loop-simplify form");
2336
2337 simplifyLCSSAPhis(OuterLoop, InnerLoop);
2338
2339 // Ensure that both preheaders do not contain PHI nodes and have single
2340 // predecessors. This allows us to move them easily. We use
2341 // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing
2342 // preheaders do not satisfy those conditions.
2343 if (isa<PHINode>(OuterLoopPreHeader->begin()) ||
2344 !OuterLoopPreHeader->getUniquePredecessor())
2345 OuterLoopPreHeader =
2346 InsertPreheaderForLoop(OuterLoop, DT, LI, nullptr, true);
2347 if (InnerLoopPreHeader == OuterLoop->getHeader())
2348 InnerLoopPreHeader =
2349 InsertPreheaderForLoop(InnerLoop, DT, LI, nullptr, true);
2350
2351 // Adjust the loop preheader
2352 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
2353 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
2354 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
2355 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
2356 BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
2357 BasicBlock *InnerLoopLatchPredecessor =
2358 InnerLoopLatch->getUniquePredecessor();
2359 BasicBlock *InnerLoopLatchSuccessor;
2360 BasicBlock *OuterLoopLatchSuccessor;
2361
2362 CondBrInst *OuterLoopLatchBI =
2363 dyn_cast<CondBrInst>(OuterLoopLatch->getTerminator());
2364 CondBrInst *InnerLoopLatchBI =
2365 dyn_cast<CondBrInst>(InnerLoopLatch->getTerminator());
2366 Instruction *OuterLoopHeaderBI = OuterLoopHeader->getTerminator();
2367 Instruction *InnerLoopHeaderBI = InnerLoopHeader->getTerminator();
2368
2369 if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
2370 !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
2371 !InnerLoopHeaderBI)
2372 return false;
2373
2374 Instruction *InnerLoopLatchPredecessorBI =
2375 InnerLoopLatchPredecessor->getTerminator();
2376 Instruction *OuterLoopPredecessorBI = OuterLoopPredecessor->getTerminator();
2377
2378 if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
2379 return false;
2380 BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
2381 if (!InnerLoopHeaderSuccessor)
2382 return false;
2383
2384 // Adjust Loop Preheader and headers.
2385 // The branches in the outer loop predecessor and the outer loop header can
2386 // be unconditional branches or conditional branches with duplicates. Consider
2387 // this when updating the successors.
2388 updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader,
2389 InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false);
2390 // The outer loop header might or might not branch to the outer latch.
2391 // We are guaranteed to branch to the inner loop preheader.
2392 if (llvm::is_contained(successors(OuterLoopHeaderBI), OuterLoopLatch)) {
2393 // In this case the outerLoopHeader should branch to the InnerLoopLatch.
2394 updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, InnerLoopLatch,
2395 DTUpdates,
2396 /*MustUpdateOnce=*/false);
2397 }
2398 updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader,
2399 InnerLoopHeaderSuccessor, DTUpdates,
2400 /*MustUpdateOnce=*/false);
2401
2402 // Adjust reduction PHI's now that the incoming block has changed.
2403 InnerLoopHeaderSuccessor->replacePhiUsesWith(InnerLoopHeader,
2404 OuterLoopHeader);
2405
2406 updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor,
2407 OuterLoopPreHeader, DTUpdates);
2408
2409 // -------------Adjust loop latches-----------
2410 if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
2411 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
2412 else
2413 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
2414
2415 updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch,
2416 InnerLoopLatchSuccessor, DTUpdates);
2417
2418 if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
2419 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
2420 else
2421 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
2422
2423 updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor,
2424 OuterLoopLatchSuccessor, DTUpdates);
2425 updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch,
2426 DTUpdates);
2427
2428 DT->applyUpdates(DTUpdates);
2429 restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader,
2430 OuterLoopPreHeader);
2431
2432 moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch,
2433 OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock(),
2434 InnerLoop, LI);
2435 // For PHIs in the exit block of the outer loop, outer's latch has been
2436 // replaced by Inners'.
2437 OuterLoopLatchSuccessor->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
2438
2439 auto &OuterInnerReductions = LIL.getOuterInnerReductions();
2440 // Now update the reduction PHIs in the inner and outer loop headers.
2441 SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
2442 for (PHINode &PHI : InnerLoopHeader->phis())
2443 if (OuterInnerReductions.contains(&PHI))
2444 InnerLoopPHIs.push_back(&PHI);
2445
2446 for (PHINode &PHI : OuterLoopHeader->phis())
2447 if (OuterInnerReductions.contains(&PHI))
2448 OuterLoopPHIs.push_back(&PHI);
2449
2450 // Now move the remaining reduction PHIs from outer to inner loop header and
2451 // vice versa. The PHI nodes must be part of a reduction across the inner and
2452 // outer loop and all the remains to do is and updating the incoming blocks.
2453 for (PHINode *PHI : OuterLoopPHIs) {
2454 LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump(););
2455 PHI->moveBefore(InnerLoopHeader->getFirstNonPHIIt());
2456 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
2457 }
2458 for (PHINode *PHI : InnerLoopPHIs) {
2459 LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump(););
2460 PHI->moveBefore(OuterLoopHeader->getFirstNonPHIIt());
2461 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
2462 }
2463
2464 // Update the incoming blocks for moved PHI nodes.
2465 OuterLoopHeader->replacePhiUsesWith(InnerLoopPreHeader, OuterLoopPreHeader);
2466 OuterLoopHeader->replacePhiUsesWith(InnerLoopLatch, OuterLoopLatch);
2467 InnerLoopHeader->replacePhiUsesWith(OuterLoopPreHeader, InnerLoopPreHeader);
2468 InnerLoopHeader->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
2469
2470 // Values defined in the outer loop header could be used in the inner loop
2471 // latch. In that case, we need to create LCSSA phis for them, because after
2472 // interchanging they will be defined in the new inner loop and used in the
2473 // new outer loop.
2474 SmallVector<Instruction *, 4> MayNeedLCSSAPhis;
2475 for (Instruction &I :
2476 make_range(OuterLoopHeader->begin(), std::prev(OuterLoopHeader->end())))
2477 MayNeedLCSSAPhis.push_back(&I);
2478 formLCSSAForInstructions(MayNeedLCSSAPhis, *DT, *LI, SE);
2479
2480 return true;
2481}
2482
2483bool LoopInterchangeTransform::adjustLoopLinks() {
2484 // Adjust all branches in the inner and outer loop.
2485 bool Changed = adjustLoopBranches();
2486 if (Changed) {
2487 // We have interchanged the preheaders so we need to interchange the data in
2488 // the preheaders as well. This is because the content of the inner
2489 // preheader was previously executed inside the outer loop.
2490 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
2491 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2492 swapBBContents(OuterLoopPreHeader, InnerLoopPreHeader);
2493 }
2494 return Changed;
2495}
2496
2500 LPMUpdater &U) {
2501 Function &F = *LN.getParent();
2502 SmallVector<Loop *, 8> LoopList(LN.getLoops());
2503
2505
2506 // Ensure minimum depth of the loop nest to do the interchange.
2507 if (!hasSupportedLoopDepth(LoopList, ORE))
2508 return PreservedAnalyses::all();
2509 // Ensure computable loop nest.
2510 if (!isComputableLoopNest(&AR.SE, LoopList)) {
2511 LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
2512 return PreservedAnalyses::all();
2513 }
2514
2515 ORE.emit([&]() {
2516 return OptimizationRemarkAnalysis(DEBUG_TYPE, "Dependence",
2519 << "Computed dependence info, invoking the transform.";
2520 });
2521
2522 DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
2523 if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &AR, &ORE).run(LN))
2524 return PreservedAnalyses::all();
2525 U.markLoopNestChanged(true);
2527}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file defines the StringMap class.
Rewrite undef for PHI
ReachingDefInfo InstSet InstSet & Ignore
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
DXIL Resource Access
#define DEBUG_TYPE
static void moveBBContents(BasicBlock &SourceBB, BasicBlock &TargetBB)
Move the contents of SourceBB to before the last instruction of TargetBB.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
This file defines the interface for the loop cache analysis.
SmallVector< Loop *, 4 > LoopVector
Definition LoopFuse.cpp:348
Loop::LoopBounds::Direction Direction
Definition LoopInfo.cpp:253
static cl::list< RuleTy > Profitabilities("loop-interchange-profitabilities", cl::MiscFlags::CommaSeparated, cl::Hidden, cl::desc("List of profitability heuristics to be used. They are applied in " "the given order"), cl::list_init< RuleTy >({RuleTy::PerInstrOrderCost, RuleTy::ForVectorization}), cl::values(clEnumValN(RuleTy::PerLoopCacheAnalysis, "cache", "Prioritize loop cache cost"), clEnumValN(RuleTy::PerInstrOrderCost, "instorder", "Prioritize the IVs order of each instruction"), clEnumValN(RuleTy::ForVectorization, "vectorize", "Prioritize vectorization"), clEnumValN(RuleTy::Ignore, "ignore", "Ignore profitability, force interchange (does not " "work with other options)")))
static cl::opt< int > LoopInterchangeCostThreshold("loop-interchange-threshold", cl::init(0), cl::Hidden, cl::desc("Interchange if you gain more than this number"))
static cl::opt< unsigned int > MinLoopNestDepth("loop-interchange-min-loop-nest-depth", cl::init(2), cl::Hidden, cl::desc("Minimum depth of loop nest considered for the transform"))
static void updateSuccessor(Instruction *Term, BasicBlock *OldBB, BasicBlock *NewBB, std::vector< DominatorTree::UpdateType > &DTUpdates, bool MustUpdateOnce=true)
static cl::opt< bool > EnableReduction2Memory("loop-interchange-reduction-to-mem", cl::init(false), cl::Hidden, cl::desc("Support for the inner-loop reduction pattern."))
static bool isComputableLoopNest(ScalarEvolution *SE, ArrayRef< Loop * > LoopList)
std::optional< const SCEV * > getAddRecCoefficient(ScalarEvolution &SE, const SCEV *S, const Loop *L)
If \S contains an affine addrec for L, return the step recurrence of it.
static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop)
static void simplifyLCSSAPhis(Loop *OuterLoop, Loop *InnerLoop)
This deals with a corner case when a LCSSA phi node appears in a non-exit block: the outer loop latch...
static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx, unsigned ToIndx)
static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader, BasicBlock *InnerLatch, BasicBlock *OuterHeader, BasicBlock *OuterLatch, BasicBlock *OuterExit, Loop *InnerLoop, LoopInfo *LI)
static void printDepMatrix(CharMatrix &DepMatrix)
static cl::opt< unsigned int > MaxMemInstrRatio("loop-interchange-max-mem-instr-ratio", cl::init(4), cl::Hidden, cl::desc("Maximum number of load/store instructions squared in relation to " "the total number of instructions. Higher value may lead to more " "interchanges at the cost of compile-time"))
static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2)
Swap instructions between BB1 and BB2 but keep terminators intact.
static PHINode * findInnerReductionPhi(Loop *L, Value *V, SmallVectorImpl< Instruction * > &HasNoWrapInsts, SmallVectorImpl< Instruction * > &HasNoInfInsts)
static bool areInnerLoopExitPHIsSupported(Loop *OuterL, Loop *InnerL, SmallPtrSetImpl< PHINode * > &Reductions, PHINode *LcssaReduction)
We currently only support LCSSA PHI nodes in the inner loop exit if their users are either of the fol...
static cl::opt< unsigned int > MaxLoopNestDepth("loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden, cl::desc("Maximum depth of loop nest considered for the transform"))
static bool hasSupportedLoopDepth(ArrayRef< Loop * > LoopList, OptimizationRemarkEmitter &ORE)
static bool inThisOrder(const Instruction *Src, const Instruction *Dst)
Return true if Src appears before Dst in the same basic block.
static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop)
static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId)
Return true if we can vectorize the loop specified by LoopId.
static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, unsigned InnerLoopId, unsigned OuterLoopId)
#define DEBUG_TYPE
static Value * followLCSSA(Value *SV)
static void populateWorklist(Loop &L, LoopVector &LoopList)
static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, Loop *L, DependenceInfo *DI, ScalarEvolution *SE, OptimizationRemarkEmitter *ORE)
static std::optional< bool > isLexicographicallyPositive(ArrayRef< char > DV, unsigned Begin, unsigned End)
static bool checkReductionKind(Loop *L, PHINode *PHI, SmallVectorImpl< Instruction * > &HasNoWrapInsts, SmallVectorImpl< Instruction * > &HasNoInfInsts)
static bool noDuplicateRulesAndIgnore(ArrayRef< RuleTy > Rules)
This file defines the interface for the loop nest analysis.
This header provides classes for managing a pipeline of passes over loops in LLVM IR.
loop Loop Strength Reduction
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
uint64_t IntrinsicInst * II
#define P(N)
This file contains some templates that are useful if you are working with the STL at all.
SmallVector< Value *, 8 > ValueVector
This file defines the SmallSet class.
This file defines the SmallVector class.
static bool isProfitable(const StableFunctionMap::StableFunctionEntries &SFS)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:119
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
Get the first element.
Definition ArrayRef.h:144
size_t size() const
Get the array size.
Definition ArrayRef.h:141
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:185
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:474
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:461
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:530
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI void replacePhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block to refer to basic block New instead of basic block Old.
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:659
static LLVM_ABI std::unique_ptr< CacheCost > getCacheCost(Loop &Root, LoopStandardAnalysisResults &AR, DependenceInfo &DI, std::optional< unsigned > TRT=std::nullopt)
Create a CacheCost for the loop nest rooted by Root.
CacheCostTy getLoopCost(const Loop &L) const
Return the estimated cost of loop L if the given loop is part of the loop nest associated with this o...
Value * getCondition() const
BasicBlock * getSuccessor(unsigned i) const
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:85
DependenceInfo - This class is the main dependence-analysis driver.
LLVM_ABI std::unique_ptr< Dependence > depends(Instruction *Src, Instruction *Dst, bool UnderRuntimeAssumptions=false)
depends - Tests for a dependence between the Src and Dst instructions.
void applyUpdates(ArrayRef< UpdateType > Updates)
Inform the dominator tree about a sequence of CFG edge insertions and deletions and perform a batch u...
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static LLVM_ABI bool isInductionPHI(PHINode *Phi, const Loop *L, ScalarEvolution *SE, InductionDescriptor &D, ArrayRef< const SCEVPredicate * > NoWrapPreds={}, const SCEV *Expr=nullptr, SmallVectorImpl< Instruction * > *CastsToIgnore=nullptr)
Returns true if Phi is an induction in the loop L.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI void insertBefore(InstListType::iterator InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified position.
LLVM_ABI bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void removeBlockFromLoop(BlockT *BB)
This removes the specified basic block from the current loop, updating the Blocks as appropriate.
const std::vector< LoopT * > & getSubLoops() const
Return the loops contained entirely within this loop.
BlockT * getHeader() const
iterator_range< block_iterator > blocks() const
void addChildLoop(LoopT *NewChild)
Add the specified loop to be a child of this loop.
void addBlockEntry(BlockT *BB)
This adds a basic block directly to the basic block list.
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
iterator begin() const
BlockT * getUniqueExitBlock() const
If getUniqueExitBlocks would return exactly one block, return that block.
LoopT * removeChildLoop(iterator I)
This removes the specified child from being a subloop of this loop.
void changeTopLevelLoop(LoopT *OldLoop, LoopT *NewLoop)
Replace the specified loop in the top-level loops list with the indicated loop.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
void changeLoopFor(const BlockT *BB, LoopT *L)
Change the top-level loop that contains BB to the specified loop.
This class represents a loop nest and can be used to query its properties.
static const BasicBlock & skipEmptyBlockUntil(const BasicBlock *From, const BasicBlock *End, bool CheckUniquePred=false)
Recursivelly traverse all empty 'single successor' basic blocks of From (if there are any).
ArrayRef< Loop * > getLoops() const
Get the loops in the nest.
Function * getParent() const
Return the function to which the loop-nest belongs.
Loop & getOutermostLoop() const
Return the outermost loop in the loop nest.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition LoopInfo.cpp:659
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition LoopInfo.cpp:67
StringRef getName() const
Definition LoopInfo.h:407
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
bool isComplete() const
If the PHI node is complete which means all of its parent's predecessors have incoming value in this ...
op_range incoming_values()
void setIncomingBlock(unsigned i, BasicBlock *BB)
void setIncomingValue(unsigned i, Value *V)
static unsigned getIncomingValueNumForOperand(unsigned i)
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Instruction * getExactFPMathInst() const
Returns 1st non-reassociative FP instruction in the PHI node's use-chain.
static LLVM_ABI bool isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes, DemandedBits *DB=nullptr, AssumptionCache *AC=nullptr, DominatorTree *DT=nullptr, ScalarEvolution *SE=nullptr)
Returns true if Phi is a reduction in TheLoop.
LLVM_ABI SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
SCEVUse getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
The main scalar evolution driver.
LLVM_ABI const SCEV * getAbsExpr(const SCEV *Op, bool IsNSW)
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI const SCEV * getPointerBase(const SCEV *V)
Transitively follow the chain of pointer-type operands until reaching a SCEV that does not have a sin...
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, SCEVUse LHS, SCEVUse RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
std::pair< iterator, bool > try_emplace(StringRef Key, ArgsTy &&...Args)
Emplace a new element for the specified key into the map if the key isn't already in the map.
Definition StringMap.h:381
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
constexpr size_t size() const
Get the string size.
Definition StringRef.h:144
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:162
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:552
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:184
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
list_initializer< Ty > list_init(ArrayRef< Ty > Vals)
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
DXILDebugInfoMap run(Module &M)
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI BasicBlock * InsertPreheaderForLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
InsertPreheaderForLoop - Once we discover that a loop doesn't have a preheader, this method is called...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
auto successors(const MachineBasicBlock *BB)
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
LLVM_ABI bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition LCSSA.cpp:449
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172
auto map_range(ContainerTy &&C, FuncTy F)
Return a range that applies F to the elements of C.
Definition STLExtras.h:365
AnalysisManager< Loop, LoopStandardAnalysisResults & > LoopAnalysisManager
The loop analysis manager.
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:2025
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:322
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FMinimumNum
FP min with llvm.minimumnum semantics.
@ Or
Bitwise or logical OR of integers.
@ FMinimum
FP min with llvm.minimum semantics.
@ Mul
Product of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMaximum
FP max with llvm.maximum semantics.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Add
Sum of integers.
@ FAdd
Sum of floats.
@ FMaximumNum
FP max with llvm.maximumnum semantics.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2011
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI bool formLCSSAForInstructions(SmallVectorImpl< Instruction * > &Worklist, const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE, SmallVectorImpl< PHINode * > *PHIsToRemove=nullptr, SmallVectorImpl< PHINode * > *InsertedPHIs=nullptr)
Ensures LCSSA form for every instruction from the Worklist in the scope of innermost containing loop.
Definition LCSSA.cpp:308
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
auto predecessors(const MachineBasicBlock *BB)
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:368
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:876
LLVM_ABI PreservedAnalyses run(LoopNest &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...