LLVM 23.0.0git
LoopInterchange.cpp
Go to the documentation of this file.
1//===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This Pass handles loop interchange transform.
10// This pass interchanges loops to provide a more cache-friendly memory access
11// patterns.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/STLExtras.h"
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/Statistic.h"
20#include "llvm/ADT/StringMap.h"
21#include "llvm/ADT/StringRef.h"
30#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/Dominators.h"
33#include "llvm/IR/Function.h"
34#include "llvm/IR/IRBuilder.h"
35#include "llvm/IR/InstrTypes.h"
36#include "llvm/IR/Instruction.h"
38#include "llvm/IR/User.h"
39#include "llvm/IR/Value.h"
42#include "llvm/Support/Debug.h"
49#include <cassert>
50#include <utility>
51#include <vector>
52
53using namespace llvm;
54
55#define DEBUG_TYPE "loop-interchange"
56
57STATISTIC(LoopsInterchanged, "Number of loops interchanged");
58
60 "loop-interchange-threshold", cl::init(0), cl::Hidden,
61 cl::desc("Interchange if you gain more than this number"));
62
64 "loop-interchange-max-mem-instr-ratio", cl::init(4), cl::Hidden,
65 cl::desc("Maximum number of load/store instructions squared in relation to "
66 "the total number of instructions. Higher value may lead to more "
67 "interchanges at the cost of compile-time"));
68
69namespace {
70
72
73/// A list of direction vectors. Each entry represents a direction vector
74/// corresponding to one or more dependencies existing in the loop nest. The
75/// length of all direction vectors is equal and is N + 1, where N is the depth
76/// of the loop nest. The first N elements correspond to the dependency
77/// direction of each N loops. The last one indicates whether this entry is
78/// forward dependency ('<') or not ('*'). The term "forward" aligns with what
79/// is defined in LoopAccessAnalysis.
80// TODO: Check if we can use a sparse matrix here.
81using CharMatrix = std::vector<std::vector<char>>;
82
83/// Types of rules used in profitability check.
84enum class RuleTy {
85 PerLoopCacheAnalysis,
86 PerInstrOrderCost,
87 ForVectorization,
88 Ignore
89};
90
91} // end anonymous namespace
92
93// Minimum loop depth supported.
95 "loop-interchange-min-loop-nest-depth", cl::init(2), cl::Hidden,
96 cl::desc("Minimum depth of loop nest considered for the transform"));
97
98// Maximum loop depth supported.
100 "loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden,
101 cl::desc("Maximum depth of loop nest considered for the transform"));
102
103// We prefer cache cost to vectorization by default.
105 "loop-interchange-profitabilities", cl::MiscFlags::CommaSeparated,
107 cl::desc("List of profitability heuristics to be used. They are applied in "
108 "the given order"),
109 cl::list_init<RuleTy>({RuleTy::PerInstrOrderCost,
110 RuleTy::ForVectorization}),
111 cl::values(clEnumValN(RuleTy::PerLoopCacheAnalysis, "cache",
112 "Prioritize loop cache cost"),
113 clEnumValN(RuleTy::PerInstrOrderCost, "instorder",
114 "Prioritize the IVs order of each instruction"),
115 clEnumValN(RuleTy::ForVectorization, "vectorize",
116 "Prioritize vectorization"),
117 clEnumValN(RuleTy::Ignore, "ignore",
118 "Ignore profitability, force interchange (does not "
119 "work with other options)")));
120
121// Support for the inner-loop reduction pattern.
123 "loop-interchange-reduction-to-mem", cl::init(false), cl::Hidden,
124 cl::desc("Support for the inner-loop reduction pattern."));
125
126#ifndef NDEBUG
129 for (RuleTy Rule : Rules) {
130 if (!Set.insert(Rule).second)
131 return false;
132 if (Rule == RuleTy::Ignore)
133 return false;
134 }
135 return true;
136}
137
138static void printDepMatrix(CharMatrix &DepMatrix) {
139 for (auto &Row : DepMatrix) {
140 // Drop the last element because it is a flag indicating whether this is
141 // forward dependency or not, which doesn't affect the legality check.
142 for (char D : drop_end(Row))
143 LLVM_DEBUG(dbgs() << D << " ");
144 LLVM_DEBUG(dbgs() << "\n");
145 }
146}
147
148/// Return true if \p Src appears before \p Dst in the same basic block.
149/// Precondition: \p Src and \Dst are distinct instructions within the same
150/// basic block.
151static bool inThisOrder(const Instruction *Src, const Instruction *Dst) {
152 assert(Src->getParent() == Dst->getParent() && Src != Dst &&
153 "Expected Src and Dst to be different instructions in the same BB");
154
155 bool FoundSrc = false;
156 for (const Instruction &I : *(Src->getParent())) {
157 if (&I == Src) {
158 FoundSrc = true;
159 continue;
160 }
161 if (&I == Dst)
162 return FoundSrc;
163 }
164
165 llvm_unreachable("Dst not found");
166}
167#endif
168
169static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
170 Loop *L, DependenceInfo *DI,
171 ScalarEvolution *SE,
174
175 ValueVector MemInstr;
176 unsigned NumInsts = 0;
177
178 // For each block.
179 for (BasicBlock *BB : L->blocks()) {
180 // Scan the BB and collect legal loads and stores.
181 for (Instruction &I : *BB) {
182 if (!isa<Instruction>(I))
183 return false;
184 NumInsts++;
185 if (!isa<Instruction>(I))
186 return false;
187 if (auto *Ld = dyn_cast<LoadInst>(&I)) {
188 if (!Ld->isSimple())
189 return false;
190 MemInstr.push_back(&I);
191 } else if (auto *St = dyn_cast<StoreInst>(&I)) {
192 if (!St->isSimple())
193 return false;
194 MemInstr.push_back(&I);
195 }
196 }
197 }
198
199 // To populate the dependence matrix, we perform dependence test for each pair
200 // of memory instructions, which has O(NumMemInstr^2) complexity. This implies
201 // that even if the number of memory instructions is small, the analysis can
202 // still be expensive if the most of the instructions in the loop are memory
203 // instructions. On the other hand, if the number of memory instructions is
204 // not small, but the loop is large (i.e., it contains many non-memory
205 // instructions), the analysis can still be affordable.
206 unsigned NumMemInstr = MemInstr.size();
207 LLVM_DEBUG(dbgs() << "Found " << NumMemInstr
208 << " Loads and Stores to analyze\n");
209 if (MaxMemInstrRatio * NumInsts < NumMemInstr * NumMemInstr) {
210 ORE->emit([&]() {
211 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoop",
212 L->getStartLoc(), L->getHeader())
213 << "Number of loads/stores exceeded, the supported maximum can be "
214 "increased with option -loop-interchange-max-mem-instr-ratio.";
215 });
216 return false;
217 }
218 ValueVector::iterator I, IE, J, JE;
219
220 // Manage direction vectors that are already seen. Map each direction vector
221 // to an index of DepMatrix at which it is stored.
223
224 for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
225 for (J = I, JE = MemInstr.end(); J != JE; ++J) {
226 std::vector<char> Dep;
229 // Ignore Input dependencies.
230 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
231 continue;
232 // Track Output, Flow, and Anti dependencies.
233 if (auto D = DI->depends(Src, Dst)) {
234 assert(D->isOrdered() && "Expected an output, flow or anti dep.");
235 // If the direction vector is negative, normalize it to
236 // make it non-negative.
237 if (D->normalize(SE))
238 LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
239 LLVM_DEBUG(StringRef DepType =
240 D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
241 dbgs() << "Found " << DepType
242 << " dependency between Src and Dst\n"
243 << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
244 unsigned Levels = D->getLevels();
245 char Direction;
246 for (unsigned II = 1; II <= Levels; ++II) {
247 // `DVEntry::LE` is converted to `*`. This is because `LE` means `<`
248 // or `=`, for which we don't have an equivalent representation, so
249 // that the conservative approximation is necessary. The same goes for
250 // `DVEntry::GE`.
251 // TODO: Use of fine-grained expressions allows for more accurate
252 // analysis.
253 unsigned Dir = D->getDirection(II);
254 if (Dir == Dependence::DVEntry::LT)
255 Direction = '<';
256 else if (Dir == Dependence::DVEntry::GT)
257 Direction = '>';
258 else if (Dir == Dependence::DVEntry::EQ)
259 Direction = '=';
260 else
261 Direction = '*';
262 Dep.push_back(Direction);
263 }
264
265 // If the Dependence object doesn't have any information, fill the
266 // dependency vector with '*'.
267 if (D->isConfused()) {
268 assert(Dep.empty() && "Expected empty dependency vector");
269 Dep.assign(Level, '*');
270 }
271
272 while (Dep.size() != Level) {
273 Dep.push_back('I');
274 }
275
276 // If all the elements of any direction vector have only '*', legality
277 // can't be proven. Exit early to save compile time.
278 if (all_of(Dep, equal_to('*'))) {
279 ORE->emit([&]() {
280 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
281 L->getStartLoc(), L->getHeader())
282 << "All loops have dependencies in all directions.";
283 });
284 return false;
285 }
286
287 // Test whether the dependency is forward or not.
288 bool IsKnownForward = true;
289 if (Src->getParent() != Dst->getParent()) {
290 // In general, when Src and Dst are in different BBs, the execution
291 // order of them within a single iteration is not guaranteed. Treat
292 // conservatively as not-forward dependency in this case.
293 IsKnownForward = false;
294 } else {
295 // Src and Dst are in the same BB. If they are the different
296 // instructions, Src should appear before Dst in the BB as they are
297 // stored to MemInstr in that order.
298 assert((Src == Dst || inThisOrder(Src, Dst)) &&
299 "Unexpected instructions");
300
301 // If the Dependence object is reversed (due to normalization), it
302 // represents the dependency from Dst to Src, meaning it is a backward
303 // dependency. Otherwise it should be a forward dependency.
304 bool IsReversed = D->getSrc() != Src;
305 if (IsReversed)
306 IsKnownForward = false;
307 }
308
309 // Initialize the last element. Assume forward dependencies only; it
310 // will be updated later if there is any non-forward dependency.
311 Dep.push_back('<');
312
313 // The last element should express the "summary" among one or more
314 // direction vectors whose first N elements are the same (where N is
315 // the depth of the loop nest). Hence we exclude the last element from
316 // the Seen map.
317 auto [Ite, Inserted] = Seen.try_emplace(
318 StringRef(Dep.data(), Dep.size() - 1), DepMatrix.size());
319
320 // Make sure we only add unique entries to the dependency matrix.
321 if (Inserted)
322 DepMatrix.push_back(Dep);
323
324 // If we cannot prove that this dependency is forward, change the last
325 // element of the corresponding entry. Since a `[... *]` dependency
326 // includes a `[... <]` dependency, we do not need to keep both and
327 // change the existing entry instead.
328 if (!IsKnownForward)
329 DepMatrix[Ite->second].back() = '*';
330 }
331 }
332 }
333
334 return true;
335}
336
337// A loop is moved from index 'from' to an index 'to'. Update the Dependence
338// matrix by exchanging the two columns.
339static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
340 unsigned ToIndx) {
341 for (auto &Row : DepMatrix)
342 std::swap(Row[ToIndx], Row[FromIndx]);
343}
344
345// Check if a direction vector is lexicographically positive. Return true if it
346// is positive, nullopt if it is "zero", otherwise false.
347// [Theorem] A permutation of the loops in a perfect nest is legal if and only
348// if the direction matrix, after the same permutation is applied to its
349// columns, has no ">" direction as the leftmost non-"=" direction in any row.
350static std::optional<bool>
351isLexicographicallyPositive(ArrayRef<char> DV, unsigned Begin, unsigned End) {
352 for (unsigned char Direction : DV.slice(Begin, End - Begin)) {
353 if (Direction == '<')
354 return true;
355 if (Direction == '>' || Direction == '*')
356 return false;
357 }
358 return std::nullopt;
359}
360
361// Checks if it is legal to interchange 2 loops.
362static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
363 unsigned InnerLoopId,
364 unsigned OuterLoopId) {
365 unsigned NumRows = DepMatrix.size();
366 std::vector<char> Cur;
367 // For each row check if it is valid to interchange.
368 for (unsigned Row = 0; Row < NumRows; ++Row) {
369 // Create temporary DepVector check its lexicographical order
370 // before and after swapping OuterLoop vs InnerLoop
371 Cur = DepMatrix[Row];
372
373 // If the surrounding loops already ensure that the direction vector is
374 // lexicographically positive, nothing within the loop will be able to break
375 // the dependence. In such a case we can skip the subsequent check.
376 if (isLexicographicallyPositive(Cur, 0, OuterLoopId) == true)
377 continue;
378
379 // Check if the direction vector is lexicographically positive (or zero)
380 // for both before/after exchanged. Ignore the last element because it
381 // doesn't affect the legality.
382 if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
383 return false;
384 std::swap(Cur[InnerLoopId], Cur[OuterLoopId]);
385 if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
386 return false;
387 }
388 return true;
389}
390
391static void populateWorklist(Loop &L, LoopVector &LoopList) {
392 LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
393 << L.getHeader()->getParent()->getName() << " Loop: %"
394 << L.getHeader()->getName() << '\n');
395 assert(LoopList.empty() && "LoopList should initially be empty!");
396 Loop *CurrentLoop = &L;
397 const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
398 while (!Vec->empty()) {
399 // The current loop has multiple subloops in it hence it is not tightly
400 // nested.
401 // Discard all loops above it added into Worklist.
402 if (Vec->size() != 1) {
403 LoopList = {};
404 return;
405 }
406
407 LoopList.push_back(CurrentLoop);
408 CurrentLoop = Vec->front();
409 Vec = &CurrentLoop->getSubLoops();
410 }
411 LoopList.push_back(CurrentLoop);
412}
413
416 unsigned LoopNestDepth = LoopList.size();
417 if (LoopNestDepth < MinLoopNestDepth || LoopNestDepth > MaxLoopNestDepth) {
418 LLVM_DEBUG(dbgs() << "Unsupported depth of loop nest " << LoopNestDepth
419 << ", the supported range is [" << MinLoopNestDepth
420 << ", " << MaxLoopNestDepth << "].\n");
421 Loop *OuterLoop = LoopList.front();
422 ORE.emit([&]() {
423 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoopNestDepth",
424 OuterLoop->getStartLoc(),
425 OuterLoop->getHeader())
426 << "Unsupported depth of loop nest, the supported range is ["
427 << std::to_string(MinLoopNestDepth) << ", "
428 << std::to_string(MaxLoopNestDepth) << "].\n";
429 });
430 return false;
431 }
432 return true;
433}
434
436 ArrayRef<Loop *> LoopList) {
437 for (Loop *L : LoopList) {
438 const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
439 if (isa<SCEVCouldNotCompute>(ExitCountOuter)) {
440 LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
441 return false;
442 }
443 if (L->getNumBackEdges() != 1) {
444 LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
445 return false;
446 }
447 if (!L->getExitingBlock()) {
448 LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
449 return false;
450 }
451 }
452 return true;
453}
454
455namespace {
456
457/// LoopInterchangeLegality checks if it is legal to interchange the loop.
458class LoopInterchangeLegality {
459public:
460 LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
461 OptimizationRemarkEmitter *ORE, DominatorTree *DT)
462 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), DT(DT), ORE(ORE) {}
463
464 /// Check if the loops can be interchanged.
465 bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
466 CharMatrix &DepMatrix);
467
468 /// Discover induction PHIs in the header of \p L. Induction
469 /// PHIs are added to \p Inductions.
470 bool findInductions(Loop *L, SmallVectorImpl<PHINode *> &Inductions);
471
472 /// Check if the loop structure is understood. We do not handle triangular
473 /// loops for now.
474 bool isLoopStructureUnderstood();
475
476 bool currentLimitations();
477
478 const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const {
479 return OuterInnerReductions;
480 }
481
482 const ArrayRef<PHINode *> getInnerLoopInductions() const {
483 return InnerLoopInductions;
484 }
485
486 ArrayRef<Instruction *> getHasNoWrapReductions() const {
487 return HasNoWrapReductions;
488 }
489
490 ArrayRef<Instruction *> getHasNoInfInsts() const { return HasNoInfInsts; }
491
492 /// Record reductions in the inner loop. Currently supported reductions:
493 /// - initialized from a constant.
494 /// - reduction PHI node has only one user.
495 /// - located in the innermost loop.
496 struct InnerReduction {
497 /// The reduction itself.
498 PHINode *Reduction;
499 Value *Init;
500 Value *Next;
501 /// The Lcssa PHI.
502 PHINode *LcssaPhi;
503 /// Store reduction result into memory object.
504 StoreInst *LcssaStore;
505 /// The memory Location.
506 Value *MemRef;
507 Type *ElemTy;
508 };
509
510 ArrayRef<InnerReduction> getInnerReductions() const {
511 return InnerReductions;
512 }
513
514private:
515 bool tightlyNested(Loop *Outer, Loop *Inner);
516 bool containsUnsafeInstructions(BasicBlock *BB, Instruction *Skip);
517
518 /// Discover induction and reduction PHIs in the header of \p L. Induction
519 /// PHIs are added to \p Inductions, reductions are added to
520 /// OuterInnerReductions. When the outer loop is passed, the inner loop needs
521 /// to be passed as \p InnerLoop.
522 bool findInductionAndReductions(Loop *L,
523 SmallVector<PHINode *, 8> &Inductions,
524 Loop *InnerLoop);
525
526 /// Detect and record the reduction of the inner loop. Add them to
527 /// InnerReductions.
528 ///
529 /// innerloop:
530 /// Re = phi<0.0, Next>
531 /// Next = Re op ...
532 /// OuterLoopLatch:
533 /// Lcssa = phi<Next> ; lcssa phi
534 /// store Lcssa, MemRef ; LcssaStore
535 ///
536 bool isInnerReduction(Loop *L, PHINode *Phi,
537 SmallVectorImpl<Instruction *> &HasNoWrapInsts);
538
539 Loop *OuterLoop;
540 Loop *InnerLoop;
541
542 ScalarEvolution *SE;
543 DominatorTree *DT;
544
545 /// Interface to emit optimization remarks.
546 OptimizationRemarkEmitter *ORE;
547
548 /// Set of reduction PHIs taking part of a reduction across the inner and
549 /// outer loop.
550 SmallPtrSet<PHINode *, 4> OuterInnerReductions;
551
552 /// Set of inner loop induction PHIs
553 SmallVector<PHINode *, 8> InnerLoopInductions;
554
555 /// Hold instructions that have nuw/nsw flags and involved in reductions,
556 /// like integer addition/multiplication. Those flags must be dropped when
557 /// interchanging the loops.
558 SmallVector<Instruction *, 4> HasNoWrapReductions;
559
560 /// Hold instructions that have ninf flags and involved in reductions. Those
561 /// flags must be dropped when interchanging the loops.
562 SmallVector<Instruction *, 4> HasNoInfInsts;
563
564 /// Vector of reductions in the inner loop.
565 SmallVector<InnerReduction, 8> InnerReductions;
566};
567
568/// Manages information utilized by the profitability check for cache. The main
569/// purpose of this class is to delay the computation of CacheCost until it is
570/// actually needed.
571class CacheCostManager {
572 Loop *OutermostLoop;
573 LoopStandardAnalysisResults *AR;
574 DependenceInfo *DI;
575
576 /// CacheCost for \ref OutermostLoop. Once it is computed, it is cached. Note
577 /// that the result can be nullptr.
578 std::optional<std::unique_ptr<CacheCost>> CC;
579
580 /// Maps each loop to an index representing the optimal position within the
581 /// loop-nest, as determined by the cache cost analysis.
582 DenseMap<const Loop *, unsigned> CostMap;
583
584 void computeIfUnitinialized();
585
586public:
587 CacheCostManager(Loop *OutermostLoop, LoopStandardAnalysisResults *AR,
588 DependenceInfo *DI)
589 : OutermostLoop(OutermostLoop), AR(AR), DI(DI) {}
590 CacheCost *getCacheCost();
591 const DenseMap<const Loop *, unsigned> &getCostMap();
592};
593
594/// LoopInterchangeProfitability checks if it is profitable to interchange the
595/// loop.
596class LoopInterchangeProfitability {
597public:
598 LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
599 OptimizationRemarkEmitter *ORE)
600 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
601
602 /// Check if the loop interchange is profitable.
603 bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
604 unsigned InnerLoopId, unsigned OuterLoopId,
605 CharMatrix &DepMatrix, CacheCostManager &CCM);
606
607private:
608 int getInstrOrderCost();
609 std::optional<bool> isProfitablePerLoopCacheAnalysis(
610 const DenseMap<const Loop *, unsigned> &CostMap, CacheCost *CC);
611 std::optional<bool> isProfitablePerInstrOrderCost();
612 std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
613 unsigned OuterLoopId,
614 CharMatrix &DepMatrix);
615 Loop *OuterLoop;
616 Loop *InnerLoop;
617
618 /// Scev analysis.
619 ScalarEvolution *SE;
620
621 /// Interface to emit optimization remarks.
622 OptimizationRemarkEmitter *ORE;
623};
624
625/// LoopInterchangeTransform interchanges the loop.
626class LoopInterchangeTransform {
627public:
628 LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
629 LoopInfo *LI, DominatorTree *DT,
630 const LoopInterchangeLegality &LIL)
631 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {}
632
633 /// Interchange OuterLoop and InnerLoop.
634 bool transform(ArrayRef<Instruction *> DropNoWrapInsts,
635 ArrayRef<Instruction *> DropNoInfInsts);
636 void reduction2Memory();
637 void restructureLoops(Loop *NewInner, Loop *NewOuter,
638 BasicBlock *OrigInnerPreHeader,
639 BasicBlock *OrigOuterPreHeader);
640 void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
641
642private:
643 bool adjustLoopLinks();
644 bool adjustLoopBranches();
645
646 Loop *OuterLoop;
647 Loop *InnerLoop;
648
649 /// Scev analysis.
650 ScalarEvolution *SE;
651
652 LoopInfo *LI;
653 DominatorTree *DT;
654
655 const LoopInterchangeLegality &LIL;
656};
657
658struct LoopInterchange {
659 ScalarEvolution *SE = nullptr;
660 LoopInfo *LI = nullptr;
661 DependenceInfo *DI = nullptr;
662 DominatorTree *DT = nullptr;
663 LoopStandardAnalysisResults *AR = nullptr;
664
665 /// Interface to emit optimization remarks.
666 OptimizationRemarkEmitter *ORE;
667
668 LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
669 DominatorTree *DT, LoopStandardAnalysisResults *AR,
670 OptimizationRemarkEmitter *ORE)
671 : SE(SE), LI(LI), DI(DI), DT(DT), AR(AR), ORE(ORE) {}
672
673 bool run(Loop *L) {
674 if (L->getParentLoop())
675 return false;
676 SmallVector<Loop *, 8> LoopList;
677 populateWorklist(*L, LoopList);
678 return processLoopList(LoopList);
679 }
680
681 bool run(LoopNest &LN) {
682 SmallVector<Loop *, 8> LoopList(LN.getLoops());
683 for (unsigned I = 1; I < LoopList.size(); ++I)
684 if (LoopList[I]->getParentLoop() != LoopList[I - 1])
685 return false;
686 return processLoopList(LoopList);
687 }
688
689 unsigned selectLoopForInterchange(ArrayRef<Loop *> LoopList) {
690 // TODO: Add a better heuristic to select the loop to be interchanged based
691 // on the dependence matrix. Currently we select the innermost loop.
692 return LoopList.size() - 1;
693 }
694
695 bool processLoopList(SmallVectorImpl<Loop *> &LoopList) {
696 bool Changed = false;
697
698 // Ensure proper loop nest depth.
699 assert(hasSupportedLoopDepth(LoopList, *ORE) &&
700 "Unsupported depth of loop nest.");
701
702 unsigned LoopNestDepth = LoopList.size();
703
704 LLVM_DEBUG({
705 dbgs() << "Processing LoopList of size = " << LoopNestDepth
706 << " containing the following loops:\n";
707 for (auto *L : LoopList) {
708 dbgs() << " - ";
709 L->print(dbgs());
710 }
711 });
712
713 CharMatrix DependencyMatrix;
714 Loop *OuterMostLoop = *(LoopList.begin());
715 if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
716 OuterMostLoop, DI, SE, ORE)) {
717 LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
718 return false;
719 }
720
721 LLVM_DEBUG(dbgs() << "Dependency matrix before interchange:\n";
722 printDepMatrix(DependencyMatrix));
723
724 // Get the Outermost loop exit.
725 BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
726 if (!LoopNestExit) {
727 LLVM_DEBUG(dbgs() << "OuterMostLoop '" << OuterMostLoop->getName()
728 << "' needs an unique exit block");
729 return false;
730 }
731
732 unsigned SelecLoopId = selectLoopForInterchange(LoopList);
733 CacheCostManager CCM(LoopList[0], AR, DI);
734 // We try to achieve the globally optimal memory access for the loopnest,
735 // and do interchange based on a bubble-sort fasion. We start from
736 // the innermost loop, move it outwards to the best possible position
737 // and repeat this process.
738 for (unsigned j = SelecLoopId; j > 0; j--) {
739 bool ChangedPerIter = false;
740 for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
741 bool Interchanged =
742 processLoop(LoopList, i, i - 1, DependencyMatrix, CCM);
743 ChangedPerIter |= Interchanged;
744 Changed |= Interchanged;
745 }
746 // Early abort if there was no interchange during an entire round of
747 // moving loops outwards.
748 if (!ChangedPerIter)
749 break;
750 }
751 return Changed;
752 }
753
754 bool processLoop(SmallVectorImpl<Loop *> &LoopList, unsigned InnerLoopId,
755 unsigned OuterLoopId,
756 std::vector<std::vector<char>> &DependencyMatrix,
757 CacheCostManager &CCM) {
758 Loop *OuterLoop = LoopList[OuterLoopId];
759 Loop *InnerLoop = LoopList[InnerLoopId];
760 LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
761 << " and OuterLoopId = " << OuterLoopId << "\n");
762 LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE, DT);
763 if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
764 LLVM_DEBUG(dbgs() << "Cannot prove legality, not interchanging loops '"
765 << OuterLoop->getName() << "' and '"
766 << InnerLoop->getName() << "'\n");
767 return false;
768 }
769 LLVM_DEBUG(dbgs() << "Loops '" << OuterLoop->getName() << "' and '"
770 << InnerLoop->getName()
771 << "' are legal to interchange\n");
772 LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
773 if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
774 DependencyMatrix, CCM)) {
775 LLVM_DEBUG(dbgs() << "Interchanging loops '" << OuterLoop->getName()
776 << "' and '" << InnerLoop->getName()
777 << "' not profitable.\n");
778 return false;
779 }
780
781 ORE->emit([&]() {
782 return OptimizationRemark(DEBUG_TYPE, "Interchanged",
783 InnerLoop->getStartLoc(),
784 InnerLoop->getHeader())
785 << "Loop interchanged with enclosing loop.";
786 });
787
788 LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL);
789 LIT.transform(LIL.getHasNoWrapReductions(), LIL.getHasNoInfInsts());
790 LLVM_DEBUG(dbgs() << "Loops interchanged: outer loop '"
791 << OuterLoop->getName() << "' and inner loop '"
792 << InnerLoop->getName() << "'\n");
793 LoopsInterchanged++;
794
795 llvm::formLCSSARecursively(*OuterLoop, *DT, LI, SE);
796
797 // Loops interchanged, update LoopList accordingly.
798 std::swap(LoopList[OuterLoopId], LoopList[InnerLoopId]);
799 // Update the DependencyMatrix
800 interChangeDependencies(DependencyMatrix, InnerLoopId, OuterLoopId);
801
802 LLVM_DEBUG(dbgs() << "Dependency matrix after interchange:\n";
803 printDepMatrix(DependencyMatrix));
804
805 return true;
806 }
807};
808
809} // end anonymous namespace
810
811bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB,
812 Instruction *Skip) {
813 return any_of(*BB, [Skip](const Instruction &I) {
814 if (&I == Skip)
815 return false;
816 return I.mayHaveSideEffects() || I.mayReadFromMemory();
817 });
818}
819
820bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
821 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
822 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
823 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
824
825 LLVM_DEBUG(dbgs() << "Checking if loops '" << OuterLoop->getName()
826 << "' and '" << InnerLoop->getName()
827 << "' are tightly nested\n");
828
829 // A perfectly nested loop will not have any branch in between the outer and
830 // inner block i.e. outer header will branch to either inner preheader and
831 // outerloop latch.
832 for (BasicBlock *Succ : successors(OuterLoopHeader))
833 if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader() &&
834 Succ != OuterLoopLatch)
835 return false;
836
837 LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
838
839 // The inner loop reduction pattern requires storing the LCSSA PHI in
840 // the OuterLoop Latch. Therefore, when reduction2Memory is enabled, skip
841 // that store during checks.
842 Instruction *Skip = nullptr;
843 assert(InnerReductions.size() <= 1 &&
844 "So far we only support at most one reduction.");
845 if (InnerReductions.size() == 1)
846 Skip = InnerReductions[0].LcssaStore;
847
848 // We do not have any basic block in between now make sure the outer header
849 // and outer loop latch doesn't contain any unsafe instructions.
850 if (containsUnsafeInstructions(OuterLoopHeader, Skip) ||
851 containsUnsafeInstructions(OuterLoopLatch, Skip))
852 return false;
853
854 // Also make sure the inner loop preheader does not contain any unsafe
855 // instructions. Note that all instructions in the preheader will be moved to
856 // the outer loop header when interchanging.
857 if (InnerLoopPreHeader != OuterLoopHeader &&
858 containsUnsafeInstructions(InnerLoopPreHeader, Skip))
859 return false;
860
861 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
862 // Ensure the inner loop exit block flows to the outer loop latch possibly
863 // through empty blocks.
864 const BasicBlock &SuccInner =
865 LoopNest::skipEmptyBlockUntil(InnerLoopExit, OuterLoopLatch);
866 if (&SuccInner != OuterLoopLatch) {
867 LLVM_DEBUG(dbgs() << "Inner loop exit block " << *InnerLoopExit
868 << " does not lead to the outer loop latch.\n";);
869 return false;
870 }
871 // The inner loop exit block does flow to the outer loop latch and not some
872 // other BBs, now make sure it contains safe instructions, since it will be
873 // moved into the (new) inner loop after interchange.
874 if (containsUnsafeInstructions(InnerLoopExit, Skip))
875 return false;
876
877 LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
878 // We have a perfect loop nest.
879 return true;
880}
881
882bool LoopInterchangeLegality::isLoopStructureUnderstood() {
883 BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
884 for (PHINode *InnerInduction : InnerLoopInductions) {
885 unsigned Num = InnerInduction->getNumOperands();
886 for (unsigned i = 0; i < Num; ++i) {
887 Value *Val = InnerInduction->getOperand(i);
888 if (isa<Constant>(Val))
889 continue;
891 if (!I)
892 return false;
893 // TODO: Handle triangular loops.
894 // e.g. for(int i=0;i<N;i++)
895 // for(int j=i;j<N;j++)
896 unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
897 if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
898 InnerLoopPreheader &&
899 !OuterLoop->isLoopInvariant(I)) {
900 return false;
901 }
902 }
903 }
904
905 // TODO: Handle triangular loops of another form.
906 // e.g. for(int i=0;i<N;i++)
907 // for(int j=0;j<i;j++)
908 // or,
909 // for(int i=0;i<N;i++)
910 // for(int j=0;j*i<N;j++)
911 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
912 CondBrInst *InnerLoopLatchBI =
913 dyn_cast<CondBrInst>(InnerLoopLatch->getTerminator());
914 if (!InnerLoopLatchBI)
915 return false;
916 if (CmpInst *InnerLoopCmp =
917 dyn_cast<CmpInst>(InnerLoopLatchBI->getCondition())) {
918 Value *Op0 = InnerLoopCmp->getOperand(0);
919 Value *Op1 = InnerLoopCmp->getOperand(1);
920
921 // LHS and RHS of the inner loop exit condition, e.g.,
922 // in "for(int j=0;j<i;j++)", LHS is j and RHS is i.
923 Value *Left = nullptr;
924 Value *Right = nullptr;
925
926 // Check if V only involves inner loop induction variable.
927 // Return true if V is InnerInduction, or a cast from
928 // InnerInduction, or a binary operator that involves
929 // InnerInduction and a constant.
930 std::function<bool(Value *)> IsPathToInnerIndVar;
931 IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value *V) -> bool {
932 if (llvm::is_contained(InnerLoopInductions, V))
933 return true;
934 if (isa<Constant>(V))
935 return true;
937 if (!I)
938 return false;
939 if (isa<CastInst>(I))
940 return IsPathToInnerIndVar(I->getOperand(0));
942 return IsPathToInnerIndVar(I->getOperand(0)) &&
943 IsPathToInnerIndVar(I->getOperand(1));
944 return false;
945 };
946
947 // In case of multiple inner loop indvars, it is okay if LHS and RHS
948 // are both inner indvar related variables.
949 if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1))
950 return true;
951
952 // Otherwise we check if the cmp instruction compares an inner indvar
953 // related variable (Left) with a outer loop invariant (Right).
954 if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Op0)) {
955 Left = Op0;
956 Right = Op1;
957 } else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Op1)) {
958 Left = Op1;
959 Right = Op0;
960 }
961
962 if (Left == nullptr)
963 return false;
964
965 const SCEV *S = SE->getSCEV(Right);
966 if (!SE->isLoopInvariant(S, OuterLoop))
967 return false;
968 }
969
970 return true;
971}
972
973// If SV is a LCSSA PHI node with a single incoming value, return the incoming
974// value.
975static Value *followLCSSA(Value *SV) {
977 if (!PHI)
978 return SV;
979
980 if (PHI->getNumIncomingValues() != 1)
981 return SV;
982 return followLCSSA(PHI->getIncomingValue(0));
983}
984
986 SmallVectorImpl<Instruction *> &HasNoWrapInsts,
987 SmallVectorImpl<Instruction *> &HasNoInfInsts) {
990 // Detect floating point reduction only when it can be reordered.
991 if (RD.getExactFPMathInst() != nullptr)
992 return false;
993
995 switch (RK) {
996 case RecurKind::Or:
997 case RecurKind::And:
998 case RecurKind::Xor:
999 case RecurKind::SMin:
1000 case RecurKind::SMax:
1001 case RecurKind::UMin:
1002 case RecurKind::UMax:
1003 case RecurKind::AnyOf:
1004 return true;
1005
1006 // Changing the order of floating-point operations may alter the results. If
1007 // a certain instruction has the ninf flag, it means that reordering can
1008 // produce a poison value, which may lead to undefined behavior. To prevent
1009 // this, we must drop the ninf flags if we decide to apply the
1010 // transformation.
1011 case RecurKind::FAdd:
1012 case RecurKind::FMul:
1013 case RecurKind::FMin:
1014 case RecurKind::FMax:
1019 case RecurKind::FMulAdd:
1020 for (Instruction *I : RD.getReductionOpChain(PHI, L))
1021 if (isa<FPMathOperator>(I) && I->hasNoInfs())
1022 HasNoInfInsts.push_back(I);
1023 return true;
1024
1025 // Change the order of integer addition/multiplication may change the
1026 // semantics. Consider the following case:
1027 //
1028 // int A[2][2] = {{ INT_MAX, INT_MAX }, { INT_MIN, INT_MIN }};
1029 // int sum = 0;
1030 // for (int i = 0; i < 2; i++)
1031 // for (int j = 0; j < 2; j++)
1032 // sum += A[j][i];
1033 //
1034 // If the above loops are exchanged, the addition will cause an
1035 // overflow. To prevent this, we must drop the nuw/nsw flags from the
1036 // addition/multiplication instructions when we actually exchanges the
1037 // loops.
1038 case RecurKind::Add:
1039 case RecurKind::Mul: {
1040 unsigned OpCode = RecurrenceDescriptor::getOpcode(RK);
1042
1043 // Bail out when we fail to collect reduction instructions chain.
1044 if (Ops.empty())
1045 return false;
1046
1047 for (Instruction *I : Ops) {
1048 assert(I->getOpcode() == OpCode &&
1049 "Expected the instruction to be the reduction operation");
1050 (void)OpCode;
1051
1052 // If the instruction has nuw/nsw flags, we must drop them when the
1053 // transformation is actually performed.
1054 if (I->hasNoSignedWrap() || I->hasNoUnsignedWrap())
1055 HasNoWrapInsts.push_back(I);
1056 }
1057 return true;
1058 }
1059
1060 default:
1061 return false;
1062 }
1063 } else
1064 return false;
1065}
1066
1067// Check V's users to see if it is involved in a reduction in L.
1068static PHINode *
1070 SmallVectorImpl<Instruction *> &HasNoWrapInsts,
1071 SmallVectorImpl<Instruction *> &HasNoInfInsts) {
1072 // Reduction variables cannot be constants.
1073 if (isa<Constant>(V))
1074 return nullptr;
1075
1076 for (Value *User : V->users()) {
1078 if (PHI->getNumIncomingValues() == 1)
1079 continue;
1080
1081 if (checkReductionKind(L, PHI, HasNoWrapInsts, HasNoInfInsts))
1082 return PHI;
1083 else
1084 return nullptr;
1085 }
1086 }
1087
1088 return nullptr;
1089}
1090
1091bool LoopInterchangeLegality::isInnerReduction(
1092 Loop *L, PHINode *Phi, SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
1093
1094 // Only support reduction2Mem when the loop nest to be interchanged is
1095 // the innermost two loops.
1096 if (!L->isInnermost()) {
1097 LLVM_DEBUG(dbgs() << "Only supported when the loop is the innermost.\n");
1098 ORE->emit([&]() {
1099 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1100 L->getStartLoc(), L->getHeader())
1101 << "Only supported when the loop is the innermost.";
1102 });
1103 return false;
1104 }
1105
1106 if (Phi->getNumIncomingValues() != 2)
1107 return false;
1108
1109 Value *Init = Phi->getIncomingValueForBlock(L->getLoopPreheader());
1110 Value *Next = Phi->getIncomingValueForBlock(L->getLoopLatch());
1111
1112 // So far only supports constant initial value.
1113 if (!isa<Constant>(Init)) {
1114 LLVM_DEBUG(
1115 dbgs()
1116 << "Only supported for the reduction with a constant initial value.\n");
1117 ORE->emit([&]() {
1118 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1119 L->getStartLoc(), L->getHeader())
1120 << "Only supported for the reduction with a constant initial "
1121 "value.";
1122 });
1123 return false;
1124 }
1125
1126 // The reduction result must live in the inner loop.
1127 if (Instruction *I = dyn_cast<Instruction>(Next)) {
1128 BasicBlock *BB = I->getParent();
1129 if (!L->contains(BB))
1130 return false;
1131 }
1132
1133 // The reduction should have only one user.
1134 if (!Phi->hasOneUser())
1135 return false;
1136
1137 // Check the reduction kind.
1138 if (!checkReductionKind(L, Phi, HasNoWrapInsts, HasNoInfInsts))
1139 return false;
1140
1141 // Find lcssa_phi in OuterLoop's Latch
1142 BasicBlock *ExitBlock = L->getExitBlock();
1143 if (!ExitBlock)
1144 return false;
1145
1146 PHINode *Lcssa = NULL;
1147 for (auto *U : Next->users()) {
1148 if (auto *P = dyn_cast<PHINode>(U)) {
1149 if (P == Phi)
1150 continue;
1151
1152 if (Lcssa == NULL && P->getParent() == ExitBlock &&
1153 P->getIncomingValueForBlock(L->getLoopLatch()) == Next)
1154 Lcssa = P;
1155 else
1156 return false;
1157 } else
1158 return false;
1159 }
1160 if (!Lcssa)
1161 return false;
1162
1163 if (!Lcssa->hasOneUser()) {
1164 LLVM_DEBUG(dbgs() << "Only supported when the reduction is used once in "
1165 "the outer loop.\n");
1166 ORE->emit([&]() {
1167 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1168 L->getStartLoc(), L->getHeader())
1169 << "Only supported when the reduction is used once in the outer "
1170 "loop.";
1171 });
1172 return false;
1173 }
1174
1175 StoreInst *LcssaStore =
1177 if (!LcssaStore || LcssaStore->getParent() != ExitBlock)
1178 return false;
1179
1180 Value *MemRef = LcssaStore->getOperand(1);
1181 Type *ElemTy = LcssaStore->getOperand(0)->getType();
1182
1183 // LcssaStore stores the reduction result in BB.
1184 // When the reduction is initialized from a constant value, we need to load
1185 // from the memory object into the target basic block of the inner loop. This
1186 // means the memory reference was used prematurely. So we must ensure that the
1187 // memory reference does not dominate the target basic block.
1188 // TODO: Move the memory reference definition into the loop header.
1189 if (!DT->dominates(dyn_cast<Instruction>(MemRef), L->getHeader())) {
1190 LLVM_DEBUG(dbgs() << "Only supported when memory reference dominate "
1191 "the inner loop.\n");
1192 ORE->emit([&]() {
1193 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1194 L->getStartLoc(), L->getHeader())
1195 << "Only supported when memory reference dominate the inner "
1196 "loop.";
1197 });
1198 return false;
1199 }
1200
1201 // Found a reduction in the inner loop.
1202 InnerReduction SR;
1203 SR.Reduction = Phi;
1204 SR.Init = Init;
1205 SR.Next = Next;
1206 SR.LcssaPhi = Lcssa;
1207 SR.LcssaStore = LcssaStore;
1208 SR.MemRef = MemRef;
1209 SR.ElemTy = ElemTy;
1210
1211 InnerReductions.push_back(SR);
1212 return true;
1213}
1214
1215bool LoopInterchangeLegality::findInductionAndReductions(
1216 Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) {
1217 if (!L->getLoopLatch() || !L->getLoopPredecessor())
1218 return false;
1219 for (PHINode &PHI : L->getHeader()->phis()) {
1220 InductionDescriptor ID;
1222 Inductions.push_back(&PHI);
1223 else {
1224 // PHIs in inner loops need to be part of a reduction in the outer loop,
1225 // discovered when checking the PHIs of the outer loop earlier.
1226 if (!InnerLoop) {
1227 if (OuterInnerReductions.count(&PHI)) {
1228 LLVM_DEBUG(dbgs() << "Found a reduction across the outer loop.\n");
1229 } else if (EnableReduction2Memory &&
1230 isInnerReduction(L, &PHI, HasNoWrapReductions)) {
1231 LLVM_DEBUG(dbgs() << "Found a reduction in the inner loop: \n"
1232 << PHI << '\n');
1233 } else
1234 return false;
1235 } else {
1236 assert(PHI.getNumIncomingValues() == 2 &&
1237 "Phis in loop header should have exactly 2 incoming values");
1238 // Check if we have a PHI node in the outer loop that has a reduction
1239 // result from the inner loop as an incoming value.
1240 Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch()));
1241 PHINode *InnerRedPhi = findInnerReductionPhi(
1242 InnerLoop, V, HasNoWrapReductions, HasNoInfInsts);
1243 if (!InnerRedPhi ||
1244 !llvm::is_contained(InnerRedPhi->incoming_values(), &PHI)) {
1245 LLVM_DEBUG(
1246 dbgs()
1247 << "Failed to recognize PHI as an induction or reduction.\n");
1248 return false;
1249 }
1250 OuterInnerReductions.insert(&PHI);
1251 OuterInnerReductions.insert(InnerRedPhi);
1252 }
1253 }
1254 }
1255
1256 // For now we only support at most one reduction.
1257 if (InnerReductions.size() > 1) {
1258 LLVM_DEBUG(dbgs() << "Only supports at most one reduction.\n");
1259 ORE->emit([&]() {
1260 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1261 L->getStartLoc(), L->getHeader())
1262 << "Only supports at most one reduction.";
1263 });
1264 return false;
1265 }
1266
1267 return true;
1268}
1269
1270// This function indicates the current limitations in the transform as a result
1271// of which we do not proceed.
1272bool LoopInterchangeLegality::currentLimitations() {
1273 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1274
1275 // transform currently expects the loop latches to also be the exiting
1276 // blocks.
1277 if (InnerLoop->getExitingBlock() != InnerLoopLatch ||
1278 OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() ||
1279 !isa<CondBrInst>(InnerLoopLatch->getTerminator()) ||
1280 !isa<CondBrInst>(OuterLoop->getLoopLatch()->getTerminator())) {
1281 LLVM_DEBUG(
1282 dbgs() << "Loops where the latch is not the exiting block are not"
1283 << " supported currently.\n");
1284 ORE->emit([&]() {
1285 return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch",
1286 OuterLoop->getStartLoc(),
1287 OuterLoop->getHeader())
1288 << "Loops where the latch is not the exiting block cannot be"
1289 " interchange currently.";
1290 });
1291 return true;
1292 }
1293
1294 SmallVector<PHINode *, 8> Inductions;
1295 if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) {
1296 LLVM_DEBUG(
1297 dbgs() << "Only outer loops with induction or reduction PHI nodes "
1298 << "are supported currently.\n");
1299 ORE->emit([&]() {
1300 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
1301 OuterLoop->getStartLoc(),
1302 OuterLoop->getHeader())
1303 << "Only outer loops with induction or reduction PHI nodes can be"
1304 " interchanged currently.";
1305 });
1306 return true;
1307 }
1308
1309 Inductions.clear();
1310 // For multi-level loop nests, make sure that all phi nodes for inner loops
1311 // at all levels can be recognized as a induction or reduction phi. Bail out
1312 // if a phi node at a certain nesting level cannot be properly recognized.
1313 Loop *CurLevelLoop = OuterLoop;
1314 while (!CurLevelLoop->getSubLoops().empty()) {
1315 // We already made sure that the loop nest is tightly nested.
1316 CurLevelLoop = CurLevelLoop->getSubLoops().front();
1317 if (!findInductionAndReductions(CurLevelLoop, Inductions, nullptr)) {
1318 LLVM_DEBUG(
1319 dbgs() << "Only inner loops with induction or reduction PHI nodes "
1320 << "are supported currently.\n");
1321 ORE->emit([&]() {
1322 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
1323 CurLevelLoop->getStartLoc(),
1324 CurLevelLoop->getHeader())
1325 << "Only inner loops with induction or reduction PHI nodes can be"
1326 " interchange currently.";
1327 });
1328 return true;
1329 }
1330 }
1331
1332 // TODO: Triangular loops are not handled for now.
1333 if (!isLoopStructureUnderstood()) {
1334 LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
1335 ORE->emit([&]() {
1336 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
1337 InnerLoop->getStartLoc(),
1338 InnerLoop->getHeader())
1339 << "Inner loop structure not understood currently.";
1340 });
1341 return true;
1342 }
1343
1344 return false;
1345}
1346
1347bool LoopInterchangeLegality::findInductions(
1348 Loop *L, SmallVectorImpl<PHINode *> &Inductions) {
1349 for (PHINode &PHI : L->getHeader()->phis()) {
1350 InductionDescriptor ID;
1352 Inductions.push_back(&PHI);
1353 }
1354 return !Inductions.empty();
1355}
1356
1357/// We currently only support LCSSA PHI nodes in the inner loop exit if their
1358/// users are either of the following:
1359///
1360/// - Reduction PHIs
1361/// - PHIs outside the outer loop
1362/// - PHIs belonging to the latch of the outer loop
1363///
1364/// These conditions mean that we are only interested in the final value after
1365/// the inner loop.
1366static bool
1369 PHINode *LcssaReduction) {
1370 BasicBlock *InnerExit = InnerL->getUniqueExitBlock();
1371 for (PHINode &PHI : InnerExit->phis()) {
1372 // The reduction LCSSA PHI will have only one incoming block, which comes
1373 // from the loop latch.
1374 if (PHI.getNumIncomingValues() > 1)
1375 return false;
1376 if (&PHI == LcssaReduction)
1377 return true;
1378 if (any_of(PHI.users(), [&Reductions, OuterL](User *U) {
1379 PHINode *PN = dyn_cast<PHINode>(U);
1380 if (!PN)
1381 return true;
1382 if (Reductions.count(PN))
1383 return false;
1384 BasicBlock *PB = PN->getParent();
1385 if (!OuterL->contains(PB))
1386 return false;
1387 return PB != OuterL->getLoopLatch();
1388 }))
1389 return false;
1390 }
1391 return true;
1392}
1393
1394// We currently support LCSSA PHI nodes in the outer loop exit, if their
1395// incoming values do not come from the outer loop latch or if the
1396// outer loop latch has a single predecessor. In that case, the value will
1397// be available if both the inner and outer loop conditions are true, which
1398// will still be true after interchanging. If we have multiple predecessor,
1399// that may not be the case, e.g. because the outer loop latch may be executed
1400// if the inner loop is not executed.
1401static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
1402 BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
1403 for (PHINode &PHI : LoopNestExit->phis()) {
1404 for (Value *Incoming : PHI.incoming_values()) {
1405 Instruction *IncomingI = dyn_cast<Instruction>(Incoming);
1406 if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
1407 continue;
1408
1409 // The incoming value is defined in the outer loop latch. Currently we
1410 // only support that in case the outer loop latch has a single predecessor.
1411 // This guarantees that the outer loop latch is executed if and only if
1412 // the inner loop is executed (because tightlyNested() guarantees that the
1413 // outer loop header only branches to the inner loop or the outer loop
1414 // latch).
1415 // FIXME: We could weaken this logic and allow multiple predecessors,
1416 // if the values are produced outside the loop latch. We would need
1417 // additional logic to update the PHI nodes in the exit block as
1418 // well.
1419 if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
1420 return false;
1421 }
1422 }
1423 return true;
1424}
1425
1426// In case of multi-level nested loops, it may occur that lcssa phis exist in
1427// the latch of InnerLoop, i.e., when defs of the incoming values are further
1428// inside the loopnest. Sometimes those incoming values are not available
1429// after interchange, since the original inner latch will become the new outer
1430// latch which may have predecessor paths that do not include those incoming
1431// values.
1432// TODO: Handle transformation of lcssa phis in the InnerLoop latch in case of
1433// multi-level loop nests.
1434static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
1435 if (InnerLoop->getSubLoops().empty())
1436 return true;
1437 // If the original outer latch has only one predecessor, then values defined
1438 // further inside the looploop, e.g., in the innermost loop, will be available
1439 // at the new outer latch after interchange.
1440 if (OuterLoop->getLoopLatch()->getUniquePredecessor() != nullptr)
1441 return true;
1442
1443 // The outer latch has more than one predecessors, i.e., the inner
1444 // exit and the inner header.
1445 // PHI nodes in the inner latch are lcssa phis where the incoming values
1446 // are defined further inside the loopnest. Check if those phis are used
1447 // in the original inner latch. If that is the case then bail out since
1448 // those incoming values may not be available at the new outer latch.
1449 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1450 for (PHINode &PHI : InnerLoopLatch->phis()) {
1451 for (auto *U : PHI.users()) {
1453 if (InnerLoopLatch == UI->getParent())
1454 return false;
1455 }
1456 }
1457 return true;
1458}
1459
1460bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
1461 unsigned OuterLoopId,
1462 CharMatrix &DepMatrix) {
1463 if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
1464 LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
1465 << " and OuterLoopId = " << OuterLoopId
1466 << " due to dependence\n");
1467 ORE->emit([&]() {
1468 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
1469 InnerLoop->getStartLoc(),
1470 InnerLoop->getHeader())
1471 << "Cannot interchange loops due to dependences.";
1472 });
1473 return false;
1474 }
1475 // Check if outer and inner loop contain legal instructions only.
1476 for (auto *BB : OuterLoop->blocks())
1477 for (Instruction &I : *BB)
1478 if (CallInst *CI = dyn_cast<CallInst>(&I)) {
1479 // readnone functions do not prevent interchanging.
1480 if (CI->onlyWritesMemory() || isa<PseudoProbeInst>(CI))
1481 continue;
1482 LLVM_DEBUG(
1483 dbgs() << "Loops with call instructions cannot be interchanged "
1484 << "safely.");
1485 ORE->emit([&]() {
1486 return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst",
1487 CI->getDebugLoc(),
1488 CI->getParent())
1489 << "Cannot interchange loops due to call instruction.";
1490 });
1491
1492 return false;
1493 }
1494
1495 if (!findInductions(InnerLoop, InnerLoopInductions)) {
1496 LLVM_DEBUG(dbgs() << "Could not find inner loop induction variables.\n");
1497 return false;
1498 }
1499
1500 if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) {
1501 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n");
1502 ORE->emit([&]() {
1503 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerLatchPHI",
1504 InnerLoop->getStartLoc(),
1505 InnerLoop->getHeader())
1506 << "Cannot interchange loops because unsupported PHI nodes found "
1507 "in inner loop latch.";
1508 });
1509 return false;
1510 }
1511
1512 // TODO: The loops could not be interchanged due to current limitations in the
1513 // transform module.
1514 if (currentLimitations()) {
1515 LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");
1516 return false;
1517 }
1518
1519 // Check if the loops are tightly nested.
1520 if (!tightlyNested(OuterLoop, InnerLoop)) {
1521 LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");
1522 ORE->emit([&]() {
1523 return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
1524 InnerLoop->getStartLoc(),
1525 InnerLoop->getHeader())
1526 << "Cannot interchange loops because they are not tightly "
1527 "nested.";
1528 });
1529 return false;
1530 }
1531
1532 // The LCSSA PHI for the reduction has passed checks before; its user
1533 // is a store instruction.
1534 PHINode *LcssaReduction = nullptr;
1535 assert(InnerReductions.size() <= 1 &&
1536 "So far we only support at most one reduction.");
1537 if (InnerReductions.size() == 1)
1538 LcssaReduction = InnerReductions[0].LcssaPhi;
1539
1540 if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop, OuterInnerReductions,
1541 LcssaReduction)) {
1542 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n");
1543 ORE->emit([&]() {
1544 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1545 InnerLoop->getStartLoc(),
1546 InnerLoop->getHeader())
1547 << "Found unsupported PHI node in loop exit.";
1548 });
1549 return false;
1550 }
1551
1552 if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
1553 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
1554 ORE->emit([&]() {
1555 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1556 OuterLoop->getStartLoc(),
1557 OuterLoop->getHeader())
1558 << "Found unsupported PHI node in loop exit.";
1559 });
1560 return false;
1561 }
1562
1563 if (any_of(OuterLoop->getLoopLatch()->phis(),
1564 [](PHINode &PHI) { return PHI.getNumIncomingValues() != 1; })) {
1565 LLVM_DEBUG(dbgs() << "Only outer loop latch PHI nodes with one incoming "
1566 "value are supported.\n");
1567 ORE->emit([&]() {
1568 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLatchPHI",
1569 OuterLoop->getStartLoc(),
1570 OuterLoop->getHeader())
1571 << "Only outer loop latch PHI nodes with one incoming value are "
1572 "supported.";
1573 });
1574 return false;
1575 }
1576
1577 return true;
1578}
1579
1580void CacheCostManager::computeIfUnitinialized() {
1581 if (CC.has_value())
1582 return;
1583
1584 LLVM_DEBUG(dbgs() << "Compute CacheCost.\n");
1585 CC = CacheCost::getCacheCost(*OutermostLoop, *AR, *DI);
1586 // Obtain the loop vector returned from loop cache analysis beforehand,
1587 // and put each <Loop, index> pair into a map for constant time query
1588 // later. Indices in loop vector reprsent the optimal order of the
1589 // corresponding loop, e.g., given a loopnest with depth N, index 0
1590 // indicates the loop should be placed as the outermost loop and index N
1591 // indicates the loop should be placed as the innermost loop.
1592 //
1593 // For the old pass manager CacheCost would be null.
1594 if (*CC != nullptr)
1595 for (const auto &[Idx, Cost] : enumerate((*CC)->getLoopCosts()))
1596 CostMap[Cost.first] = Idx;
1597}
1598
1599CacheCost *CacheCostManager::getCacheCost() {
1600 computeIfUnitinialized();
1601 return CC->get();
1602}
1603
1604const DenseMap<const Loop *, unsigned> &CacheCostManager::getCostMap() {
1605 computeIfUnitinialized();
1606 return CostMap;
1607}
1608
1609/// If \S contains an affine addrec for \p L, return the step recurrence of it.
1610/// If \S is loop invariant with respect to \p L, return nullptr. Otherwise,
1611/// return std::nullopt, which indicates we cannot determine the coefficient of
1612/// the addrec for \p L in \S.
1613/// TODO: Handle more complex cases. Maybe using SCEVTraversal is a good way to
1614/// do that.
1615std::optional<const SCEV *> getAddRecCoefficient(ScalarEvolution &SE,
1616 const SCEV *S, const Loop *L) {
1618 if (!AR) {
1619 if (SE.isLoopInvariant(S, L))
1620 return nullptr;
1621 return std::nullopt;
1622 }
1623
1624 if (!AR->isAffine()) {
1625 LLVM_DEBUG(dbgs() << "Unexpected non-affine addrec\n");
1626 return std::nullopt;
1627 }
1628
1629 std::optional<const SCEV *> Coeff =
1630 getAddRecCoefficient(SE, AR->getStart(), L);
1631 if (!Coeff.has_value())
1632 return std::nullopt;
1633
1634 if (AR->getLoop() == L) {
1635 assert(!*Coeff && "Found more than one addrec for the same loop");
1636 Coeff = AR->getStepRecurrence(SE);
1637 }
1638 return Coeff;
1639}
1640
1641int LoopInterchangeProfitability::getInstrOrderCost() {
1642 SmallPtrSet<const SCEV *, 4> GoodBasePtrs, BadBasePtrs;
1643 for (BasicBlock *BB : InnerLoop->blocks()) {
1644 for (Instruction &Ins : *BB) {
1645 if (!isa<LoadInst, StoreInst>(&Ins))
1646 continue;
1647 const SCEV *Access = SE->getSCEV(getLoadStorePointerOperand(&Ins));
1648 const SCEV *BasePtr = SE->getPointerBase(Access);
1649 std::optional<const SCEV *> OuterCoeff =
1650 getAddRecCoefficient(*SE, Access, OuterLoop);
1651 std::optional<const SCEV *> InnerCoeff =
1652 getAddRecCoefficient(*SE, Access, InnerLoop);
1653
1654 if (!OuterCoeff.has_value() || !*OuterCoeff || !InnerCoeff.has_value() ||
1655 !*InnerCoeff)
1656 continue;
1657
1658 // This heuristic assumes that a smaller step recurrence implies that the
1659 // induction variable corresponding to the loop is used in the inner
1660 // dimension of the array. Placing such a loop in the inner position would
1661 // be beneficial in terms of locality. If the array access is of the form
1662 // like `A[3*i + 2*j]`, this heuristic may lead to an unprofitable
1663 // interchange, but we expect such cases to be rare.
1664 const SCEV *OuterStep = SE->getAbsExpr(*OuterCoeff, /*IsNSW=*/false);
1665 const SCEV *InnerStep = SE->getAbsExpr(*InnerCoeff, /*IsNSW=*/false);
1666 // If we find the inner induction after an outer induction e.g.
1667 //
1668 // for(int i=0;i<N;i++)
1669 // for(int j=0;j<N;j++)
1670 // A[i][j] = A[i-1][j-1]+k;
1671 //
1672 //
1673 // then it is a good order. If we find the outer induction after an inner
1674 // induction e.g.
1675 //
1676 // for(int i=0;i<N;i++)
1677 // for(int j=0;j<N;j++)
1678 // A[j][i] = A[j-1][i-1]+k;
1679 //
1680 // then it is a bad order.
1681 //
1682 // To avoid counting the same base pointers multiple times, we deduplicate
1683 // them by using a set of base pointers.
1684 if (SE->isKnownPredicate(ICmpInst::ICMP_SLT, InnerStep, OuterStep))
1685 GoodBasePtrs.insert(BasePtr);
1686 else if (SE->isKnownPredicate(ICmpInst::ICMP_SLT, OuterStep, InnerStep))
1687 BadBasePtrs.insert(BasePtr);
1688 }
1689 }
1690
1691 int GoodOrder = GoodBasePtrs.size();
1692 int BadOrder = BadBasePtrs.size();
1693 return GoodOrder - BadOrder;
1694}
1695
1696std::optional<bool>
1697LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(
1698 const DenseMap<const Loop *, unsigned> &CostMap, CacheCost *CC) {
1699 // This is the new cost model returned from loop cache analysis.
1700 // A smaller index means the loop should be placed an outer loop, and vice
1701 // versa.
1702 auto InnerLoopIt = CostMap.find(InnerLoop);
1703 if (InnerLoopIt == CostMap.end())
1704 return std::nullopt;
1705 auto OuterLoopIt = CostMap.find(OuterLoop);
1706 if (OuterLoopIt == CostMap.end())
1707 return std::nullopt;
1708
1709 if (CC->getLoopCost(*OuterLoop) == CC->getLoopCost(*InnerLoop))
1710 return std::nullopt;
1711 unsigned InnerIndex = InnerLoopIt->second;
1712 unsigned OuterIndex = OuterLoopIt->second;
1713 LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
1714 << ", OuterIndex = " << OuterIndex << "\n");
1715 assert(InnerIndex != OuterIndex && "CostMap should assign unique "
1716 "numbers to each loop");
1717 return std::optional<bool>(InnerIndex < OuterIndex);
1718}
1719
1720std::optional<bool>
1721LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
1722 // Legacy cost model: this is rough cost estimation algorithm. It counts the
1723 // good and bad order of induction variables in the instruction and allows
1724 // reordering if number of bad orders is more than good.
1725 int Cost = getInstrOrderCost();
1726 LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
1728 return std::optional<bool>(true);
1729
1730 return std::nullopt;
1731}
1732
1733/// Return true if we can vectorize the loop specified by \p LoopId.
1734static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
1735 for (const auto &Dep : DepMatrix) {
1736 char Dir = Dep[LoopId];
1737 char DepType = Dep.back();
1738 assert((DepType == '<' || DepType == '*') &&
1739 "Unexpected element in dependency vector");
1740
1741 // There are no loop-carried dependencies.
1742 if (Dir == '=' || Dir == 'I')
1743 continue;
1744
1745 // DepType being '<' means that this direction vector represents a forward
1746 // dependency. In principle, a loop with '<' direction can be vectorized in
1747 // this case.
1748 if (Dir == '<' && DepType == '<')
1749 continue;
1750
1751 // We cannot prove that the loop is vectorizable.
1752 return false;
1753 }
1754 return true;
1755}
1756
1757std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
1758 unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
1759 // If the outer loop cannot be vectorized, it is not profitable to move this
1760 // to inner position.
1761 if (!canVectorize(DepMatrix, OuterLoopId))
1762 return false;
1763
1764 // If the inner loop cannot be vectorized but the outer loop can be, then it
1765 // is profitable to interchange to enable inner loop parallelism.
1766 if (!canVectorize(DepMatrix, InnerLoopId))
1767 return true;
1768
1769 // If both the inner and the outer loop can be vectorized, it is necessary to
1770 // check the cost of each vectorized loop for profitability decision. At this
1771 // time we do not have a cost model to estimate them, so return nullopt.
1772 // TODO: Estimate the cost of vectorized loop when both the outer and the
1773 // inner loop can be vectorized.
1774 return std::nullopt;
1775}
1776
1777bool LoopInterchangeProfitability::isProfitable(
1778 const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
1779 unsigned OuterLoopId, CharMatrix &DepMatrix, CacheCostManager &CCM) {
1780 // Do not consider loops with a backedge that isn't taken, e.g. an
1781 // unconditional branch true/false, as candidates for interchange.
1782 // TODO: when interchange is forced, we should probably also allow
1783 // interchange for these loops, and thus this logic should be moved just
1784 // below the cost-model ignore check below. But this check is done first
1785 // to avoid the issue in #163954.
1786 const SCEV *InnerBTC = SE->getBackedgeTakenCount(InnerLoop);
1787 const SCEV *OuterBTC = SE->getBackedgeTakenCount(OuterLoop);
1788 if (InnerBTC && InnerBTC->isZero()) {
1789 LLVM_DEBUG(dbgs() << "Inner loop back-edge isn't taken, rejecting "
1790 "single iteration loop\n");
1791 return false;
1792 }
1793 if (OuterBTC && OuterBTC->isZero()) {
1794 LLVM_DEBUG(dbgs() << "Outer loop back-edge isn't taken, rejecting "
1795 "single iteration loop\n");
1796 return false;
1797 }
1798
1799 // Return true if interchange is forced and the cost-model ignored.
1800 if (Profitabilities.size() == 1 && Profitabilities[0] == RuleTy::Ignore)
1801 return true;
1803 "Duplicate rules and option 'ignore' are not allowed");
1804
1805 // isProfitable() is structured to avoid endless loop interchange. If the
1806 // highest priority rule (isProfitablePerLoopCacheAnalysis by default) could
1807 // decide the profitability then, profitability check will stop and return the
1808 // analysis result. If it failed to determine it (e.g., cache analysis failed
1809 // to analyze the loopnest due to delinearization issues) then go ahead the
1810 // second highest priority rule (isProfitablePerInstrOrderCost by default).
1811 // Likewise, if it failed to analysis the profitability then only, the last
1812 // rule (isProfitableForVectorization by default) will decide.
1813 std::optional<bool> shouldInterchange;
1814 for (RuleTy RT : Profitabilities) {
1815 switch (RT) {
1816 case RuleTy::PerLoopCacheAnalysis: {
1817 CacheCost *CC = CCM.getCacheCost();
1818 const DenseMap<const Loop *, unsigned> &CostMap = CCM.getCostMap();
1819 shouldInterchange = isProfitablePerLoopCacheAnalysis(CostMap, CC);
1820 break;
1821 }
1822 case RuleTy::PerInstrOrderCost:
1823 shouldInterchange = isProfitablePerInstrOrderCost();
1824 break;
1825 case RuleTy::ForVectorization:
1826 shouldInterchange =
1827 isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
1828 break;
1829 case RuleTy::Ignore:
1830 llvm_unreachable("Option 'ignore' is not supported with other options");
1831 break;
1832 }
1833
1834 // If this rule could determine the profitability, don't call subsequent
1835 // rules.
1836 if (shouldInterchange.has_value())
1837 break;
1838 }
1839
1840 if (!shouldInterchange.has_value()) {
1841 ORE->emit([&]() {
1842 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1843 InnerLoop->getStartLoc(),
1844 InnerLoop->getHeader())
1845 << "Insufficient information to calculate the cost of loop for "
1846 "interchange.";
1847 });
1848 return false;
1849 } else if (!shouldInterchange.value()) {
1850 ORE->emit([&]() {
1851 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1852 InnerLoop->getStartLoc(),
1853 InnerLoop->getHeader())
1854 << "Interchanging loops is not considered to improve cache "
1855 "locality nor vectorization.";
1856 });
1857 return false;
1858 }
1859 return true;
1860}
1861
1862void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
1863 Loop *InnerLoop) {
1864 for (Loop *L : *OuterLoop)
1865 if (L == InnerLoop) {
1866 OuterLoop->removeChildLoop(L);
1867 return;
1868 }
1869 llvm_unreachable("Couldn't find loop");
1870}
1871
1872/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
1873/// new inner and outer loop after interchanging: NewInner is the original
1874/// outer loop and NewOuter is the original inner loop.
1875///
1876/// Before interchanging, we have the following structure
1877/// Outer preheader
1878// Outer header
1879// Inner preheader
1880// Inner header
1881// Inner body
1882// Inner latch
1883// outer bbs
1884// Outer latch
1885//
1886// After interchanging:
1887// Inner preheader
1888// Inner header
1889// Outer preheader
1890// Outer header
1891// Inner body
1892// outer bbs
1893// Outer latch
1894// Inner latch
1895void LoopInterchangeTransform::restructureLoops(
1896 Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader,
1897 BasicBlock *OrigOuterPreHeader) {
1898 Loop *OuterLoopParent = OuterLoop->getParentLoop();
1899 // The original inner loop preheader moves from the new inner loop to
1900 // the parent loop, if there is one.
1901 NewInner->removeBlockFromLoop(OrigInnerPreHeader);
1902 LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent);
1903
1904 // Switch the loop levels.
1905 if (OuterLoopParent) {
1906 // Remove the loop from its parent loop.
1907 removeChildLoop(OuterLoopParent, NewInner);
1908 removeChildLoop(NewInner, NewOuter);
1909 OuterLoopParent->addChildLoop(NewOuter);
1910 } else {
1911 removeChildLoop(NewInner, NewOuter);
1912 LI->changeTopLevelLoop(NewInner, NewOuter);
1913 }
1914 while (!NewOuter->isInnermost())
1915 NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin()));
1916 NewOuter->addChildLoop(NewInner);
1917
1918 // BBs from the original inner loop.
1919 SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks());
1920
1921 // Add BBs from the original outer loop to the original inner loop (excluding
1922 // BBs already in inner loop)
1923 for (BasicBlock *BB : NewInner->blocks())
1924 if (LI->getLoopFor(BB) == NewInner)
1925 NewOuter->addBlockEntry(BB);
1926
1927 // Now remove inner loop header and latch from the new inner loop and move
1928 // other BBs (the loop body) to the new inner loop.
1929 BasicBlock *OuterHeader = NewOuter->getHeader();
1930 BasicBlock *OuterLatch = NewOuter->getLoopLatch();
1931 for (BasicBlock *BB : OrigInnerBBs) {
1932 // Nothing will change for BBs in child loops.
1933 if (LI->getLoopFor(BB) != NewOuter)
1934 continue;
1935 // Remove the new outer loop header and latch from the new inner loop.
1936 if (BB == OuterHeader || BB == OuterLatch)
1937 NewInner->removeBlockFromLoop(BB);
1938 else
1939 LI->changeLoopFor(BB, NewInner);
1940 }
1941
1942 // The preheader of the original outer loop becomes part of the new
1943 // outer loop.
1944 NewOuter->addBlockEntry(OrigOuterPreHeader);
1945 LI->changeLoopFor(OrigOuterPreHeader, NewOuter);
1946
1947 // Tell SE that we move the loops around.
1948 SE->forgetLoop(NewOuter);
1949}
1950
1951/// User can write, or optimizers can generate the reduction for inner loop.
1952/// To make the interchange valid, apply Reduction2Mem by moving the
1953/// initializer and store instructions into the inner loop. So far we only
1954/// handle cases where the reduction variable is initialized to a constant.
1955/// For example, below code:
1956///
1957/// loop:
1958/// re = phi<0.0, next>
1959/// next = re op ...
1960/// endloop
1961/// reduc_sum = phi<next> // lcssa phi
1962/// MEM_REF[idx] = reduc_sum // LcssaStore
1963///
1964/// is transformed into:
1965///
1966/// loop:
1967/// tmp = MEM_REF[idx];
1968/// new_var = !first_iteration ? tmp : 0.0;
1969/// next = new_var op ...
1970/// MEM_REF[idx] = next; // after moving
1971/// endloop
1972///
1973/// In this way the initial const is used in the first iteration of loop.
1974void LoopInterchangeTransform::reduction2Memory() {
1976 LIL.getInnerReductions();
1977
1978 assert(InnerReductions.size() == 1 &&
1979 "So far we only support at most one reduction.");
1980
1981 LoopInterchangeLegality::InnerReduction SR = InnerReductions[0];
1982 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1983 IRBuilder<> Builder(&*(InnerLoopHeader->getFirstNonPHIIt()));
1984
1985 // Check if it's the first iteration.
1986 LLVMContext &Context = InnerLoopHeader->getContext();
1987 PHINode *FirstIter =
1988 Builder.CreatePHI(Type::getInt1Ty(Context), 2, "first.iter");
1989 FirstIter->addIncoming(ConstantInt::get(Type::getInt1Ty(Context), 1),
1990 InnerLoop->getLoopPreheader());
1991 FirstIter->addIncoming(ConstantInt::get(Type::getInt1Ty(Context), 0),
1992 InnerLoop->getLoopLatch());
1993 assert(FirstIter->isComplete() && "The FirstIter PHI node is not complete.");
1994
1995 // When the reduction is initialized from a constant value, we need to add
1996 // a stmt loading from the memory object to target basic block in inner
1997 // loop.
1998 Instruction *LoadMem = Builder.CreateLoad(SR.ElemTy, SR.MemRef);
1999
2000 // Init new_var to MEM_REF or CONST depending on if it is the first iteration.
2001 Value *NewVar = Builder.CreateSelect(FirstIter, SR.Init, LoadMem, "new.var");
2002
2003 // Replace all uses of the reduction variable with a new variable.
2004 SR.Reduction->replaceAllUsesWith(NewVar);
2005
2006 // Move store instruction into inner loop, just after reduction next's
2007 // definition.
2008 SR.LcssaStore->setOperand(0, SR.Next);
2009 SR.LcssaStore->moveAfter(dyn_cast<Instruction>(SR.Next));
2010}
2011
2012bool LoopInterchangeTransform::transform(
2013 ArrayRef<Instruction *> DropNoWrapInsts,
2014 ArrayRef<Instruction *> DropNoInfInsts) {
2015 bool Transformed = false;
2016
2018 LIL.getInnerReductions();
2019 if (InnerReductions.size() == 1)
2020 reduction2Memory();
2021
2022 LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
2023 auto &InductionPHIs = LIL.getInnerLoopInductions();
2024 if (InductionPHIs.empty()) {
2025 LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
2026 return false;
2027 }
2028
2029 SmallVector<Instruction *, 8> InnerIndexVarList;
2030 for (PHINode *CurInductionPHI : InductionPHIs) {
2031 Instruction *IncomingValue = dyn_cast<Instruction>(
2032 CurInductionPHI->getIncomingValueForBlock(InnerLoop->getLoopLatch()));
2033 assert(IncomingValue &&
2034 "Incoming value from loop latch isn't an instruction");
2035 if (is_contained(InductionPHIs, IncomingValue))
2036 continue;
2037 InnerIndexVarList.push_back(IncomingValue);
2038 }
2039
2040 // Create a new latch block for the inner loop. We split at the
2041 // current latch's terminator and then move the condition and all
2042 // operands that are not either loop-invariant or the induction PHI into the
2043 // new latch block.
2044 BasicBlock *NewLatch =
2045 SplitBlock(InnerLoop->getLoopLatch(),
2046 InnerLoop->getLoopLatch()->getTerminator(), DT, LI);
2047
2048 SmallSetVector<Instruction *, 4> WorkList;
2049 unsigned i = 0;
2050 auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() {
2051 for (; i < WorkList.size(); i++) {
2052 // Duplicate instruction and move it to the new latch. Update uses that
2053 // have been moved.
2054 Instruction *NewI = WorkList[i]->clone();
2055 NewI->insertBefore(NewLatch->getFirstNonPHIIt());
2056 assert(!NewI->mayHaveSideEffects() &&
2057 "Moving instructions with side-effects may change behavior of "
2058 "the loop nest!");
2059 for (Use &U : llvm::make_early_inc_range(WorkList[i]->uses())) {
2060 Instruction *UserI = cast<Instruction>(U.getUser());
2061 if (!InnerLoop->contains(UserI->getParent()) ||
2062 UserI->getParent() == NewLatch ||
2063 llvm::is_contained(InductionPHIs, UserI))
2064 U.set(NewI);
2065 }
2066 // Add operands of moved instruction to the worklist, except if they are
2067 // outside the inner loop or are the induction PHI.
2068 for (Value *Op : WorkList[i]->operands()) {
2070 if (!OpI || this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop ||
2071 llvm::is_contained(InductionPHIs, OpI))
2072 continue;
2073 WorkList.insert(OpI);
2074 }
2075 }
2076 };
2077
2078 // FIXME: Should we interchange when we have a constant condition?
2081 ->getCondition());
2082 if (CondI)
2083 WorkList.insert(CondI);
2084 MoveInstructions();
2085 for (Instruction *InnerIndexVar : InnerIndexVarList)
2086 WorkList.insert(cast<Instruction>(InnerIndexVar));
2087 MoveInstructions();
2088
2089 // Ensure the inner loop phi nodes have a separate basic block.
2090 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
2091 if (&*InnerLoopHeader->getFirstNonPHIIt() !=
2092 InnerLoopHeader->getTerminator()) {
2093 SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHIIt(), DT, LI);
2094 LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
2095 }
2096
2097 // Instructions in the original inner loop preheader may depend on values
2098 // defined in the outer loop header. Move them there, because the original
2099 // inner loop preheader will become the entry into the interchanged loop nest.
2100 // Currently we move all instructions and rely on LICM to move invariant
2101 // instructions outside the loop nest.
2102 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2103 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
2104
2105 if (InnerLoopPreHeader != OuterLoopHeader) {
2106 // Eliminate PHIs in the inner-loop preheader.
2107 for (PHINode &P : make_early_inc_range(InnerLoopPreHeader->phis())) {
2108 assert(P.getNumIncomingValues() == 1 &&
2109 "Expected single-incoming PHIs in inner loop preheader");
2110 P.replaceAllUsesWith(P.getIncomingValue(0));
2111 P.eraseFromParent();
2112 }
2113 for (Instruction &I :
2114 make_early_inc_range(make_range(InnerLoopPreHeader->begin(),
2115 std::prev(InnerLoopPreHeader->end()))))
2116 I.moveBeforePreserving(OuterLoopHeader->getTerminator()->getIterator());
2117 }
2118
2119 Transformed |= adjustLoopLinks();
2120 if (!Transformed) {
2121 LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
2122 return false;
2123 }
2124
2125 // Finally, drop the nsw/nuw/ninf flags from the instructions for reduction
2126 // calculations.
2127 for (Instruction *Reduction : DropNoWrapInsts) {
2128 Reduction->setHasNoSignedWrap(false);
2129 Reduction->setHasNoUnsignedWrap(false);
2130 }
2131 for (Instruction *I : DropNoInfInsts)
2132 I->setHasNoInfs(false);
2133
2134 return true;
2135}
2136
2137/// \brief Move all instructions except the terminator from FromBB right before
2138/// InsertBefore
2139static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
2140 BasicBlock *ToBB = InsertBefore->getParent();
2141
2142 ToBB->splice(InsertBefore->getIterator(), FromBB, FromBB->begin(),
2143 FromBB->getTerminator()->getIterator());
2144}
2145
2146/// Swap instructions between \p BB1 and \p BB2 but keep terminators intact.
2147static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) {
2148 // Save all non-terminator instructions of BB1 into TempInstrs and unlink them
2149 // from BB1 afterwards.
2150 auto Iter = map_range(*BB1, [](Instruction &I) { return &I; });
2151 SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(Iter.end()));
2152 for (Instruction *I : TempInstrs)
2153 I->removeFromParent();
2154
2155 // Move instructions from BB2 to BB1.
2156 moveBBContents(BB2, BB1->getTerminator());
2157
2158 // Move instructions from TempInstrs to BB2.
2159 for (Instruction *I : TempInstrs)
2160 I->insertBefore(BB2->getTerminator()->getIterator());
2161}
2162
2163// Update BI to jump to NewBB instead of OldBB. Records updates to the
2164// dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that
2165// \p OldBB is exactly once in BI's successor list.
2166static void updateSuccessor(Instruction *Term, BasicBlock *OldBB,
2167 BasicBlock *NewBB,
2168 std::vector<DominatorTree::UpdateType> &DTUpdates,
2169 bool MustUpdateOnce = true) {
2170 assert((!MustUpdateOnce || llvm::count(successors(Term), OldBB) == 1) &&
2171 "BI must jump to OldBB exactly once.");
2172 bool Changed = false;
2173 for (Use &Op : Term->operands())
2174 if (Op == OldBB) {
2175 Op.set(NewBB);
2176 Changed = true;
2177 }
2178
2179 if (Changed) {
2180 DTUpdates.push_back(
2181 {DominatorTree::UpdateKind::Insert, Term->getParent(), NewBB});
2182 DTUpdates.push_back(
2183 {DominatorTree::UpdateKind::Delete, Term->getParent(), OldBB});
2184 }
2185 assert(Changed && "Expected a successor to be updated");
2186}
2187
2188// Move Lcssa PHIs to the right place.
2189static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
2190 BasicBlock *InnerLatch, BasicBlock *OuterHeader,
2191 BasicBlock *OuterLatch, BasicBlock *OuterExit,
2192 Loop *InnerLoop, LoopInfo *LI) {
2193
2194 // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are
2195 // defined either in the header or latch. Those blocks will become header and
2196 // latch of the new outer loop, and the only possible users can PHI nodes
2197 // in the exit block of the loop nest or the outer loop header (reduction
2198 // PHIs, in that case, the incoming value must be defined in the inner loop
2199 // header). We can just substitute the user with the incoming value and remove
2200 // the PHI.
2201 for (PHINode &P : make_early_inc_range(InnerExit->phis())) {
2202 assert(P.getNumIncomingValues() == 1 &&
2203 "Only loops with a single exit are supported!");
2204
2205 // Incoming values are guaranteed be instructions currently.
2206 auto IncI = cast<Instruction>(P.getIncomingValueForBlock(InnerLatch));
2207 // In case of multi-level nested loops, follow LCSSA to find the incoming
2208 // value defined from the innermost loop.
2209 auto IncIInnerMost = cast<Instruction>(followLCSSA(IncI));
2210 // Skip phis with incoming values from the inner loop body, excluding the
2211 // header and latch.
2212 if (IncIInnerMost->getParent() != InnerLatch &&
2213 IncIInnerMost->getParent() != InnerHeader)
2214 continue;
2215
2216 assert(all_of(P.users(),
2217 [OuterHeader, OuterExit, IncI, InnerHeader](User *U) {
2218 return (cast<PHINode>(U)->getParent() == OuterHeader &&
2219 IncI->getParent() == InnerHeader) ||
2220 cast<PHINode>(U)->getParent() == OuterExit;
2221 }) &&
2222 "Can only replace phis iff the uses are in the loop nest exit or "
2223 "the incoming value is defined in the inner header (it will "
2224 "dominate all loop blocks after interchanging)");
2225 P.replaceAllUsesWith(IncI);
2226 P.eraseFromParent();
2227 }
2228
2229 SmallVector<PHINode *, 8> LcssaInnerExit(
2230 llvm::make_pointer_range(InnerExit->phis()));
2231
2232 SmallVector<PHINode *, 8> LcssaInnerLatch(
2233 llvm::make_pointer_range(InnerLatch->phis()));
2234
2235 // Lcssa PHIs for values used outside the inner loop are in InnerExit.
2236 // If a PHI node has users outside of InnerExit, it has a use outside the
2237 // interchanged loop and we have to preserve it. We move these to
2238 // InnerLatch, which will become the new exit block for the innermost
2239 // loop after interchanging.
2240 for (PHINode *P : LcssaInnerExit)
2241 P->moveBefore(InnerLatch->getFirstNonPHIIt());
2242
2243 // If the inner loop latch contains LCSSA PHIs, those come from a child loop
2244 // and we have to move them to the new inner latch.
2245 for (PHINode *P : LcssaInnerLatch)
2246 P->moveBefore(InnerExit->getFirstNonPHIIt());
2247
2248 // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have
2249 // incoming values defined in the outer loop, we have to add a new PHI
2250 // in the inner loop latch, which became the exit block of the outer loop,
2251 // after interchanging.
2252 if (OuterExit) {
2253 for (PHINode &P : OuterExit->phis()) {
2254 if (P.getNumIncomingValues() != 1)
2255 continue;
2256 // Skip Phis with incoming values defined in the inner loop. Those should
2257 // already have been updated.
2258 auto I = dyn_cast<Instruction>(P.getIncomingValue(0));
2259 if (!I || LI->getLoopFor(I->getParent()) == InnerLoop)
2260 continue;
2261
2262 PHINode *NewPhi = dyn_cast<PHINode>(P.clone());
2263 NewPhi->setIncomingValue(0, P.getIncomingValue(0));
2264 NewPhi->setIncomingBlock(0, OuterLatch);
2265 // We might have incoming edges from other BBs, i.e., the original outer
2266 // header.
2267 for (auto *Pred : predecessors(InnerLatch)) {
2268 if (Pred == OuterLatch)
2269 continue;
2270 NewPhi->addIncoming(P.getIncomingValue(0), Pred);
2271 }
2272 NewPhi->insertBefore(InnerLatch->getFirstNonPHIIt());
2273 P.setIncomingValue(0, NewPhi);
2274 }
2275 }
2276
2277 // Now adjust the incoming blocks for the LCSSA PHIs.
2278 // For PHIs moved from Inner's exit block, we need to replace Inner's latch
2279 // with the new latch.
2280 InnerLatch->replacePhiUsesWith(InnerLatch, OuterLatch);
2281}
2282
2283/// This deals with a corner case when a LCSSA phi node appears in a non-exit
2284/// block: the outer loop latch block does not need to be exit block of the
2285/// inner loop. Consider a loop that was in LCSSA form, but then some
2286/// transformation like loop-unswitch comes along and creates an empty block,
2287/// where BB5 in this example is the outer loop latch block:
2288///
2289/// BB4:
2290/// br label %BB5
2291/// BB5:
2292/// %old.cond.lcssa = phi i16 [ %cond, %BB4 ]
2293/// br outer.header
2294///
2295/// Interchange then brings it in LCSSA form again resulting in this chain of
2296/// single-input phi nodes:
2297///
2298/// BB4:
2299/// %new.cond.lcssa = phi i16 [ %cond, %BB3 ]
2300/// br label %BB5
2301/// BB5:
2302/// %old.cond.lcssa = phi i16 [ %new.cond.lcssa, %BB4 ]
2303///
2304/// The problem is that interchange can reoder blocks BB4 and BB5 placing the
2305/// use before the def if we don't check this. The solution is to simplify
2306/// lcssa phi nodes (remove) if they appear in non-exit blocks.
2307///
2308static void simplifyLCSSAPhis(Loop *OuterLoop, Loop *InnerLoop) {
2309 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
2310 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
2311
2312 // Do not modify lcssa phis where they actually belong, i.e. in exit blocks.
2313 if (OuterLoopLatch == InnerLoopExit)
2314 return;
2315
2316 // Collect and remove phis in non-exit blocks if they have 1 input.
2318 llvm::make_pointer_range(OuterLoopLatch->phis()));
2319 for (PHINode *Phi : Phis) {
2320 assert(Phi->getNumIncomingValues() == 1 && "Single input phi expected");
2321 LLVM_DEBUG(dbgs() << "Removing 1-input phi in non-exit block: " << *Phi
2322 << "\n");
2323 Phi->replaceAllUsesWith(Phi->getIncomingValue(0));
2324 Phi->eraseFromParent();
2325 }
2326}
2327
2328bool LoopInterchangeTransform::adjustLoopBranches() {
2329 LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
2330 std::vector<DominatorTree::UpdateType> DTUpdates;
2331
2332 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
2333 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2334
2335 assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
2336 InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&
2337 InnerLoopPreHeader && "Guaranteed by loop-simplify form");
2338
2339 simplifyLCSSAPhis(OuterLoop, InnerLoop);
2340
2341 // Ensure that both preheaders do not contain PHI nodes and have single
2342 // predecessors. This allows us to move them easily. We use
2343 // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing
2344 // preheaders do not satisfy those conditions.
2345 if (isa<PHINode>(OuterLoopPreHeader->begin()) ||
2346 !OuterLoopPreHeader->getUniquePredecessor())
2347 OuterLoopPreHeader =
2348 InsertPreheaderForLoop(OuterLoop, DT, LI, nullptr, true);
2349 if (InnerLoopPreHeader == OuterLoop->getHeader())
2350 InnerLoopPreHeader =
2351 InsertPreheaderForLoop(InnerLoop, DT, LI, nullptr, true);
2352
2353 // Adjust the loop preheader
2354 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
2355 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
2356 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
2357 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
2358 BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
2359 BasicBlock *InnerLoopLatchPredecessor =
2360 InnerLoopLatch->getUniquePredecessor();
2361 BasicBlock *InnerLoopLatchSuccessor;
2362 BasicBlock *OuterLoopLatchSuccessor;
2363
2364 CondBrInst *OuterLoopLatchBI =
2365 dyn_cast<CondBrInst>(OuterLoopLatch->getTerminator());
2366 CondBrInst *InnerLoopLatchBI =
2367 dyn_cast<CondBrInst>(InnerLoopLatch->getTerminator());
2368 Instruction *OuterLoopHeaderBI = OuterLoopHeader->getTerminator();
2369 Instruction *InnerLoopHeaderBI = InnerLoopHeader->getTerminator();
2370
2371 if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
2372 !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
2373 !InnerLoopHeaderBI)
2374 return false;
2375
2376 Instruction *InnerLoopLatchPredecessorBI =
2377 InnerLoopLatchPredecessor->getTerminator();
2378 Instruction *OuterLoopPredecessorBI = OuterLoopPredecessor->getTerminator();
2379
2380 if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
2381 return false;
2382 BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
2383 if (!InnerLoopHeaderSuccessor)
2384 return false;
2385
2386 // Adjust Loop Preheader and headers.
2387 // The branches in the outer loop predecessor and the outer loop header can
2388 // be unconditional branches or conditional branches with duplicates. Consider
2389 // this when updating the successors.
2390 updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader,
2391 InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false);
2392 // The outer loop header might or might not branch to the outer latch.
2393 // We are guaranteed to branch to the inner loop preheader.
2394 if (llvm::is_contained(successors(OuterLoopHeaderBI), OuterLoopLatch)) {
2395 // In this case the outerLoopHeader should branch to the InnerLoopLatch.
2396 updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, InnerLoopLatch,
2397 DTUpdates,
2398 /*MustUpdateOnce=*/false);
2399 }
2400 updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader,
2401 InnerLoopHeaderSuccessor, DTUpdates,
2402 /*MustUpdateOnce=*/false);
2403
2404 // Adjust reduction PHI's now that the incoming block has changed.
2405 InnerLoopHeaderSuccessor->replacePhiUsesWith(InnerLoopHeader,
2406 OuterLoopHeader);
2407
2408 updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor,
2409 OuterLoopPreHeader, DTUpdates);
2410
2411 // -------------Adjust loop latches-----------
2412 if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
2413 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
2414 else
2415 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
2416
2417 updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch,
2418 InnerLoopLatchSuccessor, DTUpdates);
2419
2420 if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
2421 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
2422 else
2423 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
2424
2425 updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor,
2426 OuterLoopLatchSuccessor, DTUpdates);
2427 updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch,
2428 DTUpdates);
2429
2430 DT->applyUpdates(DTUpdates);
2431 restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader,
2432 OuterLoopPreHeader);
2433
2434 moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch,
2435 OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock(),
2436 InnerLoop, LI);
2437 // For PHIs in the exit block of the outer loop, outer's latch has been
2438 // replaced by Inners'.
2439 OuterLoopLatchSuccessor->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
2440
2441 auto &OuterInnerReductions = LIL.getOuterInnerReductions();
2442 // Now update the reduction PHIs in the inner and outer loop headers.
2443 SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
2444 for (PHINode &PHI : InnerLoopHeader->phis())
2445 if (OuterInnerReductions.contains(&PHI))
2446 InnerLoopPHIs.push_back(&PHI);
2447
2448 for (PHINode &PHI : OuterLoopHeader->phis())
2449 if (OuterInnerReductions.contains(&PHI))
2450 OuterLoopPHIs.push_back(&PHI);
2451
2452 // Now move the remaining reduction PHIs from outer to inner loop header and
2453 // vice versa. The PHI nodes must be part of a reduction across the inner and
2454 // outer loop and all the remains to do is and updating the incoming blocks.
2455 for (PHINode *PHI : OuterLoopPHIs) {
2456 LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump(););
2457 PHI->moveBefore(InnerLoopHeader->getFirstNonPHIIt());
2458 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
2459 }
2460 for (PHINode *PHI : InnerLoopPHIs) {
2461 LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump(););
2462 PHI->moveBefore(OuterLoopHeader->getFirstNonPHIIt());
2463 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
2464 }
2465
2466 // Update the incoming blocks for moved PHI nodes.
2467 OuterLoopHeader->replacePhiUsesWith(InnerLoopPreHeader, OuterLoopPreHeader);
2468 OuterLoopHeader->replacePhiUsesWith(InnerLoopLatch, OuterLoopLatch);
2469 InnerLoopHeader->replacePhiUsesWith(OuterLoopPreHeader, InnerLoopPreHeader);
2470 InnerLoopHeader->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
2471
2472 // Values defined in the outer loop header could be used in the inner loop
2473 // latch. In that case, we need to create LCSSA phis for them, because after
2474 // interchanging they will be defined in the new inner loop and used in the
2475 // new outer loop.
2476 SmallVector<Instruction *, 4> MayNeedLCSSAPhis;
2477 for (Instruction &I :
2478 make_range(OuterLoopHeader->begin(), std::prev(OuterLoopHeader->end())))
2479 MayNeedLCSSAPhis.push_back(&I);
2480 formLCSSAForInstructions(MayNeedLCSSAPhis, *DT, *LI, SE);
2481
2482 return true;
2483}
2484
2485bool LoopInterchangeTransform::adjustLoopLinks() {
2486 // Adjust all branches in the inner and outer loop.
2487 bool Changed = adjustLoopBranches();
2488 if (Changed) {
2489 // We have interchanged the preheaders so we need to interchange the data in
2490 // the preheaders as well. This is because the content of the inner
2491 // preheader was previously executed inside the outer loop.
2492 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
2493 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2494 swapBBContents(OuterLoopPreHeader, InnerLoopPreHeader);
2495 }
2496 return Changed;
2497}
2498
2502 LPMUpdater &U) {
2503 Function &F = *LN.getParent();
2504 SmallVector<Loop *, 8> LoopList(LN.getLoops());
2505
2507
2508 // Ensure minimum depth of the loop nest to do the interchange.
2509 if (!hasSupportedLoopDepth(LoopList, ORE))
2510 return PreservedAnalyses::all();
2511 // Ensure computable loop nest.
2512 if (!isComputableLoopNest(&AR.SE, LoopList)) {
2513 LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
2514 return PreservedAnalyses::all();
2515 }
2516
2517 ORE.emit([&]() {
2518 return OptimizationRemarkAnalysis(DEBUG_TYPE, "Dependence",
2521 << "Computed dependence info, invoking the transform.";
2522 });
2523
2524 DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
2525 if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &AR, &ORE).run(LN))
2526 return PreservedAnalyses::all();
2527 U.markLoopNestChanged(true);
2529}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file defines the StringMap class.
Rewrite undef for PHI
ReachingDefInfo InstSet InstSet & Ignore
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
DXIL Resource Access
#define DEBUG_TYPE
static void moveBBContents(BasicBlock &SourceBB, BasicBlock &TargetBB)
Move the contents of SourceBB to before the last instruction of TargetBB.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
This file defines the interface for the loop cache analysis.
SmallVector< Loop *, 4 > LoopVector
Definition LoopFuse.cpp:348
Loop::LoopBounds::Direction Direction
Definition LoopInfo.cpp:253
static cl::list< RuleTy > Profitabilities("loop-interchange-profitabilities", cl::MiscFlags::CommaSeparated, cl::Hidden, cl::desc("List of profitability heuristics to be used. They are applied in " "the given order"), cl::list_init< RuleTy >({RuleTy::PerInstrOrderCost, RuleTy::ForVectorization}), cl::values(clEnumValN(RuleTy::PerLoopCacheAnalysis, "cache", "Prioritize loop cache cost"), clEnumValN(RuleTy::PerInstrOrderCost, "instorder", "Prioritize the IVs order of each instruction"), clEnumValN(RuleTy::ForVectorization, "vectorize", "Prioritize vectorization"), clEnumValN(RuleTy::Ignore, "ignore", "Ignore profitability, force interchange (does not " "work with other options)")))
static cl::opt< int > LoopInterchangeCostThreshold("loop-interchange-threshold", cl::init(0), cl::Hidden, cl::desc("Interchange if you gain more than this number"))
static cl::opt< unsigned int > MinLoopNestDepth("loop-interchange-min-loop-nest-depth", cl::init(2), cl::Hidden, cl::desc("Minimum depth of loop nest considered for the transform"))
static void updateSuccessor(Instruction *Term, BasicBlock *OldBB, BasicBlock *NewBB, std::vector< DominatorTree::UpdateType > &DTUpdates, bool MustUpdateOnce=true)
static cl::opt< bool > EnableReduction2Memory("loop-interchange-reduction-to-mem", cl::init(false), cl::Hidden, cl::desc("Support for the inner-loop reduction pattern."))
static bool isComputableLoopNest(ScalarEvolution *SE, ArrayRef< Loop * > LoopList)
std::optional< const SCEV * > getAddRecCoefficient(ScalarEvolution &SE, const SCEV *S, const Loop *L)
If \S contains an affine addrec for L, return the step recurrence of it.
static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop)
static void simplifyLCSSAPhis(Loop *OuterLoop, Loop *InnerLoop)
This deals with a corner case when a LCSSA phi node appears in a non-exit block: the outer loop latch...
static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx, unsigned ToIndx)
static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader, BasicBlock *InnerLatch, BasicBlock *OuterHeader, BasicBlock *OuterLatch, BasicBlock *OuterExit, Loop *InnerLoop, LoopInfo *LI)
static void printDepMatrix(CharMatrix &DepMatrix)
static cl::opt< unsigned int > MaxMemInstrRatio("loop-interchange-max-mem-instr-ratio", cl::init(4), cl::Hidden, cl::desc("Maximum number of load/store instructions squared in relation to " "the total number of instructions. Higher value may lead to more " "interchanges at the cost of compile-time"))
static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2)
Swap instructions between BB1 and BB2 but keep terminators intact.
static PHINode * findInnerReductionPhi(Loop *L, Value *V, SmallVectorImpl< Instruction * > &HasNoWrapInsts, SmallVectorImpl< Instruction * > &HasNoInfInsts)
static bool areInnerLoopExitPHIsSupported(Loop *OuterL, Loop *InnerL, SmallPtrSetImpl< PHINode * > &Reductions, PHINode *LcssaReduction)
We currently only support LCSSA PHI nodes in the inner loop exit if their users are either of the fol...
static cl::opt< unsigned int > MaxLoopNestDepth("loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden, cl::desc("Maximum depth of loop nest considered for the transform"))
static bool hasSupportedLoopDepth(ArrayRef< Loop * > LoopList, OptimizationRemarkEmitter &ORE)
static bool inThisOrder(const Instruction *Src, const Instruction *Dst)
Return true if Src appears before Dst in the same basic block.
static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop)
static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId)
Return true if we can vectorize the loop specified by LoopId.
static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, unsigned InnerLoopId, unsigned OuterLoopId)
#define DEBUG_TYPE
static Value * followLCSSA(Value *SV)
static void populateWorklist(Loop &L, LoopVector &LoopList)
static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, Loop *L, DependenceInfo *DI, ScalarEvolution *SE, OptimizationRemarkEmitter *ORE)
static std::optional< bool > isLexicographicallyPositive(ArrayRef< char > DV, unsigned Begin, unsigned End)
static bool checkReductionKind(Loop *L, PHINode *PHI, SmallVectorImpl< Instruction * > &HasNoWrapInsts, SmallVectorImpl< Instruction * > &HasNoInfInsts)
static bool noDuplicateRulesAndIgnore(ArrayRef< RuleTy > Rules)
This file defines the interface for the loop nest analysis.
This header provides classes for managing a pipeline of passes over loops in LLVM IR.
loop Loop Strength Reduction
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
uint64_t IntrinsicInst * II
#define P(N)
This file contains some templates that are useful if you are working with the STL at all.
SmallVector< Value *, 8 > ValueVector
This file defines the SmallSet class.
This file defines the SmallVector class.
static bool isProfitable(const StableFunctionMap::StableFunctionEntries &SFS)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:119
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
Get the first element.
Definition ArrayRef.h:144
size_t size() const
Get the array size.
Definition ArrayRef.h:141
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:185
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:474
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:461
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:530
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI void replacePhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block to refer to basic block New instead of basic block Old.
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:659
static std::unique_ptr< CacheCost > getCacheCost(Loop &Root, LoopStandardAnalysisResults &AR, DependenceInfo &DI, std::optional< unsigned > TRT=std::nullopt)
Create a CacheCost for the loop nest rooted by Root.
CacheCostTy getLoopCost(const Loop &L) const
Return the estimated cost of loop L if the given loop is part of the loop nest associated with this o...
Value * getCondition() const
BasicBlock * getSuccessor(unsigned i) const
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:81
DependenceInfo - This class is the main dependence-analysis driver.
LLVM_ABI std::unique_ptr< Dependence > depends(Instruction *Src, Instruction *Dst, bool UnderRuntimeAssumptions=false)
depends - Tests for a dependence between the Src and Dst instructions.
void applyUpdates(ArrayRef< UpdateType > Updates)
Inform the dominator tree about a sequence of CFG edge insertions and deletions and perform a batch u...
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static LLVM_ABI bool isInductionPHI(PHINode *Phi, const Loop *L, ScalarEvolution *SE, InductionDescriptor &D, const SCEV *Expr=nullptr, SmallVectorImpl< Instruction * > *CastsToIgnore=nullptr)
Returns true if Phi is an induction in the loop L.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI void insertBefore(InstListType::iterator InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified position.
LLVM_ABI bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void removeBlockFromLoop(BlockT *BB)
This removes the specified basic block from the current loop, updating the Blocks as appropriate.
const std::vector< LoopT * > & getSubLoops() const
Return the loops contained entirely within this loop.
BlockT * getHeader() const
iterator_range< block_iterator > blocks() const
void addChildLoop(LoopT *NewChild)
Add the specified loop to be a child of this loop.
void addBlockEntry(BlockT *BB)
This adds a basic block directly to the basic block list.
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
iterator begin() const
BlockT * getUniqueExitBlock() const
If getUniqueExitBlocks would return exactly one block, return that block.
LoopT * removeChildLoop(iterator I)
This removes the specified child from being a subloop of this loop.
void changeTopLevelLoop(LoopT *OldLoop, LoopT *NewLoop)
Replace the specified loop in the top-level loops list with the indicated loop.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
void changeLoopFor(const BlockT *BB, LoopT *L)
Change the top-level loop that contains BB to the specified loop.
This class represents a loop nest and can be used to query its properties.
static const BasicBlock & skipEmptyBlockUntil(const BasicBlock *From, const BasicBlock *End, bool CheckUniquePred=false)
Recursivelly traverse all empty 'single successor' basic blocks of From (if there are any).
ArrayRef< Loop * > getLoops() const
Get the loops in the nest.
Function * getParent() const
Return the function to which the loop-nest belongs.
Loop & getOutermostLoop() const
Return the outermost loop in the loop nest.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition LoopInfo.cpp:659
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition LoopInfo.cpp:67
StringRef getName() const
Definition LoopInfo.h:407
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
bool isComplete() const
If the PHI node is complete which means all of its parent's predecessors have incoming value in this ...
op_range incoming_values()
void setIncomingBlock(unsigned i, BasicBlock *BB)
void setIncomingValue(unsigned i, Value *V)
static unsigned getIncomingValueNumForOperand(unsigned i)
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Instruction * getExactFPMathInst() const
Returns 1st non-reassociative FP instruction in the PHI node's use-chain.
static LLVM_ABI bool isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes, DemandedBits *DB=nullptr, AssumptionCache *AC=nullptr, DominatorTree *DT=nullptr, ScalarEvolution *SE=nullptr)
Returns true if Phi is a reduction in TheLoop.
LLVM_ABI SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
SCEVUse getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
The main scalar evolution driver.
LLVM_ABI const SCEV * getAbsExpr(const SCEV *Op, bool IsNSW)
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI const SCEV * getPointerBase(const SCEV *V)
Transitively follow the chain of pointer-type operands until reaching a SCEV that does not have a sin...
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, SCEVUse LHS, SCEVUse RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
std::pair< iterator, bool > try_emplace(StringRef Key, ArgsTy &&...Args)
Emplace a new element for the specified key into the map if the key isn't already in the map.
Definition StringMap.h:381
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
constexpr size_t size() const
Get the string size.
Definition StringRef.h:144
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:162
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:549
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:184
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
list_initializer< Ty > list_init(ArrayRef< Ty > Vals)
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
DXILDebugInfoMap run(Module &M)
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI BasicBlock * InsertPreheaderForLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
InsertPreheaderForLoop - Once we discover that a loop doesn't have a preheader, this method is called...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
auto successors(const MachineBasicBlock *BB)
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
LLVM_ABI bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition LCSSA.cpp:449
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172
auto map_range(ContainerTy &&C, FuncTy F)
Return a range that applies F to the elements of C.
Definition STLExtras.h:365
AnalysisManager< Loop, LoopStandardAnalysisResults & > LoopAnalysisManager
The loop analysis manager.
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:2025
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:322
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FMinimumNum
FP min with llvm.minimumnum semantics.
@ Or
Bitwise or logical OR of integers.
@ FMinimum
FP min with llvm.minimum semantics.
@ Mul
Product of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMaximum
FP max with llvm.maximum semantics.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Add
Sum of integers.
@ FAdd
Sum of floats.
@ FMaximumNum
FP max with llvm.maximumnum semantics.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2011
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI bool formLCSSAForInstructions(SmallVectorImpl< Instruction * > &Worklist, const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE, SmallVectorImpl< PHINode * > *PHIsToRemove=nullptr, SmallVectorImpl< PHINode * > *InsertedPHIs=nullptr)
Ensures LCSSA form for every instruction from the Worklist in the scope of innermost containing loop.
Definition LCSSA.cpp:308
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
auto predecessors(const MachineBasicBlock *BB)
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:368
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:876
PreservedAnalyses run(LoopNest &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...