LLVM 23.0.0git
LoopInterchange.cpp
Go to the documentation of this file.
1//===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This Pass handles loop interchange transform.
10// This pass interchanges loops to provide a more cache-friendly memory access
11// patterns.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/STLExtras.h"
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/Statistic.h"
20#include "llvm/ADT/StringMap.h"
21#include "llvm/ADT/StringRef.h"
30#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/Dominators.h"
33#include "llvm/IR/Function.h"
34#include "llvm/IR/IRBuilder.h"
35#include "llvm/IR/InstrTypes.h"
36#include "llvm/IR/Instruction.h"
38#include "llvm/IR/User.h"
39#include "llvm/IR/Value.h"
42#include "llvm/Support/Debug.h"
49#include <cassert>
50#include <utility>
51#include <vector>
52
53using namespace llvm;
54
55#define DEBUG_TYPE "loop-interchange"
56
57STATISTIC(LoopsInterchanged, "Number of loops interchanged");
58
60 "loop-interchange-threshold", cl::init(0), cl::Hidden,
61 cl::desc("Interchange if you gain more than this number"));
62
64 "loop-interchange-max-mem-instr-ratio", cl::init(4), cl::Hidden,
65 cl::desc("Maximum number of load/store instructions squared in relation to "
66 "the total number of instructions. Higher value may lead to more "
67 "interchanges at the cost of compile-time"));
68
69namespace {
70
72
73/// A list of direction vectors. Each entry represents a direction vector
74/// corresponding to one or more dependencies existing in the loop nest. The
75/// length of all direction vectors is equal and is N + 1, where N is the depth
76/// of the loop nest. The first N elements correspond to the dependency
77/// direction of each N loops. The last one indicates whether this entry is
78/// forward dependency ('<') or not ('*'). The term "forward" aligns with what
79/// is defined in LoopAccessAnalysis.
80// TODO: Check if we can use a sparse matrix here.
81using CharMatrix = std::vector<std::vector<char>>;
82
83/// Types of rules used in profitability check.
84enum class RuleTy {
85 PerLoopCacheAnalysis,
86 PerInstrOrderCost,
87 ForVectorization,
88 Ignore
89};
90
91} // end anonymous namespace
92
93// Minimum loop depth supported.
95 "loop-interchange-min-loop-nest-depth", cl::init(2), cl::Hidden,
96 cl::desc("Minimum depth of loop nest considered for the transform"));
97
98// Maximum loop depth supported.
100 "loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden,
101 cl::desc("Maximum depth of loop nest considered for the transform"));
102
103// We prefer cache cost to vectorization by default.
105 "loop-interchange-profitabilities", cl::MiscFlags::CommaSeparated,
107 cl::desc("List of profitability heuristics to be used. They are applied in "
108 "the given order"),
109 cl::list_init<RuleTy>({RuleTy::PerInstrOrderCost,
110 RuleTy::ForVectorization}),
111 cl::values(clEnumValN(RuleTy::PerLoopCacheAnalysis, "cache",
112 "Prioritize loop cache cost"),
113 clEnumValN(RuleTy::PerInstrOrderCost, "instorder",
114 "Prioritize the IVs order of each instruction"),
115 clEnumValN(RuleTy::ForVectorization, "vectorize",
116 "Prioritize vectorization"),
117 clEnumValN(RuleTy::Ignore, "ignore",
118 "Ignore profitability, force interchange (does not "
119 "work with other options)")));
120
121// Support for the inner-loop reduction pattern.
123 "loop-interchange-reduction-to-mem", cl::init(false), cl::Hidden,
124 cl::desc("Support for the inner-loop reduction pattern."));
125
126#ifndef NDEBUG
129 for (RuleTy Rule : Rules) {
130 if (!Set.insert(Rule).second)
131 return false;
132 if (Rule == RuleTy::Ignore)
133 return false;
134 }
135 return true;
136}
137
138static void printDepMatrix(CharMatrix &DepMatrix) {
139 for (auto &Row : DepMatrix) {
140 // Drop the last element because it is a flag indicating whether this is
141 // forward dependency or not, which doesn't affect the legality check.
142 for (char D : drop_end(Row))
143 LLVM_DEBUG(dbgs() << D << " ");
144 LLVM_DEBUG(dbgs() << "\n");
145 }
146}
147
148/// Return true if \p Src appears before \p Dst in the same basic block.
149/// Precondition: \p Src and \Dst are distinct instructions within the same
150/// basic block.
151static bool inThisOrder(const Instruction *Src, const Instruction *Dst) {
152 assert(Src->getParent() == Dst->getParent() && Src != Dst &&
153 "Expected Src and Dst to be different instructions in the same BB");
154
155 bool FoundSrc = false;
156 for (const Instruction &I : *(Src->getParent())) {
157 if (&I == Src) {
158 FoundSrc = true;
159 continue;
160 }
161 if (&I == Dst)
162 return FoundSrc;
163 }
164
165 llvm_unreachable("Dst not found");
166}
167#endif
168
169static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
170 Loop *L, DependenceInfo *DI,
171 ScalarEvolution *SE,
174
175 ValueVector MemInstr;
176 unsigned NumInsts = 0;
177
178 // For each block.
179 for (BasicBlock *BB : L->blocks()) {
180 // Scan the BB and collect legal loads and stores.
181 for (Instruction &I : *BB) {
182 NumInsts++;
183 if (auto *Ld = dyn_cast<LoadInst>(&I)) {
184 if (!Ld->isSimple())
185 return false;
186 MemInstr.push_back(&I);
187 } else if (auto *St = dyn_cast<StoreInst>(&I)) {
188 if (!St->isSimple())
189 return false;
190 MemInstr.push_back(&I);
191 }
192 }
193 }
194
195 // To populate the dependence matrix, we perform dependence test for each pair
196 // of memory instructions, which has O(NumMemInstr^2) complexity. This implies
197 // that even if the number of memory instructions is small, the analysis can
198 // still be expensive if the most of the instructions in the loop are memory
199 // instructions. On the other hand, if the number of memory instructions is
200 // not small, but the loop is large (i.e., it contains many non-memory
201 // instructions), the analysis can still be affordable.
202 unsigned NumMemInstr = MemInstr.size();
203 LLVM_DEBUG(dbgs() << "Found " << NumMemInstr
204 << " Loads and Stores to analyze\n");
205 if (MaxMemInstrRatio * NumInsts < NumMemInstr * NumMemInstr) {
206 ORE->emit([&]() {
207 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoop",
208 L->getStartLoc(), L->getHeader())
209 << "Number of loads/stores exceeded, the supported maximum can be "
210 "increased with option -loop-interchange-max-mem-instr-ratio.";
211 });
212 return false;
213 }
214 ValueVector::iterator I, IE, J, JE;
215
216 // Manage direction vectors that are already seen. Map each direction vector
217 // to an index of DepMatrix at which it is stored.
219
220 for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
221 for (J = I, JE = MemInstr.end(); J != JE; ++J) {
222 std::vector<char> Dep;
225 // Ignore Input dependencies.
226 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
227 continue;
228 // Track Output, Flow, and Anti dependencies.
229 if (auto D = DI->depends(Src, Dst)) {
230 assert(D->isOrdered() && "Expected an output, flow or anti dep.");
231 // If the direction vector is negative, normalize it to
232 // make it non-negative.
233 if (D->normalize(SE))
234 LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
235 LLVM_DEBUG(StringRef DepType =
236 D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
237 dbgs() << "Found " << DepType
238 << " dependency between Src and Dst\n"
239 << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
240 unsigned Levels = D->getLevels();
241 char Direction;
242 for (unsigned II = 1; II <= Levels; ++II) {
243 // `DVEntry::LE` is converted to `*`. This is because `LE` means `<`
244 // or `=`, for which we don't have an equivalent representation, so
245 // that the conservative approximation is necessary. The same goes for
246 // `DVEntry::GE`.
247 // TODO: Use of fine-grained expressions allows for more accurate
248 // analysis.
249 unsigned Dir = D->getDirection(II);
250 if (Dir == Dependence::DVEntry::LT)
251 Direction = '<';
252 else if (Dir == Dependence::DVEntry::GT)
253 Direction = '>';
254 else if (Dir == Dependence::DVEntry::EQ)
255 Direction = '=';
256 else
257 Direction = '*';
258 Dep.push_back(Direction);
259 }
260
261 // If the Dependence object doesn't have any information, fill the
262 // dependency vector with '*'.
263 if (D->isConfused()) {
264 assert(Dep.empty() && "Expected empty dependency vector");
265 Dep.assign(Level, '*');
266 }
267
268 while (Dep.size() != Level) {
269 Dep.push_back('I');
270 }
271
272 // If all the elements of any direction vector have only '*', legality
273 // can't be proven. Exit early to save compile time.
274 if (all_of(Dep, equal_to('*'))) {
275 ORE->emit([&]() {
276 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
277 L->getStartLoc(), L->getHeader())
278 << "All loops have dependencies in all directions.";
279 });
280 return false;
281 }
282
283 // Test whether the dependency is forward or not.
284 bool IsKnownForward = true;
285 if (Src->getParent() != Dst->getParent()) {
286 // In general, when Src and Dst are in different BBs, the execution
287 // order of them within a single iteration is not guaranteed. Treat
288 // conservatively as not-forward dependency in this case.
289 IsKnownForward = false;
290 } else {
291 // Src and Dst are in the same BB. If they are the different
292 // instructions, Src should appear before Dst in the BB as they are
293 // stored to MemInstr in that order.
294 assert((Src == Dst || inThisOrder(Src, Dst)) &&
295 "Unexpected instructions");
296
297 // If the Dependence object is reversed (due to normalization), it
298 // represents the dependency from Dst to Src, meaning it is a backward
299 // dependency. Otherwise it should be a forward dependency.
300 bool IsReversed = D->getSrc() != Src;
301 if (IsReversed)
302 IsKnownForward = false;
303 }
304
305 // Initialize the last element. Assume forward dependencies only; it
306 // will be updated later if there is any non-forward dependency.
307 Dep.push_back('<');
308
309 // The last element should express the "summary" among one or more
310 // direction vectors whose first N elements are the same (where N is
311 // the depth of the loop nest). Hence we exclude the last element from
312 // the Seen map.
313 auto [Ite, Inserted] = Seen.try_emplace(
314 StringRef(Dep.data(), Dep.size() - 1), DepMatrix.size());
315
316 // Make sure we only add unique entries to the dependency matrix.
317 if (Inserted)
318 DepMatrix.push_back(Dep);
319
320 // If we cannot prove that this dependency is forward, change the last
321 // element of the corresponding entry. Since a `[... *]` dependency
322 // includes a `[... <]` dependency, we do not need to keep both and
323 // change the existing entry instead.
324 if (!IsKnownForward)
325 DepMatrix[Ite->second].back() = '*';
326 }
327 }
328 }
329
330 return true;
331}
332
333// A loop is moved from index 'from' to an index 'to'. Update the Dependence
334// matrix by exchanging the two columns.
335static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
336 unsigned ToIndx) {
337 for (auto &Row : DepMatrix)
338 std::swap(Row[ToIndx], Row[FromIndx]);
339}
340
341// Check if a direction vector is lexicographically positive. Return true if it
342// is positive, nullopt if it is "zero", otherwise false.
343// [Theorem] A permutation of the loops in a perfect nest is legal if and only
344// if the direction matrix, after the same permutation is applied to its
345// columns, has no ">" direction as the leftmost non-"=" direction in any row.
346static std::optional<bool>
347isLexicographicallyPositive(ArrayRef<char> DV, unsigned Begin, unsigned End) {
348 for (unsigned char Direction : DV.slice(Begin, End - Begin)) {
349 if (Direction == '<')
350 return true;
351 if (Direction == '>' || Direction == '*')
352 return false;
353 }
354 return std::nullopt;
355}
356
357// Checks if it is legal to interchange 2 loops.
358static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
359 unsigned InnerLoopId,
360 unsigned OuterLoopId) {
361 unsigned NumRows = DepMatrix.size();
362 std::vector<char> Cur;
363 // For each row check if it is valid to interchange.
364 for (unsigned Row = 0; Row < NumRows; ++Row) {
365 // Create temporary DepVector check its lexicographical order
366 // before and after swapping OuterLoop vs InnerLoop
367 Cur = DepMatrix[Row];
368
369 // If the surrounding loops already ensure that the direction vector is
370 // lexicographically positive, nothing within the loop will be able to break
371 // the dependence. In such a case we can skip the subsequent check.
372 if (isLexicographicallyPositive(Cur, 0, OuterLoopId) == true)
373 continue;
374
375 // Check if the direction vector is lexicographically positive (or zero)
376 // for both before/after exchanged. Ignore the last element because it
377 // doesn't affect the legality.
378 if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
379 return false;
380 std::swap(Cur[InnerLoopId], Cur[OuterLoopId]);
381 if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
382 return false;
383 }
384 return true;
385}
386
387static void populateWorklist(Loop &L, LoopVector &LoopList) {
388 LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
389 << L.getHeader()->getParent()->getName() << " Loop: %"
390 << L.getHeader()->getName() << '\n');
391 assert(LoopList.empty() && "LoopList should initially be empty!");
392 Loop *CurrentLoop = &L;
393 const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
394 while (!Vec->empty()) {
395 // The current loop has multiple subloops in it hence it is not tightly
396 // nested.
397 // Discard all loops above it added into Worklist.
398 if (Vec->size() != 1) {
399 LoopList = {};
400 return;
401 }
402
403 LoopList.push_back(CurrentLoop);
404 CurrentLoop = Vec->front();
405 Vec = &CurrentLoop->getSubLoops();
406 }
407 LoopList.push_back(CurrentLoop);
408}
409
412 unsigned LoopNestDepth = LoopList.size();
413 if (LoopNestDepth < MinLoopNestDepth || LoopNestDepth > MaxLoopNestDepth) {
414 LLVM_DEBUG(dbgs() << "Unsupported depth of loop nest " << LoopNestDepth
415 << ", the supported range is [" << MinLoopNestDepth
416 << ", " << MaxLoopNestDepth << "].\n");
417 Loop *OuterLoop = LoopList.front();
418 ORE.emit([&]() {
419 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoopNestDepth",
420 OuterLoop->getStartLoc(),
421 OuterLoop->getHeader())
422 << "Unsupported depth of loop nest, the supported range is ["
423 << std::to_string(MinLoopNestDepth) << ", "
424 << std::to_string(MaxLoopNestDepth) << "].\n";
425 });
426 return false;
427 }
428 return true;
429}
430
432 ArrayRef<Loop *> LoopList) {
433 for (Loop *L : LoopList) {
434 const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
435 if (isa<SCEVCouldNotCompute>(ExitCountOuter)) {
436 LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
437 return false;
438 }
439 if (L->getNumBackEdges() != 1) {
440 LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
441 return false;
442 }
443 if (!L->getExitingBlock()) {
444 LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
445 return false;
446 }
447 }
448 return true;
449}
450
451namespace {
452
453/// LoopInterchangeLegality checks if it is legal to interchange the loop.
454class LoopInterchangeLegality {
455public:
456 LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
457 OptimizationRemarkEmitter *ORE, DominatorTree *DT)
458 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), DT(DT), ORE(ORE) {}
459
460 /// Check if the loops can be interchanged.
461 bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
462 CharMatrix &DepMatrix);
463
464 /// Check if the loop structure is understood. We do not handle triangular
465 /// loops for now.
466 bool isLoopStructureUnderstood();
467
468 bool currentLimitations();
469
470 const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const {
471 return OuterInnerReductions;
472 }
473
474 const ArrayRef<PHINode *> getInnerLoopInductions() const {
475 return InnerLoopInductions;
476 }
477
478 ArrayRef<Instruction *> getHasNoWrapReductions() const {
479 return HasNoWrapReductions;
480 }
481
482 ArrayRef<Instruction *> getHasNoInfInsts() const { return HasNoInfInsts; }
483
484 /// Record reductions in the inner loop. Currently supported reductions:
485 /// - initialized from a constant.
486 /// - reduction PHI node has only one user.
487 /// - located in the innermost loop.
488 struct InnerReduction {
489 /// The reduction itself.
490 PHINode *Reduction;
491 Value *Init;
492 Value *Next;
493 /// The Lcssa PHI.
494 PHINode *LcssaPhi;
495 /// Store reduction result into memory object.
496 StoreInst *LcssaStore;
497 /// The memory Location.
498 Value *MemRef;
499 Type *ElemTy;
500 };
501
502 ArrayRef<InnerReduction> getInnerReductions() const {
503 return InnerReductions;
504 }
505
506private:
507 bool tightlyNested(Loop *Outer, Loop *Inner);
508 bool containsUnsafeInstructions(BasicBlock *BB, Instruction *Skip);
509
510 /// Traverse all PHI nodes in the header of each loop in the loop nest
511 /// starting from \p OuterLoop, and perform the following checks:
512 ///
513 /// - Identify induction variables in the child loop of \p OuterLoop.
514 /// - Check for reductions across the inner loop and \p OuterLoop.
515 /// - Detect unsupported PHI nodes.
516 ///
517 /// Return false if any unsupported PHI node is found or if no induction
518 /// variable is found in the child loop of \p OuterLoop. Otherwise return
519 /// true.
520 bool checkInductionsAndReductions(Loop *OuterLoop);
521
522 /// Detect and record the reduction of the inner loop. Add them to
523 /// InnerReductions.
524 ///
525 /// innerloop:
526 /// Re = phi<0.0, Next>
527 /// Next = Re op ...
528 /// OuterLoopLatch:
529 /// Lcssa = phi<Next> ; lcssa phi
530 /// store Lcssa, MemRef ; LcssaStore
531 ///
532 bool isInnerReduction(Loop *L, PHINode *Phi,
533 SmallVectorImpl<Instruction *> &HasNoWrapInsts);
534
535 Loop *OuterLoop;
536 Loop *InnerLoop;
537
538 ScalarEvolution *SE;
539 DominatorTree *DT;
540
541 /// Interface to emit optimization remarks.
542 OptimizationRemarkEmitter *ORE;
543
544 /// Set of reduction PHIs taking part of a reduction across the inner and
545 /// outer loop.
546 SmallPtrSet<PHINode *, 4> OuterInnerReductions;
547
548 /// Set of inner loop induction PHIs
549 SmallVector<PHINode *, 8> InnerLoopInductions;
550
551 /// Hold instructions that have nuw/nsw flags and involved in reductions,
552 /// like integer addition/multiplication. Those flags must be dropped when
553 /// interchanging the loops.
554 SmallVector<Instruction *, 4> HasNoWrapReductions;
555
556 /// Hold instructions that have ninf flags and involved in reductions. Those
557 /// flags must be dropped when interchanging the loops.
558 SmallVector<Instruction *, 4> HasNoInfInsts;
559
560 /// Vector of reductions in the inner loop.
561 SmallVector<InnerReduction, 8> InnerReductions;
562};
563
564/// Manages information utilized by the profitability check for cache. The main
565/// purpose of this class is to delay the computation of CacheCost until it is
566/// actually needed.
567class CacheCostManager {
568 Loop *OutermostLoop;
569 LoopStandardAnalysisResults *AR;
570 DependenceInfo *DI;
571
572 /// CacheCost for \ref OutermostLoop. Once it is computed, it is cached. Note
573 /// that the result can be nullptr.
574 std::optional<std::unique_ptr<CacheCost>> CC;
575
576 /// Maps each loop to an index representing the optimal position within the
577 /// loop-nest, as determined by the cache cost analysis.
578 DenseMap<const Loop *, unsigned> CostMap;
579
580 void computeIfUnitinialized();
581
582public:
583 CacheCostManager(Loop *OutermostLoop, LoopStandardAnalysisResults *AR,
584 DependenceInfo *DI)
585 : OutermostLoop(OutermostLoop), AR(AR), DI(DI) {}
586 CacheCost *getCacheCost();
587 const DenseMap<const Loop *, unsigned> &getCostMap();
588};
589
590/// LoopInterchangeProfitability checks if it is profitable to interchange the
591/// loop.
592class LoopInterchangeProfitability {
593public:
594 LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
595 OptimizationRemarkEmitter *ORE)
596 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
597
598 /// Check if the loop interchange is profitable.
599 bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
600 unsigned InnerLoopId, unsigned OuterLoopId,
601 CharMatrix &DepMatrix, CacheCostManager &CCM);
602
603private:
604 int getInstrOrderCost();
605 std::optional<bool> isProfitablePerLoopCacheAnalysis(
606 const DenseMap<const Loop *, unsigned> &CostMap, CacheCost *CC);
607 std::optional<bool> isProfitablePerInstrOrderCost();
608 std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
609 unsigned OuterLoopId,
610 CharMatrix &DepMatrix);
611 Loop *OuterLoop;
612 Loop *InnerLoop;
613
614 /// Scev analysis.
615 ScalarEvolution *SE;
616
617 /// Interface to emit optimization remarks.
618 OptimizationRemarkEmitter *ORE;
619};
620
621/// LoopInterchangeTransform interchanges the loop.
622class LoopInterchangeTransform {
623public:
624 LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
625 LoopInfo *LI, DominatorTree *DT,
626 const LoopInterchangeLegality &LIL)
627 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {}
628
629 /// Interchange OuterLoop and InnerLoop.
630 bool transform(ArrayRef<Instruction *> DropNoWrapInsts,
631 ArrayRef<Instruction *> DropNoInfInsts);
632 void reduction2Memory();
633 void restructureLoops(Loop *NewInner, Loop *NewOuter,
634 BasicBlock *OrigInnerPreHeader,
635 BasicBlock *OrigOuterPreHeader);
636 void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
637
638private:
639 bool adjustLoopLinks();
640 bool adjustLoopBranches();
641
642 Loop *OuterLoop;
643 Loop *InnerLoop;
644
645 /// Scev analysis.
646 ScalarEvolution *SE;
647
648 LoopInfo *LI;
649 DominatorTree *DT;
650
651 const LoopInterchangeLegality &LIL;
652};
653
654struct LoopInterchange {
655 ScalarEvolution *SE = nullptr;
656 LoopInfo *LI = nullptr;
657 DependenceInfo *DI = nullptr;
658 DominatorTree *DT = nullptr;
659 LoopStandardAnalysisResults *AR = nullptr;
660
661 /// Interface to emit optimization remarks.
662 OptimizationRemarkEmitter *ORE;
663
664 LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
665 DominatorTree *DT, LoopStandardAnalysisResults *AR,
666 OptimizationRemarkEmitter *ORE)
667 : SE(SE), LI(LI), DI(DI), DT(DT), AR(AR), ORE(ORE) {}
668
669 bool run(Loop *L) {
670 if (L->getParentLoop())
671 return false;
672 SmallVector<Loop *, 8> LoopList;
673 populateWorklist(*L, LoopList);
674 return processLoopList(LoopList);
675 }
676
677 bool run(LoopNest &LN) {
678 SmallVector<Loop *, 8> LoopList(LN.getLoops());
679 for (unsigned I = 1; I < LoopList.size(); ++I)
680 if (LoopList[I]->getParentLoop() != LoopList[I - 1])
681 return false;
682 return processLoopList(LoopList);
683 }
684
685 unsigned selectLoopForInterchange(ArrayRef<Loop *> LoopList) {
686 // TODO: Add a better heuristic to select the loop to be interchanged based
687 // on the dependence matrix. Currently we select the innermost loop.
688 return LoopList.size() - 1;
689 }
690
691 bool processLoopList(SmallVectorImpl<Loop *> &LoopList) {
692 bool Changed = false;
693
694 // Ensure proper loop nest depth.
695 assert(hasSupportedLoopDepth(LoopList, *ORE) &&
696 "Unsupported depth of loop nest.");
697
698 unsigned LoopNestDepth = LoopList.size();
699
700 LLVM_DEBUG({
701 dbgs() << "Processing LoopList of size = " << LoopNestDepth
702 << " containing the following loops:\n";
703 for (auto *L : LoopList) {
704 dbgs() << " - ";
705 L->print(dbgs());
706 }
707 });
708
709 CharMatrix DependencyMatrix;
710 Loop *OuterMostLoop = *(LoopList.begin());
711 if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
712 OuterMostLoop, DI, SE, ORE)) {
713 LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
714 return false;
715 }
716
717 LLVM_DEBUG(dbgs() << "Dependency matrix before interchange:\n";
718 printDepMatrix(DependencyMatrix));
719
720 // Get the Outermost loop exit.
721 BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
722 if (!LoopNestExit) {
723 LLVM_DEBUG(dbgs() << "OuterMostLoop '" << OuterMostLoop->getName()
724 << "' needs an unique exit block");
725 return false;
726 }
727
728 unsigned SelecLoopId = selectLoopForInterchange(LoopList);
729 CacheCostManager CCM(LoopList[0], AR, DI);
730 // We try to achieve the globally optimal memory access for the loopnest,
731 // and do interchange based on a bubble-sort fasion. We start from
732 // the innermost loop, move it outwards to the best possible position
733 // and repeat this process.
734 for (unsigned j = SelecLoopId; j > 0; j--) {
735 bool ChangedPerIter = false;
736 for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
737 bool Interchanged =
738 processLoop(LoopList, i, i - 1, DependencyMatrix, CCM);
739 ChangedPerIter |= Interchanged;
740 Changed |= Interchanged;
741 }
742 // Early abort if there was no interchange during an entire round of
743 // moving loops outwards.
744 if (!ChangedPerIter)
745 break;
746 }
747 return Changed;
748 }
749
750 bool processLoop(SmallVectorImpl<Loop *> &LoopList, unsigned InnerLoopId,
751 unsigned OuterLoopId,
752 std::vector<std::vector<char>> &DependencyMatrix,
753 CacheCostManager &CCM) {
754 Loop *OuterLoop = LoopList[OuterLoopId];
755 Loop *InnerLoop = LoopList[InnerLoopId];
756 LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
757 << " and OuterLoopId = " << OuterLoopId << "\n");
758 LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE, DT);
759 if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
760 LLVM_DEBUG(dbgs() << "Cannot prove legality, not interchanging loops '"
761 << OuterLoop->getName() << "' and '"
762 << InnerLoop->getName() << "'\n");
763 return false;
764 }
765 LLVM_DEBUG(dbgs() << "Loops '" << OuterLoop->getName() << "' and '"
766 << InnerLoop->getName()
767 << "' are legal to interchange\n");
768 LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
769 if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
770 DependencyMatrix, CCM)) {
771 LLVM_DEBUG(dbgs() << "Interchanging loops '" << OuterLoop->getName()
772 << "' and '" << InnerLoop->getName()
773 << "' not profitable.\n");
774 return false;
775 }
776
777 ORE->emit([&]() {
778 return OptimizationRemark(DEBUG_TYPE, "Interchanged",
779 InnerLoop->getStartLoc(),
780 InnerLoop->getHeader())
781 << "Loop interchanged with enclosing loop.";
782 });
783
784 LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL);
785 LIT.transform(LIL.getHasNoWrapReductions(), LIL.getHasNoInfInsts());
786 LLVM_DEBUG(dbgs() << "Loops interchanged: outer loop '"
787 << OuterLoop->getName() << "' and inner loop '"
788 << InnerLoop->getName() << "'\n");
789 LoopsInterchanged++;
790
791 llvm::formLCSSARecursively(*OuterLoop, *DT, LI, SE);
792
793 // Loops interchanged, update LoopList accordingly.
794 std::swap(LoopList[OuterLoopId], LoopList[InnerLoopId]);
795 // Update the DependencyMatrix
796 interChangeDependencies(DependencyMatrix, InnerLoopId, OuterLoopId);
797
798 LLVM_DEBUG(dbgs() << "Dependency matrix after interchange:\n";
799 printDepMatrix(DependencyMatrix));
800
801 return true;
802 }
803};
804
805} // end anonymous namespace
806
807bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB,
808 Instruction *Skip) {
809 return any_of(*BB, [Skip](const Instruction &I) {
810 if (&I == Skip)
811 return false;
812 return I.mayHaveSideEffects() || I.mayReadFromMemory();
813 });
814}
815
816bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
817 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
818 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
819 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
820
821 LLVM_DEBUG(dbgs() << "Checking if loops '" << OuterLoop->getName()
822 << "' and '" << InnerLoop->getName()
823 << "' are tightly nested\n");
824
825 // In a perfectly nested loop the outer header branches only into the inner
826 // loop. If it can also reach the outer latch, it conditionally guards the
827 // inner loop (an imperfect nest), so the inner loop runs on only a subset of
828 // the outer iterations. Interchanging such a nest would run the inner loop on
829 // every outer iteration, including the guarded-off ones, which is illegal
830 // when the inner loop relies on the guard to terminate (e.g. an eq/ne exit
831 // whose trip count is degenerate once the guard is false). Reject by allowing
832 // the outer header to branch only into the inner loop.
833 //
834 // TODO: This is conservative. A guarded nest is still safe to interchange
835 // when the inner loop has a computable trip count that is empty exactly when
836 // the guard is false, e.g.:
837 // for (i = 0; i < N; i++)
838 // if (M > 0) // loop-invariant guard
839 // for (j = 0; j < M; j++) // empty when M <= 0
840 // A[j][i] = ...;
841 // Interchanging is legal here because the inner loop runs zero times on the
842 // guarded-off iterations.
843 for (BasicBlock *Succ : successors(OuterLoopHeader))
844 if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader())
845 return false;
846
847 LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
848
849 // The inner loop reduction pattern requires storing the LCSSA PHI in
850 // the OuterLoop Latch. Therefore, when reduction2Memory is enabled, skip
851 // that store during checks.
852 Instruction *Skip = nullptr;
853 assert(InnerReductions.size() <= 1 &&
854 "So far we only support at most one reduction.");
855 if (InnerReductions.size() == 1)
856 Skip = InnerReductions[0].LcssaStore;
857
858 // We do not have any basic block in between now make sure the outer header
859 // and outer loop latch doesn't contain any unsafe instructions.
860 if (containsUnsafeInstructions(OuterLoopHeader, Skip) ||
861 containsUnsafeInstructions(OuterLoopLatch, Skip))
862 return false;
863
864 // Also make sure the inner loop preheader does not contain any unsafe
865 // instructions. Note that all instructions in the preheader will be moved to
866 // the outer loop header when interchanging.
867 if (InnerLoopPreHeader != OuterLoopHeader &&
868 containsUnsafeInstructions(InnerLoopPreHeader, Skip))
869 return false;
870
871 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
872 // Ensure the inner loop exit block flows to the outer loop latch possibly
873 // through empty blocks.
874 const BasicBlock &SuccInner =
875 LoopNest::skipEmptyBlockUntil(InnerLoopExit, OuterLoopLatch);
876 if (&SuccInner != OuterLoopLatch) {
877 LLVM_DEBUG(dbgs() << "Inner loop exit block " << *InnerLoopExit
878 << " does not lead to the outer loop latch.\n";);
879 return false;
880 }
881 // The inner loop exit block does flow to the outer loop latch and not some
882 // other BBs, now make sure it contains safe instructions, since it will be
883 // moved into the (new) inner loop after interchange.
884 if (containsUnsafeInstructions(InnerLoopExit, Skip))
885 return false;
886
887 LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
888 // We have a perfect loop nest.
889 return true;
890}
891
892bool LoopInterchangeLegality::isLoopStructureUnderstood() {
893 BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
894 for (PHINode *InnerInduction : InnerLoopInductions) {
895 unsigned Num = InnerInduction->getNumOperands();
896 for (unsigned i = 0; i < Num; ++i) {
897 Value *Val = InnerInduction->getOperand(i);
898 if (isa<Constant>(Val))
899 continue;
901 if (!I)
902 return false;
903 // TODO: Handle triangular loops.
904 // e.g. for(int i=0;i<N;i++)
905 // for(int j=i;j<N;j++)
906 unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
907 if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
908 InnerLoopPreheader &&
909 !OuterLoop->isLoopInvariant(I)) {
910 return false;
911 }
912 }
913 }
914
915 // TODO: Handle triangular loops of another form.
916 // e.g. for(int i=0;i<N;i++)
917 // for(int j=0;j<i;j++)
918 // or,
919 // for(int i=0;i<N;i++)
920 // for(int j=0;j*i<N;j++)
921 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
922 CondBrInst *InnerLoopLatchBI =
923 dyn_cast<CondBrInst>(InnerLoopLatch->getTerminator());
924 if (!InnerLoopLatchBI)
925 return false;
926
927 CmpInst *InnerLoopCmp = dyn_cast<CmpInst>(InnerLoopLatchBI->getCondition());
928 if (!InnerLoopCmp)
929 return false;
930
931 Value *Op0 = InnerLoopCmp->getOperand(0);
932 Value *Op1 = InnerLoopCmp->getOperand(1);
933
934 // LHS and RHS of the inner loop exit condition, e.g.,
935 // in "for(int j=0;j<i;j++)", LHS is j and RHS is i.
936 Value *Left = nullptr;
937 Value *Right = nullptr;
938
939 // Check if V only involves inner loop induction variable.
940 // Return true if V is InnerInduction, or a cast from
941 // InnerInduction, or a binary operator that involves
942 // InnerInduction and a constant.
943 std::function<bool(Value *)> IsPathToInnerIndVar;
944 IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value *V) -> bool {
945 if (llvm::is_contained(InnerLoopInductions, V))
946 return true;
947 if (isa<Constant>(V))
948 return true;
950 if (!I)
951 return false;
952 if (isa<CastInst>(I))
953 return IsPathToInnerIndVar(I->getOperand(0));
955 return IsPathToInnerIndVar(I->getOperand(0)) &&
956 IsPathToInnerIndVar(I->getOperand(1));
957 return false;
958 };
959
960 // In case of multiple inner loop indvars, it is okay if LHS and RHS
961 // are both inner indvar related variables.
962 if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1))
963 return true;
964
965 // Otherwise we check if the cmp instruction compares an inner indvar
966 // related variable (Left) with a outer loop invariant (Right).
967 if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Op0)) {
968 Left = Op0;
969 Right = Op1;
970 } else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Op1)) {
971 Left = Op1;
972 Right = Op0;
973 }
974
975 if (Left == nullptr)
976 return false;
977
978 const SCEV *S = SE->getSCEV(Right);
979 if (!SE->isLoopInvariant(S, OuterLoop))
980 return false;
981
982 return true;
983}
984
985// If SV is a LCSSA PHI node with a single incoming value, return the incoming
986// value.
987static Value *followLCSSA(Value *SV) {
989 if (!PHI)
990 return SV;
991
992 if (PHI->getNumIncomingValues() != 1)
993 return SV;
994 return followLCSSA(PHI->getIncomingValue(0));
995}
996
998 SmallVectorImpl<Instruction *> &HasNoWrapInsts,
999 SmallVectorImpl<Instruction *> &HasNoInfInsts) {
1002 // Detect floating point reduction only when it can be reordered.
1003 if (RD.getExactFPMathInst() != nullptr)
1004 return false;
1005
1006 RecurKind RK = RD.getRecurrenceKind();
1007 switch (RK) {
1008 case RecurKind::Or:
1009 case RecurKind::And:
1010 case RecurKind::Xor:
1011 case RecurKind::SMin:
1012 case RecurKind::SMax:
1013 case RecurKind::UMin:
1014 case RecurKind::UMax:
1015 return true;
1016
1017 // Interchanging the loops that contain AnyOf reduction is not always legal.
1018 // Especially, when the result value of the AnyOf is not loop-invariant with
1019 // respect to the outer loop, interchanging may change the semantics. The
1020 // following is an example of such case:
1021 // int A = {{ 1, 0 }, { 0, 1 }};
1022 // int red = 0;
1023 // for (int i = 0; i < 2; i++)
1024 // for (int j = 0; j < 2; j++)
1025 // red = (A[j][i] == 0) ? i + 1 : red;
1026 //
1027 // TODO: We may be able to support interchanging loops with AnyOf reduction
1028 // by checking the operand of the reduction is loop-invariant with respect
1029 // to the outer loop as well.
1030 case RecurKind::AnyOf:
1031 return false;
1032
1033 // Changing the order of floating-point operations may alter the results. If
1034 // a certain instruction has the ninf flag, it means that reordering can
1035 // produce a poison value, which may lead to undefined behavior. To prevent
1036 // this, we must drop the ninf flags if we decide to apply the
1037 // transformation.
1038 case RecurKind::FAdd:
1039 case RecurKind::FMul:
1040 case RecurKind::FMin:
1041 case RecurKind::FMax:
1046 case RecurKind::FMulAdd:
1047 for (Instruction *I : RD.getReductionOpChain(PHI, L))
1048 if (isa<FPMathOperator>(I) && I->hasNoInfs())
1049 HasNoInfInsts.push_back(I);
1050 return true;
1051
1052 // Change the order of integer addition/multiplication may change the
1053 // semantics. Consider the following case:
1054 //
1055 // int A[2][2] = {{ INT_MAX, INT_MAX }, { INT_MIN, INT_MIN }};
1056 // int sum = 0;
1057 // for (int i = 0; i < 2; i++)
1058 // for (int j = 0; j < 2; j++)
1059 // sum += A[j][i];
1060 //
1061 // If the above loops are exchanged, the addition will cause an
1062 // overflow. To prevent this, we must drop the nuw/nsw flags from the
1063 // addition/multiplication instructions when we actually exchanges the
1064 // loops.
1065 case RecurKind::Add:
1066 case RecurKind::Mul: {
1067 unsigned OpCode = RecurrenceDescriptor::getOpcode(RK);
1069
1070 // Bail out when we fail to collect reduction instructions chain.
1071 if (Ops.empty())
1072 return false;
1073
1074 for (Instruction *I : Ops) {
1075 assert(I->getOpcode() == OpCode &&
1076 "Expected the instruction to be the reduction operation");
1077 (void)OpCode;
1078
1079 // If the instruction has nuw/nsw flags, we must drop them when the
1080 // transformation is actually performed.
1081 if (I->hasNoSignedWrap() || I->hasNoUnsignedWrap())
1082 HasNoWrapInsts.push_back(I);
1083 }
1084 return true;
1085 }
1086
1087 default:
1088 return false;
1089 }
1090 } else
1091 return false;
1092}
1093
1094// Check V's users to see if it is involved in a reduction in L.
1095static PHINode *
1097 SmallVectorImpl<Instruction *> &HasNoWrapInsts,
1098 SmallVectorImpl<Instruction *> &HasNoInfInsts) {
1099 // Reduction variables cannot be constants.
1100 if (isa<Constant>(V))
1101 return nullptr;
1102
1103 for (Value *User : V->users()) {
1105 if (PHI->getNumIncomingValues() == 1)
1106 continue;
1107
1108 if (checkReductionKind(L, PHI, HasNoWrapInsts, HasNoInfInsts))
1109 return PHI;
1110 else
1111 return nullptr;
1112 }
1113 }
1114
1115 return nullptr;
1116}
1117
1118bool LoopInterchangeLegality::isInnerReduction(
1119 Loop *L, PHINode *Phi, SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
1120
1121 // Only support reduction2Mem when the loop nest to be interchanged is
1122 // the innermost two loops.
1123 if (!L->isInnermost()) {
1124 LLVM_DEBUG(dbgs() << "Only supported when the loop is the innermost.\n");
1125 ORE->emit([&]() {
1126 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1127 L->getStartLoc(), L->getHeader())
1128 << "Only supported when the loop is the innermost.";
1129 });
1130 return false;
1131 }
1132
1133 if (Phi->getNumIncomingValues() != 2)
1134 return false;
1135
1136 Value *Init = Phi->getIncomingValueForBlock(L->getLoopPreheader());
1137 Value *Next = Phi->getIncomingValueForBlock(L->getLoopLatch());
1138
1139 // So far only supports constant initial value.
1140 if (!isa<Constant>(Init)) {
1141 LLVM_DEBUG(
1142 dbgs()
1143 << "Only supported for the reduction with a constant initial value.\n");
1144 ORE->emit([&]() {
1145 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1146 L->getStartLoc(), L->getHeader())
1147 << "Only supported for the reduction with a constant initial "
1148 "value.";
1149 });
1150 return false;
1151 }
1152
1153 // The reduction result must live in the inner loop.
1154 if (Instruction *I = dyn_cast<Instruction>(Next)) {
1155 BasicBlock *BB = I->getParent();
1156 if (!L->contains(BB))
1157 return false;
1158 }
1159
1160 // The reduction should have only one user.
1161 if (!Phi->hasOneUser())
1162 return false;
1163
1164 // Check the reduction kind.
1165 if (!checkReductionKind(L, Phi, HasNoWrapInsts, HasNoInfInsts))
1166 return false;
1167
1168 // Find lcssa_phi in OuterLoop's Latch
1169 BasicBlock *ExitBlock = L->getExitBlock();
1170 if (!ExitBlock)
1171 return false;
1172
1173 PHINode *Lcssa = NULL;
1174 for (auto *U : Next->users()) {
1175 if (auto *P = dyn_cast<PHINode>(U)) {
1176 if (P == Phi)
1177 continue;
1178
1179 if (Lcssa == NULL && P->getParent() == ExitBlock &&
1180 P->getIncomingValueForBlock(L->getLoopLatch()) == Next)
1181 Lcssa = P;
1182 else
1183 return false;
1184 } else
1185 return false;
1186 }
1187 if (!Lcssa)
1188 return false;
1189
1190 if (!Lcssa->hasOneUser()) {
1191 LLVM_DEBUG(dbgs() << "Only supported when the reduction is used once in "
1192 "the outer loop.\n");
1193 ORE->emit([&]() {
1194 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1195 L->getStartLoc(), L->getHeader())
1196 << "Only supported when the reduction is used once in the outer "
1197 "loop.";
1198 });
1199 return false;
1200 }
1201
1202 StoreInst *LcssaStore =
1204 if (!LcssaStore || LcssaStore->getParent() != ExitBlock)
1205 return false;
1206
1207 Value *MemRef = LcssaStore->getOperand(1);
1208 Type *ElemTy = LcssaStore->getOperand(0)->getType();
1209
1210 // LcssaStore stores the reduction result in BB.
1211 // When the reduction is initialized from a constant value, we need to load
1212 // from the memory object into the target basic block of the inner loop. This
1213 // means the memory reference was used prematurely. So we must ensure that the
1214 // memory reference does not dominate the target basic block.
1215 // TODO: Move the memory reference definition into the loop header.
1216 if (!DT->dominates(dyn_cast<Instruction>(MemRef), L->getHeader())) {
1217 LLVM_DEBUG(dbgs() << "Only supported when memory reference dominate "
1218 "the inner loop.\n");
1219 ORE->emit([&]() {
1220 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1221 L->getStartLoc(), L->getHeader())
1222 << "Only supported when memory reference dominate the inner "
1223 "loop.";
1224 });
1225 return false;
1226 }
1227
1228 // Found a reduction in the inner loop.
1229 InnerReduction SR;
1230 SR.Reduction = Phi;
1231 SR.Init = Init;
1232 SR.Next = Next;
1233 SR.LcssaPhi = Lcssa;
1234 SR.LcssaStore = LcssaStore;
1235 SR.MemRef = MemRef;
1236 SR.ElemTy = ElemTy;
1237
1238 InnerReductions.push_back(SR);
1239 return true;
1240}
1241
1242bool LoopInterchangeLegality::checkInductionsAndReductions(Loop *OuterLoop) {
1243 auto ChildLoop = [](Loop *L) {
1244 assert(L->getSubLoops().size() <= 1 &&
1245 "Expect at most one child loop for now.");
1246 return L->getSubLoops().empty() ? nullptr : L->getSubLoops().front();
1247 };
1248
1249 Loop *InnerLoop = ChildLoop(OuterLoop);
1250 for (Loop *CurLoop = OuterLoop; CurLoop; CurLoop = ChildLoop(CurLoop)) {
1251 for (PHINode &PHI : CurLoop->getHeader()->phis()) {
1252 InductionDescriptor ID;
1253 if (InductionDescriptor::isInductionPHI(&PHI, CurLoop, SE, ID)) {
1254 if (CurLoop == InnerLoop) {
1255 const SCEV *Step = ID.getStep();
1256 if (!SE->isLoopInvariant(Step, OuterLoop))
1257 return false;
1258 InnerLoopInductions.push_back(&PHI);
1259 }
1260 continue;
1261 }
1262
1263 if (CurLoop == OuterLoop) {
1264 // PHIs in inner loops need to be part of a reduction in the outer loop,
1265 if (PHI.getNumIncomingValues() != 2) {
1266 LLVM_DEBUG(dbgs() << "Only PHI nodes in the outer loop header with 2 "
1267 "incoming values are supported.\n");
1268 return false;
1269 }
1270 // Check if we have a PHI node in the outer loop that has a reduction
1271 // result from the inner loop as an incoming value.
1272 Value *V = followLCSSA(
1273 PHI.getIncomingValueForBlock(OuterLoop->getLoopLatch()));
1274 PHINode *InnerRedPhi = findInnerReductionPhi(
1275 InnerLoop, V, HasNoWrapReductions, HasNoInfInsts);
1276
1277 // Reject if PHI has users other than InnerRedPhi. The typical case is
1278 // as follows:
1279 //
1280 // o.header:
1281 // %red.o = phi [ 0, ... ], [ %red.next, %o.latch ]
1282 // br label %i.header
1283 //
1284 // i.header:
1285 // %red.i = phi [ %red.o, %o.header ], [ %red.next, %i.latch ]
1286 // br label %i.body
1287 //
1288 // i.body:
1289 // store %red.o to %mem
1290 // ...
1291 //
1292 if (!InnerRedPhi ||
1293 !llvm::is_contained(InnerRedPhi->incoming_values(), &PHI) ||
1294 !all_of(PHI.users(),
1295 [InnerRedPhi](User *U) { return U == InnerRedPhi; })) {
1296 LLVM_DEBUG(
1297 dbgs()
1298 << "Failed to recognize PHI as an induction or reduction.\n");
1299 ORE->emit([&]() {
1300 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
1301 OuterLoop->getStartLoc(),
1302 OuterLoop->getHeader())
1303 << "Only outer loops with induction or reduction PHI nodes "
1304 "can be interchanged currently.";
1305 });
1306 return false;
1307 }
1308
1309 OuterInnerReductions.insert(&PHI);
1310 OuterInnerReductions.insert(InnerRedPhi);
1311 } else {
1312 if (OuterInnerReductions.count(&PHI)) {
1313 LLVM_DEBUG(dbgs() << "Found a reduction across the outer loop.\n");
1314 } else if (EnableReduction2Memory &&
1315 isInnerReduction(CurLoop, &PHI, HasNoWrapReductions)) {
1316 LLVM_DEBUG(dbgs() << "Found a reduction in the inner loop: \n"
1317 << PHI << '\n');
1318 } else {
1319 ORE->emit([&]() {
1320 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
1321 CurLoop->getStartLoc(),
1322 CurLoop->getHeader())
1323 << "Only inner loops with induction or reduction PHI nodes "
1324 "can be interchanged currently.";
1325 });
1326 return false;
1327 }
1328 }
1329 }
1330
1331 // For now we only support at most one reduction.
1332 if (InnerReductions.size() > 1) {
1333 LLVM_DEBUG(dbgs() << "Only supports at most one reduction.\n");
1334 ORE->emit([&]() {
1335 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1336 CurLoop->getStartLoc(),
1337 CurLoop->getHeader())
1338 << "Only supports at most one reduction.";
1339 });
1340 return false;
1341 }
1342 }
1343
1344 return !InnerLoopInductions.empty();
1345}
1346
1347// This function indicates the current limitations in the transform as a result
1348// of which we do not proceed.
1349bool LoopInterchangeLegality::currentLimitations() {
1350 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1351
1352 // transform currently expects the loop latches to also be the exiting
1353 // blocks.
1354 if (InnerLoop->getExitingBlock() != InnerLoopLatch ||
1355 OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() ||
1356 !isa<CondBrInst>(InnerLoopLatch->getTerminator()) ||
1357 !isa<CondBrInst>(OuterLoop->getLoopLatch()->getTerminator())) {
1358 LLVM_DEBUG(
1359 dbgs() << "Loops where the latch is not the exiting block are not"
1360 << " supported currently.\n");
1361 ORE->emit([&]() {
1362 return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch",
1363 OuterLoop->getStartLoc(),
1364 OuterLoop->getHeader())
1365 << "Loops where the latch is not the exiting block cannot be"
1366 " interchange currently.";
1367 });
1368 return true;
1369 }
1370
1371 // TODO: Triangular loops are not handled for now.
1372 if (!isLoopStructureUnderstood()) {
1373 LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
1374 ORE->emit([&]() {
1375 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
1376 InnerLoop->getStartLoc(),
1377 InnerLoop->getHeader())
1378 << "Inner loop structure not understood currently.";
1379 });
1380 return true;
1381 }
1382
1383 // Currently, we do not support loops that have a predecessor entering the
1384 // loop via an indirectbr.
1385 for (Loop *L : {OuterLoop, InnerLoop}) {
1386 BasicBlock *Header = L->getHeader();
1387 for (BasicBlock *Pred : predecessors(Header)) {
1388 if (L->contains(Pred))
1389 continue;
1390 if (isa<IndirectBrInst>(Pred->getTerminator())) {
1391 LLVM_DEBUG(
1392 dbgs() << "Indirect branch found in the loop predecessor.\n");
1393 ORE->emit([&]() {
1394 return OptimizationRemarkMissed(DEBUG_TYPE, "IndirectBranchPreheader",
1395 L->getStartLoc(), L->getHeader())
1396 << "Indirect branch found in the loop predecessor.";
1397 });
1398 return true;
1399 }
1400 }
1401 }
1402
1403 // Currently, we do not support loops where the inner loop header has
1404 // duplicate successors.
1405 SmallPtrSet<BasicBlock *, 2> InnerLoopHeaderSuccs;
1406 for (BasicBlock *Succ : successors(InnerLoop->getHeader()))
1407 if (!InnerLoopHeaderSuccs.insert(Succ).second)
1408 return true;
1409
1410 return false;
1411}
1412
1413/// We currently only support LCSSA PHI nodes in the inner loop exit if their
1414/// users are either of the following:
1415///
1416/// - Reduction PHIs
1417/// - PHIs outside the outer loop
1418/// - PHIs belonging to the latch of the outer loop
1419///
1420/// These conditions mean that we are only interested in the final value after
1421/// the inner loop.
1422static bool
1425 PHINode *LcssaReduction) {
1426 BasicBlock *InnerExit = InnerL->getUniqueExitBlock();
1427 for (PHINode &PHI : InnerExit->phis()) {
1428 // The reduction LCSSA PHI will have only one incoming block, which comes
1429 // from the loop latch.
1430 if (PHI.getNumIncomingValues() > 1)
1431 return false;
1432 // The reduction LCSSA PHI's store user is rewritten by reduction2Memory();
1433 // skip its user-check but keep validating the remaining LCSSA PHIs.
1434 if (&PHI == LcssaReduction)
1435 continue;
1436 if (any_of(PHI.users(), [&Reductions, OuterL](User *U) {
1437 PHINode *PN = dyn_cast<PHINode>(U);
1438 if (!PN)
1439 return true;
1440 if (Reductions.count(PN))
1441 return false;
1442 BasicBlock *PB = PN->getParent();
1443 if (!OuterL->contains(PB))
1444 return false;
1445 return PB != OuterL->getLoopLatch();
1446 }))
1447 return false;
1448 }
1449 return true;
1450}
1451
1452// We currently support LCSSA PHI nodes in the outer loop exit, if their
1453// incoming values do not come from the outer loop latch or if the
1454// outer loop latch has a single predecessor. In that case, the value will
1455// be available if both the inner and outer loop conditions are true, which
1456// will still be true after interchanging. If we have multiple predecessor,
1457// that may not be the case, e.g. because the outer loop latch may be executed
1458// if the inner loop is not executed.
1459static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
1460 BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
1461 for (PHINode &PHI : LoopNestExit->phis()) {
1462 for (Value *Incoming : PHI.incoming_values()) {
1463 Instruction *IncomingI = dyn_cast<Instruction>(Incoming);
1464 if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
1465 continue;
1466
1467 // The incoming value is defined in the outer loop latch. Currently we
1468 // only support that in case the outer loop latch has a single predecessor.
1469 // This guarantees that the outer loop latch is executed if and only if
1470 // the inner loop is executed (because tightlyNested() guarantees that the
1471 // outer loop header only branches to the inner loop or the outer loop
1472 // latch).
1473 // FIXME: We could weaken this logic and allow multiple predecessors,
1474 // if the values are produced outside the loop latch. We would need
1475 // additional logic to update the PHI nodes in the exit block as
1476 // well.
1477 if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
1478 return false;
1479 }
1480 }
1481 return true;
1482}
1483
1484/// The transform clones the inner latch's exit condition into the new latch
1485/// (see MoveInstructions in LoopInterchangeTransform::transform), but it does
1486/// not relocate PHI nodes. So if a PHI in the inner latch feeds that condition,
1487/// a later interchange can leave the cloned PHI with a stale incoming block,
1488/// producing invalid IR. Reject that case here.
1489///
1490/// For example, %p is a PHI in the inner latch and the inner loop's exit test
1491/// reads %p, so %p feeds the condition that would be cloned:
1492///
1493/// inner.latch:
1494/// %p = phi i64 [ %v, %subloop.latch ]
1495/// %ec = icmp eq i64 %iv, %p ; inner exit test reads %p
1496/// br i1 %ec, label %exit, label %inner.header
1497///
1498/// TODO: Handle transformation of lcssa phis in the InnerLoop latch in case of
1499/// multi-level loop nests.
1500static bool areInnerLoopLatchPHIsSupported(Loop *InnerLoop) {
1501 if (InnerLoop->getSubLoops().empty())
1502 return true;
1503
1504 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1505 auto *LatchBI = dyn_cast<CondBrInst>(InnerLoopLatch->getTerminator());
1506 if (!LatchBI)
1507 return true;
1508 auto *CondI = dyn_cast<Instruction>(LatchBI->getCondition());
1509 if (!CondI)
1510 return true;
1511
1512 // Bail if a phi in the inner latch feeds the exit condition, walking operands
1513 // within the inner loop.
1515 Worklist.insert(CondI);
1516 for (unsigned I = 0; I < Worklist.size(); ++I) {
1517 Instruction *Cur = Worklist[I];
1518 if (isa<PHINode>(Cur) && Cur->getParent() == InnerLoopLatch)
1519 return false;
1520 for (Value *Op : Cur->operands())
1521 if (auto *OpI = dyn_cast<Instruction>(Op))
1522 if (InnerLoop->contains(OpI))
1523 Worklist.insert(OpI);
1524 }
1525 return true;
1526}
1527
1528bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
1529 unsigned OuterLoopId,
1530 CharMatrix &DepMatrix) {
1531 if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
1532 LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
1533 << " and OuterLoopId = " << OuterLoopId
1534 << " due to dependence\n");
1535 ORE->emit([&]() {
1536 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
1537 InnerLoop->getStartLoc(),
1538 InnerLoop->getHeader())
1539 << "Cannot interchange loops due to dependences.";
1540 });
1541 return false;
1542 }
1543 // Check if outer and inner loop contain legal instructions only.
1544 for (auto *BB : OuterLoop->blocks())
1545 for (Instruction &I : *BB) {
1546 // Loads and stores are checked separately, so we can skip them here.
1548 continue;
1549
1550 // We cannot ignore potential memory reads, e.g., loads inside the called
1551 // function.
1552 if (!I.mayHaveSideEffects() && !I.mayReadFromMemory())
1553 continue;
1554
1555 LLVM_DEBUG(
1556 dbgs()
1557 << "Loops contain instructions that cannot be safely interchanged\n");
1558 ORE->emit([&]() {
1559 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsafeInst",
1560 I.getDebugLoc(), I.getParent())
1561 << "Cannot interchange loops due to instruction that is "
1562 "potentially unsafe to interchange.";
1563 });
1564
1565 return false;
1566 }
1567
1568 if (!checkInductionsAndReductions(OuterLoop)) {
1569 LLVM_DEBUG(dbgs() << "Failed to find inner loop inductions or found "
1570 "unsupported reductions.\n");
1571 return false;
1572 }
1573
1574 if (!areInnerLoopLatchPHIsSupported(InnerLoop)) {
1575 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n");
1576 ORE->emit([&]() {
1577 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerLatchPHI",
1578 InnerLoop->getStartLoc(),
1579 InnerLoop->getHeader())
1580 << "Cannot interchange loops because unsupported PHI nodes found "
1581 "in inner loop latch.";
1582 });
1583 return false;
1584 }
1585
1586 // TODO: The loops could not be interchanged due to current limitations in the
1587 // transform module.
1588 if (currentLimitations()) {
1589 LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");
1590 return false;
1591 }
1592
1593 // Check if the loops are tightly nested.
1594 if (!tightlyNested(OuterLoop, InnerLoop)) {
1595 LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");
1596 ORE->emit([&]() {
1597 return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
1598 InnerLoop->getStartLoc(),
1599 InnerLoop->getHeader())
1600 << "Cannot interchange loops because they are not tightly "
1601 "nested.";
1602 });
1603 return false;
1604 }
1605
1606 // The LCSSA PHI for the reduction has passed checks before; its user
1607 // is a store instruction.
1608 PHINode *LcssaReduction = nullptr;
1609 assert(InnerReductions.size() <= 1 &&
1610 "So far we only support at most one reduction.");
1611 if (InnerReductions.size() == 1)
1612 LcssaReduction = InnerReductions[0].LcssaPhi;
1613
1614 if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop, OuterInnerReductions,
1615 LcssaReduction)) {
1616 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n");
1617 ORE->emit([&]() {
1618 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1619 InnerLoop->getStartLoc(),
1620 InnerLoop->getHeader())
1621 << "Found unsupported PHI node in loop exit.";
1622 });
1623 return false;
1624 }
1625
1626 if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
1627 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
1628 ORE->emit([&]() {
1629 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1630 OuterLoop->getStartLoc(),
1631 OuterLoop->getHeader())
1632 << "Found unsupported PHI node in loop exit.";
1633 });
1634 return false;
1635 }
1636
1637 if (any_of(OuterLoop->getLoopLatch()->phis(),
1638 [](PHINode &PHI) { return PHI.getNumIncomingValues() != 1; })) {
1639 LLVM_DEBUG(dbgs() << "Only outer loop latch PHI nodes with one incoming "
1640 "value are supported.\n");
1641 ORE->emit([&]() {
1642 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLatchPHI",
1643 OuterLoop->getStartLoc(),
1644 OuterLoop->getHeader())
1645 << "Only outer loop latch PHI nodes with one incoming value are "
1646 "supported.";
1647 });
1648 return false;
1649 }
1650
1651 // Regarding def-use chains that begin at an LCSSA PHI in the inner loop exit
1652 // and end at any instruction in the outer loop latch, we currently support
1653 // only the case where the chain contains only PHI nodes. Since we already
1654 // call `tightlyNested()`, we know that if there is a def-use chain that we
1655 // don't support (i.e., a chain that contains a non-PHI user), then the
1656 // non-PHI user must be in the outer loop latch.
1657 if (InnerLoop->getExitBlock() != OuterLoop->getLoopLatch())
1658 for (PHINode &PHI : OuterLoop->getLoopLatch()->phis())
1659 if (any_of(PHI.users(), [](const User *U) { return !isa<PHINode>(U); })) {
1660 LLVM_DEBUG(dbgs() << "Outer loop latch PHI has a non-PHI user.\n");
1661 ORE->emit([&]() {
1662 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLatchPHI",
1663 OuterLoop->getStartLoc(),
1664 OuterLoop->getHeader())
1665 << "Cannot interchange loops because an outer loop latch PHI "
1666 "node has a non-PHI user.";
1667 });
1668 return false;
1669 }
1670
1671 return true;
1672}
1673
1674void CacheCostManager::computeIfUnitinialized() {
1675 if (CC.has_value())
1676 return;
1677
1678 LLVM_DEBUG(dbgs() << "Compute CacheCost.\n");
1679 CC = CacheCost::getCacheCost(*OutermostLoop, *AR, *DI);
1680 // Obtain the loop vector returned from loop cache analysis beforehand,
1681 // and put each <Loop, index> pair into a map for constant time query
1682 // later. Indices in loop vector reprsent the optimal order of the
1683 // corresponding loop, e.g., given a loopnest with depth N, index 0
1684 // indicates the loop should be placed as the outermost loop and index N
1685 // indicates the loop should be placed as the innermost loop.
1686 //
1687 // For the old pass manager CacheCost would be null.
1688 if (*CC != nullptr)
1689 for (const auto &[Idx, Cost] : enumerate((*CC)->getLoopCosts()))
1690 CostMap[Cost.first] = Idx;
1691}
1692
1693CacheCost *CacheCostManager::getCacheCost() {
1694 computeIfUnitinialized();
1695 return CC->get();
1696}
1697
1698const DenseMap<const Loop *, unsigned> &CacheCostManager::getCostMap() {
1699 computeIfUnitinialized();
1700 return CostMap;
1701}
1702
1703/// If \S contains an affine addrec for \p L, return the step recurrence of it.
1704/// If \S is loop invariant with respect to \p L, return nullptr. Otherwise,
1705/// return std::nullopt, which indicates we cannot determine the coefficient of
1706/// the addrec for \p L in \S.
1707/// TODO: Handle more complex cases. Maybe using SCEVTraversal is a good way to
1708/// do that.
1709static std::optional<const SCEV *>
1712 if (!AR) {
1713 if (SE.isLoopInvariant(S, L))
1714 return nullptr;
1715 return std::nullopt;
1716 }
1717
1718 if (!AR->isAffine()) {
1719 LLVM_DEBUG(dbgs() << "Unexpected non-affine addrec\n");
1720 return std::nullopt;
1721 }
1722
1723 std::optional<const SCEV *> Coeff =
1724 getAddRecCoefficient(SE, AR->getStart(), L);
1725 if (!Coeff.has_value())
1726 return std::nullopt;
1727
1728 if (AR->getLoop() == L) {
1729 assert(!*Coeff && "Found more than one addrec for the same loop");
1730 Coeff = AR->getStepRecurrence(SE);
1731 }
1732 return Coeff;
1733}
1734
1735int LoopInterchangeProfitability::getInstrOrderCost() {
1736 SmallPtrSet<const SCEV *, 4> GoodBasePtrs, BadBasePtrs;
1737 for (BasicBlock *BB : InnerLoop->blocks()) {
1738 for (Instruction &Ins : *BB) {
1739 if (!isa<LoadInst, StoreInst>(&Ins))
1740 continue;
1741 const SCEV *Access = SE->getSCEV(getLoadStorePointerOperand(&Ins));
1742 const SCEV *BasePtr = SE->getPointerBase(Access);
1743 std::optional<const SCEV *> OuterCoeff =
1744 getAddRecCoefficient(*SE, Access, OuterLoop);
1745 std::optional<const SCEV *> InnerCoeff =
1746 getAddRecCoefficient(*SE, Access, InnerLoop);
1747
1748 if (!OuterCoeff.has_value() || !*OuterCoeff || !InnerCoeff.has_value() ||
1749 !*InnerCoeff)
1750 continue;
1751
1752 // This heuristic assumes that a smaller step recurrence implies that the
1753 // induction variable corresponding to the loop is used in the inner
1754 // dimension of the array. Placing such a loop in the inner position would
1755 // be beneficial in terms of locality. If the array access is of the form
1756 // like `A[3*i + 2*j]`, this heuristic may lead to an unprofitable
1757 // interchange, but we expect such cases to be rare.
1758 const SCEV *OuterStep = SE->getAbsExpr(*OuterCoeff, /*IsNSW=*/false);
1759 const SCEV *InnerStep = SE->getAbsExpr(*InnerCoeff, /*IsNSW=*/false);
1760 // If we find the inner induction after an outer induction e.g.
1761 //
1762 // for(int i=0;i<N;i++)
1763 // for(int j=0;j<N;j++)
1764 // A[i][j] = A[i-1][j-1]+k;
1765 //
1766 //
1767 // then it is a good order. If we find the outer induction after an inner
1768 // induction e.g.
1769 //
1770 // for(int i=0;i<N;i++)
1771 // for(int j=0;j<N;j++)
1772 // A[j][i] = A[j-1][i-1]+k;
1773 //
1774 // then it is a bad order.
1775 //
1776 // To avoid counting the same base pointers multiple times, we deduplicate
1777 // them by using a set of base pointers.
1778 if (SE->isKnownPredicate(ICmpInst::ICMP_SLT, InnerStep, OuterStep))
1779 GoodBasePtrs.insert(BasePtr);
1780 else if (SE->isKnownPredicate(ICmpInst::ICMP_SLT, OuterStep, InnerStep))
1781 BadBasePtrs.insert(BasePtr);
1782 }
1783 }
1784
1785 int GoodOrder = GoodBasePtrs.size();
1786 int BadOrder = BadBasePtrs.size();
1787 return GoodOrder - BadOrder;
1788}
1789
1790std::optional<bool>
1791LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(
1792 const DenseMap<const Loop *, unsigned> &CostMap, CacheCost *CC) {
1793 // This is the new cost model returned from loop cache analysis.
1794 // A smaller index means the loop should be placed an outer loop, and vice
1795 // versa.
1796 auto InnerLoopIt = CostMap.find(InnerLoop);
1797 if (InnerLoopIt == CostMap.end())
1798 return std::nullopt;
1799 auto OuterLoopIt = CostMap.find(OuterLoop);
1800 if (OuterLoopIt == CostMap.end())
1801 return std::nullopt;
1802
1803 if (CC->getLoopCost(*OuterLoop) == CC->getLoopCost(*InnerLoop))
1804 return std::nullopt;
1805 unsigned InnerIndex = InnerLoopIt->second;
1806 unsigned OuterIndex = OuterLoopIt->second;
1807 LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
1808 << ", OuterIndex = " << OuterIndex << "\n");
1809 assert(InnerIndex != OuterIndex && "CostMap should assign unique "
1810 "numbers to each loop");
1811 return std::optional<bool>(InnerIndex < OuterIndex);
1812}
1813
1814std::optional<bool>
1815LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
1816 // Legacy cost model: this is rough cost estimation algorithm. It counts the
1817 // good and bad order of induction variables in the instruction and allows
1818 // reordering if number of bad orders is more than good.
1819 int Cost = getInstrOrderCost();
1820 LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
1822 return std::optional<bool>(true);
1823
1824 return std::nullopt;
1825}
1826
1827/// Return true if we can vectorize the loop specified by \p LoopId.
1828static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
1829 for (const auto &Dep : DepMatrix) {
1830 char Dir = Dep[LoopId];
1831 char DepType = Dep.back();
1832 assert((DepType == '<' || DepType == '*') &&
1833 "Unexpected element in dependency vector");
1834
1835 // There are no loop-carried dependencies.
1836 if (Dir == '=' || Dir == 'I')
1837 continue;
1838
1839 // DepType being '<' means that this direction vector represents a forward
1840 // dependency. In principle, a loop with '<' direction can be vectorized in
1841 // this case.
1842 if (Dir == '<' && DepType == '<')
1843 continue;
1844
1845 // We cannot prove that the loop is vectorizable.
1846 return false;
1847 }
1848 return true;
1849}
1850
1851std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
1852 unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
1853 // If the outer loop cannot be vectorized, it is not profitable to move this
1854 // to inner position.
1855 if (!canVectorize(DepMatrix, OuterLoopId))
1856 return false;
1857
1858 // If the inner loop cannot be vectorized but the outer loop can be, then it
1859 // is profitable to interchange to enable inner loop parallelism.
1860 if (!canVectorize(DepMatrix, InnerLoopId))
1861 return true;
1862
1863 // If both the inner and the outer loop can be vectorized, it is necessary to
1864 // check the cost of each vectorized loop for profitability decision. At this
1865 // time we do not have a cost model to estimate them, so return nullopt.
1866 // TODO: Estimate the cost of vectorized loop when both the outer and the
1867 // inner loop can be vectorized.
1868 return std::nullopt;
1869}
1870
1871bool LoopInterchangeProfitability::isProfitable(
1872 const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
1873 unsigned OuterLoopId, CharMatrix &DepMatrix, CacheCostManager &CCM) {
1874 // Do not consider loops with a backedge that isn't taken, e.g. an
1875 // unconditional branch true/false, as candidates for interchange.
1876 // TODO: when interchange is forced, we should probably also allow
1877 // interchange for these loops, and thus this logic should be moved just
1878 // below the cost-model ignore check below. But this check is done first
1879 // to avoid the issue in #163954.
1880 const SCEV *InnerBTC = SE->getBackedgeTakenCount(InnerLoop);
1881 const SCEV *OuterBTC = SE->getBackedgeTakenCount(OuterLoop);
1882 if (InnerBTC && InnerBTC->isZero()) {
1883 LLVM_DEBUG(dbgs() << "Inner loop back-edge isn't taken, rejecting "
1884 "single iteration loop\n");
1885 return false;
1886 }
1887 if (OuterBTC && OuterBTC->isZero()) {
1888 LLVM_DEBUG(dbgs() << "Outer loop back-edge isn't taken, rejecting "
1889 "single iteration loop\n");
1890 return false;
1891 }
1892
1893 // Return true if interchange is forced and the cost-model ignored.
1894 if (Profitabilities.size() == 1 && Profitabilities[0] == RuleTy::Ignore)
1895 return true;
1897 "Duplicate rules and option 'ignore' are not allowed");
1898
1899 // isProfitable() is structured to avoid endless loop interchange. If the
1900 // highest priority rule (isProfitablePerLoopCacheAnalysis by default) could
1901 // decide the profitability then, profitability check will stop and return the
1902 // analysis result. If it failed to determine it (e.g., cache analysis failed
1903 // to analyze the loopnest due to delinearization issues) then go ahead the
1904 // second highest priority rule (isProfitablePerInstrOrderCost by default).
1905 // Likewise, if it failed to analysis the profitability then only, the last
1906 // rule (isProfitableForVectorization by default) will decide.
1907 std::optional<bool> shouldInterchange;
1908 for (RuleTy RT : Profitabilities) {
1909 switch (RT) {
1910 case RuleTy::PerLoopCacheAnalysis: {
1911 CacheCost *CC = CCM.getCacheCost();
1912 const DenseMap<const Loop *, unsigned> &CostMap = CCM.getCostMap();
1913 shouldInterchange = isProfitablePerLoopCacheAnalysis(CostMap, CC);
1914 break;
1915 }
1916 case RuleTy::PerInstrOrderCost:
1917 shouldInterchange = isProfitablePerInstrOrderCost();
1918 break;
1919 case RuleTy::ForVectorization:
1920 shouldInterchange =
1921 isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
1922 break;
1923 case RuleTy::Ignore:
1924 llvm_unreachable("Option 'ignore' is not supported with other options");
1925 break;
1926 }
1927
1928 // If this rule could determine the profitability, don't call subsequent
1929 // rules.
1930 if (shouldInterchange.has_value())
1931 break;
1932 }
1933
1934 if (!shouldInterchange.has_value()) {
1935 ORE->emit([&]() {
1936 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1937 InnerLoop->getStartLoc(),
1938 InnerLoop->getHeader())
1939 << "Insufficient information to calculate the cost of loop for "
1940 "interchange.";
1941 });
1942 return false;
1943 } else if (!shouldInterchange.value()) {
1944 ORE->emit([&]() {
1945 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1946 InnerLoop->getStartLoc(),
1947 InnerLoop->getHeader())
1948 << "Interchanging loops is not considered to improve cache "
1949 "locality nor vectorization.";
1950 });
1951 return false;
1952 }
1953 return true;
1954}
1955
1956void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
1957 Loop *InnerLoop) {
1958 for (Loop *L : *OuterLoop)
1959 if (L == InnerLoop) {
1960 OuterLoop->removeChildLoop(L);
1961 return;
1962 }
1963 llvm_unreachable("Couldn't find loop");
1964}
1965
1966/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
1967/// new inner and outer loop after interchanging: NewInner is the original
1968/// outer loop and NewOuter is the original inner loop.
1969///
1970/// Before interchanging, we have the following structure
1971/// Outer preheader
1972// Outer header
1973// Inner preheader
1974// Inner header
1975// Inner body
1976// Inner latch
1977// outer bbs
1978// Outer latch
1979//
1980// After interchanging:
1981// Inner preheader
1982// Inner header
1983// Outer preheader
1984// Outer header
1985// Inner body
1986// outer bbs
1987// Outer latch
1988// Inner latch
1989void LoopInterchangeTransform::restructureLoops(
1990 Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader,
1991 BasicBlock *OrigOuterPreHeader) {
1992 Loop *OuterLoopParent = OuterLoop->getParentLoop();
1993 // The original inner loop preheader moves from the new inner loop to
1994 // the parent loop, if there is one.
1995 NewInner->removeBlockFromLoop(OrigInnerPreHeader);
1996 LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent);
1997
1998 // Switch the loop levels.
1999 if (OuterLoopParent) {
2000 // Remove the loop from its parent loop.
2001 removeChildLoop(OuterLoopParent, NewInner);
2002 removeChildLoop(NewInner, NewOuter);
2003 OuterLoopParent->addChildLoop(NewOuter);
2004 } else {
2005 removeChildLoop(NewInner, NewOuter);
2006 LI->changeTopLevelLoop(NewInner, NewOuter);
2007 }
2008 while (!NewOuter->isInnermost())
2009 NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin()));
2010 NewOuter->addChildLoop(NewInner);
2011
2012 // BBs from the original inner loop.
2013 SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks());
2014
2015 // Add BBs from the original outer loop to the original inner loop (excluding
2016 // BBs already in inner loop)
2017 for (BasicBlock *BB : NewInner->blocks())
2018 if (LI->getLoopFor(BB) == NewInner)
2019 NewOuter->addBlockEntry(BB);
2020
2021 // Now remove inner loop header and latch from the new inner loop and move
2022 // other BBs (the loop body) to the new inner loop.
2023 BasicBlock *OuterHeader = NewOuter->getHeader();
2024 BasicBlock *OuterLatch = NewOuter->getLoopLatch();
2025 for (BasicBlock *BB : OrigInnerBBs) {
2026 // Nothing will change for BBs in child loops.
2027 if (LI->getLoopFor(BB) != NewOuter)
2028 continue;
2029 // Remove the new outer loop header and latch from the new inner loop.
2030 if (BB == OuterHeader || BB == OuterLatch)
2031 NewInner->removeBlockFromLoop(BB);
2032 else
2033 LI->changeLoopFor(BB, NewInner);
2034 }
2035
2036 // The preheader of the original outer loop becomes part of the new
2037 // outer loop.
2038 NewOuter->addBlockEntry(OrigOuterPreHeader);
2039 LI->changeLoopFor(OrigOuterPreHeader, NewOuter);
2040
2041 // Tell SE that we move the loops around.
2042 SE->forgetLoop(NewOuter);
2043}
2044
2045/// User can write, or optimizers can generate the reduction for inner loop.
2046/// To make the interchange valid, apply Reduction2Mem by moving the
2047/// initializer and store instructions into the inner loop. So far we only
2048/// handle cases where the reduction variable is initialized to a constant.
2049/// For example, below code:
2050///
2051/// loop:
2052/// re = phi<0.0, next>
2053/// next = re op ...
2054/// endloop
2055/// reduc_sum = phi<next> // lcssa phi
2056/// MEM_REF[idx] = reduc_sum // LcssaStore
2057///
2058/// is transformed into:
2059///
2060/// loop:
2061/// tmp = MEM_REF[idx];
2062/// new_var = !first_iteration ? tmp : 0.0;
2063/// next = new_var op ...
2064/// MEM_REF[idx] = next; // after moving
2065/// endloop
2066///
2067/// In this way the initial const is used in the first iteration of loop.
2068void LoopInterchangeTransform::reduction2Memory() {
2070 LIL.getInnerReductions();
2071
2072 assert(InnerReductions.size() == 1 &&
2073 "So far we only support at most one reduction.");
2074
2075 LoopInterchangeLegality::InnerReduction SR = InnerReductions[0];
2076 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
2077 IRBuilder<> Builder(&*(InnerLoopHeader->getFirstNonPHIIt()));
2078
2079 // Check if it's the first iteration.
2080 LLVMContext &Context = InnerLoopHeader->getContext();
2081 PHINode *FirstIter =
2082 Builder.CreatePHI(Type::getInt1Ty(Context), 2, "first.iter");
2083 FirstIter->addIncoming(ConstantInt::get(Type::getInt1Ty(Context), 1),
2084 InnerLoop->getLoopPreheader());
2085 FirstIter->addIncoming(ConstantInt::get(Type::getInt1Ty(Context), 0),
2086 InnerLoop->getLoopLatch());
2087 assert(FirstIter->isComplete() && "The FirstIter PHI node is not complete.");
2088
2089 // When the reduction is initialized from a constant value, we need to add
2090 // a stmt loading from the memory object to target basic block in inner
2091 // loop.
2092 Instruction *LoadMem = Builder.CreateLoad(SR.ElemTy, SR.MemRef);
2093
2094 // Init new_var to MEM_REF or CONST depending on if it is the first iteration.
2095 Value *NewVar = Builder.CreateSelect(FirstIter, SR.Init, LoadMem, "new.var");
2096
2097 // Replace all uses of the reduction variable with a new variable.
2098 SR.Reduction->replaceAllUsesWith(NewVar);
2099
2100 // Move store instruction into inner loop, just after reduction next's
2101 // definition.
2102 SR.LcssaStore->setOperand(0, SR.Next);
2103 SR.LcssaStore->moveAfter(dyn_cast<Instruction>(SR.Next));
2104}
2105
2106bool LoopInterchangeTransform::transform(
2107 ArrayRef<Instruction *> DropNoWrapInsts,
2108 ArrayRef<Instruction *> DropNoInfInsts) {
2109 bool Transformed = false;
2110
2112 LIL.getInnerReductions();
2113 if (InnerReductions.size() == 1)
2114 reduction2Memory();
2115
2116 LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
2117 auto &InductionPHIs = LIL.getInnerLoopInductions();
2118 if (InductionPHIs.empty()) {
2119 LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
2120 return false;
2121 }
2122
2123 SmallVector<Instruction *, 8> InnerIndexVarList;
2124 for (PHINode *CurInductionPHI : InductionPHIs) {
2125 Instruction *IncomingValue = dyn_cast<Instruction>(
2126 CurInductionPHI->getIncomingValueForBlock(InnerLoop->getLoopLatch()));
2127 assert(IncomingValue &&
2128 "Incoming value from loop latch isn't an instruction");
2129 if (is_contained(InductionPHIs, IncomingValue))
2130 continue;
2131 InnerIndexVarList.push_back(IncomingValue);
2132 }
2133
2134 // Create a new latch block for the inner loop. We split at the
2135 // current latch's terminator and then move the condition and all
2136 // operands that are not either loop-invariant or the induction PHI into the
2137 // new latch block.
2138 BasicBlock *NewLatch =
2139 SplitBlock(InnerLoop->getLoopLatch(),
2140 InnerLoop->getLoopLatch()->getTerminator(), DT, LI);
2141
2142 SmallSetVector<Instruction *, 4> WorkList;
2143 unsigned i = 0;
2144 auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() {
2145 for (; i < WorkList.size(); i++) {
2146 // PHI nodes cannot be cloned and moved here; the legality check
2147 // (areInnerLoopLatchPHIsSupported) ensures none reach the worklist.
2148 assert(!isa<PHINode>(WorkList[i]) &&
2149 "MoveInstructions does not support PHI nodes");
2150 // Duplicate instruction and move it to the new latch. Update uses that
2151 // have been moved.
2152 Instruction *NewI = WorkList[i]->clone();
2153 NewI->insertBefore(NewLatch->getFirstNonPHIIt());
2154 assert(!NewI->mayHaveSideEffects() &&
2155 "Moving instructions with side-effects may change behavior of "
2156 "the loop nest!");
2157 for (Use &U : llvm::make_early_inc_range(WorkList[i]->uses())) {
2158 Instruction *UserI = cast<Instruction>(U.getUser());
2159 if (!InnerLoop->contains(UserI->getParent()) ||
2160 UserI->getParent() == NewLatch ||
2161 llvm::is_contained(InductionPHIs, UserI))
2162 U.set(NewI);
2163 }
2164 // Add operands of moved instruction to the worklist, except if they are
2165 // outside the inner loop or are the induction PHI.
2166 for (Value *Op : WorkList[i]->operands()) {
2168 if (!OpI || this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop ||
2169 llvm::is_contained(InductionPHIs, OpI))
2170 continue;
2171 WorkList.insert(OpI);
2172 }
2173 }
2174 };
2175
2176 // FIXME: Should we interchange when we have a constant condition?
2179 ->getCondition());
2180 if (CondI)
2181 WorkList.insert(CondI);
2182 MoveInstructions();
2183 for (Instruction *InnerIndexVar : InnerIndexVarList)
2184 WorkList.insert(cast<Instruction>(InnerIndexVar));
2185 MoveInstructions();
2186
2187 // Ensure the inner loop phi nodes have a separate basic block.
2188 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
2189 if (&*InnerLoopHeader->getFirstNonPHIIt() !=
2190 InnerLoopHeader->getTerminator()) {
2191 SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHIIt(), DT, LI);
2192 LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
2193 }
2194
2195 // Instructions in the original inner loop preheader may depend on values
2196 // defined in the outer loop header. Move them there, because the original
2197 // inner loop preheader will become the entry into the interchanged loop nest.
2198 // Currently we move all instructions and rely on LICM to move invariant
2199 // instructions outside the loop nest.
2200 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2201 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
2202
2203 if (InnerLoopPreHeader != OuterLoopHeader) {
2204 // Eliminate PHIs in the inner-loop preheader.
2205 for (PHINode &P : make_early_inc_range(InnerLoopPreHeader->phis())) {
2206 assert(all_equal(P.incoming_values()) &&
2207 "Expected equivalent incoming values in inner loop preheader");
2208 P.replaceAllUsesWith(P.getIncomingValue(0));
2209 P.eraseFromParent();
2210 }
2211 for (Instruction &I :
2212 make_early_inc_range(make_range(InnerLoopPreHeader->begin(),
2213 std::prev(InnerLoopPreHeader->end()))))
2214 I.moveBeforePreserving(OuterLoopHeader->getTerminator()->getIterator());
2215 }
2216
2217 Transformed |= adjustLoopLinks();
2218 if (!Transformed) {
2219 LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
2220 return false;
2221 }
2222
2223 // Finally, drop the nsw/nuw/ninf flags from the instructions for reduction
2224 // calculations.
2225 for (Instruction *Reduction : DropNoWrapInsts) {
2226 Reduction->setHasNoSignedWrap(false);
2227 Reduction->setHasNoUnsignedWrap(false);
2228 }
2229 for (Instruction *I : DropNoInfInsts)
2230 I->setHasNoInfs(false);
2231
2232 return true;
2233}
2234
2235/// \brief Move all instructions except the terminator from FromBB right before
2236/// InsertBefore
2237static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
2238 BasicBlock *ToBB = InsertBefore->getParent();
2239
2240 ToBB->splice(InsertBefore->getIterator(), FromBB, FromBB->begin(),
2241 FromBB->getTerminator()->getIterator());
2242}
2243
2244/// Swap instructions between \p BB1 and \p BB2 but keep terminators intact.
2245static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) {
2246 // Save all non-terminator instructions of BB1 into TempInstrs and unlink them
2247 // from BB1 afterwards.
2248 auto Iter = map_range(*BB1, [](Instruction &I) { return &I; });
2249 SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(Iter.end()));
2250 for (Instruction *I : TempInstrs)
2251 I->removeFromParent();
2252
2253 // Move instructions from BB2 to BB1.
2254 moveBBContents(BB2, BB1->getTerminator());
2255
2256 // Move instructions from TempInstrs to BB2.
2257 for (Instruction *I : TempInstrs)
2258 I->insertBefore(BB2->getTerminator()->getIterator());
2259}
2260
2261// Update BI to jump to NewBB instead of OldBB. Records updates to the
2262// dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that
2263// \p OldBB is exactly once in BI's successor list.
2264static void updateSuccessor(Instruction *Term, BasicBlock *OldBB,
2265 BasicBlock *NewBB,
2266 std::vector<DominatorTree::UpdateType> &DTUpdates,
2267 bool MustUpdateOnce = true) {
2268 assert((!MustUpdateOnce || llvm::count(successors(Term), OldBB) == 1) &&
2269 "BI must jump to OldBB exactly once.");
2270 bool Changed = false;
2271 for (Use &Op : Term->operands())
2272 if (Op == OldBB) {
2273 Op.set(NewBB);
2274 Changed = true;
2275 }
2276
2277 if (Changed) {
2278 DTUpdates.push_back(
2279 {DominatorTree::UpdateKind::Insert, Term->getParent(), NewBB});
2280 DTUpdates.push_back(
2281 {DominatorTree::UpdateKind::Delete, Term->getParent(), OldBB});
2282 }
2283 assert(Changed && "Expected a successor to be updated");
2284}
2285
2286// Move Lcssa PHIs to the right place.
2287static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
2288 BasicBlock *InnerLatch, BasicBlock *OuterHeader,
2289 BasicBlock *OuterLatch, BasicBlock *OuterExit,
2290 Loop *InnerLoop, LoopInfo *LI) {
2291
2292 // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are
2293 // defined either in the header or latch. Those blocks will become header and
2294 // latch of the new outer loop, and the only possible users can PHI nodes
2295 // in the exit block of the loop nest or the outer loop header (reduction
2296 // PHIs, in that case, the incoming value must be defined in the inner loop
2297 // header). We can just substitute the user with the incoming value and remove
2298 // the PHI.
2299 for (PHINode &P : make_early_inc_range(InnerExit->phis())) {
2300 assert(P.getNumIncomingValues() == 1 &&
2301 "Only loops with a single exit are supported!");
2302
2303 Value *IncomingValue = P.getIncomingValueForBlock(InnerLatch);
2304 auto *IncI = dyn_cast<Instruction>(IncomingValue);
2305 if (!IncI) {
2306 // If the incoming value is not an instruction, it must be loop invariant.
2307 // In that case, we can just replace the PHI with the incoming value and
2308 // remove the PHI.
2309 assert(InnerLoop->isLoopInvariant(IncomingValue) &&
2310 "Expected non-instruction incoming value to be loop invariant");
2311 P.replaceAllUsesWith(IncomingValue);
2312 P.eraseFromParent();
2313 continue;
2314 }
2315
2316 // In case of multi-level nested loops, follow LCSSA to find the incoming
2317 // value defined from the innermost loop.
2318 auto *IncIInnerMost = dyn_cast<Instruction>(followLCSSA(IncI));
2319 // Skip phis when:
2320 // - they are not an instruction, e.g. incoming values are constants.
2321 // - Incomming values from the inner loop body, excluding the header and
2322 // latch.
2323 if (!IncIInnerMost || (IncIInnerMost->getParent() != InnerLatch &&
2324 IncIInnerMost->getParent() != InnerHeader))
2325 continue;
2326
2327 assert(all_of(P.users(),
2328 [OuterHeader, OuterExit, IncI, InnerHeader](User *U) {
2329 return (cast<PHINode>(U)->getParent() == OuterHeader &&
2330 IncI->getParent() == InnerHeader) ||
2331 cast<PHINode>(U)->getParent() == OuterExit;
2332 }) &&
2333 "Can only replace phis iff the uses are in the loop nest exit or "
2334 "the incoming value is defined in the inner header (it will "
2335 "dominate all loop blocks after interchanging)");
2336 P.replaceAllUsesWith(IncI);
2337 P.eraseFromParent();
2338 }
2339
2340 SmallVector<PHINode *, 8> LcssaInnerExit(
2341 llvm::make_pointer_range(InnerExit->phis()));
2342
2343 SmallVector<PHINode *, 8> LcssaInnerLatch(
2344 llvm::make_pointer_range(InnerLatch->phis()));
2345
2346 // Lcssa PHIs for values used outside the inner loop are in InnerExit.
2347 // If a PHI node has users outside of InnerExit, it has a use outside the
2348 // interchanged loop and we have to preserve it. We move these to
2349 // InnerLatch, which will become the new exit block for the innermost
2350 // loop after interchanging.
2351 for (PHINode *P : LcssaInnerExit)
2352 P->moveBefore(InnerLatch->getFirstNonPHIIt());
2353
2354 // If the inner loop latch contains LCSSA PHIs, those come from a child loop
2355 // and we have to move them to the new inner latch.
2356 for (PHINode *P : LcssaInnerLatch)
2357 P->moveBefore(InnerExit->getFirstNonPHIIt());
2358
2359 // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have
2360 // incoming values defined in the outer loop, we have to add a new PHI
2361 // in the inner loop latch, which became the exit block of the outer loop,
2362 // after interchanging.
2363 if (OuterExit) {
2364 for (PHINode &P : OuterExit->phis()) {
2365 if (P.getNumIncomingValues() != 1)
2366 continue;
2367 // Skip Phis with incoming values defined in the inner loop. Those should
2368 // already have been updated.
2369 auto I = dyn_cast<Instruction>(P.getIncomingValue(0));
2370 if (!I || LI->getLoopFor(I->getParent()) == InnerLoop)
2371 continue;
2372
2373 PHINode *NewPhi = dyn_cast<PHINode>(P.clone());
2374 NewPhi->setIncomingValue(0, P.getIncomingValue(0));
2375 NewPhi->setIncomingBlock(0, OuterLatch);
2376 // We might have incoming edges from other BBs, i.e., the original outer
2377 // header.
2378 for (auto *Pred : predecessors(InnerLatch)) {
2379 if (Pred == OuterLatch)
2380 continue;
2381 NewPhi->addIncoming(P.getIncomingValue(0), Pred);
2382 }
2383 NewPhi->insertBefore(InnerLatch->getFirstNonPHIIt());
2384 P.setIncomingValue(0, NewPhi);
2385 }
2386 }
2387
2388 // Now adjust the incoming blocks for the LCSSA PHIs.
2389 // For PHIs moved from Inner's exit block, we need to replace Inner's latch
2390 // with the new latch.
2391 InnerLatch->replacePhiUsesWith(InnerLatch, OuterLatch);
2392}
2393
2394/// This deals with a corner case when a LCSSA phi node appears in a non-exit
2395/// block: the outer loop latch block does not need to be exit block of the
2396/// inner loop. Consider a loop that was in LCSSA form, but then some
2397/// transformation like loop-unswitch comes along and creates an empty block,
2398/// where BB5 in this example is the outer loop latch block:
2399///
2400/// BB4:
2401/// br label %BB5
2402/// BB5:
2403/// %old.cond.lcssa = phi i16 [ %cond, %BB4 ]
2404/// br outer.header
2405///
2406/// Interchange then brings it in LCSSA form again resulting in this chain of
2407/// single-input phi nodes:
2408///
2409/// BB4:
2410/// %new.cond.lcssa = phi i16 [ %cond, %BB3 ]
2411/// br label %BB5
2412/// BB5:
2413/// %old.cond.lcssa = phi i16 [ %new.cond.lcssa, %BB4 ]
2414///
2415/// The problem is that interchange can reoder blocks BB4 and BB5 placing the
2416/// use before the def if we don't check this. The solution is to simplify
2417/// lcssa phi nodes (remove) if they appear in non-exit blocks.
2418///
2419static void simplifyLCSSAPhis(Loop *OuterLoop, Loop *InnerLoop) {
2420 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
2421 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
2422
2423 // Do not modify lcssa phis where they actually belong, i.e. in exit blocks.
2424 if (OuterLoopLatch == InnerLoopExit)
2425 return;
2426
2427 // Collect and remove phis in non-exit blocks if they have 1 input.
2429 llvm::make_pointer_range(OuterLoopLatch->phis()));
2430 for (PHINode *Phi : Phis) {
2431 assert(Phi->getNumIncomingValues() == 1 && "Single input phi expected");
2432 LLVM_DEBUG(dbgs() << "Removing 1-input phi in non-exit block: " << *Phi
2433 << "\n");
2434 Phi->replaceAllUsesWith(Phi->getIncomingValue(0));
2435 Phi->eraseFromParent();
2436 }
2437}
2438
2439bool LoopInterchangeTransform::adjustLoopBranches() {
2440 LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
2441 std::vector<DominatorTree::UpdateType> DTUpdates;
2442
2443 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
2444 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2445
2446 assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
2447 InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&
2448 InnerLoopPreHeader && "Guaranteed by loop-simplify form");
2449
2450 simplifyLCSSAPhis(OuterLoop, InnerLoop);
2451
2452 // Ensure that both preheaders do not contain PHI nodes and have single
2453 // predecessors. This allows us to move them easily. We use
2454 // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing
2455 // preheaders do not satisfy those conditions.
2456 if (isa<PHINode>(OuterLoopPreHeader->begin()) ||
2457 !OuterLoopPreHeader->getUniquePredecessor())
2458 OuterLoopPreHeader =
2459 InsertPreheaderForLoop(OuterLoop, DT, LI, nullptr, true);
2460 if (InnerLoopPreHeader == OuterLoop->getHeader())
2461 InnerLoopPreHeader =
2462 InsertPreheaderForLoop(InnerLoop, DT, LI, nullptr, true);
2463
2464 // Adjust the loop preheader
2465 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
2466 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
2467 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
2468 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
2469 BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
2470 BasicBlock *InnerLoopLatchPredecessor =
2471 InnerLoopLatch->getUniquePredecessor();
2472 BasicBlock *InnerLoopLatchSuccessor;
2473 BasicBlock *OuterLoopLatchSuccessor;
2474
2475 CondBrInst *OuterLoopLatchBI =
2476 dyn_cast<CondBrInst>(OuterLoopLatch->getTerminator());
2477 CondBrInst *InnerLoopLatchBI =
2478 dyn_cast<CondBrInst>(InnerLoopLatch->getTerminator());
2479 Instruction *OuterLoopHeaderBI = OuterLoopHeader->getTerminator();
2480 Instruction *InnerLoopHeaderBI = InnerLoopHeader->getTerminator();
2481
2482 if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
2483 !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
2484 !InnerLoopHeaderBI)
2485 return false;
2486
2487 Instruction *InnerLoopLatchPredecessorBI =
2488 InnerLoopLatchPredecessor->getTerminator();
2489 Instruction *OuterLoopPredecessorBI = OuterLoopPredecessor->getTerminator();
2490
2491 if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
2492 return false;
2493 BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
2494 if (!InnerLoopHeaderSuccessor)
2495 return false;
2496
2497 // Adjust Loop Preheader and headers.
2498 // The branches in the outer loop predecessor and the outer loop header can
2499 // be unconditional branches or conditional branches with duplicates. Consider
2500 // this when updating the successors.
2501 updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader,
2502 InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false);
2503 // The outer loop header might or might not branch to the outer latch.
2504 // We are guaranteed to branch to the inner loop preheader.
2505 if (llvm::is_contained(successors(OuterLoopHeaderBI), OuterLoopLatch)) {
2506 // In this case the outerLoopHeader should branch to the InnerLoopLatch.
2507 updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, InnerLoopLatch,
2508 DTUpdates,
2509 /*MustUpdateOnce=*/false);
2510 }
2511 updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader,
2512 InnerLoopHeaderSuccessor, DTUpdates,
2513 /*MustUpdateOnce=*/false);
2514
2515 // Adjust reduction PHI's now that the incoming block has changed.
2516 InnerLoopHeaderSuccessor->replacePhiUsesWith(InnerLoopHeader,
2517 OuterLoopHeader);
2518
2519 updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor,
2520 OuterLoopPreHeader, DTUpdates);
2521
2522 // -------------Adjust loop latches-----------
2523 if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
2524 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
2525 else
2526 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
2527
2528 updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch,
2529 InnerLoopLatchSuccessor, DTUpdates);
2530
2531 if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
2532 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
2533 else
2534 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
2535
2536 updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor,
2537 OuterLoopLatchSuccessor, DTUpdates);
2538 updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch,
2539 DTUpdates);
2540
2541 DT->applyUpdates(DTUpdates);
2542 restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader,
2543 OuterLoopPreHeader);
2544
2545 moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch,
2546 OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock(),
2547 InnerLoop, LI);
2548 // For PHIs in the exit block of the outer loop, outer's latch has been
2549 // replaced by Inners'.
2550 OuterLoopLatchSuccessor->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
2551
2552 auto &OuterInnerReductions = LIL.getOuterInnerReductions();
2553 // Now update the reduction PHIs in the inner and outer loop headers.
2554 SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
2555 for (PHINode &PHI : InnerLoopHeader->phis())
2556 if (OuterInnerReductions.contains(&PHI))
2557 InnerLoopPHIs.push_back(&PHI);
2558
2559 for (PHINode &PHI : OuterLoopHeader->phis())
2560 if (OuterInnerReductions.contains(&PHI))
2561 OuterLoopPHIs.push_back(&PHI);
2562
2563 // Now move the remaining reduction PHIs from outer to inner loop header and
2564 // vice versa. The PHI nodes must be part of a reduction across the inner and
2565 // outer loop and all the remains to do is and updating the incoming blocks.
2566 for (PHINode *PHI : OuterLoopPHIs) {
2567 LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump(););
2568 PHI->moveBefore(InnerLoopHeader->getFirstNonPHIIt());
2569 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
2570 }
2571 for (PHINode *PHI : InnerLoopPHIs) {
2572 LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump(););
2573 PHI->moveBefore(OuterLoopHeader->getFirstNonPHIIt());
2574 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
2575 }
2576
2577 // Update the incoming blocks for moved PHI nodes.
2578 OuterLoopHeader->replacePhiUsesWith(InnerLoopPreHeader, OuterLoopPreHeader);
2579 OuterLoopHeader->replacePhiUsesWith(InnerLoopLatch, OuterLoopLatch);
2580 InnerLoopHeader->replacePhiUsesWith(OuterLoopPreHeader, InnerLoopPreHeader);
2581 InnerLoopHeader->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
2582
2583 // Values defined in the outer loop header could be used in the inner loop
2584 // latch. In that case, we need to create LCSSA phis for them, because after
2585 // interchanging they will be defined in the new inner loop and used in the
2586 // new outer loop.
2587 SmallVector<Instruction *, 4> MayNeedLCSSAPhis;
2588 for (Instruction &I :
2589 make_range(OuterLoopHeader->begin(), std::prev(OuterLoopHeader->end())))
2590 MayNeedLCSSAPhis.push_back(&I);
2591 formLCSSAForInstructions(MayNeedLCSSAPhis, *DT, *LI, SE);
2592
2593 return true;
2594}
2595
2596bool LoopInterchangeTransform::adjustLoopLinks() {
2597 // Adjust all branches in the inner and outer loop.
2598 bool Changed = adjustLoopBranches();
2599 if (Changed) {
2600 // We have interchanged the preheaders so we need to interchange the data in
2601 // the preheaders as well. This is because the content of the inner
2602 // preheader was previously executed inside the outer loop.
2603 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
2604 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2605 swapBBContents(OuterLoopPreHeader, InnerLoopPreHeader);
2606 }
2607 return Changed;
2608}
2609
2613 LPMUpdater &U) {
2614 Function &F = *LN.getParent();
2615 SmallVector<Loop *, 8> LoopList(LN.getLoops());
2616
2618
2619 // Ensure minimum depth of the loop nest to do the interchange.
2620 if (!hasSupportedLoopDepth(LoopList, ORE))
2621 return PreservedAnalyses::all();
2622 // Ensure computable loop nest.
2623 if (!isComputableLoopNest(&AR.SE, LoopList)) {
2624 LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
2625 return PreservedAnalyses::all();
2626 }
2627
2628 ORE.emit([&]() {
2629 return OptimizationRemarkAnalysis(DEBUG_TYPE, "Dependence",
2632 << "Computed dependence info, invoking the transform.";
2633 });
2634
2635 DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
2636 if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &AR, &ORE).run(LN))
2637 return PreservedAnalyses::all();
2638 U.markLoopNestChanged(true);
2640}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file defines the StringMap class.
Rewrite undef for PHI
ReachingDefInfo InstSet InstSet & Ignore
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
DXIL Resource Access
#define DEBUG_TYPE
static void moveBBContents(BasicBlock &SourceBB, BasicBlock &TargetBB)
Move the contents of SourceBB to before the last instruction of TargetBB.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
This file defines the interface for the loop cache analysis.
SmallVector< Loop *, 4 > LoopVector
Definition LoopFuse.cpp:362
Loop::LoopBounds::Direction Direction
Definition LoopInfo.cpp:253
static cl::list< RuleTy > Profitabilities("loop-interchange-profitabilities", cl::MiscFlags::CommaSeparated, cl::Hidden, cl::desc("List of profitability heuristics to be used. They are applied in " "the given order"), cl::list_init< RuleTy >({RuleTy::PerInstrOrderCost, RuleTy::ForVectorization}), cl::values(clEnumValN(RuleTy::PerLoopCacheAnalysis, "cache", "Prioritize loop cache cost"), clEnumValN(RuleTy::PerInstrOrderCost, "instorder", "Prioritize the IVs order of each instruction"), clEnumValN(RuleTy::ForVectorization, "vectorize", "Prioritize vectorization"), clEnumValN(RuleTy::Ignore, "ignore", "Ignore profitability, force interchange (does not " "work with other options)")))
static cl::opt< int > LoopInterchangeCostThreshold("loop-interchange-threshold", cl::init(0), cl::Hidden, cl::desc("Interchange if you gain more than this number"))
static cl::opt< unsigned int > MinLoopNestDepth("loop-interchange-min-loop-nest-depth", cl::init(2), cl::Hidden, cl::desc("Minimum depth of loop nest considered for the transform"))
static void updateSuccessor(Instruction *Term, BasicBlock *OldBB, BasicBlock *NewBB, std::vector< DominatorTree::UpdateType > &DTUpdates, bool MustUpdateOnce=true)
static cl::opt< bool > EnableReduction2Memory("loop-interchange-reduction-to-mem", cl::init(false), cl::Hidden, cl::desc("Support for the inner-loop reduction pattern."))
static bool isComputableLoopNest(ScalarEvolution *SE, ArrayRef< Loop * > LoopList)
static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop)
static void simplifyLCSSAPhis(Loop *OuterLoop, Loop *InnerLoop)
This deals with a corner case when a LCSSA phi node appears in a non-exit block: the outer loop latch...
static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx, unsigned ToIndx)
static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader, BasicBlock *InnerLatch, BasicBlock *OuterHeader, BasicBlock *OuterLatch, BasicBlock *OuterExit, Loop *InnerLoop, LoopInfo *LI)
static void printDepMatrix(CharMatrix &DepMatrix)
static cl::opt< unsigned int > MaxMemInstrRatio("loop-interchange-max-mem-instr-ratio", cl::init(4), cl::Hidden, cl::desc("Maximum number of load/store instructions squared in relation to " "the total number of instructions. Higher value may lead to more " "interchanges at the cost of compile-time"))
static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2)
Swap instructions between BB1 and BB2 but keep terminators intact.
static PHINode * findInnerReductionPhi(Loop *L, Value *V, SmallVectorImpl< Instruction * > &HasNoWrapInsts, SmallVectorImpl< Instruction * > &HasNoInfInsts)
static bool areInnerLoopExitPHIsSupported(Loop *OuterL, Loop *InnerL, SmallPtrSetImpl< PHINode * > &Reductions, PHINode *LcssaReduction)
We currently only support LCSSA PHI nodes in the inner loop exit if their users are either of the fol...
static cl::opt< unsigned int > MaxLoopNestDepth("loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden, cl::desc("Maximum depth of loop nest considered for the transform"))
static bool hasSupportedLoopDepth(ArrayRef< Loop * > LoopList, OptimizationRemarkEmitter &ORE)
static bool inThisOrder(const Instruction *Src, const Instruction *Dst)
Return true if Src appears before Dst in the same basic block.
static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId)
Return true if we can vectorize the loop specified by LoopId.
static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, unsigned InnerLoopId, unsigned OuterLoopId)
#define DEBUG_TYPE
static Value * followLCSSA(Value *SV)
static void populateWorklist(Loop &L, LoopVector &LoopList)
static bool areInnerLoopLatchPHIsSupported(Loop *InnerLoop)
The transform clones the inner latch's exit condition into the new latch (see MoveInstructions in Loo...
static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, Loop *L, DependenceInfo *DI, ScalarEvolution *SE, OptimizationRemarkEmitter *ORE)
static std::optional< bool > isLexicographicallyPositive(ArrayRef< char > DV, unsigned Begin, unsigned End)
static bool checkReductionKind(Loop *L, PHINode *PHI, SmallVectorImpl< Instruction * > &HasNoWrapInsts, SmallVectorImpl< Instruction * > &HasNoInfInsts)
static std::optional< const SCEV * > getAddRecCoefficient(ScalarEvolution &SE, const SCEV *S, const Loop *L)
If \S contains an affine addrec for L, return the step recurrence of it.
static bool noDuplicateRulesAndIgnore(ArrayRef< RuleTy > Rules)
This file defines the interface for the loop nest analysis.
This header provides classes for managing a pipeline of passes over loops in LLVM IR.
loop Loop Strength Reduction
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
uint64_t IntrinsicInst * II
#define P(N)
This file contains some templates that are useful if you are working with the STL at all.
static bool processLoop(Loop &L, const AArch64Subtarget &ST, DataLayout DL)
SmallVector< Value *, 8 > ValueVector
This file defines the SmallSet class.
This file defines the SmallVector class.
static bool isProfitable(const StableFunctionMap::StableFunctionEntries &SFS)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:119
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
Get the first element.
Definition ArrayRef.h:144
size_t size() const
Get the array size.
Definition ArrayRef.h:141
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:185
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:474
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:461
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:530
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI void replacePhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block to refer to basic block New instead of basic block Old.
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:659
static LLVM_ABI std::unique_ptr< CacheCost > getCacheCost(Loop &Root, LoopStandardAnalysisResults &AR, DependenceInfo &DI, std::optional< unsigned > TRT=std::nullopt)
Create a CacheCost for the loop nest rooted by Root.
CacheCostTy getLoopCost(const Loop &L) const
Return the estimated cost of loop L if the given loop is part of the loop nest associated with this o...
Value * getCondition() const
BasicBlock * getSuccessor(unsigned i) const
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:225
iterator end()
Definition DenseMap.h:143
DependenceInfo - This class is the main dependence-analysis driver.
LLVM_ABI std::unique_ptr< Dependence > depends(Instruction *Src, Instruction *Dst, bool UnderRuntimeAssumptions=false)
depends - Tests for a dependence between the Src and Dst instructions.
void applyUpdates(ArrayRef< UpdateType > Updates)
Inform the dominator tree about a sequence of CFG edge insertions and deletions and perform a batch u...
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static LLVM_ABI bool isInductionPHI(PHINode *Phi, const Loop *L, ScalarEvolution *SE, InductionDescriptor &D, ArrayRef< const SCEVPredicate * > NoWrapPreds={}, const SCEV *Expr=nullptr, SmallVectorImpl< Instruction * > *CastsToIgnore=nullptr)
Returns true if Phi is an induction in the loop L.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI void insertBefore(InstListType::iterator InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified position.
LLVM_ABI bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void removeBlockFromLoop(BlockT *BB)
This removes the specified basic block from the current loop, updating the Blocks as appropriate.
const std::vector< LoopT * > & getSubLoops() const
Return the loops contained entirely within this loop.
BlockT * getHeader() const
iterator_range< block_iterator > blocks() const
void addChildLoop(LoopT *NewChild)
Add the specified loop to be a child of this loop.
void addBlockEntry(BlockT *BB)
This adds a basic block directly to the basic block list.
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
iterator begin() const
BlockT * getUniqueExitBlock() const
If getUniqueExitBlocks would return exactly one block, return that block.
LoopT * removeChildLoop(iterator I)
This removes the specified child from being a subloop of this loop.
void changeTopLevelLoop(LoopT *OldLoop, LoopT *NewLoop)
Replace the specified loop in the top-level loops list with the indicated loop.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
void changeLoopFor(const BlockT *BB, LoopT *L)
Change the top-level loop that contains BB to the specified loop.
This class represents a loop nest and can be used to query its properties.
static const BasicBlock & skipEmptyBlockUntil(const BasicBlock *From, const BasicBlock *End, bool CheckUniquePred=false)
Recursivelly traverse all empty 'single successor' basic blocks of From (if there are any).
ArrayRef< Loop * > getLoops() const
Get the loops in the nest.
Function * getParent() const
Return the function to which the loop-nest belongs.
Loop & getOutermostLoop() const
Return the outermost loop in the loop nest.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition LoopInfo.cpp:659
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition LoopInfo.cpp:67
StringRef getName() const
Definition LoopInfo.h:407
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
bool isComplete() const
If the PHI node is complete which means all of its parent's predecessors have incoming value in this ...
op_range incoming_values()
void setIncomingBlock(unsigned i, BasicBlock *BB)
void setIncomingValue(unsigned i, Value *V)
static unsigned getIncomingValueNumForOperand(unsigned i)
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Instruction * getExactFPMathInst() const
Returns 1st non-reassociative FP instruction in the PHI node's use-chain.
static LLVM_ABI bool isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes, DemandedBits *DB=nullptr, AssumptionCache *AC=nullptr, DominatorTree *DT=nullptr, ScalarEvolution *SE=nullptr)
Returns true if Phi is a reduction in TheLoop.
LLVM_ABI SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
SCEVUse getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
The main scalar evolution driver.
LLVM_ABI const SCEV * getAbsExpr(const SCEV *Op, bool IsNSW)
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI const SCEV * getPointerBase(const SCEV *V)
Transitively follow the chain of pointer-type operands until reaching a SCEV that does not have a sin...
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, SCEVUse LHS, SCEVUse RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:128
std::pair< iterator, bool > try_emplace(StringRef Key, ArgsTy &&...Args)
Emplace a new element for the specified key into the map if the key isn't already in the map.
Definition StringMap.h:369
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
constexpr size_t size() const
Get the string size.
Definition StringRef.h:144
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:267
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:163
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:185
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
list_initializer< Ty > list_init(ArrayRef< Ty > Vals)
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
DXILDebugInfoMap run(Module &M)
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI BasicBlock * InsertPreheaderForLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
InsertPreheaderForLoop - Once we discover that a loop doesn't have a preheader, this method is called...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
auto successors(const MachineBasicBlock *BB)
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
LLVM_ABI bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition LCSSA.cpp:449
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172
auto map_range(ContainerTy &&C, FuncTy F)
Return a range that applies F to the elements of C.
Definition STLExtras.h:365
AnalysisManager< Loop, LoopStandardAnalysisResults & > LoopAnalysisManager
The loop analysis manager.
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:2025
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:322
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FMinimumNum
FP min with llvm.minimumnum semantics.
@ Or
Bitwise or logical OR of integers.
@ FMinimum
FP min with llvm.minimum semantics.
@ Mul
Product of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMaximum
FP max with llvm.maximum semantics.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Add
Sum of integers.
@ FAdd
Sum of floats.
@ FMaximumNum
FP max with llvm.maximumnum semantics.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2011
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI bool formLCSSAForInstructions(SmallVectorImpl< Instruction * > &Worklist, const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE, SmallVectorImpl< PHINode * > *PHIsToRemove=nullptr, SmallVectorImpl< PHINode * > *InsertedPHIs=nullptr)
Ensures LCSSA form for every instruction from the Worklist in the scope of innermost containing loop.
Definition LCSSA.cpp:308
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
auto predecessors(const MachineBasicBlock *BB)
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:368
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2165
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
LLVM_ABI PreservedAnalyses run(LoopNest &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...