LLVM 23.0.0git
LoopInterchange.cpp
Go to the documentation of this file.
1//===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This Pass handles loop interchange transform.
10// This pass interchanges loops to provide a more cache-friendly memory access
11// patterns.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/STLExtras.h"
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/Statistic.h"
20#include "llvm/ADT/StringMap.h"
21#include "llvm/ADT/StringRef.h"
30#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/Dominators.h"
33#include "llvm/IR/Function.h"
34#include "llvm/IR/IRBuilder.h"
35#include "llvm/IR/InstrTypes.h"
36#include "llvm/IR/Instruction.h"
38#include "llvm/IR/User.h"
39#include "llvm/IR/Value.h"
42#include "llvm/Support/Debug.h"
49#include <cassert>
50#include <utility>
51#include <vector>
52
53using namespace llvm;
54
55#define DEBUG_TYPE "loop-interchange"
56
57STATISTIC(LoopsInterchanged, "Number of loops interchanged");
58
60 "loop-interchange-threshold", cl::init(0), cl::Hidden,
61 cl::desc("Interchange if you gain more than this number"));
62
63// Maximum number of load-stores that can be handled in the dependency matrix.
65 "loop-interchange-max-meminstr-count", cl::init(64), cl::Hidden,
67 "Maximum number of load-store instructions that should be handled "
68 "in the dependency matrix. Higher value may lead to more interchanges "
69 "at the cost of compile-time"));
70
71namespace {
72
74
75/// A list of direction vectors. Each entry represents a direction vector
76/// corresponding to one or more dependencies existing in the loop nest. The
77/// length of all direction vectors is equal and is N + 1, where N is the depth
78/// of the loop nest. The first N elements correspond to the dependency
79/// direction of each N loops. The last one indicates whether this entry is
80/// forward dependency ('<') or not ('*'). The term "forward" aligns with what
81/// is defined in LoopAccessAnalysis.
82// TODO: Check if we can use a sparse matrix here.
83using CharMatrix = std::vector<std::vector<char>>;
84
85/// Types of rules used in profitability check.
86enum class RuleTy {
87 PerLoopCacheAnalysis,
88 PerInstrOrderCost,
89 ForVectorization,
90 Ignore
91};
92
93} // end anonymous namespace
94
95// Minimum loop depth supported.
97 "loop-interchange-min-loop-nest-depth", cl::init(2), cl::Hidden,
98 cl::desc("Minimum depth of loop nest considered for the transform"));
99
100// Maximum loop depth supported.
102 "loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden,
103 cl::desc("Maximum depth of loop nest considered for the transform"));
104
105// We prefer cache cost to vectorization by default.
107 "loop-interchange-profitabilities", cl::ZeroOrMore,
109 cl::desc("List of profitability heuristics to be used. They are applied in "
110 "the given order"),
111 cl::list_init<RuleTy>({RuleTy::PerLoopCacheAnalysis,
112 RuleTy::PerInstrOrderCost,
113 RuleTy::ForVectorization}),
114 cl::values(clEnumValN(RuleTy::PerLoopCacheAnalysis, "cache",
115 "Prioritize loop cache cost"),
116 clEnumValN(RuleTy::PerInstrOrderCost, "instorder",
117 "Prioritize the IVs order of each instruction"),
118 clEnumValN(RuleTy::ForVectorization, "vectorize",
119 "Prioritize vectorization"),
120 clEnumValN(RuleTy::Ignore, "ignore",
121 "Ignore profitability, force interchange (does not "
122 "work with other options)")));
123
124// Support for the inner-loop reduction pattern.
126 "loop-interchange-reduction-to-mem", cl::init(false), cl::Hidden,
127 cl::desc("Support for the inner-loop reduction pattern."));
128
129#ifndef NDEBUG
132 for (RuleTy Rule : Rules) {
133 if (!Set.insert(Rule).second)
134 return false;
135 if (Rule == RuleTy::Ignore)
136 return false;
137 }
138 return true;
139}
140
141static void printDepMatrix(CharMatrix &DepMatrix) {
142 for (auto &Row : DepMatrix) {
143 // Drop the last element because it is a flag indicating whether this is
144 // forward dependency or not, which doesn't affect the legality check.
145 for (char D : drop_end(Row))
146 LLVM_DEBUG(dbgs() << D << " ");
147 LLVM_DEBUG(dbgs() << "\n");
148 }
149}
150
151/// Return true if \p Src appears before \p Dst in the same basic block.
152/// Precondition: \p Src and \Dst are distinct instructions within the same
153/// basic block.
154static bool inThisOrder(const Instruction *Src, const Instruction *Dst) {
155 assert(Src->getParent() == Dst->getParent() && Src != Dst &&
156 "Expected Src and Dst to be different instructions in the same BB");
157
158 bool FoundSrc = false;
159 for (const Instruction &I : *(Src->getParent())) {
160 if (&I == Src) {
161 FoundSrc = true;
162 continue;
163 }
164 if (&I == Dst)
165 return FoundSrc;
166 }
167
168 llvm_unreachable("Dst not found");
169}
170#endif
171
172static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
173 Loop *L, DependenceInfo *DI,
174 ScalarEvolution *SE,
177
178 ValueVector MemInstr;
179
180 // For each block.
181 for (BasicBlock *BB : L->blocks()) {
182 // Scan the BB and collect legal loads and stores.
183 for (Instruction &I : *BB) {
184 if (!isa<Instruction>(I))
185 return false;
186 if (auto *Ld = dyn_cast<LoadInst>(&I)) {
187 if (!Ld->isSimple())
188 return false;
189 MemInstr.push_back(&I);
190 } else if (auto *St = dyn_cast<StoreInst>(&I)) {
191 if (!St->isSimple())
192 return false;
193 MemInstr.push_back(&I);
194 }
195 }
196 }
197
198 LLVM_DEBUG(dbgs() << "Found " << MemInstr.size()
199 << " Loads and Stores to analyze\n");
200 if (MemInstr.size() > MaxMemInstrCount) {
201 LLVM_DEBUG(dbgs() << "The transform doesn't support more than "
202 << MaxMemInstrCount << " load/stores in a loop\n");
203 ORE->emit([&]() {
204 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoop",
205 L->getStartLoc(), L->getHeader())
206 << "Number of loads/stores exceeded, the supported maximum "
207 "can be increased with option "
208 "-loop-interchange-maxmeminstr-count.";
209 });
210 return false;
211 }
212 ValueVector::iterator I, IE, J, JE;
213
214 // Manage direction vectors that are already seen. Map each direction vector
215 // to an index of DepMatrix at which it is stored.
217
218 for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
219 for (J = I, JE = MemInstr.end(); J != JE; ++J) {
220 std::vector<char> Dep;
223 // Ignore Input dependencies.
224 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
225 continue;
226 // Track Output, Flow, and Anti dependencies.
227 if (auto D = DI->depends(Src, Dst)) {
228 assert(D->isOrdered() && "Expected an output, flow or anti dep.");
229 // If the direction vector is negative, normalize it to
230 // make it non-negative.
231 if (D->normalize(SE))
232 LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
233 LLVM_DEBUG(StringRef DepType =
234 D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
235 dbgs() << "Found " << DepType
236 << " dependency between Src and Dst\n"
237 << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
238 unsigned Levels = D->getLevels();
239 char Direction;
240 for (unsigned II = 1; II <= Levels; ++II) {
241 // `DVEntry::LE` is converted to `*`. This is because `LE` means `<`
242 // or `=`, for which we don't have an equivalent representation, so
243 // that the conservative approximation is necessary. The same goes for
244 // `DVEntry::GE`.
245 // TODO: Use of fine-grained expressions allows for more accurate
246 // analysis.
247 unsigned Dir = D->getDirection(II);
248 if (Dir == Dependence::DVEntry::LT)
249 Direction = '<';
250 else if (Dir == Dependence::DVEntry::GT)
251 Direction = '>';
252 else if (Dir == Dependence::DVEntry::EQ)
253 Direction = '=';
254 else
255 Direction = '*';
256 Dep.push_back(Direction);
257 }
258
259 // If the Dependence object doesn't have any information, fill the
260 // dependency vector with '*'.
261 if (D->isConfused()) {
262 assert(Dep.empty() && "Expected empty dependency vector");
263 Dep.assign(Level, '*');
264 }
265
266 while (Dep.size() != Level) {
267 Dep.push_back('I');
268 }
269
270 // If all the elements of any direction vector have only '*', legality
271 // can't be proven. Exit early to save compile time.
272 if (all_of(Dep, equal_to('*'))) {
273 ORE->emit([&]() {
274 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
275 L->getStartLoc(), L->getHeader())
276 << "All loops have dependencies in all directions.";
277 });
278 return false;
279 }
280
281 // Test whether the dependency is forward or not.
282 bool IsKnownForward = true;
283 if (Src->getParent() != Dst->getParent()) {
284 // In general, when Src and Dst are in different BBs, the execution
285 // order of them within a single iteration is not guaranteed. Treat
286 // conservatively as not-forward dependency in this case.
287 IsKnownForward = false;
288 } else {
289 // Src and Dst are in the same BB. If they are the different
290 // instructions, Src should appear before Dst in the BB as they are
291 // stored to MemInstr in that order.
292 assert((Src == Dst || inThisOrder(Src, Dst)) &&
293 "Unexpected instructions");
294
295 // If the Dependence object is reversed (due to normalization), it
296 // represents the dependency from Dst to Src, meaning it is a backward
297 // dependency. Otherwise it should be a forward dependency.
298 bool IsReversed = D->getSrc() != Src;
299 if (IsReversed)
300 IsKnownForward = false;
301 }
302
303 // Initialize the last element. Assume forward dependencies only; it
304 // will be updated later if there is any non-forward dependency.
305 Dep.push_back('<');
306
307 // The last element should express the "summary" among one or more
308 // direction vectors whose first N elements are the same (where N is
309 // the depth of the loop nest). Hence we exclude the last element from
310 // the Seen map.
311 auto [Ite, Inserted] = Seen.try_emplace(
312 StringRef(Dep.data(), Dep.size() - 1), DepMatrix.size());
313
314 // Make sure we only add unique entries to the dependency matrix.
315 if (Inserted)
316 DepMatrix.push_back(Dep);
317
318 // If we cannot prove that this dependency is forward, change the last
319 // element of the corresponding entry. Since a `[... *]` dependency
320 // includes a `[... <]` dependency, we do not need to keep both and
321 // change the existing entry instead.
322 if (!IsKnownForward)
323 DepMatrix[Ite->second].back() = '*';
324 }
325 }
326 }
327
328 return true;
329}
330
331// A loop is moved from index 'from' to an index 'to'. Update the Dependence
332// matrix by exchanging the two columns.
333static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
334 unsigned ToIndx) {
335 for (auto &Row : DepMatrix)
336 std::swap(Row[ToIndx], Row[FromIndx]);
337}
338
339// Check if a direction vector is lexicographically positive. Return true if it
340// is positive, nullopt if it is "zero", otherwise false.
341// [Theorem] A permutation of the loops in a perfect nest is legal if and only
342// if the direction matrix, after the same permutation is applied to its
343// columns, has no ">" direction as the leftmost non-"=" direction in any row.
344static std::optional<bool>
345isLexicographicallyPositive(ArrayRef<char> DV, unsigned Begin, unsigned End) {
346 for (unsigned char Direction : DV.slice(Begin, End - Begin)) {
347 if (Direction == '<')
348 return true;
349 if (Direction == '>' || Direction == '*')
350 return false;
351 }
352 return std::nullopt;
353}
354
355// Checks if it is legal to interchange 2 loops.
356static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
357 unsigned InnerLoopId,
358 unsigned OuterLoopId) {
359 unsigned NumRows = DepMatrix.size();
360 std::vector<char> Cur;
361 // For each row check if it is valid to interchange.
362 for (unsigned Row = 0; Row < NumRows; ++Row) {
363 // Create temporary DepVector check its lexicographical order
364 // before and after swapping OuterLoop vs InnerLoop
365 Cur = DepMatrix[Row];
366
367 // If the surrounding loops already ensure that the direction vector is
368 // lexicographically positive, nothing within the loop will be able to break
369 // the dependence. In such a case we can skip the subsequent check.
370 if (isLexicographicallyPositive(Cur, 0, OuterLoopId) == true)
371 continue;
372
373 // Check if the direction vector is lexicographically positive (or zero)
374 // for both before/after exchanged. Ignore the last element because it
375 // doesn't affect the legality.
376 if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
377 return false;
378 std::swap(Cur[InnerLoopId], Cur[OuterLoopId]);
379 if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
380 return false;
381 }
382 return true;
383}
384
385static void populateWorklist(Loop &L, LoopVector &LoopList) {
386 LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
387 << L.getHeader()->getParent()->getName() << " Loop: %"
388 << L.getHeader()->getName() << '\n');
389 assert(LoopList.empty() && "LoopList should initially be empty!");
390 Loop *CurrentLoop = &L;
391 const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
392 while (!Vec->empty()) {
393 // The current loop has multiple subloops in it hence it is not tightly
394 // nested.
395 // Discard all loops above it added into Worklist.
396 if (Vec->size() != 1) {
397 LoopList = {};
398 return;
399 }
400
401 LoopList.push_back(CurrentLoop);
402 CurrentLoop = Vec->front();
403 Vec = &CurrentLoop->getSubLoops();
404 }
405 LoopList.push_back(CurrentLoop);
406}
407
410 unsigned LoopNestDepth = LoopList.size();
411 if (LoopNestDepth < MinLoopNestDepth || LoopNestDepth > MaxLoopNestDepth) {
412 LLVM_DEBUG(dbgs() << "Unsupported depth of loop nest " << LoopNestDepth
413 << ", the supported range is [" << MinLoopNestDepth
414 << ", " << MaxLoopNestDepth << "].\n");
415 Loop *OuterLoop = LoopList.front();
416 ORE.emit([&]() {
417 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoopNestDepth",
418 OuterLoop->getStartLoc(),
419 OuterLoop->getHeader())
420 << "Unsupported depth of loop nest, the supported range is ["
421 << std::to_string(MinLoopNestDepth) << ", "
422 << std::to_string(MaxLoopNestDepth) << "].\n";
423 });
424 return false;
425 }
426 return true;
427}
428
430 ArrayRef<Loop *> LoopList) {
431 for (Loop *L : LoopList) {
432 const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
433 if (isa<SCEVCouldNotCompute>(ExitCountOuter)) {
434 LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
435 return false;
436 }
437 if (L->getNumBackEdges() != 1) {
438 LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
439 return false;
440 }
441 if (!L->getExitingBlock()) {
442 LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
443 return false;
444 }
445 }
446 return true;
447}
448
449namespace {
450
451/// LoopInterchangeLegality checks if it is legal to interchange the loop.
452class LoopInterchangeLegality {
453public:
454 LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
455 OptimizationRemarkEmitter *ORE, DominatorTree *DT)
456 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), DT(DT), ORE(ORE) {}
457
458 /// Check if the loops can be interchanged.
459 bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
460 CharMatrix &DepMatrix);
461
462 /// Discover induction PHIs in the header of \p L. Induction
463 /// PHIs are added to \p Inductions.
464 bool findInductions(Loop *L, SmallVectorImpl<PHINode *> &Inductions);
465
466 /// Check if the loop structure is understood. We do not handle triangular
467 /// loops for now.
468 bool isLoopStructureUnderstood();
469
470 bool currentLimitations();
471
472 const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const {
473 return OuterInnerReductions;
474 }
475
476 const ArrayRef<PHINode *> getInnerLoopInductions() const {
477 return InnerLoopInductions;
478 }
479
480 ArrayRef<Instruction *> getHasNoWrapReductions() const {
481 return HasNoWrapReductions;
482 }
483
484 /// Record reductions in the inner loop. Currently supported reductions:
485 /// - initialized from a constant.
486 /// - reduction PHI node has only one user.
487 /// - located in the innermost loop.
488 struct InnerReduction {
489 /// The reduction itself.
490 PHINode *Reduction;
491 Value *Init;
492 Value *Next;
493 /// The Lcssa PHI.
494 PHINode *LcssaPhi;
495 /// Store reduction result into memory object.
496 StoreInst *LcssaStore;
497 /// The memory Location.
498 Value *MemRef;
499 Type *ElemTy;
500 };
501
502 ArrayRef<InnerReduction> getInnerReductions() const {
503 return InnerReductions;
504 }
505
506private:
507 bool tightlyNested(Loop *Outer, Loop *Inner);
508 bool containsUnsafeInstructions(BasicBlock *BB, Instruction *Skip);
509
510 /// Discover induction and reduction PHIs in the header of \p L. Induction
511 /// PHIs are added to \p Inductions, reductions are added to
512 /// OuterInnerReductions. When the outer loop is passed, the inner loop needs
513 /// to be passed as \p InnerLoop.
514 bool findInductionAndReductions(Loop *L,
515 SmallVector<PHINode *, 8> &Inductions,
516 Loop *InnerLoop);
517
518 /// Detect and record the reduction of the inner loop. Add them to
519 /// InnerReductions.
520 ///
521 /// innerloop:
522 /// Re = phi<0.0, Next>
523 /// Next = Re op ...
524 /// OuterLoopLatch:
525 /// Lcssa = phi<Next> ; lcssa phi
526 /// store Lcssa, MemRef ; LcssaStore
527 ///
528 bool isInnerReduction(Loop *L, PHINode *Phi,
529 SmallVectorImpl<Instruction *> &HasNoWrapInsts);
530
531 Loop *OuterLoop;
532 Loop *InnerLoop;
533
534 ScalarEvolution *SE;
535 DominatorTree *DT;
536
537 /// Interface to emit optimization remarks.
538 OptimizationRemarkEmitter *ORE;
539
540 /// Set of reduction PHIs taking part of a reduction across the inner and
541 /// outer loop.
542 SmallPtrSet<PHINode *, 4> OuterInnerReductions;
543
544 /// Set of inner loop induction PHIs
545 SmallVector<PHINode *, 8> InnerLoopInductions;
546
547 /// Hold instructions that have nuw/nsw flags and involved in reductions,
548 /// like integer addition/multiplication. Those flags must be dropped when
549 /// interchanging the loops.
550 SmallVector<Instruction *, 4> HasNoWrapReductions;
551
552 /// Vector of reductions in the inner loop.
553 SmallVector<InnerReduction, 8> InnerReductions;
554};
555
556/// Manages information utilized by the profitability check for cache. The main
557/// purpose of this class is to delay the computation of CacheCost until it is
558/// actually needed.
559class CacheCostManager {
560 Loop *OutermostLoop;
561 LoopStandardAnalysisResults *AR;
562 DependenceInfo *DI;
563
564 /// CacheCost for \ref OutermostLoop. Once it is computed, it is cached. Note
565 /// that the result can be nullptr.
566 std::optional<std::unique_ptr<CacheCost>> CC;
567
568 /// Maps each loop to an index representing the optimal position within the
569 /// loop-nest, as determined by the cache cost analysis.
570 DenseMap<const Loop *, unsigned> CostMap;
571
572 void computeIfUnitinialized();
573
574public:
575 CacheCostManager(Loop *OutermostLoop, LoopStandardAnalysisResults *AR,
576 DependenceInfo *DI)
577 : OutermostLoop(OutermostLoop), AR(AR), DI(DI) {}
578 CacheCost *getCacheCost();
579 const DenseMap<const Loop *, unsigned> &getCostMap();
580};
581
582/// LoopInterchangeProfitability checks if it is profitable to interchange the
583/// loop.
584class LoopInterchangeProfitability {
585public:
586 LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
587 OptimizationRemarkEmitter *ORE)
588 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
589
590 /// Check if the loop interchange is profitable.
591 bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
592 unsigned InnerLoopId, unsigned OuterLoopId,
593 CharMatrix &DepMatrix, CacheCostManager &CCM);
594
595private:
596 int getInstrOrderCost();
597 std::optional<bool> isProfitablePerLoopCacheAnalysis(
598 const DenseMap<const Loop *, unsigned> &CostMap, CacheCost *CC);
599 std::optional<bool> isProfitablePerInstrOrderCost();
600 std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
601 unsigned OuterLoopId,
602 CharMatrix &DepMatrix);
603 Loop *OuterLoop;
604 Loop *InnerLoop;
605
606 /// Scev analysis.
607 ScalarEvolution *SE;
608
609 /// Interface to emit optimization remarks.
610 OptimizationRemarkEmitter *ORE;
611};
612
613/// LoopInterchangeTransform interchanges the loop.
614class LoopInterchangeTransform {
615public:
616 LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
617 LoopInfo *LI, DominatorTree *DT,
618 const LoopInterchangeLegality &LIL)
619 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {}
620
621 /// Interchange OuterLoop and InnerLoop.
622 bool transform(ArrayRef<Instruction *> DropNoWrapInsts);
623 void reduction2Memory();
624 void restructureLoops(Loop *NewInner, Loop *NewOuter,
625 BasicBlock *OrigInnerPreHeader,
626 BasicBlock *OrigOuterPreHeader);
627 void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
628
629private:
630 bool adjustLoopLinks();
631 bool adjustLoopBranches();
632
633 Loop *OuterLoop;
634 Loop *InnerLoop;
635
636 /// Scev analysis.
637 ScalarEvolution *SE;
638
639 LoopInfo *LI;
640 DominatorTree *DT;
641
642 const LoopInterchangeLegality &LIL;
643};
644
645struct LoopInterchange {
646 ScalarEvolution *SE = nullptr;
647 LoopInfo *LI = nullptr;
648 DependenceInfo *DI = nullptr;
649 DominatorTree *DT = nullptr;
650 LoopStandardAnalysisResults *AR = nullptr;
651
652 /// Interface to emit optimization remarks.
653 OptimizationRemarkEmitter *ORE;
654
655 LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
656 DominatorTree *DT, LoopStandardAnalysisResults *AR,
657 OptimizationRemarkEmitter *ORE)
658 : SE(SE), LI(LI), DI(DI), DT(DT), AR(AR), ORE(ORE) {}
659
660 bool run(Loop *L) {
661 if (L->getParentLoop())
662 return false;
663 SmallVector<Loop *, 8> LoopList;
664 populateWorklist(*L, LoopList);
665 return processLoopList(LoopList);
666 }
667
668 bool run(LoopNest &LN) {
669 SmallVector<Loop *, 8> LoopList(LN.getLoops());
670 for (unsigned I = 1; I < LoopList.size(); ++I)
671 if (LoopList[I]->getParentLoop() != LoopList[I - 1])
672 return false;
673 return processLoopList(LoopList);
674 }
675
676 unsigned selectLoopForInterchange(ArrayRef<Loop *> LoopList) {
677 // TODO: Add a better heuristic to select the loop to be interchanged based
678 // on the dependence matrix. Currently we select the innermost loop.
679 return LoopList.size() - 1;
680 }
681
682 bool processLoopList(SmallVectorImpl<Loop *> &LoopList) {
683 bool Changed = false;
684
685 // Ensure proper loop nest depth.
686 assert(hasSupportedLoopDepth(LoopList, *ORE) &&
687 "Unsupported depth of loop nest.");
688
689 unsigned LoopNestDepth = LoopList.size();
690
691 LLVM_DEBUG({
692 dbgs() << "Processing LoopList of size = " << LoopNestDepth
693 << " containing the following loops:\n";
694 for (auto *L : LoopList) {
695 dbgs() << " - ";
696 L->print(dbgs());
697 }
698 });
699
700 CharMatrix DependencyMatrix;
701 Loop *OuterMostLoop = *(LoopList.begin());
702 if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
703 OuterMostLoop, DI, SE, ORE)) {
704 LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
705 return false;
706 }
707
708 LLVM_DEBUG(dbgs() << "Dependency matrix before interchange:\n";
709 printDepMatrix(DependencyMatrix));
710
711 // Get the Outermost loop exit.
712 BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
713 if (!LoopNestExit) {
714 LLVM_DEBUG(dbgs() << "OuterMostLoop '" << OuterMostLoop->getName()
715 << "' needs an unique exit block");
716 return false;
717 }
718
719 unsigned SelecLoopId = selectLoopForInterchange(LoopList);
720 CacheCostManager CCM(LoopList[0], AR, DI);
721 // We try to achieve the globally optimal memory access for the loopnest,
722 // and do interchange based on a bubble-sort fasion. We start from
723 // the innermost loop, move it outwards to the best possible position
724 // and repeat this process.
725 for (unsigned j = SelecLoopId; j > 0; j--) {
726 bool ChangedPerIter = false;
727 for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
728 bool Interchanged =
729 processLoop(LoopList, i, i - 1, DependencyMatrix, CCM);
730 ChangedPerIter |= Interchanged;
731 Changed |= Interchanged;
732 }
733 // Early abort if there was no interchange during an entire round of
734 // moving loops outwards.
735 if (!ChangedPerIter)
736 break;
737 }
738 return Changed;
739 }
740
741 bool processLoop(SmallVectorImpl<Loop *> &LoopList, unsigned InnerLoopId,
742 unsigned OuterLoopId,
743 std::vector<std::vector<char>> &DependencyMatrix,
744 CacheCostManager &CCM) {
745 Loop *OuterLoop = LoopList[OuterLoopId];
746 Loop *InnerLoop = LoopList[InnerLoopId];
747 LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
748 << " and OuterLoopId = " << OuterLoopId << "\n");
749 LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE, DT);
750 if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
751 LLVM_DEBUG(dbgs() << "Cannot prove legality, not interchanging loops '"
752 << OuterLoop->getName() << "' and '"
753 << InnerLoop->getName() << "'\n");
754 return false;
755 }
756 LLVM_DEBUG(dbgs() << "Loops '" << OuterLoop->getName() << "' and '"
757 << InnerLoop->getName()
758 << "' are legal to interchange\n");
759 LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
760 if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
761 DependencyMatrix, CCM)) {
762 LLVM_DEBUG(dbgs() << "Interchanging loops '" << OuterLoop->getName()
763 << "' and '" << InnerLoop->getName()
764 << "' not profitable.\n");
765 return false;
766 }
767
768 ORE->emit([&]() {
769 return OptimizationRemark(DEBUG_TYPE, "Interchanged",
770 InnerLoop->getStartLoc(),
771 InnerLoop->getHeader())
772 << "Loop interchanged with enclosing loop.";
773 });
774
775 LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL);
776 LIT.transform(LIL.getHasNoWrapReductions());
777 LLVM_DEBUG(dbgs() << "Loops interchanged: outer loop '"
778 << OuterLoop->getName() << "' and inner loop '"
779 << InnerLoop->getName() << "'\n");
780 LoopsInterchanged++;
781
782 llvm::formLCSSARecursively(*OuterLoop, *DT, LI, SE);
783
784 // Loops interchanged, update LoopList accordingly.
785 std::swap(LoopList[OuterLoopId], LoopList[InnerLoopId]);
786 // Update the DependencyMatrix
787 interChangeDependencies(DependencyMatrix, InnerLoopId, OuterLoopId);
788
789 LLVM_DEBUG(dbgs() << "Dependency matrix after interchange:\n";
790 printDepMatrix(DependencyMatrix));
791
792 return true;
793 }
794};
795
796} // end anonymous namespace
797
798bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB,
799 Instruction *Skip) {
800 return any_of(*BB, [Skip](const Instruction &I) {
801 if (&I == Skip)
802 return false;
803 return I.mayHaveSideEffects() || I.mayReadFromMemory();
804 });
805}
806
807bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
808 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
809 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
810 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
811
812 LLVM_DEBUG(dbgs() << "Checking if loops '" << OuterLoop->getName()
813 << "' and '" << InnerLoop->getName()
814 << "' are tightly nested\n");
815
816 // A perfectly nested loop will not have any branch in between the outer and
817 // inner block i.e. outer header will branch to either inner preheader and
818 // outerloop latch.
819 BranchInst *OuterLoopHeaderBI =
820 dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
821 if (!OuterLoopHeaderBI)
822 return false;
823
824 for (BasicBlock *Succ : successors(OuterLoopHeaderBI))
825 if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader() &&
826 Succ != OuterLoopLatch)
827 return false;
828
829 LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
830
831 // The inner loop reduction pattern requires storing the LCSSA PHI in
832 // the OuterLoop Latch. Therefore, when reduction2Memory is enabled, skip
833 // that store during checks.
834 Instruction *Skip = nullptr;
835 assert(InnerReductions.size() <= 1 &&
836 "So far we only support at most one reduction.");
837 if (InnerReductions.size() == 1)
838 Skip = InnerReductions[0].LcssaStore;
839
840 // We do not have any basic block in between now make sure the outer header
841 // and outer loop latch doesn't contain any unsafe instructions.
842 if (containsUnsafeInstructions(OuterLoopHeader, Skip) ||
843 containsUnsafeInstructions(OuterLoopLatch, Skip))
844 return false;
845
846 // Also make sure the inner loop preheader does not contain any unsafe
847 // instructions. Note that all instructions in the preheader will be moved to
848 // the outer loop header when interchanging.
849 if (InnerLoopPreHeader != OuterLoopHeader &&
850 containsUnsafeInstructions(InnerLoopPreHeader, Skip))
851 return false;
852
853 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
854 // Ensure the inner loop exit block flows to the outer loop latch possibly
855 // through empty blocks.
856 const BasicBlock &SuccInner =
857 LoopNest::skipEmptyBlockUntil(InnerLoopExit, OuterLoopLatch);
858 if (&SuccInner != OuterLoopLatch) {
859 LLVM_DEBUG(dbgs() << "Inner loop exit block " << *InnerLoopExit
860 << " does not lead to the outer loop latch.\n";);
861 return false;
862 }
863 // The inner loop exit block does flow to the outer loop latch and not some
864 // other BBs, now make sure it contains safe instructions, since it will be
865 // moved into the (new) inner loop after interchange.
866 if (containsUnsafeInstructions(InnerLoopExit, Skip))
867 return false;
868
869 LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
870 // We have a perfect loop nest.
871 return true;
872}
873
874bool LoopInterchangeLegality::isLoopStructureUnderstood() {
875 BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
876 for (PHINode *InnerInduction : InnerLoopInductions) {
877 unsigned Num = InnerInduction->getNumOperands();
878 for (unsigned i = 0; i < Num; ++i) {
879 Value *Val = InnerInduction->getOperand(i);
880 if (isa<Constant>(Val))
881 continue;
883 if (!I)
884 return false;
885 // TODO: Handle triangular loops.
886 // e.g. for(int i=0;i<N;i++)
887 // for(int j=i;j<N;j++)
888 unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
889 if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
890 InnerLoopPreheader &&
891 !OuterLoop->isLoopInvariant(I)) {
892 return false;
893 }
894 }
895 }
896
897 // TODO: Handle triangular loops of another form.
898 // e.g. for(int i=0;i<N;i++)
899 // for(int j=0;j<i;j++)
900 // or,
901 // for(int i=0;i<N;i++)
902 // for(int j=0;j*i<N;j++)
903 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
904 BranchInst *InnerLoopLatchBI =
905 dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
906 if (!InnerLoopLatchBI->isConditional())
907 return false;
908 if (CmpInst *InnerLoopCmp =
909 dyn_cast<CmpInst>(InnerLoopLatchBI->getCondition())) {
910 Value *Op0 = InnerLoopCmp->getOperand(0);
911 Value *Op1 = InnerLoopCmp->getOperand(1);
912
913 // LHS and RHS of the inner loop exit condition, e.g.,
914 // in "for(int j=0;j<i;j++)", LHS is j and RHS is i.
915 Value *Left = nullptr;
916 Value *Right = nullptr;
917
918 // Check if V only involves inner loop induction variable.
919 // Return true if V is InnerInduction, or a cast from
920 // InnerInduction, or a binary operator that involves
921 // InnerInduction and a constant.
922 std::function<bool(Value *)> IsPathToInnerIndVar;
923 IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value *V) -> bool {
924 if (llvm::is_contained(InnerLoopInductions, V))
925 return true;
926 if (isa<Constant>(V))
927 return true;
929 if (!I)
930 return false;
931 if (isa<CastInst>(I))
932 return IsPathToInnerIndVar(I->getOperand(0));
934 return IsPathToInnerIndVar(I->getOperand(0)) &&
935 IsPathToInnerIndVar(I->getOperand(1));
936 return false;
937 };
938
939 // In case of multiple inner loop indvars, it is okay if LHS and RHS
940 // are both inner indvar related variables.
941 if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1))
942 return true;
943
944 // Otherwise we check if the cmp instruction compares an inner indvar
945 // related variable (Left) with a outer loop invariant (Right).
946 if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Op0)) {
947 Left = Op0;
948 Right = Op1;
949 } else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Op1)) {
950 Left = Op1;
951 Right = Op0;
952 }
953
954 if (Left == nullptr)
955 return false;
956
957 const SCEV *S = SE->getSCEV(Right);
958 if (!SE->isLoopInvariant(S, OuterLoop))
959 return false;
960 }
961
962 return true;
963}
964
965// If SV is a LCSSA PHI node with a single incoming value, return the incoming
966// value.
967static Value *followLCSSA(Value *SV) {
969 if (!PHI)
970 return SV;
971
972 if (PHI->getNumIncomingValues() != 1)
973 return SV;
974 return followLCSSA(PHI->getIncomingValue(0));
975}
976
978 SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
981 // Detect floating point reduction only when it can be reordered.
982 if (RD.getExactFPMathInst() != nullptr)
983 return false;
984
986 switch (RK) {
987 case RecurKind::Or:
988 case RecurKind::And:
989 case RecurKind::Xor:
990 case RecurKind::SMin:
991 case RecurKind::SMax:
992 case RecurKind::UMin:
993 case RecurKind::UMax:
994 case RecurKind::FAdd:
995 case RecurKind::FMul:
996 case RecurKind::FMin:
997 case RecurKind::FMax:
1002 case RecurKind::FMulAdd:
1003 case RecurKind::AnyOf:
1004 return true;
1005
1006 // Change the order of integer addition/multiplication may change the
1007 // semantics. Consider the following case:
1008 //
1009 // int A[2][2] = {{ INT_MAX, INT_MAX }, { INT_MIN, INT_MIN }};
1010 // int sum = 0;
1011 // for (int i = 0; i < 2; i++)
1012 // for (int j = 0; j < 2; j++)
1013 // sum += A[j][i];
1014 //
1015 // If the above loops are exchanged, the addition will cause an
1016 // overflow. To prevent this, we must drop the nuw/nsw flags from the
1017 // addition/multiplication instructions when we actually exchanges the
1018 // loops.
1019 case RecurKind::Add:
1020 case RecurKind::Mul: {
1021 unsigned OpCode = RecurrenceDescriptor::getOpcode(RK);
1023
1024 // Bail out when we fail to collect reduction instructions chain.
1025 if (Ops.empty())
1026 return false;
1027
1028 for (Instruction *I : Ops) {
1029 assert(I->getOpcode() == OpCode &&
1030 "Expected the instruction to be the reduction operation");
1031 (void)OpCode;
1032
1033 // If the instruction has nuw/nsw flags, we must drop them when the
1034 // transformation is actually performed.
1035 if (I->hasNoSignedWrap() || I->hasNoUnsignedWrap())
1036 HasNoWrapInsts.push_back(I);
1037 }
1038 return true;
1039 }
1040
1041 default:
1042 return false;
1043 }
1044 } else
1045 return false;
1046}
1047
1048// Check V's users to see if it is involved in a reduction in L.
1049static PHINode *
1051 SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
1052 // Reduction variables cannot be constants.
1053 if (isa<Constant>(V))
1054 return nullptr;
1055
1056 for (Value *User : V->users()) {
1058 if (PHI->getNumIncomingValues() == 1)
1059 continue;
1060
1061 if (checkReductionKind(L, PHI, HasNoWrapInsts))
1062 return PHI;
1063 else
1064 return nullptr;
1065 }
1066 }
1067
1068 return nullptr;
1069}
1070
1071bool LoopInterchangeLegality::isInnerReduction(
1072 Loop *L, PHINode *Phi, SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
1073
1074 // Only support reduction2Mem when the loop nest to be interchanged is
1075 // the innermost two loops.
1076 if (!L->isInnermost()) {
1077 LLVM_DEBUG(dbgs() << "Only supported when the loop is the innermost.\n");
1078 ORE->emit([&]() {
1079 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1080 L->getStartLoc(), L->getHeader())
1081 << "Only supported when the loop is the innermost.";
1082 });
1083 return false;
1084 }
1085
1086 if (Phi->getNumIncomingValues() != 2)
1087 return false;
1088
1089 Value *Init = Phi->getIncomingValueForBlock(L->getLoopPreheader());
1090 Value *Next = Phi->getIncomingValueForBlock(L->getLoopLatch());
1091
1092 // So far only supports constant initial value.
1093 if (!isa<Constant>(Init)) {
1094 LLVM_DEBUG(
1095 dbgs()
1096 << "Only supported for the reduction with a constant initial value.\n");
1097 ORE->emit([&]() {
1098 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1099 L->getStartLoc(), L->getHeader())
1100 << "Only supported for the reduction with a constant initial "
1101 "value.";
1102 });
1103 return false;
1104 }
1105
1106 // The reduction result must live in the inner loop.
1107 if (Instruction *I = dyn_cast<Instruction>(Next)) {
1108 BasicBlock *BB = I->getParent();
1109 if (!L->contains(BB))
1110 return false;
1111 }
1112
1113 // The reduction should have only one user.
1114 if (!Phi->hasOneUser())
1115 return false;
1116
1117 // Check the reduction kind.
1118 if (!checkReductionKind(L, Phi, HasNoWrapInsts))
1119 return false;
1120
1121 // Find lcssa_phi in OuterLoop's Latch
1122 BasicBlock *ExitBlock = L->getExitBlock();
1123 if (!ExitBlock)
1124 return false;
1125
1126 PHINode *Lcssa = NULL;
1127 for (auto *U : Next->users()) {
1128 if (auto *P = dyn_cast<PHINode>(U)) {
1129 if (P == Phi)
1130 continue;
1131
1132 if (Lcssa == NULL && P->getParent() == ExitBlock &&
1133 P->getIncomingValueForBlock(L->getLoopLatch()) == Next)
1134 Lcssa = P;
1135 else
1136 return false;
1137 } else
1138 return false;
1139 }
1140 if (!Lcssa)
1141 return false;
1142
1143 if (!Lcssa->hasOneUser()) {
1144 LLVM_DEBUG(dbgs() << "Only supported when the reduction is used once in "
1145 "the outer loop.\n");
1146 ORE->emit([&]() {
1147 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1148 L->getStartLoc(), L->getHeader())
1149 << "Only supported when the reduction is used once in the outer "
1150 "loop.";
1151 });
1152 return false;
1153 }
1154
1155 StoreInst *LcssaStore =
1157 if (!LcssaStore || LcssaStore->getParent() != ExitBlock)
1158 return false;
1159
1160 Value *MemRef = LcssaStore->getOperand(1);
1161 Type *ElemTy = LcssaStore->getOperand(0)->getType();
1162
1163 // LcssaStore stores the reduction result in BB.
1164 // When the reduction is initialized from a constant value, we need to load
1165 // from the memory object into the target basic block of the inner loop. This
1166 // means the memory reference was used prematurely. So we must ensure that the
1167 // memory reference does not dominate the target basic block.
1168 // TODO: Move the memory reference definition into the loop header.
1169 if (!DT->dominates(dyn_cast<Instruction>(MemRef), L->getHeader())) {
1170 LLVM_DEBUG(dbgs() << "Only supported when memory reference dominate "
1171 "the inner loop.\n");
1172 ORE->emit([&]() {
1173 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1174 L->getStartLoc(), L->getHeader())
1175 << "Only supported when memory reference dominate the inner "
1176 "loop.";
1177 });
1178 return false;
1179 }
1180
1181 // Found a reduction in the inner loop.
1182 InnerReduction SR;
1183 SR.Reduction = Phi;
1184 SR.Init = Init;
1185 SR.Next = Next;
1186 SR.LcssaPhi = Lcssa;
1187 SR.LcssaStore = LcssaStore;
1188 SR.MemRef = MemRef;
1189 SR.ElemTy = ElemTy;
1190
1191 InnerReductions.push_back(SR);
1192 return true;
1193}
1194
1195bool LoopInterchangeLegality::findInductionAndReductions(
1196 Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) {
1197 if (!L->getLoopLatch() || !L->getLoopPredecessor())
1198 return false;
1199 for (PHINode &PHI : L->getHeader()->phis()) {
1200 InductionDescriptor ID;
1202 Inductions.push_back(&PHI);
1203 else {
1204 // PHIs in inner loops need to be part of a reduction in the outer loop,
1205 // discovered when checking the PHIs of the outer loop earlier.
1206 if (!InnerLoop) {
1207 if (OuterInnerReductions.count(&PHI)) {
1208 LLVM_DEBUG(dbgs() << "Found a reduction across the outer loop.\n");
1209 } else if (EnableReduction2Memory &&
1210 isInnerReduction(L, &PHI, HasNoWrapReductions)) {
1211 LLVM_DEBUG(dbgs() << "Found a reduction in the inner loop: \n"
1212 << PHI << '\n');
1213 } else
1214 return false;
1215 } else {
1216 assert(PHI.getNumIncomingValues() == 2 &&
1217 "Phis in loop header should have exactly 2 incoming values");
1218 // Check if we have a PHI node in the outer loop that has a reduction
1219 // result from the inner loop as an incoming value.
1220 Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch()));
1221 PHINode *InnerRedPhi =
1222 findInnerReductionPhi(InnerLoop, V, HasNoWrapReductions);
1223 if (!InnerRedPhi ||
1224 !llvm::is_contained(InnerRedPhi->incoming_values(), &PHI)) {
1225 LLVM_DEBUG(
1226 dbgs()
1227 << "Failed to recognize PHI as an induction or reduction.\n");
1228 return false;
1229 }
1230 OuterInnerReductions.insert(&PHI);
1231 OuterInnerReductions.insert(InnerRedPhi);
1232 }
1233 }
1234 }
1235
1236 // For now we only support at most one reduction.
1237 if (InnerReductions.size() > 1) {
1238 LLVM_DEBUG(dbgs() << "Only supports at most one reduction.\n");
1239 ORE->emit([&]() {
1240 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1241 L->getStartLoc(), L->getHeader())
1242 << "Only supports at most one reduction.";
1243 });
1244 return false;
1245 }
1246
1247 return true;
1248}
1249
1250// This function indicates the current limitations in the transform as a result
1251// of which we do not proceed.
1252bool LoopInterchangeLegality::currentLimitations() {
1253 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1254
1255 // transform currently expects the loop latches to also be the exiting
1256 // blocks.
1257 if (InnerLoop->getExitingBlock() != InnerLoopLatch ||
1258 OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() ||
1259 !isa<BranchInst>(InnerLoopLatch->getTerminator()) ||
1260 !isa<BranchInst>(OuterLoop->getLoopLatch()->getTerminator())) {
1261 LLVM_DEBUG(
1262 dbgs() << "Loops where the latch is not the exiting block are not"
1263 << " supported currently.\n");
1264 ORE->emit([&]() {
1265 return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch",
1266 OuterLoop->getStartLoc(),
1267 OuterLoop->getHeader())
1268 << "Loops where the latch is not the exiting block cannot be"
1269 " interchange currently.";
1270 });
1271 return true;
1272 }
1273
1274 SmallVector<PHINode *, 8> Inductions;
1275 if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) {
1276 LLVM_DEBUG(
1277 dbgs() << "Only outer loops with induction or reduction PHI nodes "
1278 << "are supported currently.\n");
1279 ORE->emit([&]() {
1280 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
1281 OuterLoop->getStartLoc(),
1282 OuterLoop->getHeader())
1283 << "Only outer loops with induction or reduction PHI nodes can be"
1284 " interchanged currently.";
1285 });
1286 return true;
1287 }
1288
1289 Inductions.clear();
1290 // For multi-level loop nests, make sure that all phi nodes for inner loops
1291 // at all levels can be recognized as a induction or reduction phi. Bail out
1292 // if a phi node at a certain nesting level cannot be properly recognized.
1293 Loop *CurLevelLoop = OuterLoop;
1294 while (!CurLevelLoop->getSubLoops().empty()) {
1295 // We already made sure that the loop nest is tightly nested.
1296 CurLevelLoop = CurLevelLoop->getSubLoops().front();
1297 if (!findInductionAndReductions(CurLevelLoop, Inductions, nullptr)) {
1298 LLVM_DEBUG(
1299 dbgs() << "Only inner loops with induction or reduction PHI nodes "
1300 << "are supported currently.\n");
1301 ORE->emit([&]() {
1302 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
1303 CurLevelLoop->getStartLoc(),
1304 CurLevelLoop->getHeader())
1305 << "Only inner loops with induction or reduction PHI nodes can be"
1306 " interchange currently.";
1307 });
1308 return true;
1309 }
1310 }
1311
1312 // TODO: Triangular loops are not handled for now.
1313 if (!isLoopStructureUnderstood()) {
1314 LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
1315 ORE->emit([&]() {
1316 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
1317 InnerLoop->getStartLoc(),
1318 InnerLoop->getHeader())
1319 << "Inner loop structure not understood currently.";
1320 });
1321 return true;
1322 }
1323
1324 return false;
1325}
1326
1327bool LoopInterchangeLegality::findInductions(
1328 Loop *L, SmallVectorImpl<PHINode *> &Inductions) {
1329 for (PHINode &PHI : L->getHeader()->phis()) {
1330 InductionDescriptor ID;
1332 Inductions.push_back(&PHI);
1333 }
1334 return !Inductions.empty();
1335}
1336
1337// We currently only support LCSSA PHI nodes in the inner loop exit, if their
1338// users are either reduction PHIs or PHIs outside the outer loop (which means
1339// the we are only interested in the final value after the loop).
1340static bool
1343 PHINode *LcssaReduction) {
1344 BasicBlock *InnerExit = OuterL->getUniqueExitBlock();
1345 for (PHINode &PHI : InnerExit->phis()) {
1346 // The reduction LCSSA PHI will have only one incoming block, which comes
1347 // from the loop latch.
1348 if (PHI.getNumIncomingValues() > 1)
1349 return false;
1350 if (&PHI == LcssaReduction)
1351 return true;
1352 if (any_of(PHI.users(), [&Reductions, OuterL](User *U) {
1353 PHINode *PN = dyn_cast<PHINode>(U);
1354 return !PN ||
1355 (!Reductions.count(PN) && OuterL->contains(PN->getParent()));
1356 })) {
1357 return false;
1358 }
1359 }
1360 return true;
1361}
1362
1363// We currently support LCSSA PHI nodes in the outer loop exit, if their
1364// incoming values do not come from the outer loop latch or if the
1365// outer loop latch has a single predecessor. In that case, the value will
1366// be available if both the inner and outer loop conditions are true, which
1367// will still be true after interchanging. If we have multiple predecessor,
1368// that may not be the case, e.g. because the outer loop latch may be executed
1369// if the inner loop is not executed.
1370static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
1371 BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
1372 for (PHINode &PHI : LoopNestExit->phis()) {
1373 for (Value *Incoming : PHI.incoming_values()) {
1375 if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
1376 continue;
1377
1378 // The incoming value is defined in the outer loop latch. Currently we
1379 // only support that in case the outer loop latch has a single predecessor.
1380 // This guarantees that the outer loop latch is executed if and only if
1381 // the inner loop is executed (because tightlyNested() guarantees that the
1382 // outer loop header only branches to the inner loop or the outer loop
1383 // latch).
1384 // FIXME: We could weaken this logic and allow multiple predecessors,
1385 // if the values are produced outside the loop latch. We would need
1386 // additional logic to update the PHI nodes in the exit block as
1387 // well.
1388 if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
1389 return false;
1390 }
1391 }
1392 return true;
1393}
1394
1395// In case of multi-level nested loops, it may occur that lcssa phis exist in
1396// the latch of InnerLoop, i.e., when defs of the incoming values are further
1397// inside the loopnest. Sometimes those incoming values are not available
1398// after interchange, since the original inner latch will become the new outer
1399// latch which may have predecessor paths that do not include those incoming
1400// values.
1401// TODO: Handle transformation of lcssa phis in the InnerLoop latch in case of
1402// multi-level loop nests.
1403static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
1404 if (InnerLoop->getSubLoops().empty())
1405 return true;
1406 // If the original outer latch has only one predecessor, then values defined
1407 // further inside the looploop, e.g., in the innermost loop, will be available
1408 // at the new outer latch after interchange.
1409 if (OuterLoop->getLoopLatch()->getUniquePredecessor() != nullptr)
1410 return true;
1411
1412 // The outer latch has more than one predecessors, i.e., the inner
1413 // exit and the inner header.
1414 // PHI nodes in the inner latch are lcssa phis where the incoming values
1415 // are defined further inside the loopnest. Check if those phis are used
1416 // in the original inner latch. If that is the case then bail out since
1417 // those incoming values may not be available at the new outer latch.
1418 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1419 for (PHINode &PHI : InnerLoopLatch->phis()) {
1420 for (auto *U : PHI.users()) {
1422 if (InnerLoopLatch == UI->getParent())
1423 return false;
1424 }
1425 }
1426 return true;
1427}
1428
1429bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
1430 unsigned OuterLoopId,
1431 CharMatrix &DepMatrix) {
1432 if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
1433 LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
1434 << " and OuterLoopId = " << OuterLoopId
1435 << " due to dependence\n");
1436 ORE->emit([&]() {
1437 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
1438 InnerLoop->getStartLoc(),
1439 InnerLoop->getHeader())
1440 << "Cannot interchange loops due to dependences.";
1441 });
1442 return false;
1443 }
1444 // Check if outer and inner loop contain legal instructions only.
1445 for (auto *BB : OuterLoop->blocks())
1446 for (Instruction &I : BB->instructionsWithoutDebug())
1447 if (CallInst *CI = dyn_cast<CallInst>(&I)) {
1448 // readnone functions do not prevent interchanging.
1449 if (CI->onlyWritesMemory())
1450 continue;
1451 LLVM_DEBUG(
1452 dbgs() << "Loops with call instructions cannot be interchanged "
1453 << "safely.");
1454 ORE->emit([&]() {
1455 return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst",
1456 CI->getDebugLoc(),
1457 CI->getParent())
1458 << "Cannot interchange loops due to call instruction.";
1459 });
1460
1461 return false;
1462 }
1463
1464 if (!findInductions(InnerLoop, InnerLoopInductions)) {
1465 LLVM_DEBUG(dbgs() << "Could not find inner loop induction variables.\n");
1466 return false;
1467 }
1468
1469 if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) {
1470 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n");
1471 ORE->emit([&]() {
1472 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerLatchPHI",
1473 InnerLoop->getStartLoc(),
1474 InnerLoop->getHeader())
1475 << "Cannot interchange loops because unsupported PHI nodes found "
1476 "in inner loop latch.";
1477 });
1478 return false;
1479 }
1480
1481 // TODO: The loops could not be interchanged due to current limitations in the
1482 // transform module.
1483 if (currentLimitations()) {
1484 LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");
1485 return false;
1486 }
1487
1488 // Check if the loops are tightly nested.
1489 if (!tightlyNested(OuterLoop, InnerLoop)) {
1490 LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");
1491 ORE->emit([&]() {
1492 return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
1493 InnerLoop->getStartLoc(),
1494 InnerLoop->getHeader())
1495 << "Cannot interchange loops because they are not tightly "
1496 "nested.";
1497 });
1498 return false;
1499 }
1500
1501 // The LCSSA PHI for the reduction has passed checks before; its user
1502 // is a store instruction.
1503 PHINode *LcssaReduction = nullptr;
1504 assert(InnerReductions.size() <= 1 &&
1505 "So far we only support at most one reduction.");
1506 if (InnerReductions.size() == 1)
1507 LcssaReduction = InnerReductions[0].LcssaPhi;
1508
1509 if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop, OuterInnerReductions,
1510 LcssaReduction)) {
1511 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n");
1512 ORE->emit([&]() {
1513 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1514 InnerLoop->getStartLoc(),
1515 InnerLoop->getHeader())
1516 << "Found unsupported PHI node in loop exit.";
1517 });
1518 return false;
1519 }
1520
1521 if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
1522 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
1523 ORE->emit([&]() {
1524 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1525 OuterLoop->getStartLoc(),
1526 OuterLoop->getHeader())
1527 << "Found unsupported PHI node in loop exit.";
1528 });
1529 return false;
1530 }
1531
1532 return true;
1533}
1534
1535void CacheCostManager::computeIfUnitinialized() {
1536 if (CC.has_value())
1537 return;
1538
1539 LLVM_DEBUG(dbgs() << "Compute CacheCost.\n");
1540 CC = CacheCost::getCacheCost(*OutermostLoop, *AR, *DI);
1541 // Obtain the loop vector returned from loop cache analysis beforehand,
1542 // and put each <Loop, index> pair into a map for constant time query
1543 // later. Indices in loop vector reprsent the optimal order of the
1544 // corresponding loop, e.g., given a loopnest with depth N, index 0
1545 // indicates the loop should be placed as the outermost loop and index N
1546 // indicates the loop should be placed as the innermost loop.
1547 //
1548 // For the old pass manager CacheCost would be null.
1549 if (*CC != nullptr)
1550 for (const auto &[Idx, Cost] : enumerate((*CC)->getLoopCosts()))
1551 CostMap[Cost.first] = Idx;
1552}
1553
1554CacheCost *CacheCostManager::getCacheCost() {
1555 computeIfUnitinialized();
1556 return CC->get();
1557}
1558
1559const DenseMap<const Loop *, unsigned> &CacheCostManager::getCostMap() {
1560 computeIfUnitinialized();
1561 return CostMap;
1562}
1563
1564int LoopInterchangeProfitability::getInstrOrderCost() {
1565 unsigned GoodOrder, BadOrder;
1566 BadOrder = GoodOrder = 0;
1567 for (BasicBlock *BB : InnerLoop->blocks()) {
1568 for (Instruction &Ins : *BB) {
1569 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) {
1570 bool FoundInnerInduction = false;
1571 bool FoundOuterInduction = false;
1572 for (Value *Op : GEP->operands()) {
1573 // Skip operands that are not SCEV-able.
1574 if (!SE->isSCEVable(Op->getType()))
1575 continue;
1576
1577 const SCEV *OperandVal = SE->getSCEV(Op);
1578 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal);
1579 if (!AR)
1580 continue;
1581
1582 // If we find the inner induction after an outer induction e.g.
1583 // for(int i=0;i<N;i++)
1584 // for(int j=0;j<N;j++)
1585 // A[i][j] = A[i-1][j-1]+k;
1586 // then it is a good order.
1587 if (AR->getLoop() == InnerLoop) {
1588 // We found an InnerLoop induction after OuterLoop induction. It is
1589 // a good order.
1590 FoundInnerInduction = true;
1591 if (FoundOuterInduction) {
1592 GoodOrder++;
1593 break;
1594 }
1595 }
1596 // If we find the outer induction after an inner induction e.g.
1597 // for(int i=0;i<N;i++)
1598 // for(int j=0;j<N;j++)
1599 // A[j][i] = A[j-1][i-1]+k;
1600 // then it is a bad order.
1601 if (AR->getLoop() == OuterLoop) {
1602 // We found an OuterLoop induction after InnerLoop induction. It is
1603 // a bad order.
1604 FoundOuterInduction = true;
1605 if (FoundInnerInduction) {
1606 BadOrder++;
1607 break;
1608 }
1609 }
1610 }
1611 }
1612 }
1613 }
1614 return GoodOrder - BadOrder;
1615}
1616
1617std::optional<bool>
1618LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(
1619 const DenseMap<const Loop *, unsigned> &CostMap, CacheCost *CC) {
1620 // This is the new cost model returned from loop cache analysis.
1621 // A smaller index means the loop should be placed an outer loop, and vice
1622 // versa.
1623 auto InnerLoopIt = CostMap.find(InnerLoop);
1624 if (InnerLoopIt == CostMap.end())
1625 return std::nullopt;
1626 auto OuterLoopIt = CostMap.find(OuterLoop);
1627 if (OuterLoopIt == CostMap.end())
1628 return std::nullopt;
1629
1630 if (CC->getLoopCost(*OuterLoop) == CC->getLoopCost(*InnerLoop))
1631 return std::nullopt;
1632 unsigned InnerIndex = InnerLoopIt->second;
1633 unsigned OuterIndex = OuterLoopIt->second;
1634 LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
1635 << ", OuterIndex = " << OuterIndex << "\n");
1636 assert(InnerIndex != OuterIndex && "CostMap should assign unique "
1637 "numbers to each loop");
1638 return std::optional<bool>(InnerIndex < OuterIndex);
1639}
1640
1641std::optional<bool>
1642LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
1643 // Legacy cost model: this is rough cost estimation algorithm. It counts the
1644 // good and bad order of induction variables in the instruction and allows
1645 // reordering if number of bad orders is more than good.
1646 int Cost = getInstrOrderCost();
1647 LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
1649 return std::optional<bool>(true);
1650
1651 return std::nullopt;
1652}
1653
1654/// Return true if we can vectorize the loop specified by \p LoopId.
1655static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
1656 for (const auto &Dep : DepMatrix) {
1657 char Dir = Dep[LoopId];
1658 char DepType = Dep.back();
1659 assert((DepType == '<' || DepType == '*') &&
1660 "Unexpected element in dependency vector");
1661
1662 // There are no loop-carried dependencies.
1663 if (Dir == '=' || Dir == 'I')
1664 continue;
1665
1666 // DepType being '<' means that this direction vector represents a forward
1667 // dependency. In principle, a loop with '<' direction can be vectorized in
1668 // this case.
1669 if (Dir == '<' && DepType == '<')
1670 continue;
1671
1672 // We cannot prove that the loop is vectorizable.
1673 return false;
1674 }
1675 return true;
1676}
1677
1678std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
1679 unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
1680 // If the outer loop cannot be vectorized, it is not profitable to move this
1681 // to inner position.
1682 if (!canVectorize(DepMatrix, OuterLoopId))
1683 return false;
1684
1685 // If the inner loop cannot be vectorized but the outer loop can be, then it
1686 // is profitable to interchange to enable inner loop parallelism.
1687 if (!canVectorize(DepMatrix, InnerLoopId))
1688 return true;
1689
1690 // If both the inner and the outer loop can be vectorized, it is necessary to
1691 // check the cost of each vectorized loop for profitability decision. At this
1692 // time we do not have a cost model to estimate them, so return nullopt.
1693 // TODO: Estimate the cost of vectorized loop when both the outer and the
1694 // inner loop can be vectorized.
1695 return std::nullopt;
1696}
1697
1698bool LoopInterchangeProfitability::isProfitable(
1699 const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
1700 unsigned OuterLoopId, CharMatrix &DepMatrix, CacheCostManager &CCM) {
1701 // Do not consider loops with a backedge that isn't taken, e.g. an
1702 // unconditional branch true/false, as candidates for interchange.
1703 // TODO: when interchange is forced, we should probably also allow
1704 // interchange for these loops, and thus this logic should be moved just
1705 // below the cost-model ignore check below. But this check is done first
1706 // to avoid the issue in #163954.
1707 const SCEV *InnerBTC = SE->getBackedgeTakenCount(InnerLoop);
1708 const SCEV *OuterBTC = SE->getBackedgeTakenCount(OuterLoop);
1709 if (InnerBTC && InnerBTC->isZero()) {
1710 LLVM_DEBUG(dbgs() << "Inner loop back-edge isn't taken, rejecting "
1711 "single iteration loop\n");
1712 return false;
1713 }
1714 if (OuterBTC && OuterBTC->isZero()) {
1715 LLVM_DEBUG(dbgs() << "Outer loop back-edge isn't taken, rejecting "
1716 "single iteration loop\n");
1717 return false;
1718 }
1719
1720 // Return true if interchange is forced and the cost-model ignored.
1721 if (Profitabilities.size() == 1 && Profitabilities[0] == RuleTy::Ignore)
1722 return true;
1724 "Duplicate rules and option 'ignore' are not allowed");
1725
1726 // isProfitable() is structured to avoid endless loop interchange. If the
1727 // highest priority rule (isProfitablePerLoopCacheAnalysis by default) could
1728 // decide the profitability then, profitability check will stop and return the
1729 // analysis result. If it failed to determine it (e.g., cache analysis failed
1730 // to analyze the loopnest due to delinearization issues) then go ahead the
1731 // second highest priority rule (isProfitablePerInstrOrderCost by default).
1732 // Likewise, if it failed to analysis the profitability then only, the last
1733 // rule (isProfitableForVectorization by default) will decide.
1734 std::optional<bool> shouldInterchange;
1735 for (RuleTy RT : Profitabilities) {
1736 switch (RT) {
1737 case RuleTy::PerLoopCacheAnalysis: {
1738 CacheCost *CC = CCM.getCacheCost();
1739 const DenseMap<const Loop *, unsigned> &CostMap = CCM.getCostMap();
1740 shouldInterchange = isProfitablePerLoopCacheAnalysis(CostMap, CC);
1741 break;
1742 }
1743 case RuleTy::PerInstrOrderCost:
1744 shouldInterchange = isProfitablePerInstrOrderCost();
1745 break;
1746 case RuleTy::ForVectorization:
1747 shouldInterchange =
1748 isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
1749 break;
1750 case RuleTy::Ignore:
1751 llvm_unreachable("Option 'ignore' is not supported with other options");
1752 break;
1753 }
1754
1755 // If this rule could determine the profitability, don't call subsequent
1756 // rules.
1757 if (shouldInterchange.has_value())
1758 break;
1759 }
1760
1761 if (!shouldInterchange.has_value()) {
1762 ORE->emit([&]() {
1763 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1764 InnerLoop->getStartLoc(),
1765 InnerLoop->getHeader())
1766 << "Insufficient information to calculate the cost of loop for "
1767 "interchange.";
1768 });
1769 return false;
1770 } else if (!shouldInterchange.value()) {
1771 ORE->emit([&]() {
1772 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1773 InnerLoop->getStartLoc(),
1774 InnerLoop->getHeader())
1775 << "Interchanging loops is not considered to improve cache "
1776 "locality nor vectorization.";
1777 });
1778 return false;
1779 }
1780 return true;
1781}
1782
1783void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
1784 Loop *InnerLoop) {
1785 for (Loop *L : *OuterLoop)
1786 if (L == InnerLoop) {
1787 OuterLoop->removeChildLoop(L);
1788 return;
1789 }
1790 llvm_unreachable("Couldn't find loop");
1791}
1792
1793/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
1794/// new inner and outer loop after interchanging: NewInner is the original
1795/// outer loop and NewOuter is the original inner loop.
1796///
1797/// Before interchanging, we have the following structure
1798/// Outer preheader
1799// Outer header
1800// Inner preheader
1801// Inner header
1802// Inner body
1803// Inner latch
1804// outer bbs
1805// Outer latch
1806//
1807// After interchanging:
1808// Inner preheader
1809// Inner header
1810// Outer preheader
1811// Outer header
1812// Inner body
1813// outer bbs
1814// Outer latch
1815// Inner latch
1816void LoopInterchangeTransform::restructureLoops(
1817 Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader,
1818 BasicBlock *OrigOuterPreHeader) {
1819 Loop *OuterLoopParent = OuterLoop->getParentLoop();
1820 // The original inner loop preheader moves from the new inner loop to
1821 // the parent loop, if there is one.
1822 NewInner->removeBlockFromLoop(OrigInnerPreHeader);
1823 LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent);
1824
1825 // Switch the loop levels.
1826 if (OuterLoopParent) {
1827 // Remove the loop from its parent loop.
1828 removeChildLoop(OuterLoopParent, NewInner);
1829 removeChildLoop(NewInner, NewOuter);
1830 OuterLoopParent->addChildLoop(NewOuter);
1831 } else {
1832 removeChildLoop(NewInner, NewOuter);
1833 LI->changeTopLevelLoop(NewInner, NewOuter);
1834 }
1835 while (!NewOuter->isInnermost())
1836 NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin()));
1837 NewOuter->addChildLoop(NewInner);
1838
1839 // BBs from the original inner loop.
1840 SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks());
1841
1842 // Add BBs from the original outer loop to the original inner loop (excluding
1843 // BBs already in inner loop)
1844 for (BasicBlock *BB : NewInner->blocks())
1845 if (LI->getLoopFor(BB) == NewInner)
1846 NewOuter->addBlockEntry(BB);
1847
1848 // Now remove inner loop header and latch from the new inner loop and move
1849 // other BBs (the loop body) to the new inner loop.
1850 BasicBlock *OuterHeader = NewOuter->getHeader();
1851 BasicBlock *OuterLatch = NewOuter->getLoopLatch();
1852 for (BasicBlock *BB : OrigInnerBBs) {
1853 // Nothing will change for BBs in child loops.
1854 if (LI->getLoopFor(BB) != NewOuter)
1855 continue;
1856 // Remove the new outer loop header and latch from the new inner loop.
1857 if (BB == OuterHeader || BB == OuterLatch)
1858 NewInner->removeBlockFromLoop(BB);
1859 else
1860 LI->changeLoopFor(BB, NewInner);
1861 }
1862
1863 // The preheader of the original outer loop becomes part of the new
1864 // outer loop.
1865 NewOuter->addBlockEntry(OrigOuterPreHeader);
1866 LI->changeLoopFor(OrigOuterPreHeader, NewOuter);
1867
1868 // Tell SE that we move the loops around.
1869 SE->forgetLoop(NewOuter);
1870}
1871
1872/// User can write, or optimizers can generate the reduction for inner loop.
1873/// To make the interchange valid, apply Reduction2Mem by moving the
1874/// initializer and store instructions into the inner loop. So far we only
1875/// handle cases where the reduction variable is initialized to a constant.
1876/// For example, below code:
1877///
1878/// loop:
1879/// re = phi<0.0, next>
1880/// next = re op ...
1881/// endloop
1882/// reduc_sum = phi<next> // lcssa phi
1883/// MEM_REF[idx] = reduc_sum // LcssaStore
1884///
1885/// is transformed into:
1886///
1887/// loop:
1888/// tmp = MEM_REF[idx];
1889/// new_var = !first_iteration ? tmp : 0.0;
1890/// next = new_var op ...
1891/// MEM_REF[idx] = next; // after moving
1892/// endloop
1893///
1894/// In this way the initial const is used in the first iteration of loop.
1895void LoopInterchangeTransform::reduction2Memory() {
1897 LIL.getInnerReductions();
1898
1899 assert(InnerReductions.size() == 1 &&
1900 "So far we only support at most one reduction.");
1901
1902 LoopInterchangeLegality::InnerReduction SR = InnerReductions[0];
1903 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1904 IRBuilder<> Builder(&*(InnerLoopHeader->getFirstNonPHIIt()));
1905
1906 // Check if it's the first iteration.
1907 LLVMContext &Context = InnerLoopHeader->getContext();
1908 PHINode *FirstIter =
1909 Builder.CreatePHI(Type::getInt1Ty(Context), 2, "first.iter");
1910 FirstIter->addIncoming(ConstantInt::get(Type::getInt1Ty(Context), 1),
1911 InnerLoop->getLoopPreheader());
1912 FirstIter->addIncoming(ConstantInt::get(Type::getInt1Ty(Context), 0),
1913 InnerLoop->getLoopLatch());
1914 assert(FirstIter->isComplete() && "The FirstIter PHI node is not complete.");
1915
1916 // When the reduction is initialized from a constant value, we need to add
1917 // a stmt loading from the memory object to target basic block in inner
1918 // loop.
1919 Instruction *LoadMem = Builder.CreateLoad(SR.ElemTy, SR.MemRef);
1920
1921 // Init new_var to MEM_REF or CONST depending on if it is the first iteration.
1922 Value *NewVar = Builder.CreateSelect(FirstIter, SR.Init, LoadMem, "new.var");
1923
1924 // Replace all uses of the reduction variable with a new variable.
1925 SR.Reduction->replaceAllUsesWith(NewVar);
1926
1927 // Move store instruction into inner loop, just after reduction next's
1928 // definition.
1929 SR.LcssaStore->setOperand(0, SR.Next);
1930 SR.LcssaStore->moveAfter(dyn_cast<Instruction>(SR.Next));
1931}
1932
1933bool LoopInterchangeTransform::transform(
1934 ArrayRef<Instruction *> DropNoWrapInsts) {
1935 bool Transformed = false;
1936
1938 LIL.getInnerReductions();
1939 if (InnerReductions.size() == 1)
1940 reduction2Memory();
1941
1942 if (InnerLoop->getSubLoops().empty()) {
1943 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1944 LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
1945 auto &InductionPHIs = LIL.getInnerLoopInductions();
1946 if (InductionPHIs.empty()) {
1947 LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
1948 return false;
1949 }
1950
1951 SmallVector<Instruction *, 8> InnerIndexVarList;
1952 for (PHINode *CurInductionPHI : InductionPHIs) {
1953 if (CurInductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
1954 InnerIndexVarList.push_back(
1955 dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(1)));
1956 else
1957 InnerIndexVarList.push_back(
1958 dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(0)));
1959 }
1960
1961 // Create a new latch block for the inner loop. We split at the
1962 // current latch's terminator and then move the condition and all
1963 // operands that are not either loop-invariant or the induction PHI into the
1964 // new latch block.
1965 BasicBlock *NewLatch =
1966 SplitBlock(InnerLoop->getLoopLatch(),
1967 InnerLoop->getLoopLatch()->getTerminator(), DT, LI);
1968
1969 SmallSetVector<Instruction *, 4> WorkList;
1970 unsigned i = 0;
1971 auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() {
1972 for (; i < WorkList.size(); i++) {
1973 // Duplicate instruction and move it the new latch. Update uses that
1974 // have been moved.
1975 Instruction *NewI = WorkList[i]->clone();
1976 NewI->insertBefore(NewLatch->getFirstNonPHIIt());
1977 assert(!NewI->mayHaveSideEffects() &&
1978 "Moving instructions with side-effects may change behavior of "
1979 "the loop nest!");
1980 for (Use &U : llvm::make_early_inc_range(WorkList[i]->uses())) {
1981 Instruction *UserI = cast<Instruction>(U.getUser());
1982 if (!InnerLoop->contains(UserI->getParent()) ||
1983 UserI->getParent() == NewLatch ||
1984 llvm::is_contained(InductionPHIs, UserI))
1985 U.set(NewI);
1986 }
1987 // Add operands of moved instruction to the worklist, except if they are
1988 // outside the inner loop or are the induction PHI.
1989 for (Value *Op : WorkList[i]->operands()) {
1991 if (!OpI ||
1992 this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop ||
1993 llvm::is_contained(InductionPHIs, OpI))
1994 continue;
1995 WorkList.insert(OpI);
1996 }
1997 }
1998 };
1999
2000 // FIXME: Should we interchange when we have a constant condition?
2003 ->getCondition());
2004 if (CondI)
2005 WorkList.insert(CondI);
2006 MoveInstructions();
2007 for (Instruction *InnerIndexVar : InnerIndexVarList)
2008 WorkList.insert(cast<Instruction>(InnerIndexVar));
2009 MoveInstructions();
2010 }
2011
2012 // Ensure the inner loop phi nodes have a separate basic block.
2013 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
2014 if (&*InnerLoopHeader->getFirstNonPHIIt() !=
2015 InnerLoopHeader->getTerminator()) {
2016 SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHIIt(), DT, LI);
2017 LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
2018 }
2019
2020 // Instructions in the original inner loop preheader may depend on values
2021 // defined in the outer loop header. Move them there, because the original
2022 // inner loop preheader will become the entry into the interchanged loop nest.
2023 // Currently we move all instructions and rely on LICM to move invariant
2024 // instructions outside the loop nest.
2025 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2026 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
2027 if (InnerLoopPreHeader != OuterLoopHeader) {
2028 for (Instruction &I :
2029 make_early_inc_range(make_range(InnerLoopPreHeader->begin(),
2030 std::prev(InnerLoopPreHeader->end()))))
2031 I.moveBeforePreserving(OuterLoopHeader->getTerminator()->getIterator());
2032 }
2033
2034 Transformed |= adjustLoopLinks();
2035 if (!Transformed) {
2036 LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
2037 return false;
2038 }
2039
2040 // Finally, drop the nsw/nuw flags from the instructions for reduction
2041 // calculations.
2042 for (Instruction *Reduction : DropNoWrapInsts) {
2043 Reduction->setHasNoSignedWrap(false);
2044 Reduction->setHasNoUnsignedWrap(false);
2045 }
2046
2047 return true;
2048}
2049
2050/// \brief Move all instructions except the terminator from FromBB right before
2051/// InsertBefore
2052static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
2053 BasicBlock *ToBB = InsertBefore->getParent();
2054
2055 ToBB->splice(InsertBefore->getIterator(), FromBB, FromBB->begin(),
2056 FromBB->getTerminator()->getIterator());
2057}
2058
2059/// Swap instructions between \p BB1 and \p BB2 but keep terminators intact.
2060static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) {
2061 // Save all non-terminator instructions of BB1 into TempInstrs and unlink them
2062 // from BB1 afterwards.
2063 auto Iter = map_range(*BB1, [](Instruction &I) { return &I; });
2064 SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(Iter.end()));
2065 for (Instruction *I : TempInstrs)
2066 I->removeFromParent();
2067
2068 // Move instructions from BB2 to BB1.
2069 moveBBContents(BB2, BB1->getTerminator());
2070
2071 // Move instructions from TempInstrs to BB2.
2072 for (Instruction *I : TempInstrs)
2073 I->insertBefore(BB2->getTerminator()->getIterator());
2074}
2075
2076// Update BI to jump to NewBB instead of OldBB. Records updates to the
2077// dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that
2078// \p OldBB is exactly once in BI's successor list.
2079static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB,
2080 BasicBlock *NewBB,
2081 std::vector<DominatorTree::UpdateType> &DTUpdates,
2082 bool MustUpdateOnce = true) {
2083 assert((!MustUpdateOnce || llvm::count(successors(BI), OldBB) == 1) &&
2084 "BI must jump to OldBB exactly once.");
2085 bool Changed = false;
2086 for (Use &Op : BI->operands())
2087 if (Op == OldBB) {
2088 Op.set(NewBB);
2089 Changed = true;
2090 }
2091
2092 if (Changed) {
2093 DTUpdates.push_back(
2094 {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB});
2095 DTUpdates.push_back(
2096 {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB});
2097 }
2098 assert(Changed && "Expected a successor to be updated");
2099}
2100
2101// Move Lcssa PHIs to the right place.
2102static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
2103 BasicBlock *InnerLatch, BasicBlock *OuterHeader,
2104 BasicBlock *OuterLatch, BasicBlock *OuterExit,
2105 Loop *InnerLoop, LoopInfo *LI) {
2106
2107 // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are
2108 // defined either in the header or latch. Those blocks will become header and
2109 // latch of the new outer loop, and the only possible users can PHI nodes
2110 // in the exit block of the loop nest or the outer loop header (reduction
2111 // PHIs, in that case, the incoming value must be defined in the inner loop
2112 // header). We can just substitute the user with the incoming value and remove
2113 // the PHI.
2114 for (PHINode &P : make_early_inc_range(InnerExit->phis())) {
2115 assert(P.getNumIncomingValues() == 1 &&
2116 "Only loops with a single exit are supported!");
2117
2118 // Incoming values are guaranteed be instructions currently.
2119 auto IncI = cast<Instruction>(P.getIncomingValueForBlock(InnerLatch));
2120 // In case of multi-level nested loops, follow LCSSA to find the incoming
2121 // value defined from the innermost loop.
2122 auto IncIInnerMost = cast<Instruction>(followLCSSA(IncI));
2123 // Skip phis with incoming values from the inner loop body, excluding the
2124 // header and latch.
2125 if (IncIInnerMost->getParent() != InnerLatch &&
2126 IncIInnerMost->getParent() != InnerHeader)
2127 continue;
2128
2129 assert(all_of(P.users(),
2130 [OuterHeader, OuterExit, IncI, InnerHeader](User *U) {
2131 return (cast<PHINode>(U)->getParent() == OuterHeader &&
2132 IncI->getParent() == InnerHeader) ||
2133 cast<PHINode>(U)->getParent() == OuterExit;
2134 }) &&
2135 "Can only replace phis iff the uses are in the loop nest exit or "
2136 "the incoming value is defined in the inner header (it will "
2137 "dominate all loop blocks after interchanging)");
2138 P.replaceAllUsesWith(IncI);
2139 P.eraseFromParent();
2140 }
2141
2142 SmallVector<PHINode *, 8> LcssaInnerExit(
2143 llvm::make_pointer_range(InnerExit->phis()));
2144
2145 SmallVector<PHINode *, 8> LcssaInnerLatch(
2146 llvm::make_pointer_range(InnerLatch->phis()));
2147
2148 // Lcssa PHIs for values used outside the inner loop are in InnerExit.
2149 // If a PHI node has users outside of InnerExit, it has a use outside the
2150 // interchanged loop and we have to preserve it. We move these to
2151 // InnerLatch, which will become the new exit block for the innermost
2152 // loop after interchanging.
2153 for (PHINode *P : LcssaInnerExit)
2154 P->moveBefore(InnerLatch->getFirstNonPHIIt());
2155
2156 // If the inner loop latch contains LCSSA PHIs, those come from a child loop
2157 // and we have to move them to the new inner latch.
2158 for (PHINode *P : LcssaInnerLatch)
2159 P->moveBefore(InnerExit->getFirstNonPHIIt());
2160
2161 // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have
2162 // incoming values defined in the outer loop, we have to add a new PHI
2163 // in the inner loop latch, which became the exit block of the outer loop,
2164 // after interchanging.
2165 if (OuterExit) {
2166 for (PHINode &P : OuterExit->phis()) {
2167 if (P.getNumIncomingValues() != 1)
2168 continue;
2169 // Skip Phis with incoming values defined in the inner loop. Those should
2170 // already have been updated.
2171 auto I = dyn_cast<Instruction>(P.getIncomingValue(0));
2172 if (!I || LI->getLoopFor(I->getParent()) == InnerLoop)
2173 continue;
2174
2175 PHINode *NewPhi = dyn_cast<PHINode>(P.clone());
2176 NewPhi->setIncomingValue(0, P.getIncomingValue(0));
2177 NewPhi->setIncomingBlock(0, OuterLatch);
2178 // We might have incoming edges from other BBs, i.e., the original outer
2179 // header.
2180 for (auto *Pred : predecessors(InnerLatch)) {
2181 if (Pred == OuterLatch)
2182 continue;
2183 NewPhi->addIncoming(P.getIncomingValue(0), Pred);
2184 }
2185 NewPhi->insertBefore(InnerLatch->getFirstNonPHIIt());
2186 P.setIncomingValue(0, NewPhi);
2187 }
2188 }
2189
2190 // Now adjust the incoming blocks for the LCSSA PHIs.
2191 // For PHIs moved from Inner's exit block, we need to replace Inner's latch
2192 // with the new latch.
2193 InnerLatch->replacePhiUsesWith(InnerLatch, OuterLatch);
2194}
2195
2196/// This deals with a corner case when a LCSSA phi node appears in a non-exit
2197/// block: the outer loop latch block does not need to be exit block of the
2198/// inner loop. Consider a loop that was in LCSSA form, but then some
2199/// transformation like loop-unswitch comes along and creates an empty block,
2200/// where BB5 in this example is the outer loop latch block:
2201///
2202/// BB4:
2203/// br label %BB5
2204/// BB5:
2205/// %old.cond.lcssa = phi i16 [ %cond, %BB4 ]
2206/// br outer.header
2207///
2208/// Interchange then brings it in LCSSA form again resulting in this chain of
2209/// single-input phi nodes:
2210///
2211/// BB4:
2212/// %new.cond.lcssa = phi i16 [ %cond, %BB3 ]
2213/// br label %BB5
2214/// BB5:
2215/// %old.cond.lcssa = phi i16 [ %new.cond.lcssa, %BB4 ]
2216///
2217/// The problem is that interchange can reoder blocks BB4 and BB5 placing the
2218/// use before the def if we don't check this. The solution is to simplify
2219/// lcssa phi nodes (remove) if they appear in non-exit blocks.
2220///
2221static void simplifyLCSSAPhis(Loop *OuterLoop, Loop *InnerLoop) {
2222 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
2223 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
2224
2225 // Do not modify lcssa phis where they actually belong, i.e. in exit blocks.
2226 if (OuterLoopLatch == InnerLoopExit)
2227 return;
2228
2229 // Collect and remove phis in non-exit blocks if they have 1 input.
2231 llvm::make_pointer_range(OuterLoopLatch->phis()));
2232 for (PHINode *Phi : Phis) {
2233 assert(Phi->getNumIncomingValues() == 1 && "Single input phi expected");
2234 LLVM_DEBUG(dbgs() << "Removing 1-input phi in non-exit block: " << *Phi
2235 << "\n");
2236 Phi->replaceAllUsesWith(Phi->getIncomingValue(0));
2237 Phi->eraseFromParent();
2238 }
2239}
2240
2241bool LoopInterchangeTransform::adjustLoopBranches() {
2242 LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
2243 std::vector<DominatorTree::UpdateType> DTUpdates;
2244
2245 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
2246 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2247
2248 assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
2249 InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&
2250 InnerLoopPreHeader && "Guaranteed by loop-simplify form");
2251
2252 simplifyLCSSAPhis(OuterLoop, InnerLoop);
2253
2254 // Ensure that both preheaders do not contain PHI nodes and have single
2255 // predecessors. This allows us to move them easily. We use
2256 // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing
2257 // preheaders do not satisfy those conditions.
2258 if (isa<PHINode>(OuterLoopPreHeader->begin()) ||
2259 !OuterLoopPreHeader->getUniquePredecessor())
2260 OuterLoopPreHeader =
2261 InsertPreheaderForLoop(OuterLoop, DT, LI, nullptr, true);
2262 if (InnerLoopPreHeader == OuterLoop->getHeader())
2263 InnerLoopPreHeader =
2264 InsertPreheaderForLoop(InnerLoop, DT, LI, nullptr, true);
2265
2266 // Adjust the loop preheader
2267 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
2268 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
2269 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
2270 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
2271 BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
2272 BasicBlock *InnerLoopLatchPredecessor =
2273 InnerLoopLatch->getUniquePredecessor();
2274 BasicBlock *InnerLoopLatchSuccessor;
2275 BasicBlock *OuterLoopLatchSuccessor;
2276
2277 BranchInst *OuterLoopLatchBI =
2278 dyn_cast<BranchInst>(OuterLoopLatch->getTerminator());
2279 BranchInst *InnerLoopLatchBI =
2280 dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
2281 BranchInst *OuterLoopHeaderBI =
2282 dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
2283 BranchInst *InnerLoopHeaderBI =
2284 dyn_cast<BranchInst>(InnerLoopHeader->getTerminator());
2285
2286 if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
2287 !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
2288 !InnerLoopHeaderBI)
2289 return false;
2290
2291 BranchInst *InnerLoopLatchPredecessorBI =
2292 dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator());
2293 BranchInst *OuterLoopPredecessorBI =
2294 dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator());
2295
2296 if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
2297 return false;
2298 BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
2299 if (!InnerLoopHeaderSuccessor)
2300 return false;
2301
2302 // Adjust Loop Preheader and headers.
2303 // The branches in the outer loop predecessor and the outer loop header can
2304 // be unconditional branches or conditional branches with duplicates. Consider
2305 // this when updating the successors.
2306 updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader,
2307 InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false);
2308 // The outer loop header might or might not branch to the outer latch.
2309 // We are guaranteed to branch to the inner loop preheader.
2310 if (llvm::is_contained(OuterLoopHeaderBI->successors(), OuterLoopLatch)) {
2311 // In this case the outerLoopHeader should branch to the InnerLoopLatch.
2312 updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, InnerLoopLatch,
2313 DTUpdates,
2314 /*MustUpdateOnce=*/false);
2315 }
2316 updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader,
2317 InnerLoopHeaderSuccessor, DTUpdates,
2318 /*MustUpdateOnce=*/false);
2319
2320 // Adjust reduction PHI's now that the incoming block has changed.
2321 InnerLoopHeaderSuccessor->replacePhiUsesWith(InnerLoopHeader,
2322 OuterLoopHeader);
2323
2324 updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor,
2325 OuterLoopPreHeader, DTUpdates);
2326
2327 // -------------Adjust loop latches-----------
2328 if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
2329 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
2330 else
2331 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
2332
2333 updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch,
2334 InnerLoopLatchSuccessor, DTUpdates);
2335
2336 if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
2337 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
2338 else
2339 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
2340
2341 updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor,
2342 OuterLoopLatchSuccessor, DTUpdates);
2343 updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch,
2344 DTUpdates);
2345
2346 DT->applyUpdates(DTUpdates);
2347 restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader,
2348 OuterLoopPreHeader);
2349
2350 moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch,
2351 OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock(),
2352 InnerLoop, LI);
2353 // For PHIs in the exit block of the outer loop, outer's latch has been
2354 // replaced by Inners'.
2355 OuterLoopLatchSuccessor->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
2356
2357 auto &OuterInnerReductions = LIL.getOuterInnerReductions();
2358 // Now update the reduction PHIs in the inner and outer loop headers.
2359 SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
2360 for (PHINode &PHI : InnerLoopHeader->phis())
2361 if (OuterInnerReductions.contains(&PHI))
2362 InnerLoopPHIs.push_back(&PHI);
2363
2364 for (PHINode &PHI : OuterLoopHeader->phis())
2365 if (OuterInnerReductions.contains(&PHI))
2366 OuterLoopPHIs.push_back(&PHI);
2367
2368 // Now move the remaining reduction PHIs from outer to inner loop header and
2369 // vice versa. The PHI nodes must be part of a reduction across the inner and
2370 // outer loop and all the remains to do is and updating the incoming blocks.
2371 for (PHINode *PHI : OuterLoopPHIs) {
2372 LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump(););
2373 PHI->moveBefore(InnerLoopHeader->getFirstNonPHIIt());
2374 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
2375 }
2376 for (PHINode *PHI : InnerLoopPHIs) {
2377 LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump(););
2378 PHI->moveBefore(OuterLoopHeader->getFirstNonPHIIt());
2379 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
2380 }
2381
2382 // Update the incoming blocks for moved PHI nodes.
2383 OuterLoopHeader->replacePhiUsesWith(InnerLoopPreHeader, OuterLoopPreHeader);
2384 OuterLoopHeader->replacePhiUsesWith(InnerLoopLatch, OuterLoopLatch);
2385 InnerLoopHeader->replacePhiUsesWith(OuterLoopPreHeader, InnerLoopPreHeader);
2386 InnerLoopHeader->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
2387
2388 // Values defined in the outer loop header could be used in the inner loop
2389 // latch. In that case, we need to create LCSSA phis for them, because after
2390 // interchanging they will be defined in the new inner loop and used in the
2391 // new outer loop.
2392 SmallVector<Instruction *, 4> MayNeedLCSSAPhis;
2393 for (Instruction &I :
2394 make_range(OuterLoopHeader->begin(), std::prev(OuterLoopHeader->end())))
2395 MayNeedLCSSAPhis.push_back(&I);
2396 formLCSSAForInstructions(MayNeedLCSSAPhis, *DT, *LI, SE);
2397
2398 return true;
2399}
2400
2401bool LoopInterchangeTransform::adjustLoopLinks() {
2402 // Adjust all branches in the inner and outer loop.
2403 bool Changed = adjustLoopBranches();
2404 if (Changed) {
2405 // We have interchanged the preheaders so we need to interchange the data in
2406 // the preheaders as well. This is because the content of the inner
2407 // preheader was previously executed inside the outer loop.
2408 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
2409 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2410 swapBBContents(OuterLoopPreHeader, InnerLoopPreHeader);
2411 }
2412 return Changed;
2413}
2414
2418 LPMUpdater &U) {
2419 Function &F = *LN.getParent();
2420 SmallVector<Loop *, 8> LoopList(LN.getLoops());
2421
2422 if (MaxMemInstrCount < 1) {
2423 LLVM_DEBUG(dbgs() << "MaxMemInstrCount should be at least 1");
2424 return PreservedAnalyses::all();
2425 }
2427
2428 // Ensure minimum depth of the loop nest to do the interchange.
2429 if (!hasSupportedLoopDepth(LoopList, ORE))
2430 return PreservedAnalyses::all();
2431 // Ensure computable loop nest.
2432 if (!isComputableLoopNest(&AR.SE, LoopList)) {
2433 LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
2434 return PreservedAnalyses::all();
2435 }
2436
2437 ORE.emit([&]() {
2438 return OptimizationRemarkAnalysis(DEBUG_TYPE, "Dependence",
2441 << "Computed dependence info, invoking the transform.";
2442 });
2443
2444 DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
2445 if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &AR, &ORE).run(LN))
2446 return PreservedAnalyses::all();
2447 U.markLoopNestChanged(true);
2449}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file defines the StringMap class.
Rewrite undef for PHI
ReachingDefInfo InstSet InstSet & Ignore
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
#define DEBUG_TYPE
Hexagon Common GEP
static void moveBBContents(BasicBlock &SourceBB, BasicBlock &TargetBB)
Move the contents of SourceBB to before the last instruction of TargetBB.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
This file defines the interface for the loop cache analysis.
SmallVector< Loop *, 4 > LoopVector
Definition LoopFuse.cpp:382
Loop::LoopBounds::Direction Direction
Definition LoopInfo.cpp:231
static cl::opt< unsigned int > MaxMemInstrCount("loop-interchange-max-meminstr-count", cl::init(64), cl::Hidden, cl::desc("Maximum number of load-store instructions that should be handled " "in the dependency matrix. Higher value may lead to more interchanges " "at the cost of compile-time"))
static cl::opt< int > LoopInterchangeCostThreshold("loop-interchange-threshold", cl::init(0), cl::Hidden, cl::desc("Interchange if you gain more than this number"))
static PHINode * findInnerReductionPhi(Loop *L, Value *V, SmallVectorImpl< Instruction * > &HasNoWrapInsts)
static cl::opt< unsigned int > MinLoopNestDepth("loop-interchange-min-loop-nest-depth", cl::init(2), cl::Hidden, cl::desc("Minimum depth of loop nest considered for the transform"))
static cl::opt< bool > EnableReduction2Memory("loop-interchange-reduction-to-mem", cl::init(false), cl::Hidden, cl::desc("Support for the inner-loop reduction pattern."))
static bool isComputableLoopNest(ScalarEvolution *SE, ArrayRef< Loop * > LoopList)
static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop)
static void simplifyLCSSAPhis(Loop *OuterLoop, Loop *InnerLoop)
This deals with a corner case when a LCSSA phi node appears in a non-exit block: the outer loop latch...
static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx, unsigned ToIndx)
static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader, BasicBlock *InnerLatch, BasicBlock *OuterHeader, BasicBlock *OuterLatch, BasicBlock *OuterExit, Loop *InnerLoop, LoopInfo *LI)
static void printDepMatrix(CharMatrix &DepMatrix)
static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2)
Swap instructions between BB1 and BB2 but keep terminators intact.
static bool areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL, SmallPtrSetImpl< PHINode * > &Reductions, PHINode *LcssaReduction)
static cl::opt< unsigned int > MaxLoopNestDepth("loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden, cl::desc("Maximum depth of loop nest considered for the transform"))
static bool hasSupportedLoopDepth(ArrayRef< Loop * > LoopList, OptimizationRemarkEmitter &ORE)
static bool inThisOrder(const Instruction *Src, const Instruction *Dst)
Return true if Src appears before Dst in the same basic block.
static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop)
static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId)
Return true if we can vectorize the loop specified by LoopId.
static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, unsigned InnerLoopId, unsigned OuterLoopId)
#define DEBUG_TYPE
static Value * followLCSSA(Value *SV)
static void populateWorklist(Loop &L, LoopVector &LoopList)
static bool checkReductionKind(Loop *L, PHINode *PHI, SmallVectorImpl< Instruction * > &HasNoWrapInsts)
static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, Loop *L, DependenceInfo *DI, ScalarEvolution *SE, OptimizationRemarkEmitter *ORE)
static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB, BasicBlock *NewBB, std::vector< DominatorTree::UpdateType > &DTUpdates, bool MustUpdateOnce=true)
static std::optional< bool > isLexicographicallyPositive(ArrayRef< char > DV, unsigned Begin, unsigned End)
static cl::list< RuleTy > Profitabilities("loop-interchange-profitabilities", cl::ZeroOrMore, cl::MiscFlags::CommaSeparated, cl::Hidden, cl::desc("List of profitability heuristics to be used. They are applied in " "the given order"), cl::list_init< RuleTy >({RuleTy::PerLoopCacheAnalysis, RuleTy::PerInstrOrderCost, RuleTy::ForVectorization}), cl::values(clEnumValN(RuleTy::PerLoopCacheAnalysis, "cache", "Prioritize loop cache cost"), clEnumValN(RuleTy::PerInstrOrderCost, "instorder", "Prioritize the IVs order of each instruction"), clEnumValN(RuleTy::ForVectorization, "vectorize", "Prioritize vectorization"), clEnumValN(RuleTy::Ignore, "ignore", "Ignore profitability, force interchange (does not " "work with other options)")))
static bool noDuplicateRulesAndIgnore(ArrayRef< RuleTy > Rules)
This file defines the interface for the loop nest analysis.
This header provides classes for managing a pipeline of passes over loops in LLVM IR.
loop Loop Strength Reduction
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
uint64_t IntrinsicInst * II
#define P(N)
This file contains some templates that are useful if you are working with the STL at all.
SmallVector< Value *, 8 > ValueVector
This file defines the SmallSet class.
This file defines the SmallVector class.
static bool isProfitable(const StableFunctionMap::StableFunctionEntries &SFS)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:186
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:483
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:470
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:539
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI iterator_range< filter_iterator< BasicBlock::const_iterator, std::function< bool(const Instruction &)> > > instructionsWithoutDebug(bool SkipPseudoOp=true) const
Return a const iterator range over the instructions in the block, skipping any debug instructions.
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI void replacePhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block to refer to basic block New instead of basic block Old.
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:673
Conditional or Unconditional Branch instruction.
iterator_range< succ_op_iterator > successors()
bool isConditional() const
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
static std::unique_ptr< CacheCost > getCacheCost(Loop &Root, LoopStandardAnalysisResults &AR, DependenceInfo &DI, std::optional< unsigned > TRT=std::nullopt)
Create a CacheCost for the loop nest rooted by Root.
CacheCostTy getLoopCost(const Loop &L) const
Return the estimated cost of loop L if the given loop is part of the loop nest associated with this o...
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:81
DependenceInfo - This class is the main dependence-analysis driver.
LLVM_ABI std::unique_ptr< Dependence > depends(Instruction *Src, Instruction *Dst, bool UnderRuntimeAssumptions=false)
depends - Tests for a dependence between the Src and Dst instructions.
void applyUpdates(ArrayRef< UpdateType > Updates)
Inform the dominator tree about a sequence of CFG edge insertions and deletions and perform a batch u...
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static LLVM_ABI bool isInductionPHI(PHINode *Phi, const Loop *L, ScalarEvolution *SE, InductionDescriptor &D, const SCEV *Expr=nullptr, SmallVectorImpl< Instruction * > *CastsToIgnore=nullptr)
Returns true if Phi is an induction in the loop L.
LLVM_ABI void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI void insertBefore(InstListType::iterator InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified position.
LLVM_ABI bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void removeBlockFromLoop(BlockT *BB)
This removes the specified basic block from the current loop, updating the Blocks as appropriate.
const std::vector< LoopT * > & getSubLoops() const
Return the loops contained entirely within this loop.
BlockT * getHeader() const
iterator_range< block_iterator > blocks() const
void addChildLoop(LoopT *NewChild)
Add the specified loop to be a child of this loop.
void addBlockEntry(BlockT *BB)
This adds a basic block directly to the basic block list.
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
iterator begin() const
BlockT * getUniqueExitBlock() const
If getUniqueExitBlocks would return exactly one block, return that block.
LoopT * removeChildLoop(iterator I)
This removes the specified child from being a subloop of this loop.
void changeTopLevelLoop(LoopT *OldLoop, LoopT *NewLoop)
Replace the specified loop in the top-level loops list with the indicated loop.
void changeLoopFor(BlockT *BB, LoopT *L)
Change the top-level loop that contains BB to the specified loop.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
static const BasicBlock & skipEmptyBlockUntil(const BasicBlock *From, const BasicBlock *End, bool CheckUniquePred=false)
Recursivelly traverse all empty 'single successor' basic blocks of From (if there are any).
ArrayRef< Loop * > getLoops() const
Get the loops in the nest.
Function * getParent() const
Return the function to which the loop-nest belongs.
Loop & getOutermostLoop() const
Return the outermost loop in the loop nest.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition LoopInfo.cpp:632
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition LoopInfo.cpp:61
StringRef getName() const
Definition LoopInfo.h:389
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
bool isComplete() const
If the PHI node is complete which means all of its parent's predecessors have incoming value in this ...
op_range incoming_values()
void setIncomingBlock(unsigned i, BasicBlock *BB)
void setIncomingValue(unsigned i, Value *V)
static unsigned getIncomingValueNumForOperand(unsigned i)
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Instruction * getExactFPMathInst() const
Returns 1st non-reassociative FP instruction in the PHI node's use-chain.
static LLVM_ABI bool isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes, DemandedBits *DB=nullptr, AssumptionCache *AC=nullptr, DominatorTree *DT=nullptr, ScalarEvolution *SE=nullptr)
Returns true if Phi is a reduction in TheLoop.
LLVM_ABI SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
RecurKind getRecurrenceKind() const
This class represents an analyzed expression in the program.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
std::pair< iterator, bool > try_emplace(StringRef Key, ArgsTy &&...Args)
Emplace a new element for the specified key into the map if the key isn't already in the map.
Definition StringMap.h:381
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:267
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:166
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:188
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
list_initializer< Ty > list_init(ArrayRef< Ty > Vals)
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI BasicBlock * InsertPreheaderForLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
InsertPreheaderForLoop - Once we discover that a loop doesn't have a preheader, this method is called...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2544
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
auto successors(const MachineBasicBlock *BB)
LLVM_ABI bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition LCSSA.cpp:449
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2163
auto map_range(ContainerTy &&C, FuncTy F)
Definition STLExtras.h:364
AnalysisManager< Loop, LoopStandardAnalysisResults & > LoopAnalysisManager
The loop analysis manager.
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:2016
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
RecurKind
These are the kinds of recurrences that we support.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FMinimumNum
FP min with llvm.minimumnum semantics.
@ Or
Bitwise or logical OR of integers.
@ FMinimum
FP min with llvm.minimum semantics.
@ Mul
Product of integers.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMaximum
FP max with llvm.maximum semantics.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Add
Sum of integers.
@ FAdd
Sum of floats.
@ FMaximumNum
FP max with llvm.maximumnum semantics.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2002
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI bool formLCSSAForInstructions(SmallVectorImpl< Instruction * > &Worklist, const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE, SmallVectorImpl< PHINode * > *PHIsToRemove=nullptr, SmallVectorImpl< PHINode * > *InsertedPHIs=nullptr)
Ensures LCSSA form for every instruction from the Worklist in the scope of innermost containing loop.
Definition LCSSA.cpp:308
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
LLVM_ABI PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
auto predecessors(const MachineBasicBlock *BB)
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:368
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
PreservedAnalyses run(LoopNest &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...