LLVM 19.0.0git
LoopUnrollPass.cpp
Go to the documentation of this file.
1//===- LoopUnroll.cpp - Loop unroller pass --------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements a simple loop unroller. It works best when loops have
10// been canonicalized by the -indvars pass, allowing it to determine the trip
11// counts of loops easily.
12//===----------------------------------------------------------------------===//
13
15#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/DenseSet.h"
18#include "llvm/ADT/STLExtras.h"
19#include "llvm/ADT/SetVector.h"
22#include "llvm/ADT/StringRef.h"
34#include "llvm/IR/BasicBlock.h"
35#include "llvm/IR/CFG.h"
36#include "llvm/IR/Constant.h"
37#include "llvm/IR/Constants.h"
39#include "llvm/IR/Dominators.h"
40#include "llvm/IR/Function.h"
41#include "llvm/IR/Instruction.h"
44#include "llvm/IR/Metadata.h"
45#include "llvm/IR/PassManager.h"
47#include "llvm/Pass.h"
50#include "llvm/Support/Debug.h"
61#include <algorithm>
62#include <cassert>
63#include <cstdint>
64#include <limits>
65#include <optional>
66#include <string>
67#include <tuple>
68#include <utility>
69
70using namespace llvm;
71
72#define DEBUG_TYPE "loop-unroll"
73
75 "forget-scev-loop-unroll", cl::init(false), cl::Hidden,
76 cl::desc("Forget everything in SCEV when doing LoopUnroll, instead of just"
77 " the current top-most loop. This is sometimes preferred to reduce"
78 " compile time."));
79
81 UnrollThreshold("unroll-threshold", cl::Hidden,
82 cl::desc("The cost threshold for loop unrolling"));
83
86 "unroll-optsize-threshold", cl::init(0), cl::Hidden,
87 cl::desc("The cost threshold for loop unrolling when optimizing for "
88 "size"));
89
91 "unroll-partial-threshold", cl::Hidden,
92 cl::desc("The cost threshold for partial loop unrolling"));
93
95 "unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden,
96 cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied "
97 "to the threshold when aggressively unrolling a loop due to the "
98 "dynamic cost savings. If completely unrolling a loop will reduce "
99 "the total runtime from X to Y, we boost the loop unroll "
100 "threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, "
101 "X/Y). This limit avoids excessive code bloat."));
102
104 "unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden,
105 cl::desc("Don't allow loop unrolling to simulate more than this number of"
106 "iterations when checking full unroll profitability"));
107
109 "unroll-count", cl::Hidden,
110 cl::desc("Use this unroll count for all loops including those with "
111 "unroll_count pragma values, for testing purposes"));
112
114 "unroll-max-count", cl::Hidden,
115 cl::desc("Set the max unroll count for partial and runtime unrolling, for"
116 "testing purposes"));
117
119 "unroll-full-max-count", cl::Hidden,
120 cl::desc(
121 "Set the max unroll count for full unrolling, for testing purposes"));
122
123static cl::opt<bool>
124 UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
125 cl::desc("Allows loops to be partially unrolled until "
126 "-unroll-threshold loop size is reached."));
127
129 "unroll-allow-remainder", cl::Hidden,
130 cl::desc("Allow generation of a loop remainder (extra iterations) "
131 "when unrolling a loop."));
132
133static cl::opt<bool>
134 UnrollRuntime("unroll-runtime", cl::Hidden,
135 cl::desc("Unroll loops with run-time trip counts"));
136
138 "unroll-max-upperbound", cl::init(8), cl::Hidden,
139 cl::desc(
140 "The max of trip count upper bound that is considered in unrolling"));
141
143 "pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,
144 cl::desc("Unrolled size limit for loops with an unroll(full) or "
145 "unroll_count pragma."));
146
148 "flat-loop-tripcount-threshold", cl::init(5), cl::Hidden,
149 cl::desc("If the runtime tripcount for the loop is lower than the "
150 "threshold, the loop is considered as flat and will be less "
151 "aggressively unrolled."));
152
154 "unroll-remainder", cl::Hidden,
155 cl::desc("Allow the loop remainder to be unrolled."));
156
157// This option isn't ever intended to be enabled, it serves to allow
158// experiments to check the assumptions about when this kind of revisit is
159// necessary.
161 "unroll-revisit-child-loops", cl::Hidden,
162 cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. "
163 "This shouldn't typically be needed as child loops (or their "
164 "clones) were already visited."));
165
167 "unroll-threshold-aggressive", cl::init(300), cl::Hidden,
168 cl::desc("Threshold (max size of unrolled loop) to use in aggressive (O3) "
169 "optimizations"));
171 UnrollThresholdDefault("unroll-threshold-default", cl::init(150),
173 cl::desc("Default threshold (max size of unrolled "
174 "loop), used in all but O3 optimizations"));
175
177 "pragma-unroll-full-max-iterations", cl::init(1'000'000), cl::Hidden,
178 cl::desc("Maximum allowed iterations to unroll under pragma unroll full."));
179
180/// A magic value for use with the Threshold parameter to indicate
181/// that the loop unroll should be performed regardless of how much
182/// code expansion would result.
183static const unsigned NoThreshold = std::numeric_limits<unsigned>::max();
184
185/// Gather the various unrolling parameters based on the defaults, compiler
186/// flags, TTI overrides and user specified parameters.
190 OptimizationRemarkEmitter &ORE, int OptLevel,
191 std::optional<unsigned> UserThreshold, std::optional<unsigned> UserCount,
192 std::optional<bool> UserAllowPartial, std::optional<bool> UserRuntime,
193 std::optional<bool> UserUpperBound,
194 std::optional<unsigned> UserFullUnrollMaxCount) {
196
197 // Set up the defaults
198 UP.Threshold =
202 UP.PartialThreshold = 150;
204 UP.Count = 0;
206 UP.MaxCount = std::numeric_limits<unsigned>::max();
208 UP.FullUnrollMaxCount = std::numeric_limits<unsigned>::max();
209 UP.BEInsns = 2;
210 UP.Partial = false;
211 UP.Runtime = false;
212 UP.AllowRemainder = true;
213 UP.UnrollRemainder = false;
214 UP.AllowExpensiveTripCount = false;
215 UP.Force = false;
216 UP.UpperBound = false;
217 UP.UnrollAndJam = false;
220
221 // Override with any target specific settings
222 TTI.getUnrollingPreferences(L, SE, UP, &ORE);
223
224 // Apply size attributes
225 bool OptForSize = L->getHeader()->getParent()->hasOptSize() ||
226 // Let unroll hints / pragmas take precedence over PGSO.
228 llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
229 PGSOQueryType::IRPass));
230 if (OptForSize) {
234 }
235
236 // Apply any user values specified by cl::opt
237 if (UnrollThreshold.getNumOccurrences() > 0)
239 if (UnrollPartialThreshold.getNumOccurrences() > 0)
241 if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0)
243 if (UnrollMaxCount.getNumOccurrences() > 0)
245 if (UnrollMaxUpperBound.getNumOccurrences() > 0)
247 if (UnrollFullMaxCount.getNumOccurrences() > 0)
249 if (UnrollAllowPartial.getNumOccurrences() > 0)
251 if (UnrollAllowRemainder.getNumOccurrences() > 0)
253 if (UnrollRuntime.getNumOccurrences() > 0)
256 UP.UpperBound = false;
257 if (UnrollUnrollRemainder.getNumOccurrences() > 0)
259 if (UnrollMaxIterationsCountToAnalyze.getNumOccurrences() > 0)
261
262 // Apply user values provided by argument
263 if (UserThreshold) {
264 UP.Threshold = *UserThreshold;
265 UP.PartialThreshold = *UserThreshold;
266 }
267 if (UserCount)
268 UP.Count = *UserCount;
269 if (UserAllowPartial)
270 UP.Partial = *UserAllowPartial;
271 if (UserRuntime)
272 UP.Runtime = *UserRuntime;
273 if (UserUpperBound)
274 UP.UpperBound = *UserUpperBound;
275 if (UserFullUnrollMaxCount)
276 UP.FullUnrollMaxCount = *UserFullUnrollMaxCount;
277
278 return UP;
279}
280
281namespace {
282
283/// A struct to densely store the state of an instruction after unrolling at
284/// each iteration.
285///
286/// This is designed to work like a tuple of <Instruction *, int> for the
287/// purposes of hashing and lookup, but to be able to associate two boolean
288/// states with each key.
289struct UnrolledInstState {
290 Instruction *I;
291 int Iteration : 30;
292 unsigned IsFree : 1;
293 unsigned IsCounted : 1;
294};
295
296/// Hashing and equality testing for a set of the instruction states.
297struct UnrolledInstStateKeyInfo {
298 using PtrInfo = DenseMapInfo<Instruction *>;
300
301 static inline UnrolledInstState getEmptyKey() {
302 return {PtrInfo::getEmptyKey(), 0, 0, 0};
303 }
304
305 static inline UnrolledInstState getTombstoneKey() {
306 return {PtrInfo::getTombstoneKey(), 0, 0, 0};
307 }
308
309 static inline unsigned getHashValue(const UnrolledInstState &S) {
310 return PairInfo::getHashValue({S.I, S.Iteration});
311 }
312
313 static inline bool isEqual(const UnrolledInstState &LHS,
314 const UnrolledInstState &RHS) {
315 return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration});
316 }
317};
318
319struct EstimatedUnrollCost {
320 /// The estimated cost after unrolling.
321 unsigned UnrolledCost;
322
323 /// The estimated dynamic cost of executing the instructions in the
324 /// rolled form.
325 unsigned RolledDynamicCost;
326};
327
328struct PragmaInfo {
329 PragmaInfo(bool UUC, bool PFU, unsigned PC, bool PEU)
330 : UserUnrollCount(UUC), PragmaFullUnroll(PFU), PragmaCount(PC),
331 PragmaEnableUnroll(PEU) {}
332 const bool UserUnrollCount;
333 const bool PragmaFullUnroll;
334 const unsigned PragmaCount;
335 const bool PragmaEnableUnroll;
336};
337
338} // end anonymous namespace
339
340/// Figure out if the loop is worth full unrolling.
341///
342/// Complete loop unrolling can make some loads constant, and we need to know
343/// if that would expose any further optimization opportunities. This routine
344/// estimates this optimization. It computes cost of unrolled loop
345/// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By
346/// dynamic cost we mean that we won't count costs of blocks that are known not
347/// to be executed (i.e. if we have a branch in the loop and we know that at the
348/// given iteration its condition would be resolved to true, we won't add up the
349/// cost of the 'false'-block).
350/// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If
351/// the analysis failed (no benefits expected from the unrolling, or the loop is
352/// too big to analyze), the returned value is std::nullopt.
353static std::optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
354 const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE,
355 const SmallPtrSetImpl<const Value *> &EphValues,
356 const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize,
357 unsigned MaxIterationsCountToAnalyze) {
358 // We want to be able to scale offsets by the trip count and add more offsets
359 // to them without checking for overflows, and we already don't want to
360 // analyze *massive* trip counts, so we force the max to be reasonably small.
361 assert(MaxIterationsCountToAnalyze <
362 (unsigned)(std::numeric_limits<int>::max() / 2) &&
363 "The unroll iterations max is too large!");
364
365 // Only analyze inner loops. We can't properly estimate cost of nested loops
366 // and we won't visit inner loops again anyway.
367 if (!L->isInnermost())
368 return std::nullopt;
369
370 // Don't simulate loops with a big or unknown tripcount
371 if (!TripCount || TripCount > MaxIterationsCountToAnalyze)
372 return std::nullopt;
373
376 DenseMap<Value *, Value *> SimplifiedValues;
377 SmallVector<std::pair<Value *, Value *>, 4> SimplifiedInputValues;
378
379 // The estimated cost of the unrolled form of the loop. We try to estimate
380 // this by simplifying as much as we can while computing the estimate.
381 InstructionCost UnrolledCost = 0;
382
383 // We also track the estimated dynamic (that is, actually executed) cost in
384 // the rolled form. This helps identify cases when the savings from unrolling
385 // aren't just exposing dead control flows, but actual reduced dynamic
386 // instructions due to the simplifications which we expect to occur after
387 // unrolling.
388 InstructionCost RolledDynamicCost = 0;
389
390 // We track the simplification of each instruction in each iteration. We use
391 // this to recursively merge costs into the unrolled cost on-demand so that
392 // we don't count the cost of any dead code. This is essentially a map from
393 // <instruction, int> to <bool, bool>, but stored as a densely packed struct.
395
396 // A small worklist used to accumulate cost of instructions from each
397 // observable and reached root in the loop.
399
400 // PHI-used worklist used between iterations while accumulating cost.
402
403 // Helper function to accumulate cost for instructions in the loop.
404 auto AddCostRecursively = [&](Instruction &RootI, int Iteration) {
405 assert(Iteration >= 0 && "Cannot have a negative iteration!");
406 assert(CostWorklist.empty() && "Must start with an empty cost list");
407 assert(PHIUsedList.empty() && "Must start with an empty phi used list");
408 CostWorklist.push_back(&RootI);
410 RootI.getFunction()->hasMinSize() ?
413 for (;; --Iteration) {
414 do {
415 Instruction *I = CostWorklist.pop_back_val();
416
417 // InstCostMap only uses I and Iteration as a key, the other two values
418 // don't matter here.
419 auto CostIter = InstCostMap.find({I, Iteration, 0, 0});
420 if (CostIter == InstCostMap.end())
421 // If an input to a PHI node comes from a dead path through the loop
422 // we may have no cost data for it here. What that actually means is
423 // that it is free.
424 continue;
425 auto &Cost = *CostIter;
426 if (Cost.IsCounted)
427 // Already counted this instruction.
428 continue;
429
430 // Mark that we are counting the cost of this instruction now.
431 Cost.IsCounted = true;
432
433 // If this is a PHI node in the loop header, just add it to the PHI set.
434 if (auto *PhiI = dyn_cast<PHINode>(I))
435 if (PhiI->getParent() == L->getHeader()) {
436 assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they "
437 "inherently simplify during unrolling.");
438 if (Iteration == 0)
439 continue;
440
441 // Push the incoming value from the backedge into the PHI used list
442 // if it is an in-loop instruction. We'll use this to populate the
443 // cost worklist for the next iteration (as we count backwards).
444 if (auto *OpI = dyn_cast<Instruction>(
445 PhiI->getIncomingValueForBlock(L->getLoopLatch())))
446 if (L->contains(OpI))
447 PHIUsedList.push_back(OpI);
448 continue;
449 }
450
451 // First accumulate the cost of this instruction.
452 if (!Cost.IsFree) {
453 // Consider simplified operands in instruction cost.
455 transform(I->operands(), std::back_inserter(Operands),
456 [&](Value *Op) {
457 if (auto Res = SimplifiedValues.lookup(Op))
458 return Res;
459 return Op;
460 });
461 UnrolledCost += TTI.getInstructionCost(I, Operands, CostKind);
462 LLVM_DEBUG(dbgs() << "Adding cost of instruction (iteration "
463 << Iteration << "): ");
464 LLVM_DEBUG(I->dump());
465 }
466
467 // We must count the cost of every operand which is not free,
468 // recursively. If we reach a loop PHI node, simply add it to the set
469 // to be considered on the next iteration (backwards!).
470 for (Value *Op : I->operands()) {
471 // Check whether this operand is free due to being a constant or
472 // outside the loop.
473 auto *OpI = dyn_cast<Instruction>(Op);
474 if (!OpI || !L->contains(OpI))
475 continue;
476
477 // Otherwise accumulate its cost.
478 CostWorklist.push_back(OpI);
479 }
480 } while (!CostWorklist.empty());
481
482 if (PHIUsedList.empty())
483 // We've exhausted the search.
484 break;
485
486 assert(Iteration > 0 &&
487 "Cannot track PHI-used values past the first iteration!");
488 CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end());
489 PHIUsedList.clear();
490 }
491 };
492
493 // Ensure that we don't violate the loop structure invariants relied on by
494 // this analysis.
495 assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");
496 assert(L->isLCSSAForm(DT) &&
497 "Must have loops in LCSSA form to track live-out values.");
498
499 LLVM_DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");
500
502 L->getHeader()->getParent()->hasMinSize() ?
504 // Simulate execution of each iteration of the loop counting instructions,
505 // which would be simplified.
506 // Since the same load will take different values on different iterations,
507 // we literally have to go through all loop's iterations.
508 for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {
509 LLVM_DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n");
510
511 // Prepare for the iteration by collecting any simplified entry or backedge
512 // inputs.
513 for (Instruction &I : *L->getHeader()) {
514 auto *PHI = dyn_cast<PHINode>(&I);
515 if (!PHI)
516 break;
517
518 // The loop header PHI nodes must have exactly two input: one from the
519 // loop preheader and one from the loop latch.
520 assert(
521 PHI->getNumIncomingValues() == 2 &&
522 "Must have an incoming value only for the preheader and the latch.");
523
524 Value *V = PHI->getIncomingValueForBlock(
525 Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch());
526 if (Iteration != 0 && SimplifiedValues.count(V))
527 V = SimplifiedValues.lookup(V);
528 SimplifiedInputValues.push_back({PHI, V});
529 }
530
531 // Now clear and re-populate the map for the next iteration.
532 SimplifiedValues.clear();
533 while (!SimplifiedInputValues.empty())
534 SimplifiedValues.insert(SimplifiedInputValues.pop_back_val());
535
536 UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L);
537
538 BBWorklist.clear();
539 BBWorklist.insert(L->getHeader());
540 // Note that we *must not* cache the size, this loop grows the worklist.
541 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
542 BasicBlock *BB = BBWorklist[Idx];
543
544 // Visit all instructions in the given basic block and try to simplify
545 // it. We don't change the actual IR, just count optimization
546 // opportunities.
547 for (Instruction &I : *BB) {
548 // These won't get into the final code - don't even try calculating the
549 // cost for them.
550 if (isa<DbgInfoIntrinsic>(I) || EphValues.count(&I))
551 continue;
552
553 // Track this instruction's expected baseline cost when executing the
554 // rolled loop form.
555 RolledDynamicCost += TTI.getInstructionCost(&I, CostKind);
556
557 // Visit the instruction to analyze its loop cost after unrolling,
558 // and if the visitor returns true, mark the instruction as free after
559 // unrolling and continue.
560 bool IsFree = Analyzer.visit(I);
561 bool Inserted = InstCostMap.insert({&I, (int)Iteration,
562 (unsigned)IsFree,
563 /*IsCounted*/ false}).second;
564 (void)Inserted;
565 assert(Inserted && "Cannot have a state for an unvisited instruction!");
566
567 if (IsFree)
568 continue;
569
570 // Can't properly model a cost of a call.
571 // FIXME: With a proper cost model we should be able to do it.
572 if (auto *CI = dyn_cast<CallInst>(&I)) {
573 const Function *Callee = CI->getCalledFunction();
574 if (!Callee || TTI.isLoweredToCall(Callee)) {
575 LLVM_DEBUG(dbgs() << "Can't analyze cost of loop with call\n");
576 return std::nullopt;
577 }
578 }
579
580 // If the instruction might have a side-effect recursively account for
581 // the cost of it and all the instructions leading up to it.
582 if (I.mayHaveSideEffects())
583 AddCostRecursively(I, Iteration);
584
585 // If unrolled body turns out to be too big, bail out.
586 if (UnrolledCost > MaxUnrolledLoopSize) {
587 LLVM_DEBUG(dbgs() << " Exceeded threshold.. exiting.\n"
588 << " UnrolledCost: " << UnrolledCost
589 << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize
590 << "\n");
591 return std::nullopt;
592 }
593 }
594
595 Instruction *TI = BB->getTerminator();
596
597 auto getSimplifiedConstant = [&](Value *V) -> Constant * {
598 if (SimplifiedValues.count(V))
599 V = SimplifiedValues.lookup(V);
600 return dyn_cast<Constant>(V);
601 };
602
603 // Add in the live successors by first checking whether we have terminator
604 // that may be simplified based on the values simplified by this call.
605 BasicBlock *KnownSucc = nullptr;
606 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
607 if (BI->isConditional()) {
608 if (auto *SimpleCond = getSimplifiedConstant(BI->getCondition())) {
609 // Just take the first successor if condition is undef
610 if (isa<UndefValue>(SimpleCond))
611 KnownSucc = BI->getSuccessor(0);
612 else if (ConstantInt *SimpleCondVal =
613 dyn_cast<ConstantInt>(SimpleCond))
614 KnownSucc = BI->getSuccessor(SimpleCondVal->isZero() ? 1 : 0);
615 }
616 }
617 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
618 if (auto *SimpleCond = getSimplifiedConstant(SI->getCondition())) {
619 // Just take the first successor if condition is undef
620 if (isa<UndefValue>(SimpleCond))
621 KnownSucc = SI->getSuccessor(0);
622 else if (ConstantInt *SimpleCondVal =
623 dyn_cast<ConstantInt>(SimpleCond))
624 KnownSucc = SI->findCaseValue(SimpleCondVal)->getCaseSuccessor();
625 }
626 }
627 if (KnownSucc) {
628 if (L->contains(KnownSucc))
629 BBWorklist.insert(KnownSucc);
630 else
631 ExitWorklist.insert({BB, KnownSucc});
632 continue;
633 }
634
635 // Add BB's successors to the worklist.
636 for (BasicBlock *Succ : successors(BB))
637 if (L->contains(Succ))
638 BBWorklist.insert(Succ);
639 else
640 ExitWorklist.insert({BB, Succ});
641 AddCostRecursively(*TI, Iteration);
642 }
643
644 // If we found no optimization opportunities on the first iteration, we
645 // won't find them on later ones too.
646 if (UnrolledCost == RolledDynamicCost) {
647 LLVM_DEBUG(dbgs() << " No opportunities found.. exiting.\n"
648 << " UnrolledCost: " << UnrolledCost << "\n");
649 return std::nullopt;
650 }
651 }
652
653 while (!ExitWorklist.empty()) {
654 BasicBlock *ExitingBB, *ExitBB;
655 std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val();
656
657 for (Instruction &I : *ExitBB) {
658 auto *PN = dyn_cast<PHINode>(&I);
659 if (!PN)
660 break;
661
662 Value *Op = PN->getIncomingValueForBlock(ExitingBB);
663 if (auto *OpI = dyn_cast<Instruction>(Op))
664 if (L->contains(OpI))
665 AddCostRecursively(*OpI, TripCount - 1);
666 }
667 }
668
669 assert(UnrolledCost.isValid() && RolledDynamicCost.isValid() &&
670 "All instructions must have a valid cost, whether the "
671 "loop is rolled or unrolled.");
672
673 LLVM_DEBUG(dbgs() << "Analysis finished:\n"
674 << "UnrolledCost: " << UnrolledCost << ", "
675 << "RolledDynamicCost: " << RolledDynamicCost << "\n");
676 return {{unsigned(*UnrolledCost.getValue()),
677 unsigned(*RolledDynamicCost.getValue())}};
678}
679
681 const Loop *L, const TargetTransformInfo &TTI,
682 const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) {
684 for (BasicBlock *BB : L->blocks())
685 Metrics.analyzeBasicBlock(BB, TTI, EphValues);
686 NumInlineCandidates = Metrics.NumInlineCandidates;
687 NotDuplicatable = Metrics.notDuplicatable;
688 Convergent = Metrics.convergent;
689 LoopSize = Metrics.NumInsts;
690
691 // Don't allow an estimate of size zero. This would allows unrolling of loops
692 // with huge iteration counts, which is a compile time problem even if it's
693 // not a problem for code quality. Also, the code using this size may assume
694 // that each loop has at least three instructions (likely a conditional
695 // branch, a comparison feeding that branch, and some kind of loop increment
696 // feeding that comparison instruction).
697 if (LoopSize.isValid() && LoopSize < BEInsns + 1)
698 // This is an open coded max() on InstructionCost
699 LoopSize = BEInsns + 1;
700}
701
704 unsigned CountOverwrite) const {
705 unsigned LS = *LoopSize.getValue();
706 assert(LS >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
707 if (CountOverwrite)
708 return static_cast<uint64_t>(LS - UP.BEInsns) * CountOverwrite + UP.BEInsns;
709 else
710 return static_cast<uint64_t>(LS - UP.BEInsns) * UP.Count + UP.BEInsns;
711}
712
713// Returns the loop hint metadata node with the given name (for example,
714// "llvm.loop.unroll.count"). If no such metadata node exists, then nullptr is
715// returned.
717 if (MDNode *LoopID = L->getLoopID())
718 return GetUnrollMetadata(LoopID, Name);
719 return nullptr;
720}
721
722// Returns true if the loop has an unroll(full) pragma.
723static bool hasUnrollFullPragma(const Loop *L) {
724 return getUnrollMetadataForLoop(L, "llvm.loop.unroll.full");
725}
726
727// Returns true if the loop has an unroll(enable) pragma. This metadata is used
728// for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives.
729static bool hasUnrollEnablePragma(const Loop *L) {
730 return getUnrollMetadataForLoop(L, "llvm.loop.unroll.enable");
731}
732
733// Returns true if the loop has an runtime unroll(disable) pragma.
734static bool hasRuntimeUnrollDisablePragma(const Loop *L) {
735 return getUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable");
736}
737
738// If loop has an unroll_count pragma return the (necessarily
739// positive) value from the pragma. Otherwise return 0.
740static unsigned unrollCountPragmaValue(const Loop *L) {
741 MDNode *MD = getUnrollMetadataForLoop(L, "llvm.loop.unroll.count");
742 if (MD) {
743 assert(MD->getNumOperands() == 2 &&
744 "Unroll count hint metadata should have two operands.");
745 unsigned Count =
746 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
747 assert(Count >= 1 && "Unroll count must be positive.");
748 return Count;
749 }
750 return 0;
751}
752
753// Computes the boosting factor for complete unrolling.
754// If fully unrolling the loop would save a lot of RolledDynamicCost, it would
755// be beneficial to fully unroll the loop even if unrolledcost is large. We
756// use (RolledDynamicCost / UnrolledCost) to model the unroll benefits to adjust
757// the unroll threshold.
758static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost,
759 unsigned MaxPercentThresholdBoost) {
760 if (Cost.RolledDynamicCost >= std::numeric_limits<unsigned>::max() / 100)
761 return 100;
762 else if (Cost.UnrolledCost != 0)
763 // The boosting factor is RolledDynamicCost / UnrolledCost
764 return std::min(100 * Cost.RolledDynamicCost / Cost.UnrolledCost,
765 MaxPercentThresholdBoost);
766 else
767 return MaxPercentThresholdBoost;
768}
769
770static std::optional<unsigned>
771shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo,
772 const unsigned TripMultiple, const unsigned TripCount,
773 unsigned MaxTripCount, const UnrollCostEstimator UCE,
775
776 // Using unroll pragma
777 // 1st priority is unroll count set by "unroll-count" option.
778
779 if (PInfo.UserUnrollCount) {
780 if (UP.AllowRemainder &&
781 UCE.getUnrolledLoopSize(UP, (unsigned)UnrollCount) < UP.Threshold)
782 return (unsigned)UnrollCount;
783 }
784
785 // 2nd priority is unroll count set by pragma.
786 if (PInfo.PragmaCount > 0) {
787 if ((UP.AllowRemainder || (TripMultiple % PInfo.PragmaCount == 0)))
788 return PInfo.PragmaCount;
789 }
790
791 if (PInfo.PragmaFullUnroll && TripCount != 0) {
792 // Certain cases with UBSAN can cause trip count to be calculated as
793 // INT_MAX, Block full unrolling at a reasonable limit so that the compiler
794 // doesn't hang trying to unroll the loop. See PR77842
795 if (TripCount > PragmaUnrollFullMaxIterations) {
796 LLVM_DEBUG(dbgs() << "Won't unroll; trip count is too large\n");
797 return std::nullopt;
798 }
799
800 return TripCount;
801 }
802
803 if (PInfo.PragmaEnableUnroll && !TripCount && MaxTripCount &&
804 MaxTripCount <= UP.MaxUpperBound)
805 return MaxTripCount;
806
807 // if didn't return until here, should continue to other priorties
808 return std::nullopt;
809}
810
811static std::optional<unsigned> shouldFullUnroll(
814 const unsigned FullUnrollTripCount, const UnrollCostEstimator UCE,
816 assert(FullUnrollTripCount && "should be non-zero!");
817
818 if (FullUnrollTripCount > UP.FullUnrollMaxCount)
819 return std::nullopt;
820
821 // When computing the unrolled size, note that BEInsns are not replicated
822 // like the rest of the loop body.
823 if (UCE.getUnrolledLoopSize(UP) < UP.Threshold)
824 return FullUnrollTripCount;
825
826 // The loop isn't that small, but we still can fully unroll it if that
827 // helps to remove a significant number of instructions.
828 // To check that, run additional analysis on the loop.
829 if (std::optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
830 L, FullUnrollTripCount, DT, SE, EphValues, TTI,
833 unsigned Boost =
835 if (Cost->UnrolledCost < UP.Threshold * Boost / 100)
836 return FullUnrollTripCount;
837 }
838 return std::nullopt;
839}
840
841static std::optional<unsigned>
842shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount,
843 const UnrollCostEstimator UCE,
845
846 if (!TripCount)
847 return std::nullopt;
848
849 if (!UP.Partial) {
850 LLVM_DEBUG(dbgs() << " will not try to unroll partially because "
851 << "-unroll-allow-partial not given\n");
852 return 0;
853 }
854 unsigned count = UP.Count;
855 if (count == 0)
856 count = TripCount;
857 if (UP.PartialThreshold != NoThreshold) {
858 // Reduce unroll count to be modulo of TripCount for partial unrolling.
860 count = (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
861 (LoopSize - UP.BEInsns);
862 if (count > UP.MaxCount)
863 count = UP.MaxCount;
864 while (count != 0 && TripCount % count != 0)
865 count--;
866 if (UP.AllowRemainder && count <= 1) {
867 // If there is no Count that is modulo of TripCount, set Count to
868 // largest power-of-two factor that satisfies the threshold limit.
869 // As we'll create fixup loop, do the type of unrolling only if
870 // remainder loop is allowed.
872 while (count != 0 &&
874 count >>= 1;
875 }
876 if (count < 2) {
877 count = 0;
878 }
879 } else {
880 count = TripCount;
881 }
882 if (count > UP.MaxCount)
883 count = UP.MaxCount;
884
885 LLVM_DEBUG(dbgs() << " partially unrolling with count: " << count << "\n");
886
887 return count;
888}
889// Returns true if unroll count was set explicitly.
890// Calculates unroll count and writes it to UP.Count.
891// Unless IgnoreUser is true, will also use metadata and command-line options
892// that are specific to to the LoopUnroll pass (which, for instance, are
893// irrelevant for the LoopUnrollAndJam pass).
894// FIXME: This function is used by LoopUnroll and LoopUnrollAndJam, but consumes
895// many LoopUnroll-specific options. The shared functionality should be
896// refactored into it own function.
900 const SmallPtrSetImpl<const Value *> &EphValues,
901 OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount,
902 bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE,
904 TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) {
905
906 unsigned LoopSize = UCE.getRolledLoopSize();
907
908 const bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;
909 const bool PragmaFullUnroll = hasUnrollFullPragma(L);
910 const unsigned PragmaCount = unrollCountPragmaValue(L);
911 const bool PragmaEnableUnroll = hasUnrollEnablePragma(L);
912
913 const bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll ||
914 PragmaEnableUnroll || UserUnrollCount;
915
916 PragmaInfo PInfo(UserUnrollCount, PragmaFullUnroll, PragmaCount,
917 PragmaEnableUnroll);
918 // Use an explicit peel count that has been specified for testing. In this
919 // case it's not permitted to also specify an explicit unroll count.
920 if (PP.PeelCount) {
921 if (UnrollCount.getNumOccurrences() > 0) {
922 report_fatal_error("Cannot specify both explicit peel count and "
923 "explicit unroll count", /*GenCrashDiag=*/false);
924 }
925 UP.Count = 1;
926 UP.Runtime = false;
927 return true;
928 }
929 // Check for explicit Count.
930 // 1st priority is unroll count set by "unroll-count" option.
931 // 2nd priority is unroll count set by pragma.
932 if (auto UnrollFactor = shouldPragmaUnroll(L, PInfo, TripMultiple, TripCount,
933 MaxTripCount, UCE, UP)) {
934 UP.Count = *UnrollFactor;
935
936 if (UserUnrollCount || (PragmaCount > 0)) {
937 UP.AllowExpensiveTripCount = true;
938 UP.Force = true;
939 }
940 UP.Runtime |= (PragmaCount > 0);
941 return ExplicitUnroll;
942 } else {
943 if (ExplicitUnroll && TripCount != 0) {
944 // If the loop has an unrolling pragma, we want to be more aggressive with
945 // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold
946 // value which is larger than the default limits.
947 UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
949 std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
950 }
951 }
952
953 // 3rd priority is exact full unrolling. This will eliminate all copies
954 // of some exit test.
955 UP.Count = 0;
956 if (TripCount) {
957 UP.Count = TripCount;
958 if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
959 TripCount, UCE, UP)) {
960 UP.Count = *UnrollFactor;
961 UseUpperBound = false;
962 return ExplicitUnroll;
963 }
964 }
965
966 // 4th priority is bounded unrolling.
967 // We can unroll by the upper bound amount if it's generally allowed or if
968 // we know that the loop is executed either the upper bound or zero times.
969 // (MaxOrZero unrolling keeps only the first loop test, so the number of
970 // loop tests remains the same compared to the non-unrolled version, whereas
971 // the generic upper bound unrolling keeps all but the last loop test so the
972 // number of loop tests goes up which may end up being worse on targets with
973 // constrained branch predictor resources so is controlled by an option.)
974 // In addition we only unroll small upper bounds.
975 // Note that the cost of bounded unrolling is always strictly greater than
976 // cost of exact full unrolling. As such, if we have an exact count and
977 // found it unprofitable, we'll never chose to bounded unroll.
978 if (!TripCount && MaxTripCount && (UP.UpperBound || MaxOrZero) &&
979 MaxTripCount <= UP.MaxUpperBound) {
980 UP.Count = MaxTripCount;
981 if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
982 MaxTripCount, UCE, UP)) {
983 UP.Count = *UnrollFactor;
984 UseUpperBound = true;
985 return ExplicitUnroll;
986 }
987 }
988
989 // 5th priority is loop peeling.
990 computePeelCount(L, LoopSize, PP, TripCount, DT, SE, AC, UP.Threshold);
991 if (PP.PeelCount) {
992 UP.Runtime = false;
993 UP.Count = 1;
994 return ExplicitUnroll;
995 }
996
997 // Before starting partial unrolling, set up.partial to true,
998 // if user explicitly asked for unrolling
999 if (TripCount)
1000 UP.Partial |= ExplicitUnroll;
1001
1002 // 6th priority is partial unrolling.
1003 // Try partial unroll only when TripCount could be statically calculated.
1004 if (auto UnrollFactor = shouldPartialUnroll(LoopSize, TripCount, UCE, UP)) {
1005 UP.Count = *UnrollFactor;
1006
1007 if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
1008 UP.Count != TripCount)
1009 ORE->emit([&]() {
1011 "FullUnrollAsDirectedTooLarge",
1012 L->getStartLoc(), L->getHeader())
1013 << "Unable to fully unroll loop as directed by unroll pragma "
1014 "because "
1015 "unrolled size is too large.";
1016 });
1017
1018 if (UP.PartialThreshold != NoThreshold) {
1019 if (UP.Count == 0) {
1020 if (PragmaEnableUnroll)
1021 ORE->emit([&]() {
1023 "UnrollAsDirectedTooLarge",
1024 L->getStartLoc(), L->getHeader())
1025 << "Unable to unroll loop as directed by unroll(enable) "
1026 "pragma "
1027 "because unrolled size is too large.";
1028 });
1029 }
1030 }
1031 return ExplicitUnroll;
1032 }
1033 assert(TripCount == 0 &&
1034 "All cases when TripCount is constant should be covered here.");
1035 if (PragmaFullUnroll)
1036 ORE->emit([&]() {
1038 DEBUG_TYPE, "CantFullUnrollAsDirectedRuntimeTripCount",
1039 L->getStartLoc(), L->getHeader())
1040 << "Unable to fully unroll loop as directed by unroll(full) "
1041 "pragma "
1042 "because loop has a runtime trip count.";
1043 });
1044
1045 // 7th priority is runtime unrolling.
1046 // Don't unroll a runtime trip count loop when it is disabled.
1048 UP.Count = 0;
1049 return false;
1050 }
1051
1052 // Don't unroll a small upper bound loop unless user or TTI asked to do so.
1053 if (MaxTripCount && !UP.Force && MaxTripCount < UP.MaxUpperBound) {
1054 UP.Count = 0;
1055 return false;
1056 }
1057
1058 // Check if the runtime trip count is too small when profile is available.
1059 if (L->getHeader()->getParent()->hasProfileData()) {
1060 if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
1061 if (*ProfileTripCount < FlatLoopTripCountThreshold)
1062 return false;
1063 else
1064 UP.AllowExpensiveTripCount = true;
1065 }
1066 }
1067 UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount;
1068 if (!UP.Runtime) {
1069 LLVM_DEBUG(
1070 dbgs() << " will not try to unroll loop with runtime trip count "
1071 << "-unroll-runtime not given\n");
1072 UP.Count = 0;
1073 return false;
1074 }
1075 if (UP.Count == 0)
1077
1078 // Reduce unroll count to be the largest power-of-two factor of
1079 // the original count which satisfies the threshold limit.
1080 while (UP.Count != 0 &&
1082 UP.Count >>= 1;
1083
1084#ifndef NDEBUG
1085 unsigned OrigCount = UP.Count;
1086#endif
1087
1088 if (!UP.AllowRemainder && UP.Count != 0 && (TripMultiple % UP.Count) != 0) {
1089 while (UP.Count != 0 && TripMultiple % UP.Count != 0)
1090 UP.Count >>= 1;
1091 LLVM_DEBUG(
1092 dbgs() << "Remainder loop is restricted (that could architecture "
1093 "specific or because the loop contains a convergent "
1094 "instruction), so unroll count must divide the trip "
1095 "multiple, "
1096 << TripMultiple << ". Reducing unroll count from " << OrigCount
1097 << " to " << UP.Count << ".\n");
1098
1099 using namespace ore;
1100
1101 if (unrollCountPragmaValue(L) > 0 && !UP.AllowRemainder)
1102 ORE->emit([&]() {
1104 "DifferentUnrollCountFromDirected",
1105 L->getStartLoc(), L->getHeader())
1106 << "Unable to unroll loop the number of times directed by "
1107 "unroll_count pragma because remainder loop is restricted "
1108 "(that could architecture specific or because the loop "
1109 "contains a convergent instruction) and so must have an "
1110 "unroll "
1111 "count that divides the loop trip multiple of "
1112 << NV("TripMultiple", TripMultiple) << ". Unrolling instead "
1113 << NV("UnrollCount", UP.Count) << " time(s).";
1114 });
1115 }
1116
1117 if (UP.Count > UP.MaxCount)
1118 UP.Count = UP.MaxCount;
1119
1120 if (MaxTripCount && UP.Count > MaxTripCount)
1121 UP.Count = MaxTripCount;
1122
1123 LLVM_DEBUG(dbgs() << " runtime unrolling with count: " << UP.Count
1124 << "\n");
1125 if (UP.Count < 2)
1126 UP.Count = 0;
1127 return ExplicitUnroll;
1128}
1129
1130static LoopUnrollResult
1134 ProfileSummaryInfo *PSI, bool PreserveLCSSA, int OptLevel,
1135 bool OnlyFullUnroll, bool OnlyWhenForced, bool ForgetAllSCEV,
1136 std::optional<unsigned> ProvidedCount,
1137 std::optional<unsigned> ProvidedThreshold,
1138 std::optional<bool> ProvidedAllowPartial,
1139 std::optional<bool> ProvidedRuntime,
1140 std::optional<bool> ProvidedUpperBound,
1141 std::optional<bool> ProvidedAllowPeeling,
1142 std::optional<bool> ProvidedAllowProfileBasedPeeling,
1143 std::optional<unsigned> ProvidedFullUnrollMaxCount) {
1144
1145 LLVM_DEBUG(dbgs() << "Loop Unroll: F["
1146 << L->getHeader()->getParent()->getName() << "] Loop %"
1147 << L->getHeader()->getName() << "\n");
1149 if (TM & TM_Disable)
1151
1152 // If this loop isn't forced to be unrolled, avoid unrolling it when the
1153 // parent loop has an explicit unroll-and-jam pragma. This is to prevent
1154 // automatic unrolling from interfering with the user requested
1155 // transformation.
1156 Loop *ParentL = L->getParentLoop();
1157 if (ParentL != nullptr &&
1160 LLVM_DEBUG(dbgs() << "Not unrolling loop since parent loop has"
1161 << " llvm.loop.unroll_and_jam.\n");
1163 }
1164
1165 // If this loop isn't forced to be unrolled, avoid unrolling it when the
1166 // loop has an explicit unroll-and-jam pragma. This is to prevent automatic
1167 // unrolling from interfering with the user requested transformation.
1170 LLVM_DEBUG(
1171 dbgs()
1172 << " Not unrolling loop since it has llvm.loop.unroll_and_jam.\n");
1174 }
1175
1176 if (!L->isLoopSimplifyForm()) {
1177 LLVM_DEBUG(
1178 dbgs() << " Not unrolling loop which is not in loop-simplify form.\n");
1180 }
1181
1182 // When automatic unrolling is disabled, do not unroll unless overridden for
1183 // this loop.
1184 if (OnlyWhenForced && !(TM & TM_Enable))
1186
1187 bool OptForSize = L->getHeader()->getParent()->hasOptSize();
1189 L, SE, TTI, BFI, PSI, ORE, OptLevel, ProvidedThreshold, ProvidedCount,
1190 ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
1191 ProvidedFullUnrollMaxCount);
1193 L, SE, TTI, ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling, true);
1194
1195 // Exit early if unrolling is disabled. For OptForSize, we pick the loop size
1196 // as threshold later on.
1197 if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0) &&
1198 !OptForSize)
1200
1202 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
1203
1204 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
1205 if (!UCE.canUnroll()) {
1206 LLVM_DEBUG(dbgs() << " Not unrolling loop which contains instructions"
1207 << " which cannot be duplicated or have invalid cost.\n");
1209 }
1210
1211 unsigned LoopSize = UCE.getRolledLoopSize();
1212 LLVM_DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n");
1213
1214 // When optimizing for size, use LoopSize + 1 as threshold (we use < Threshold
1215 // later), to (fully) unroll loops, if it does not increase code size.
1216 if (OptForSize)
1217 UP.Threshold = std::max(UP.Threshold, LoopSize + 1);
1218
1219 if (UCE.NumInlineCandidates != 0) {
1220 LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");
1222 }
1223
1224 // Find the smallest exact trip count for any exit. This is an upper bound
1225 // on the loop trip count, but an exit at an earlier iteration is still
1226 // possible. An unroll by the smallest exact trip count guarantees that all
1227 // branches relating to at least one exit can be eliminated. This is unlike
1228 // the max trip count, which only guarantees that the backedge can be broken.
1229 unsigned TripCount = 0;
1230 unsigned TripMultiple = 1;
1231 SmallVector<BasicBlock *, 8> ExitingBlocks;
1232 L->getExitingBlocks(ExitingBlocks);
1233 for (BasicBlock *ExitingBlock : ExitingBlocks)
1234 if (unsigned TC = SE.getSmallConstantTripCount(L, ExitingBlock))
1235 if (!TripCount || TC < TripCount)
1236 TripCount = TripMultiple = TC;
1237
1238 if (!TripCount) {
1239 // If no exact trip count is known, determine the trip multiple of either
1240 // the loop latch or the single exiting block.
1241 // TODO: Relax for multiple exits.
1242 BasicBlock *ExitingBlock = L->getLoopLatch();
1243 if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
1244 ExitingBlock = L->getExitingBlock();
1245 if (ExitingBlock)
1246 TripMultiple = SE.getSmallConstantTripMultiple(L, ExitingBlock);
1247 }
1248
1249 // If the loop contains a convergent operation, the prelude we'd add
1250 // to do the first few instructions before we hit the unrolled loop
1251 // is unsafe -- it adds a control-flow dependency to the convergent
1252 // operation. Therefore restrict remainder loop (try unrolling without).
1253 //
1254 // TODO: This is quite conservative. In practice, convergent_op()
1255 // is likely to be called unconditionally in the loop. In this
1256 // case, the program would be ill-formed (on most architectures)
1257 // unless n were the same on all threads in a thread group.
1258 // Assuming n is the same on all threads, any kind of unrolling is
1259 // safe. But currently llvm's notion of convergence isn't powerful
1260 // enough to express this.
1261 if (UCE.Convergent)
1262 UP.AllowRemainder = false;
1263
1264 // Try to find the trip count upper bound if we cannot find the exact trip
1265 // count.
1266 unsigned MaxTripCount = 0;
1267 bool MaxOrZero = false;
1268 if (!TripCount) {
1269 MaxTripCount = SE.getSmallConstantMaxTripCount(L);
1270 MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L);
1271 }
1272
1273 // computeUnrollCount() decides whether it is beneficial to use upper bound to
1274 // fully unroll the loop.
1275 bool UseUpperBound = false;
1276 bool IsCountSetExplicitly = computeUnrollCount(
1277 L, TTI, DT, LI, &AC, SE, EphValues, &ORE, TripCount, MaxTripCount,
1278 MaxOrZero, TripMultiple, UCE, UP, PP, UseUpperBound);
1279 if (!UP.Count)
1281
1282 if (PP.PeelCount) {
1283 assert(UP.Count == 1 && "Cannot perform peel and unroll in the same step");
1284 LLVM_DEBUG(dbgs() << "PEELING loop %" << L->getHeader()->getName()
1285 << " with iteration count " << PP.PeelCount << "!\n");
1286 ORE.emit([&]() {
1287 return OptimizationRemark(DEBUG_TYPE, "Peeled", L->getStartLoc(),
1288 L->getHeader())
1289 << " peeled loop by " << ore::NV("PeelCount", PP.PeelCount)
1290 << " iterations";
1291 });
1292
1293 ValueToValueMapTy VMap;
1294 if (peelLoop(L, PP.PeelCount, LI, &SE, DT, &AC, PreserveLCSSA, VMap)) {
1295 simplifyLoopAfterUnroll(L, true, LI, &SE, &DT, &AC, &TTI);
1296 // If the loop was peeled, we already "used up" the profile information
1297 // we had, so we don't want to unroll or peel again.
1299 L->setLoopAlreadyUnrolled();
1301 }
1303 }
1304
1305 // Do not attempt partial/runtime unrolling in FullLoopUnrolling
1306 if (OnlyFullUnroll && (UP.Count < TripCount || UP.Count < MaxTripCount)) {
1307 LLVM_DEBUG(
1308 dbgs() << "Not attempting partial/runtime unroll in FullLoopUnroll.\n");
1310 }
1311
1312 // At this point, UP.Runtime indicates that run-time unrolling is allowed.
1313 // However, we only want to actually perform it if we don't know the trip
1314 // count and the unroll count doesn't divide the known trip multiple.
1315 // TODO: This decision should probably be pushed up into
1316 // computeUnrollCount().
1317 UP.Runtime &= TripCount == 0 && TripMultiple % UP.Count != 0;
1318
1319 // Save loop properties before it is transformed.
1320 MDNode *OrigLoopID = L->getLoopID();
1321
1322 // Unroll the loop.
1323 Loop *RemainderLoop = nullptr;
1324 LoopUnrollResult UnrollResult = UnrollLoop(
1325 L,
1327 UP.UnrollRemainder, ForgetAllSCEV},
1328 LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop);
1329 if (UnrollResult == LoopUnrollResult::Unmodified)
1331
1332 if (RemainderLoop) {
1333 std::optional<MDNode *> RemainderLoopID =
1336 if (RemainderLoopID)
1337 RemainderLoop->setLoopID(*RemainderLoopID);
1338 }
1339
1340 if (UnrollResult != LoopUnrollResult::FullyUnrolled) {
1341 std::optional<MDNode *> NewLoopID =
1344 if (NewLoopID) {
1345 L->setLoopID(*NewLoopID);
1346
1347 // Do not setLoopAlreadyUnrolled if loop attributes have been specified
1348 // explicitly.
1349 return UnrollResult;
1350 }
1351 }
1352
1353 // If loop has an unroll count pragma or unrolled by explicitly set count
1354 // mark loop as unrolled to prevent unrolling beyond that requested.
1355 if (UnrollResult != LoopUnrollResult::FullyUnrolled && IsCountSetExplicitly)
1356 L->setLoopAlreadyUnrolled();
1357
1358 return UnrollResult;
1359}
1360
1361namespace {
1362
1363class LoopUnroll : public LoopPass {
1364public:
1365 static char ID; // Pass ID, replacement for typeid
1366
1367 int OptLevel;
1368
1369 /// If false, use a cost model to determine whether unrolling of a loop is
1370 /// profitable. If true, only loops that explicitly request unrolling via
1371 /// metadata are considered. All other loops are skipped.
1372 bool OnlyWhenForced;
1373
1374 /// If false, when SCEV is invalidated, only forget everything in the
1375 /// top-most loop (call forgetTopMostLoop), of the loop being processed.
1376 /// Otherwise, forgetAllLoops and rebuild when needed next.
1377 bool ForgetAllSCEV;
1378
1379 std::optional<unsigned> ProvidedCount;
1380 std::optional<unsigned> ProvidedThreshold;
1381 std::optional<bool> ProvidedAllowPartial;
1382 std::optional<bool> ProvidedRuntime;
1383 std::optional<bool> ProvidedUpperBound;
1384 std::optional<bool> ProvidedAllowPeeling;
1385 std::optional<bool> ProvidedAllowProfileBasedPeeling;
1386 std::optional<unsigned> ProvidedFullUnrollMaxCount;
1387
1388 LoopUnroll(int OptLevel = 2, bool OnlyWhenForced = false,
1389 bool ForgetAllSCEV = false,
1390 std::optional<unsigned> Threshold = std::nullopt,
1391 std::optional<unsigned> Count = std::nullopt,
1392 std::optional<bool> AllowPartial = std::nullopt,
1393 std::optional<bool> Runtime = std::nullopt,
1394 std::optional<bool> UpperBound = std::nullopt,
1395 std::optional<bool> AllowPeeling = std::nullopt,
1396 std::optional<bool> AllowProfileBasedPeeling = std::nullopt,
1397 std::optional<unsigned> ProvidedFullUnrollMaxCount = std::nullopt)
1398 : LoopPass(ID), OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced),
1399 ForgetAllSCEV(ForgetAllSCEV), ProvidedCount(std::move(Count)),
1400 ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
1401 ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound),
1402 ProvidedAllowPeeling(AllowPeeling),
1403 ProvidedAllowProfileBasedPeeling(AllowProfileBasedPeeling),
1404 ProvidedFullUnrollMaxCount(ProvidedFullUnrollMaxCount) {
1406 }
1407
1408 bool runOnLoop(Loop *L, LPPassManager &LPM) override {
1409 if (skipLoop(L))
1410 return false;
1411
1412 Function &F = *L->getHeader()->getParent();
1413
1414 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1415 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1416 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1417 const TargetTransformInfo &TTI =
1418 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1419 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1420 // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
1421 // pass. Function analyses need to be preserved across loop transformations
1422 // but ORE cannot be preserved (see comment before the pass definition).
1424 bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
1425
1427 L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr, PreserveLCSSA, OptLevel,
1428 /*OnlyFullUnroll*/ false, OnlyWhenForced, ForgetAllSCEV, ProvidedCount,
1429 ProvidedThreshold, ProvidedAllowPartial, ProvidedRuntime,
1430 ProvidedUpperBound, ProvidedAllowPeeling,
1431 ProvidedAllowProfileBasedPeeling, ProvidedFullUnrollMaxCount);
1432
1433 if (Result == LoopUnrollResult::FullyUnrolled)
1434 LPM.markLoopAsDeleted(*L);
1435
1436 return Result != LoopUnrollResult::Unmodified;
1437 }
1438
1439 /// This transformation requires natural loop information & requires that
1440 /// loop preheaders be inserted into the CFG...
1441 void getAnalysisUsage(AnalysisUsage &AU) const override {
1444 // FIXME: Loop passes are required to preserve domtree, and for now we just
1445 // recreate dom info if anything gets unrolled.
1447 }
1448};
1449
1450} // end anonymous namespace
1451
1452char LoopUnroll::ID = 0;
1453
1454INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
1458INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
1459
1460Pass *llvm::createLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
1461 bool ForgetAllSCEV, int Threshold, int Count,
1462 int AllowPartial, int Runtime, int UpperBound,
1463 int AllowPeeling) {
1464 // TODO: It would make more sense for this function to take the optionals
1465 // directly, but that's dangerous since it would silently break out of tree
1466 // callers.
1467 return new LoopUnroll(
1468 OptLevel, OnlyWhenForced, ForgetAllSCEV,
1469 Threshold == -1 ? std::nullopt : std::optional<unsigned>(Threshold),
1470 Count == -1 ? std::nullopt : std::optional<unsigned>(Count),
1471 AllowPartial == -1 ? std::nullopt : std::optional<bool>(AllowPartial),
1472 Runtime == -1 ? std::nullopt : std::optional<bool>(Runtime),
1473 UpperBound == -1 ? std::nullopt : std::optional<bool>(UpperBound),
1474 AllowPeeling == -1 ? std::nullopt : std::optional<bool>(AllowPeeling));
1475}
1476
1479 LPMUpdater &Updater) {
1480 // For the new PM, we can't use OptimizationRemarkEmitter as an analysis
1481 // pass. Function analyses need to be preserved across loop transformations
1482 // but ORE cannot be preserved (see comment before the pass definition).
1483 OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
1484
1485 // Keep track of the previous loop structure so we can identify new loops
1486 // created by unrolling.
1487 Loop *ParentL = L.getParentLoop();
1488 SmallPtrSet<Loop *, 4> OldLoops;
1489 if (ParentL)
1490 OldLoops.insert(ParentL->begin(), ParentL->end());
1491 else
1492 OldLoops.insert(AR.LI.begin(), AR.LI.end());
1493
1494 std::string LoopName = std::string(L.getName());
1495
1496 bool Changed =
1497 tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, ORE,
1498 /*BFI*/ nullptr, /*PSI*/ nullptr,
1499 /*PreserveLCSSA*/ true, OptLevel, /*OnlyFullUnroll*/ true,
1500 OnlyWhenForced, ForgetSCEV, /*Count*/ std::nullopt,
1501 /*Threshold*/ std::nullopt, /*AllowPartial*/ false,
1502 /*Runtime*/ false, /*UpperBound*/ false,
1503 /*AllowPeeling*/ true,
1504 /*AllowProfileBasedPeeling*/ false,
1505 /*FullUnrollMaxCount*/ std::nullopt) !=
1507 if (!Changed)
1508 return PreservedAnalyses::all();
1509
1510 // The parent must not be damaged by unrolling!
1511#ifndef NDEBUG
1512 if (ParentL)
1513 ParentL->verifyLoop();
1514#endif
1515
1516 // Unrolling can do several things to introduce new loops into a loop nest:
1517 // - Full unrolling clones child loops within the current loop but then
1518 // removes the current loop making all of the children appear to be new
1519 // sibling loops.
1520 //
1521 // When a new loop appears as a sibling loop after fully unrolling,
1522 // its nesting structure has fundamentally changed and we want to revisit
1523 // it to reflect that.
1524 //
1525 // When unrolling has removed the current loop, we need to tell the
1526 // infrastructure that it is gone.
1527 //
1528 // Finally, we support a debugging/testing mode where we revisit child loops
1529 // as well. These are not expected to require further optimizations as either
1530 // they or the loop they were cloned from have been directly visited already.
1531 // But the debugging mode allows us to check this assumption.
1532 bool IsCurrentLoopValid = false;
1533 SmallVector<Loop *, 4> SibLoops;
1534 if (ParentL)
1535 SibLoops.append(ParentL->begin(), ParentL->end());
1536 else
1537 SibLoops.append(AR.LI.begin(), AR.LI.end());
1538 erase_if(SibLoops, [&](Loop *SibLoop) {
1539 if (SibLoop == &L) {
1540 IsCurrentLoopValid = true;
1541 return true;
1542 }
1543
1544 // Otherwise erase the loop from the list if it was in the old loops.
1545 return OldLoops.contains(SibLoop);
1546 });
1547 Updater.addSiblingLoops(SibLoops);
1548
1549 if (!IsCurrentLoopValid) {
1550 Updater.markLoopAsDeleted(L, LoopName);
1551 } else {
1552 // We can only walk child loops if the current loop remained valid.
1554 // Walk *all* of the child loops.
1555 SmallVector<Loop *, 4> ChildLoops(L.begin(), L.end());
1556 Updater.addChildLoops(ChildLoops);
1557 }
1558 }
1559
1561}
1562
1565 auto &LI = AM.getResult<LoopAnalysis>(F);
1566 // There are no loops in the function. Return before computing other expensive
1567 // analyses.
1568 if (LI.empty())
1569 return PreservedAnalyses::all();
1570 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
1571 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
1572 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
1573 auto &AC = AM.getResult<AssumptionAnalysis>(F);
1575
1576 LoopAnalysisManager *LAM = nullptr;
1577 if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(F))
1578 LAM = &LAMProxy->getManager();
1579
1580 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
1581 ProfileSummaryInfo *PSI =
1582 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
1583 auto *BFI = (PSI && PSI->hasProfileSummary()) ?
1584 &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
1585
1586 bool Changed = false;
1587
1588 // The unroller requires loops to be in simplified form, and also needs LCSSA.
1589 // Since simplification may add new inner loops, it has to run before the
1590 // legality and profitability checks. This means running the loop unroller
1591 // will simplify all loops, regardless of whether anything end up being
1592 // unrolled.
1593 for (const auto &L : LI) {
1594 Changed |=
1595 simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */);
1596 Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
1597 }
1598
1599 // Add the loop nests in the reverse order of LoopInfo. See method
1600 // declaration.
1602 appendLoopsToWorklist(LI, Worklist);
1603
1604 while (!Worklist.empty()) {
1605 // Because the LoopInfo stores the loops in RPO, we walk the worklist
1606 // from back to front so that we work forward across the CFG, which
1607 // for unrolling is only needed to get optimization remarks emitted in
1608 // a forward order.
1609 Loop &L = *Worklist.pop_back_val();
1610#ifndef NDEBUG
1611 Loop *ParentL = L.getParentLoop();
1612#endif
1613
1614 // Check if the profile summary indicates that the profiled application
1615 // has a huge working set size, in which case we disable peeling to avoid
1616 // bloating it further.
1617 std::optional<bool> LocalAllowPeeling = UnrollOpts.AllowPeeling;
1618 if (PSI && PSI->hasHugeWorkingSetSize())
1619 LocalAllowPeeling = false;
1620 std::string LoopName = std::string(L.getName());
1621 // The API here is quite complex to call and we allow to select some
1622 // flavors of unrolling during construction time (by setting UnrollOpts).
1624 &L, DT, &LI, SE, TTI, AC, ORE, BFI, PSI,
1625 /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, /*OnlyFullUnroll*/ false,
1626 UnrollOpts.OnlyWhenForced, UnrollOpts.ForgetSCEV,
1627 /*Count*/ std::nullopt,
1628 /*Threshold*/ std::nullopt, UnrollOpts.AllowPartial,
1629 UnrollOpts.AllowRuntime, UnrollOpts.AllowUpperBound, LocalAllowPeeling,
1630 UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount);
1631 Changed |= Result != LoopUnrollResult::Unmodified;
1632
1633 // The parent must not be damaged by unrolling!
1634#ifndef NDEBUG
1635 if (Result != LoopUnrollResult::Unmodified && ParentL)
1636 ParentL->verifyLoop();
1637#endif
1638
1639 // Clear any cached analysis results for L if we removed it completely.
1640 if (LAM && Result == LoopUnrollResult::FullyUnrolled)
1641 LAM->clear(L, LoopName);
1642 }
1643
1644 if (!Changed)
1645 return PreservedAnalyses::all();
1646
1648}
1649
1651 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
1652 static_cast<PassInfoMixin<LoopUnrollPass> *>(this)->printPipeline(
1653 OS, MapClassName2PassName);
1654 OS << '<';
1655 if (UnrollOpts.AllowPartial != std::nullopt)
1656 OS << (*UnrollOpts.AllowPartial ? "" : "no-") << "partial;";
1657 if (UnrollOpts.AllowPeeling != std::nullopt)
1658 OS << (*UnrollOpts.AllowPeeling ? "" : "no-") << "peeling;";
1659 if (UnrollOpts.AllowRuntime != std::nullopt)
1660 OS << (*UnrollOpts.AllowRuntime ? "" : "no-") << "runtime;";
1661 if (UnrollOpts.AllowUpperBound != std::nullopt)
1662 OS << (*UnrollOpts.AllowUpperBound ? "" : "no-") << "upperbound;";
1663 if (UnrollOpts.AllowProfileBasedPeeling != std::nullopt)
1664 OS << (*UnrollOpts.AllowProfileBasedPeeling ? "" : "no-")
1665 << "profile-peeling;";
1666 if (UnrollOpts.FullUnrollMaxCount != std::nullopt)
1667 OS << "full-unroll-max=" << UnrollOpts.FullUnrollMaxCount << ';';
1668 OS << 'O' << UnrollOpts.OptLevel;
1669 OS << '>';
1670}
Rewrite undef for PHI
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines DenseMapInfo traits for DenseMap.
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
#define DEBUG_TYPE
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header provides classes for managing per-loop analyses.
loops
Definition: LoopInfo.cpp:1177
This header provides classes for managing a pipeline of passes over loops in LLVM IR.
static MDNode * getUnrollMetadataForLoop(const Loop *L, StringRef Name)
static cl::opt< unsigned > UnrollMaxCount("unroll-max-count", cl::Hidden, cl::desc("Set the max unroll count for partial and runtime unrolling, for" "testing purposes"))
static cl::opt< unsigned > UnrollCount("unroll-count", cl::Hidden, cl::desc("Use this unroll count for all loops including those with " "unroll_count pragma values, for testing purposes"))
static cl::opt< unsigned > UnrollThresholdDefault("unroll-threshold-default", cl::init(150), cl::Hidden, cl::desc("Default threshold (max size of unrolled " "loop), used in all but O3 optimizations"))
static cl::opt< unsigned > FlatLoopTripCountThreshold("flat-loop-tripcount-threshold", cl::init(5), cl::Hidden, cl::desc("If the runtime tripcount for the loop is lower than the " "threshold, the loop is considered as flat and will be less " "aggressively unrolled."))
static cl::opt< unsigned > UnrollMaxIterationsCountToAnalyze("unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden, cl::desc("Don't allow loop unrolling to simulate more than this number of" "iterations when checking full unroll profitability"))
static LoopUnrollResult tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, const TargetTransformInfo &TTI, AssumptionCache &AC, OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, bool PreserveLCSSA, int OptLevel, bool OnlyFullUnroll, bool OnlyWhenForced, bool ForgetAllSCEV, std::optional< unsigned > ProvidedCount, std::optional< unsigned > ProvidedThreshold, std::optional< bool > ProvidedAllowPartial, std::optional< bool > ProvidedRuntime, std::optional< bool > ProvidedUpperBound, std::optional< bool > ProvidedAllowPeeling, std::optional< bool > ProvidedAllowProfileBasedPeeling, std::optional< unsigned > ProvidedFullUnrollMaxCount)
static cl::opt< unsigned > UnrollOptSizeThreshold("unroll-optsize-threshold", cl::init(0), cl::Hidden, cl::desc("The cost threshold for loop unrolling when optimizing for " "size"))
static bool hasUnrollFullPragma(const Loop *L)
static cl::opt< bool > UnrollUnrollRemainder("unroll-remainder", cl::Hidden, cl::desc("Allow the loop remainder to be unrolled."))
static unsigned unrollCountPragmaValue(const Loop *L)
static bool hasUnrollEnablePragma(const Loop *L)
static cl::opt< unsigned > UnrollFullMaxCount("unroll-full-max-count", cl::Hidden, cl::desc("Set the max unroll count for full unrolling, for testing purposes"))
static cl::opt< unsigned > UnrollMaxUpperBound("unroll-max-upperbound", cl::init(8), cl::Hidden, cl::desc("The max of trip count upper bound that is considered in unrolling"))
static std::optional< unsigned > shouldFullUnroll(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, const unsigned FullUnrollTripCount, const UnrollCostEstimator UCE, const TargetTransformInfo::UnrollingPreferences &UP)
static std::optional< EstimatedUnrollCost > analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize, unsigned MaxIterationsCountToAnalyze)
Figure out if the loop is worth full unrolling.
static cl::opt< unsigned > UnrollPartialThreshold("unroll-partial-threshold", cl::Hidden, cl::desc("The cost threshold for partial loop unrolling"))
static cl::opt< bool > UnrollAllowRemainder("unroll-allow-remainder", cl::Hidden, cl::desc("Allow generation of a loop remainder (extra iterations) " "when unrolling a loop."))
static std::optional< unsigned > shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount, const UnrollCostEstimator UCE, const TargetTransformInfo::UnrollingPreferences &UP)
static cl::opt< unsigned > PragmaUnrollFullMaxIterations("pragma-unroll-full-max-iterations", cl::init(1 '000 '000), cl::Hidden, cl::desc("Maximum allowed iterations to unroll under pragma unroll full."))
static const unsigned NoThreshold
A magic value for use with the Threshold parameter to indicate that the loop unroll should be perform...
static std::optional< unsigned > shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo, const unsigned TripMultiple, const unsigned TripCount, unsigned MaxTripCount, const UnrollCostEstimator UCE, const TargetTransformInfo::UnrollingPreferences &UP)
static cl::opt< bool > UnrollRevisitChildLoops("unroll-revisit-child-loops", cl::Hidden, cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. " "This shouldn't typically be needed as child loops (or their " "clones) were already visited."))
static cl::opt< unsigned > UnrollThreshold("unroll-threshold", cl::Hidden, cl::desc("The cost threshold for loop unrolling"))
static cl::opt< bool > UnrollRuntime("unroll-runtime", cl::Hidden, cl::desc("Unroll loops with run-time trip counts"))
static bool hasRuntimeUnrollDisablePragma(const Loop *L)
static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost, unsigned MaxPercentThresholdBoost)
static cl::opt< unsigned > UnrollThresholdAggressive("unroll-threshold-aggressive", cl::init(300), cl::Hidden, cl::desc("Threshold (max size of unrolled loop) to use in aggressive (O3) " "optimizations"))
static cl::opt< unsigned > UnrollMaxPercentThresholdBoost("unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden, cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied " "to the threshold when aggressively unrolling a loop due to the " "dynamic cost savings. If completely unrolling a loop will reduce " "the total runtime from X to Y, we boost the loop unroll " "threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, " "X/Y). This limit avoids excessive code bloat."))
static cl::opt< unsigned > PragmaUnrollThreshold("pragma-unroll-threshold", cl::init(16 *1024), cl::Hidden, cl::desc("Unrolled size limit for loops with an unroll(full) or " "unroll_count pragma."))
static cl::opt< bool > UnrollAllowPartial("unroll-allow-partial", cl::Hidden, cl::desc("Allows loops to be partially unrolled until " "-unroll-threshold loop size is reached."))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
Machine Trace Metrics
This file contains the declarations for metadata subclasses.
if(VerifyEach)
LoopAnalysisManager LAM
const char LLVMTargetMachineRef TM
This header defines various interfaces for pass management in LLVM.
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This pass exposes codegen information to IR-level passes.
Value * RHS
Value * LHS
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:348
void clear(IRUnitT &IR, llvm::StringRef Name)
Clear any cached analysis results for a single unit of IR.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:519
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:500
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:220
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional or Unconditional Branch instruction.
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
This is an important base class in LLVM.
Definition: Constant.h:41
This class represents an Operation in the Expression.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:677
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:658
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:84
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
void addChildLoops(ArrayRef< Loop * > NewChildLoops)
Loop passes should use this method to indicate they have added new child loops of the current loop.
void markLoopAsDeleted(Loop &L, llvm::StringRef Name)
Loop passes should use this method to indicate they have deleted a loop from the nest.
void addSiblingLoops(ArrayRef< Loop * > NewSibLoops)
Loop passes should use this method to indicate they have added new sibling loops to the current loop.
void markLoopAsDeleted(Loop &L)
Definition: LoopPass.cpp:110
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
void verifyLoop() const
Verify loop structure.
iterator end() const
iterator begin() const
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
iterator end() const
iterator begin() const
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
void setLoopID(MDNode *LoopID) const
Set the llvm.loop loop id metadata for this loop.
Definition: LoopInfo.cpp:525
Metadata node.
Definition: Metadata.h:1067
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1428
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1434
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...
Definition: PassManager.h:783
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:115
bool empty() const
Determine if the PriorityWorklist is empty or not.
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
unsigned getSmallConstantTripMultiple(const Loop *L, const SCEV *ExitCount)
Returns the largest constant divisor of the trip count as a normal unsigned value,...
unsigned getSmallConstantMaxTripCount(const Loop *L)
Returns the upper bound of the loop trip count as a normal unsigned value.
bool isBackedgeTakenCountMaxOrZero(const Loop *L)
Return true if the backedge taken count is either the value returned by getConstantMaxBackedgeTakenCo...
unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
value_type pop_back_val()
Definition: SetVector.h:285
A version of PriorityWorklist that selects small size optimized data structures for the vector and ma...
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:321
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:366
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
bool empty() const
Definition: SmallVector.h:94
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Multiway switch.
Analysis pass providing the TargetTransformInfo.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
void getUnrollingPreferences(Loop *L, ScalarEvolution &, UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const
Get target-customized preferences for the generic loop unrolling transformation.
TargetCostKind
The kind of cost model.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
bool isLoweredToCall(const Function *F) const
Test whether calls to a function lower to actual program function calls.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
Produce an estimate of the unrolled cost of the specified loop.
Definition: UnrollLoop.h:122
uint64_t getUnrolledLoopSize(const TargetTransformInfo::UnrollingPreferences &UP, unsigned CountOverwrite=0) const
Returns loop size estimation for unrolled loop, given the unrolling configuration specified by UP.
UnrollCostEstimator(const Loop *L, const TargetTransformInfo &TTI, const SmallPtrSetImpl< const Value * > &EphValues, unsigned BEInsns)
bool canUnroll() const
Whether it is legal to unroll this loop.
Definition: UnrollLoop.h:135
uint64_t getRolledLoopSize() const
Definition: UnrollLoop.h:137
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
iterator find(const_arg_type_t< ValueT > V)
Definition: DenseSet.h:179
An efficient, type-erasing, non-owning reference to a callable.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Returns a loop's estimated trip count based on branch weight metadata.
Definition: LoopUtils.cpp:849
void initializeLoopUnrollPass(PassRegistry &)
void computePeelCount(Loop *L, unsigned LoopSize, TargetTransformInfo::PeelingPreferences &PP, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE, AssumptionCache *AC=nullptr, unsigned Threshold=UINT_MAX)
Definition: LoopPeel.cpp:498
auto successors(const MachineBasicBlock *BB)
@ Runtime
Detect stack use after return if not disabled runtime with (ASAN_OPTIONS=detect_stack_use_after_retur...
bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition: LCSSA.cpp:425
std::optional< MDNode * > makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef< StringRef > FollowupAttrs, const char *InheritOptionsAttrsPrefix="", bool AlwaysNew=false)
Create a new loop identifier for a loop created from a loop transformation.
Definition: LoopUtils.cpp:263
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
char & LCSSAID
Definition: LCSSA.cpp:507
Pass * createLoopUnrollPass(int OptLevel=2, bool OnlyWhenForced=false, bool ForgetAllSCEV=false, int Threshold=-1, int Count=-1, int AllowPartial=-1, int Runtime=-1, int UpperBound=-1, int AllowPeeling=-1)
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1937
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
Definition: LoopPeel.cpp:832
TransformationMode hasUnrollAndJamTransformation(const Loop *L)
Definition: LoopUtils.cpp:373
cl::opt< bool > ForgetSCEVInLoopUnroll
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
TransformationMode hasUnrollTransformation(const Loop *L)
Definition: LoopUtils.cpp:352
LoopUnrollResult
Represents the result of a UnrollLoop invocation.
Definition: UnrollLoop.h:54
@ PartiallyUnrolled
The loop was partially unrolled – we still have a loop, but with a smaller trip count.
@ Unmodified
The loop was not modified.
@ FullyUnrolled
The loop was fully unrolled into straight-line code.
bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
void getLoopAnalysisUsage(AnalysisUsage &AU)
Helper to consistently add the set of standard passes to a loop pass's AnalysisUsage.
Definition: LoopUtils.cpp:141
const char *const LLVMLoopUnrollFollowupAll
Definition: UnrollLoop.h:42
TargetTransformInfo TTI
TransformationMode
The mode sets how eager a transformation should be applied.
Definition: LoopUtils.h:275
@ TM_ForcedByUser
The transformation was directed by the user, e.g.
Definition: LoopUtils.h:292
@ TM_Disable
The transformation should not be applied.
Definition: LoopUtils.h:284
@ TM_Enable
The transformation should be applied without considering a cost model.
Definition: LoopUtils.h:281
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1923
void appendLoopsToWorklist(RangeT &&, SmallPriorityWorklist< Loop *, 4 > &)
Utility that implements appending of loops onto a worklist given a range.
Definition: LoopUtils.cpp:1681
TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1858
const char *const LLVMLoopUnrollFollowupRemainder
Definition: UnrollLoop.h:45
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
const char *const LLVMLoopUnrollFollowupUnrolled
Definition: UnrollLoop.h:43
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2060
LoopUnrollResult UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, const llvm::TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE, bool PreserveLCSSA, Loop **RemainderLoop=nullptr)
Unroll the given loop by Count.
Definition: LoopUnroll.cpp:295
void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, const TargetTransformInfo *TTI)
Perform some cleanup and simplifications on loops after unrolling.
Definition: LoopUnroll.cpp:215
MDNode * GetUnrollMetadata(MDNode *LoopID, StringRef Name)
Given an llvm.loop loop id metadata node, returns the loop hint metadata node with the given name (fo...
Definition: LoopUnroll.cpp:927
bool peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, ScalarEvolution *SE, DominatorTree &DT, AssumptionCache *AC, bool PreserveLCSSA, ValueToValueMapTy &VMap)
VMap is the value-map that maps instructions from the original loop to instructions in the last peele...
Definition: LoopPeel.cpp:876
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
Utility to calculate the size and a few similar metrics for a set of basic blocks.
Definition: CodeMetrics.h:31
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:70
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:50
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
bool OnlyWhenForced
If false, use a cost model to determine whether unrolling of a loop is profitable.
const bool ForgetSCEV
If true, forget all loops when unrolling.
std::optional< unsigned > FullUnrollMaxCount
std::optional< bool > AllowPartial
std::optional< bool > AllowRuntime
std::optional< bool > AllowProfileBasedPeeling
std::optional< bool > AllowPeeling
std::optional< bool > AllowUpperBound
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition: PassManager.h:91
bool PeelProfiledIterations
Allow peeling basing on profile.
unsigned PeelCount
A forced peeling factor (the number of bodied of the original loop that should be peeled off before t...
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
bool UpperBound
Allow using trip count upper bound to unroll loops.
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned MaxPercentThresholdBoost
If complete unrolling will reduce the cost of the loop, we will boost the Threshold by a certain perc...
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
unsigned MaxIterationsCountToAnalyze
Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...
bool AllowRemainder
Allow generation of a loop remainder (extra iterations after unroll).
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned FullUnrollMaxCount
Set the maximum unrolling factor for full unrolling.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
bool AllowExpensiveTripCount
Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for...
unsigned MaxUpperBound
Set the maximum upper bound of trip count.