72#define DEBUG_TYPE "loop-unroll"
76 cl::desc(
"Forget everything in SCEV when doing LoopUnroll, instead of just"
77 " the current top-most loop. This is sometimes preferred to reduce"
82 cl::desc(
"The cost threshold for loop unrolling"));
87 cl::desc(
"The cost threshold for loop unrolling when optimizing for "
92 cl::desc(
"The cost threshold for partial loop unrolling"));
96 cl::desc(
"The maximum 'boost' (represented as a percentage >= 100) applied "
97 "to the threshold when aggressively unrolling a loop due to the "
98 "dynamic cost savings. If completely unrolling a loop will reduce "
99 "the total runtime from X to Y, we boost the loop unroll "
100 "threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, "
101 "X/Y). This limit avoids excessive code bloat."));
105 cl::desc(
"Don't allow loop unrolling to simulate more than this number of"
106 "iterations when checking full unroll profitability"));
110 cl::desc(
"Use this unroll count for all loops including those with "
111 "unroll_count pragma values, for testing purposes"));
115 cl::desc(
"Set the max unroll count for partial and runtime unrolling, for"
116 "testing purposes"));
121 "Set the max unroll count for full unrolling, for testing purposes"));
125 cl::desc(
"Allows loops to be partially unrolled until "
126 "-unroll-threshold loop size is reached."));
130 cl::desc(
"Allow generation of a loop remainder (extra iterations) "
131 "when unrolling a loop."));
135 cl::desc(
"Unroll loops with run-time trip counts"));
140 "The max of trip count upper bound that is considered in unrolling"));
144 cl::desc(
"Unrolled size limit for loops with an unroll(full) or "
145 "unroll_count pragma."));
149 cl::desc(
"If the runtime tripcount for the loop is lower than the "
150 "threshold, the loop is considered as flat and will be less "
151 "aggressively unrolled."));
155 cl::desc(
"Allow the loop remainder to be unrolled."));
162 cl::desc(
"Enqueue and re-visit child loops in the loop PM after unrolling. "
163 "This shouldn't typically be needed as child loops (or their "
164 "clones) were already visited."));
168 cl::desc(
"Threshold (max size of unrolled loop) to use in aggressive (O3) "
173 cl::desc(
"Default threshold (max size of unrolled "
174 "loop), used in all but O3 optimizations"));
179static const unsigned NoThreshold = std::numeric_limits<unsigned>::max();
187 std::optional<unsigned> UserThreshold, std::optional<unsigned> UserCount,
188 std::optional<bool> UserAllowPartial, std::optional<bool> UserRuntime,
189 std::optional<bool> UserUpperBound,
190 std::optional<unsigned> UserFullUnrollMaxCount) {
202 UP.
MaxCount = std::numeric_limits<unsigned>::max();
220 bool OptForSize = L->getHeader()->getParent()->hasOptSize() ||
224 PGSOQueryType::IRPass));
261 UP.
Count = *UserCount;
262 if (UserAllowPartial)
263 UP.
Partial = *UserAllowPartial;
268 if (UserFullUnrollMaxCount)
282struct UnrolledInstState {
286 unsigned IsCounted : 1;
290struct UnrolledInstStateKeyInfo {
294 static inline UnrolledInstState getEmptyKey() {
295 return {PtrInfo::getEmptyKey(), 0, 0, 0};
298 static inline UnrolledInstState getTombstoneKey() {
299 return {PtrInfo::getTombstoneKey(), 0, 0, 0};
302 static inline unsigned getHashValue(
const UnrolledInstState &S) {
303 return PairInfo::getHashValue({S.I, S.Iteration});
306 static inline bool isEqual(
const UnrolledInstState &LHS,
307 const UnrolledInstState &RHS) {
308 return PairInfo::isEqual({
LHS.I,
LHS.Iteration}, {
RHS.I,
RHS.Iteration});
312struct EstimatedUnrollCost {
314 unsigned UnrolledCost;
318 unsigned RolledDynamicCost;
322 PragmaInfo(
bool UUC,
bool PFU,
unsigned PC,
bool PEU)
323 : UserUnrollCount(UUC), PragmaFullUnroll(PFU), PragmaCount(PC),
324 PragmaEnableUnroll(PEU) {}
325 const bool UserUnrollCount;
326 const bool PragmaFullUnroll;
327 const unsigned PragmaCount;
328 const bool PragmaEnableUnroll;
350 unsigned MaxIterationsCountToAnalyze) {
354 assert(MaxIterationsCountToAnalyze <
355 (
unsigned)(std::numeric_limits<int>::max() / 2) &&
356 "The unroll iterations max is too large!");
360 if (!L->isInnermost())
364 if (!TripCount || TripCount > MaxIterationsCountToAnalyze)
397 auto AddCostRecursively = [&](
Instruction &RootI,
int Iteration) {
398 assert(Iteration >= 0 &&
"Cannot have a negative iteration!");
399 assert(CostWorklist.
empty() &&
"Must start with an empty cost list");
400 assert(PHIUsedList.
empty() &&
"Must start with an empty phi used list");
406 for (;; --Iteration) {
412 auto CostIter = InstCostMap.
find({
I, Iteration, 0, 0});
413 if (CostIter == InstCostMap.
end())
418 auto &
Cost = *CostIter;
424 Cost.IsCounted =
true;
427 if (
auto *PhiI = dyn_cast<PHINode>(
I))
428 if (PhiI->getParent() == L->getHeader()) {
429 assert(
Cost.IsFree &&
"Loop PHIs shouldn't be evaluated as they "
430 "inherently simplify during unrolling.");
437 if (
auto *OpI = dyn_cast<Instruction>(
438 PhiI->getIncomingValueForBlock(L->getLoopLatch())))
439 if (L->contains(OpI))
448 << Iteration <<
"): ");
458 auto *OpI = dyn_cast<Instruction>(
Op);
459 if (!OpI || !L->contains(OpI))
465 }
while (!CostWorklist.
empty());
467 if (PHIUsedList.
empty())
472 "Cannot track PHI-used values past the first iteration!");
480 assert(L->isLoopSimplifyForm() &&
"Must put loop into normal form first.");
481 assert(L->isLCSSAForm(DT) &&
482 "Must have loops in LCSSA form to track live-out values.");
484 LLVM_DEBUG(
dbgs() <<
"Starting LoopUnroll profitability analysis...\n");
487 L->getHeader()->getParent()->hasMinSize() ?
493 for (
unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {
494 LLVM_DEBUG(
dbgs() <<
" Analyzing iteration " << Iteration <<
"\n");
499 auto *
PHI = dyn_cast<PHINode>(&
I);
506 PHI->getNumIncomingValues() == 2 &&
507 "Must have an incoming value only for the preheader and the latch.");
509 Value *V =
PHI->getIncomingValueForBlock(
510 Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch());
511 if (Iteration != 0 && SimplifiedValues.
count(V))
512 V = SimplifiedValues.
lookup(V);
517 SimplifiedValues.
clear();
518 while (!SimplifiedInputValues.
empty())
524 BBWorklist.
insert(L->getHeader());
535 if (isa<DbgInfoIntrinsic>(
I) || EphValues.
count(&
I))
545 bool IsFree = Analyzer.visit(
I);
546 bool Inserted = InstCostMap.
insert({&
I, (int)Iteration,
550 assert(Inserted &&
"Cannot have a state for an unvisited instruction!");
557 if (
auto *CI = dyn_cast<CallInst>(&
I)) {
558 const Function *Callee = CI->getCalledFunction();
567 if (
I.mayHaveSideEffects())
568 AddCostRecursively(
I, Iteration);
571 if (UnrolledCost > MaxUnrolledLoopSize) {
573 <<
" UnrolledCost: " << UnrolledCost
574 <<
", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize
583 if (SimplifiedValues.
count(V))
584 V = SimplifiedValues.
lookup(V);
585 return dyn_cast<Constant>(V);
591 if (
BranchInst *BI = dyn_cast<BranchInst>(TI)) {
592 if (BI->isConditional()) {
593 if (
auto *SimpleCond = getSimplifiedConstant(BI->getCondition())) {
595 if (isa<UndefValue>(SimpleCond))
596 KnownSucc = BI->getSuccessor(0);
598 dyn_cast<ConstantInt>(SimpleCond))
599 KnownSucc = BI->getSuccessor(SimpleCondVal->isZero() ? 1 : 0);
602 }
else if (
SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
603 if (
auto *SimpleCond = getSimplifiedConstant(SI->getCondition())) {
605 if (isa<UndefValue>(SimpleCond))
606 KnownSucc = SI->getSuccessor(0);
608 dyn_cast<ConstantInt>(SimpleCond))
609 KnownSucc = SI->findCaseValue(SimpleCondVal)->getCaseSuccessor();
613 if (L->contains(KnownSucc))
614 BBWorklist.
insert(KnownSucc);
616 ExitWorklist.
insert({BB, KnownSucc});
622 if (L->contains(Succ))
625 ExitWorklist.
insert({BB, Succ});
626 AddCostRecursively(*TI, Iteration);
631 if (UnrolledCost == RolledDynamicCost) {
633 <<
" UnrolledCost: " << UnrolledCost <<
"\n");
638 while (!ExitWorklist.
empty()) {
640 std::tie(ExitingBB, ExitBB) = ExitWorklist.
pop_back_val();
643 auto *PN = dyn_cast<PHINode>(&
I);
647 Value *
Op = PN->getIncomingValueForBlock(ExitingBB);
648 if (
auto *OpI = dyn_cast<Instruction>(
Op))
649 if (L->contains(OpI))
650 AddCostRecursively(*OpI, TripCount - 1);
655 "All instructions must have a valid cost, whether the "
656 "loop is rolled or unrolled.");
659 <<
"UnrolledCost: " << UnrolledCost <<
", "
660 <<
"RolledDynamicCost: " << RolledDynamicCost <<
"\n");
667 const Loop *L,
unsigned &NumCalls,
bool &NotDuplicatable,
bool &Convergent,
672 Metrics.analyzeBasicBlock(BB,
TTI, EphValues);
673 NumCalls =
Metrics.NumInlineCandidates;
674 NotDuplicatable =
Metrics.notDuplicatable;
675 Convergent =
Metrics.convergent;
685 if (LoopSize.
isValid() && LoopSize < BEInsns + 1)
687 LoopSize = BEInsns + 1;
696 if (
MDNode *LoopID = L->getLoopID())
723 "Unroll count hint metadata should have two operands.");
725 mdconst::extract<ConstantInt>(MD->
getOperand(1))->getZExtValue();
726 assert(Count >= 1 &&
"Unroll count must be positive.");
738 unsigned MaxPercentThresholdBoost) {
739 if (
Cost.RolledDynamicCost >= std::numeric_limits<unsigned>::max() / 100)
741 else if (
Cost.UnrolledCost != 0)
743 return std::min(100 *
Cost.RolledDynamicCost /
Cost.UnrolledCost,
744 MaxPercentThresholdBoost);
746 return MaxPercentThresholdBoost;
754 const unsigned LoopSize;
763 const unsigned CountOverwrite = 0)
const {
765 "LoopSize should not be less than BEInsns!");
775static std::optional<unsigned>
777 const unsigned TripMultiple,
const unsigned TripCount,
784 if (PInfo.UserUnrollCount) {
791 if (PInfo.PragmaCount > 0) {
792 if ((UP.
AllowRemainder || (TripMultiple % PInfo.PragmaCount == 0)))
793 return PInfo.PragmaCount;
796 if (PInfo.PragmaFullUnroll && TripCount != 0)
808 assert(FullUnrollTripCount &&
"should be non-zero!");
816 return FullUnrollTripCount;
822 L, FullUnrollTripCount, DT, SE, EphValues,
TTI,
828 return FullUnrollTripCount;
833static std::optional<unsigned>
843 <<
"-unroll-allow-partial not given\n");
894 bool MaxOrZero,
unsigned TripMultiple,
unsigned LoopSize,
900 const bool UserUnrollCount =
UnrollCount.getNumOccurrences() > 0;
905 const bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll ||
906 PragmaEnableUnroll || UserUnrollCount;
908 PragmaInfo PInfo(UserUnrollCount, PragmaFullUnroll, PragmaCount,
915 "explicit unroll count",
false);
926 UP.
Count = *UnrollFactor;
928 if (UserUnrollCount || (PragmaCount > 0)) {
932 UP.
Runtime |= (PragmaCount > 0);
933 return ExplicitUnroll;
935 if (ExplicitUnroll && TripCount != 0) {
949 UP.
Count = TripCount;
951 TripCount, UCE, UP)) {
952 UP.
Count = *UnrollFactor;
953 UseUpperBound =
false;
954 return ExplicitUnroll;
970 if (!TripCount && MaxTripCount && (UP.
UpperBound || MaxOrZero) &&
972 UP.
Count = MaxTripCount;
974 MaxTripCount, UCE, UP)) {
975 UP.
Count = *UnrollFactor;
976 UseUpperBound =
true;
977 return ExplicitUnroll;
986 return ExplicitUnroll;
997 UP.
Count = *UnrollFactor;
999 if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
1000 UP.
Count != TripCount)
1003 "FullUnrollAsDirectedTooLarge",
1004 L->getStartLoc(), L->getHeader())
1005 <<
"Unable to fully unroll loop as directed by unroll pragma "
1007 "unrolled size is too large.";
1011 if (UP.
Count == 0) {
1012 if (PragmaEnableUnroll)
1015 "UnrollAsDirectedTooLarge",
1016 L->getStartLoc(), L->getHeader())
1017 <<
"Unable to unroll loop as directed by unroll(enable) "
1019 "because unrolled size is too large.";
1023 return ExplicitUnroll;
1026 "All cases when TripCount is constant should be covered here.");
1027 if (PragmaFullUnroll)
1030 DEBUG_TYPE,
"CantFullUnrollAsDirectedRuntimeTripCount",
1031 L->getStartLoc(), L->getHeader())
1032 <<
"Unable to fully unroll loop as directed by unroll(full) "
1034 "because loop has a runtime trip count.";
1051 if (L->getHeader()->getParent()->hasProfileData()) {
1059 UP.
Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount;
1062 dbgs() <<
" will not try to unroll loop with runtime trip count "
1063 <<
"-unroll-runtime not given\n");
1072 while (UP.
Count != 0 &&
1077 unsigned OrigCount = UP.
Count;
1081 while (UP.
Count != 0 && TripMultiple % UP.
Count != 0)
1084 dbgs() <<
"Remainder loop is restricted (that could architecture "
1085 "specific or because the loop contains a convergent "
1086 "instruction), so unroll count must divide the trip "
1088 << TripMultiple <<
". Reducing unroll count from " << OrigCount
1089 <<
" to " << UP.
Count <<
".\n");
1091 using namespace ore;
1096 "DifferentUnrollCountFromDirected",
1097 L->getStartLoc(), L->getHeader())
1098 <<
"Unable to unroll loop the number of times directed by "
1099 "unroll_count pragma because remainder loop is restricted "
1100 "(that could architecture specific or because the loop "
1101 "contains a convergent instruction) and so must have an "
1103 "count that divides the loop trip multiple of "
1104 << NV(
"TripMultiple", TripMultiple) <<
". Unrolling instead "
1105 << NV(
"UnrollCount", UP.
Count) <<
" time(s).";
1112 if (MaxTripCount && UP.
Count > MaxTripCount)
1113 UP.
Count = MaxTripCount;
1119 return ExplicitUnroll;
1127 bool OnlyFullUnroll,
bool OnlyWhenForced,
bool ForgetAllSCEV,
1128 std::optional<unsigned> ProvidedCount,
1129 std::optional<unsigned> ProvidedThreshold,
1130 std::optional<bool> ProvidedAllowPartial,
1131 std::optional<bool> ProvidedRuntime,
1132 std::optional<bool> ProvidedUpperBound,
1133 std::optional<bool> ProvidedAllowPeeling,
1134 std::optional<bool> ProvidedAllowProfileBasedPeeling,
1135 std::optional<unsigned> ProvidedFullUnrollMaxCount) {
1138 << L->getHeader()->getParent()->getName() <<
"] Loop %"
1139 << L->getHeader()->getName() <<
"\n");
1142 return LoopUnrollResult::Unmodified;
1148 Loop *ParentL = L->getParentLoop();
1149 if (ParentL !=
nullptr &&
1153 <<
" llvm.loop.unroll_and_jam.\n");
1154 return LoopUnrollResult::Unmodified;
1164 <<
" Not unrolling loop since it has llvm.loop.unroll_and_jam.\n");
1165 return LoopUnrollResult::Unmodified;
1168 if (!L->isLoopSimplifyForm()) {
1170 dbgs() <<
" Not unrolling loop which is not in loop-simplify form.\n");
1171 return LoopUnrollResult::Unmodified;
1177 return LoopUnrollResult::Unmodified;
1179 bool OptForSize = L->getHeader()->getParent()->hasOptSize();
1180 unsigned NumInlineCandidates;
1181 bool NotDuplicatable;
1184 L, SE,
TTI, BFI, PSI, ORE, OptLevel, ProvidedThreshold, ProvidedCount,
1185 ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
1186 ProvidedFullUnrollMaxCount);
1188 L, SE,
TTI, ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling,
true);
1194 return LoopUnrollResult::Unmodified;
1205 LLVM_DEBUG(
dbgs() <<
" Not unrolling loop which contains instructions"
1206 <<
" with invalid cost.\n");
1207 return LoopUnrollResult::Unmodified;
1209 unsigned LoopSize = *LoopSizeIC.
getValue();
1211 if (NotDuplicatable) {
1212 LLVM_DEBUG(
dbgs() <<
" Not unrolling loop which contains non-duplicatable"
1213 <<
" instructions.\n");
1214 return LoopUnrollResult::Unmodified;
1222 if (NumInlineCandidates != 0) {
1223 LLVM_DEBUG(
dbgs() <<
" Not unrolling loop with inlinable calls.\n");
1224 return LoopUnrollResult::Unmodified;
1232 unsigned TripCount = 0;
1233 unsigned TripMultiple = 1;
1235 L->getExitingBlocks(ExitingBlocks);
1236 for (
BasicBlock *ExitingBlock : ExitingBlocks)
1238 if (!TripCount || TC < TripCount)
1239 TripCount = TripMultiple = TC;
1245 BasicBlock *ExitingBlock = L->getLoopLatch();
1246 if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
1247 ExitingBlock = L->getExitingBlock();
1269 unsigned MaxTripCount = 0;
1270 bool MaxOrZero =
false;
1278 bool UseUpperBound =
false;
1280 L,
TTI, DT, LI, &AC, SE, EphValues, &ORE, TripCount, MaxTripCount, MaxOrZero,
1281 TripMultiple, LoopSize, UP, PP, UseUpperBound);
1283 return LoopUnrollResult::Unmodified;
1286 assert(UP.
Count == 1 &&
"Cannot perform peel and unroll in the same step");
1287 LLVM_DEBUG(
dbgs() <<
"PEELING loop %" << L->getHeader()->getName()
1288 <<
" with iteration count " << PP.
PeelCount <<
"!\n");
1302 L->setLoopAlreadyUnrolled();
1303 return LoopUnrollResult::PartiallyUnrolled;
1305 return LoopUnrollResult::Unmodified;
1309 if (OnlyFullUnroll && !(UP.
Count >= MaxTripCount)) {
1311 dbgs() <<
"Not attempting partial/runtime unroll in FullLoopUnroll.\n");
1312 return LoopUnrollResult::Unmodified;
1320 UP.
Runtime &= TripCount == 0 && TripMultiple % UP.
Count != 0;
1323 MDNode *OrigLoopID = L->getLoopID();
1326 Loop *RemainderLoop =
nullptr;
1331 LI, &SE, &DT, &AC, &
TTI, &ORE, PreserveLCSSA, &RemainderLoop);
1332 if (UnrollResult == LoopUnrollResult::Unmodified)
1333 return LoopUnrollResult::Unmodified;
1335 if (RemainderLoop) {
1336 std::optional<MDNode *> RemainderLoopID =
1339 if (RemainderLoopID)
1340 RemainderLoop->
setLoopID(*RemainderLoopID);
1343 if (UnrollResult != LoopUnrollResult::FullyUnrolled) {
1344 std::optional<MDNode *> NewLoopID =
1348 L->setLoopID(*NewLoopID);
1352 return UnrollResult;
1358 if (UnrollResult != LoopUnrollResult::FullyUnrolled && IsCountSetExplicitly)
1359 L->setLoopAlreadyUnrolled();
1361 return UnrollResult;
1366class LoopUnroll :
public LoopPass {
1375 bool OnlyWhenForced;
1382 std::optional<unsigned> ProvidedCount;
1383 std::optional<unsigned> ProvidedThreshold;
1384 std::optional<bool> ProvidedAllowPartial;
1385 std::optional<bool> ProvidedRuntime;
1386 std::optional<bool> ProvidedUpperBound;
1387 std::optional<bool> ProvidedAllowPeeling;
1388 std::optional<bool> ProvidedAllowProfileBasedPeeling;
1389 std::optional<unsigned> ProvidedFullUnrollMaxCount;
1391 LoopUnroll(
int OptLevel = 2,
bool OnlyWhenForced =
false,
1392 bool ForgetAllSCEV =
false,
1393 std::optional<unsigned> Threshold = std::nullopt,
1394 std::optional<unsigned> Count = std::nullopt,
1395 std::optional<bool> AllowPartial = std::nullopt,
1396 std::optional<bool>
Runtime = std::nullopt,
1397 std::optional<bool> UpperBound = std::nullopt,
1398 std::optional<bool> AllowPeeling = std::nullopt,
1399 std::optional<bool> AllowProfileBasedPeeling = std::nullopt,
1400 std::optional<unsigned> ProvidedFullUnrollMaxCount = std::nullopt)
1401 :
LoopPass(
ID), OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced),
1402 ForgetAllSCEV(ForgetAllSCEV), ProvidedCount(
std::
move(Count)),
1403 ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
1404 ProvidedRuntime(
Runtime), ProvidedUpperBound(UpperBound),
1405 ProvidedAllowPeeling(AllowPeeling),
1406 ProvidedAllowProfileBasedPeeling(AllowProfileBasedPeeling),
1407 ProvidedFullUnrollMaxCount(ProvidedFullUnrollMaxCount) {
1417 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1418 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1419 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1421 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
F);
1422 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
F);
1430 L, DT, LI, SE,
TTI, AC, ORE,
nullptr,
nullptr, PreserveLCSSA, OptLevel,
1431 false, OnlyWhenForced, ForgetAllSCEV, ProvidedCount,
1432 ProvidedThreshold, ProvidedAllowPartial, ProvidedRuntime,
1433 ProvidedUpperBound, ProvidedAllowPeeling,
1434 ProvidedAllowProfileBasedPeeling, ProvidedFullUnrollMaxCount);
1436 if (Result == LoopUnrollResult::FullyUnrolled)
1455char LoopUnroll::ID = 0;
1464 bool ForgetAllSCEV,
int Threshold,
int Count,
1465 int AllowPartial,
int Runtime,
int UpperBound,
1470 return new LoopUnroll(
1471 OptLevel, OnlyWhenForced, ForgetAllSCEV,
1472 Threshold == -1 ? std::nullopt : std::optional<unsigned>(Threshold),
1473 Count == -1 ? std::nullopt : std::optional<unsigned>(Count),
1474 AllowPartial == -1 ? std::nullopt : std::optional<bool>(AllowPartial),
1476 UpperBound == -1 ? std::nullopt : std::optional<bool>(UpperBound),
1477 AllowPeeling == -1 ? std::nullopt : std::optional<bool>(AllowPeeling));
1490 Loop *ParentL = L.getParentLoop();
1497 std::string LoopName = std::string(L.getName());
1502 true, OptLevel,
true,
1503 OnlyWhenForced, ForgetSCEV, std::nullopt,
1504 std::nullopt,
false,
1535 bool IsCurrentLoopValid =
false;
1542 if (SibLoop == &L) {
1543 IsCurrentLoopValid =
true;
1552 if (!IsCurrentLoopValid) {
1581 LAM = &LAMProxy->getManager();
1586 auto *BFI = (PSI && PSI->hasProfileSummary()) ?
1589 bool Changed =
false;
1596 for (
const auto &L : LI) {
1607 while (!Worklist.
empty()) {
1614 Loop *ParentL = L.getParentLoop();
1620 std::optional<bool> LocalAllowPeeling = UnrollOpts.
AllowPeeling;
1621 if (PSI && PSI->hasHugeWorkingSetSize())
1622 LocalAllowPeeling =
false;
1623 std::string LoopName = std::string(L.getName());
1627 &L, DT, &LI, SE,
TTI, AC, ORE, BFI, PSI,
1656 OS, MapClassName2PassName);
1668 <<
"profile-peeling;";
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines DenseMapInfo traits for DenseMap.
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header provides classes for managing per-loop analyses.
This header provides classes for managing a pipeline of passes over loops in LLVM IR.
static cl::opt< unsigned > UnrollMaxCount("unroll-max-count", cl::Hidden, cl::desc("Set the max unroll count for partial and runtime unrolling, for" "testing purposes"))
static cl::opt< unsigned > UnrollCount("unroll-count", cl::Hidden, cl::desc("Use this unroll count for all loops including those with " "unroll_count pragma values, for testing purposes"))
static cl::opt< unsigned > UnrollThresholdDefault("unroll-threshold-default", cl::init(150), cl::Hidden, cl::desc("Default threshold (max size of unrolled " "loop), used in all but O3 optimizations"))
static cl::opt< unsigned > FlatLoopTripCountThreshold("flat-loop-tripcount-threshold", cl::init(5), cl::Hidden, cl::desc("If the runtime tripcount for the loop is lower than the " "threshold, the loop is considered as flat and will be less " "aggressively unrolled."))
static cl::opt< unsigned > UnrollMaxIterationsCountToAnalyze("unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden, cl::desc("Don't allow loop unrolling to simulate more than this number of" "iterations when checking full unroll profitability"))
static LoopUnrollResult tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, const TargetTransformInfo &TTI, AssumptionCache &AC, OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, bool PreserveLCSSA, int OptLevel, bool OnlyFullUnroll, bool OnlyWhenForced, bool ForgetAllSCEV, std::optional< unsigned > ProvidedCount, std::optional< unsigned > ProvidedThreshold, std::optional< bool > ProvidedAllowPartial, std::optional< bool > ProvidedRuntime, std::optional< bool > ProvidedUpperBound, std::optional< bool > ProvidedAllowPeeling, std::optional< bool > ProvidedAllowProfileBasedPeeling, std::optional< unsigned > ProvidedFullUnrollMaxCount)
static MDNode * getUnrollMetadataForLoop(const Loop *L, StringRef Name)
static cl::opt< unsigned > UnrollOptSizeThreshold("unroll-optsize-threshold", cl::init(0), cl::Hidden, cl::desc("The cost threshold for loop unrolling when optimizing for " "size"))
static bool hasUnrollFullPragma(const Loop *L)
static cl::opt< bool > UnrollUnrollRemainder("unroll-remainder", cl::Hidden, cl::desc("Allow the loop remainder to be unrolled."))
static unsigned unrollCountPragmaValue(const Loop *L)
static bool hasUnrollEnablePragma(const Loop *L)
static std::optional< unsigned > shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo, const unsigned TripMultiple, const unsigned TripCount, const UnrollCostEstimator UCE, const TargetTransformInfo::UnrollingPreferences &UP)
static cl::opt< unsigned > UnrollFullMaxCount("unroll-full-max-count", cl::Hidden, cl::desc("Set the max unroll count for full unrolling, for testing purposes"))
static cl::opt< unsigned > UnrollMaxUpperBound("unroll-max-upperbound", cl::init(8), cl::Hidden, cl::desc("The max of trip count upper bound that is considered in unrolling"))
static std::optional< unsigned > shouldFullUnroll(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, const unsigned FullUnrollTripCount, const UnrollCostEstimator UCE, const TargetTransformInfo::UnrollingPreferences &UP)
static std::optional< EstimatedUnrollCost > analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize, unsigned MaxIterationsCountToAnalyze)
Figure out if the loop is worth full unrolling.
static cl::opt< unsigned > UnrollPartialThreshold("unroll-partial-threshold", cl::Hidden, cl::desc("The cost threshold for partial loop unrolling"))
static cl::opt< bool > UnrollAllowRemainder("unroll-allow-remainder", cl::Hidden, cl::desc("Allow generation of a loop remainder (extra iterations) " "when unrolling a loop."))
static std::optional< unsigned > shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount, const UnrollCostEstimator UCE, const TargetTransformInfo::UnrollingPreferences &UP)
static const unsigned NoThreshold
A magic value for use with the Threshold parameter to indicate that the loop unroll should be perform...
static cl::opt< bool > UnrollRevisitChildLoops("unroll-revisit-child-loops", cl::Hidden, cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. " "This shouldn't typically be needed as child loops (or their " "clones) were already visited."))
static cl::opt< unsigned > UnrollThreshold("unroll-threshold", cl::Hidden, cl::desc("The cost threshold for loop unrolling"))
static cl::opt< bool > UnrollRuntime("unroll-runtime", cl::Hidden, cl::desc("Unroll loops with run-time trip counts"))
static bool hasRuntimeUnrollDisablePragma(const Loop *L)
static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost, unsigned MaxPercentThresholdBoost)
static cl::opt< unsigned > UnrollThresholdAggressive("unroll-threshold-aggressive", cl::init(300), cl::Hidden, cl::desc("Threshold (max size of unrolled loop) to use in aggressive (O3) " "optimizations"))
static cl::opt< unsigned > UnrollMaxPercentThresholdBoost("unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden, cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied " "to the threshold when aggressively unrolling a loop due to the " "dynamic cost savings. If completely unrolling a loop will reduce " "the total runtime from X to Y, we boost the loop unroll " "threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, " "X/Y). This limit avoids excessive code bloat."))
static cl::opt< unsigned > PragmaUnrollThreshold("pragma-unroll-threshold", cl::init(16 *1024), cl::Hidden, cl::desc("Unrolled size limit for loops with an unroll(full) or " "unroll_count pragma."))
static cl::opt< bool > UnrollAllowPartial("unroll-allow-partial", cl::Hidden, cl::desc("Allows loops to be partially unrolled until " "-unroll-threshold loop size is reached."))
const char LLVMTargetMachineRef TM
This header defines various interfaces for pass management in LLVM.
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
UnrollCostEstimator(Loop &L, unsigned LoopSize)
uint64_t getUnrolledLoopSize(const TargetTransformInfo::UnrollingPreferences &UP, const unsigned CountOverwrite=0) const
A container for analyses that lazily runs them and caches their results.
void clear(IRUnitT &IR, llvm::StringRef Name)
Clear any cached analysis results for a single unit of IR.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional or Unconditional Branch instruction.
This is the shared class of boolean and integer constants.
This is an important base class in LLVM.
This class represents an Operation in the Expression.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
Analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
const Function * getFunction() const
Return the function this instruction belongs to.
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
void addChildLoops(ArrayRef< Loop * > NewChildLoops)
Loop passes should use this method to indicate they have added new child loops of the current loop.
void markLoopAsDeleted(Loop &L, llvm::StringRef Name)
Loop passes should use this method to indicate they have deleted a loop from the nest.
void addSiblingLoops(ArrayRef< Loop * > NewSibLoops)
Loop passes should use this method to indicate they have added new sibling loops to the current loop.
void markLoopAsDeleted(Loop &L)
Analysis pass that exposes the LoopInfo for a function.
void verifyLoop() const
Verify loop structure.
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
virtual bool runOnLoop(Loop *L, LPPassManager &LPM)=0
bool skipLoop(const Loop *L) const
Optional passes call this function to check whether the pass should be skipped.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
Represents a single loop in the control flow graph.
void setLoopID(MDNode *LoopID) const
Set the llvm.loop loop id metadata for this loop.
const MDOperand & getOperand(unsigned I) const
unsigned getNumOperands() const
Return number of MDNode operands.
An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
bool mustPreserveAnalysisID(char &AID) const
mustPreserveAnalysisID - This method serves the same function as getAnalysisIfAvailable,...
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
bool empty() const
Determine if the PriorityWorklist is empty or not.
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
unsigned getSmallConstantTripMultiple(const Loop *L, const SCEV *ExitCount)
Returns the largest constant divisor of the trip count as a normal unsigned value,...
unsigned getSmallConstantMaxTripCount(const Loop *L)
Returns the upper bound of the loop trip count as a normal unsigned value.
bool isBackedgeTakenCountMaxOrZero(const Loop *L)
Return true if the backedge taken count is either the value returned by getConstantMaxBackedgeTakenCo...
unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
size_type size() const
Determine the number of elements in the SetVector.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
value_type pop_back_val()
A version of PriorityWorklist that selects small size optimized data structures for the vector and ma...
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Analysis pass providing the TargetTransformInfo.
LLVM Value Representation.
std::pair< iterator, bool > insert(const ValueT &V)
iterator find(const_arg_type_t< ValueT > V)
An efficient, type-erasing, non-owning reference to a callable.
This class implements an extremely fast bulk output stream that can only output to a stream.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Returns a loop's estimated trip count based on branch weight metadata.
void initializeLoopUnrollPass(PassRegistry &)
void computePeelCount(Loop *L, unsigned LoopSize, TargetTransformInfo::PeelingPreferences &PP, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE, AssumptionCache *AC=nullptr, unsigned Threshold=UINT_MAX)
auto successors(const MachineBasicBlock *BB)
@ Runtime
Detect stack use after return if not disabled runtime with (ASAN_OPTIONS=detect_stack_use_after_retur...
bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
std::optional< MDNode * > makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef< StringRef > FollowupAttrs, const char *InheritOptionsAttrsPrefix="", bool AlwaysNew=false)
Create a new loop identifier for a loop created from a loop transformation.
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
Pass * createLoopUnrollPass(int OptLevel=2, bool OnlyWhenForced=false, bool ForgetAllSCEV=false, int Threshold=-1, int Count=-1, int AllowPartial=-1, int Runtime=-1, int UpperBound=-1, int AllowPeeling=-1)
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
TransformationMode hasUnrollAndJamTransformation(const Loop *L)
cl::opt< bool > ForgetSCEVInLoopUnroll
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
TransformationMode hasUnrollTransformation(const Loop *L)
LoopUnrollResult
Represents the result of a UnrollLoop invocation.
@ Unmodified
The loop was not modified.
@ FullyUnrolled
The loop was fully unrolled into straight-line code.
void getLoopAnalysisUsage(AnalysisUsage &AU)
Helper to consistently add the set of standard passes to a loop pass's AnalysisUsage.
const char *const LLVMLoopUnrollFollowupAll
TransformationMode
The mode sets how eager a transformation should be applied.
@ TM_ForcedByUser
The transformation was directed by the user, e.g.
@ TM_Disable
The transformation should not be applied.
@ TM_Enable
The transformation should be applied without considering a cost model.
bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, unsigned LoopSize, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
void appendLoopsToWorklist(RangeT &&, SmallPriorityWorklist< Loop *, 4 > &)
Utility that implements appending of loops onto a worklist given a range.
TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
const char *const LLVMLoopUnrollFollowupRemainder
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
const char *const LLVMLoopUnrollFollowupUnrolled
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
InstructionCost ApproximateLoopSize(const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent, const TargetTransformInfo &TTI, const SmallPtrSetImpl< const Value * > &EphValues, unsigned BEInsns)
ApproximateLoopSize - Approximate the size of the loop.
LoopUnrollResult UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, const llvm::TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE, bool PreserveLCSSA, Loop **RemainderLoop=nullptr)
Unroll the given loop by Count.
void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, const TargetTransformInfo *TTI)
Perform some cleanup and simplifications on loops after unrolling.
MDNode * GetUnrollMetadata(MDNode *LoopID, StringRef Name)
Given an llvm.loop loop id metadata node, returns the loop hint metadata node with the given name (fo...
bool peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, ScalarEvolution *SE, DominatorTree &DT, AssumptionCache *AC, bool PreserveLCSSA, ValueToValueMapTy &VMap)
VMap is the value-map that maps instructions from the original loop to instructions in the last peele...
Implement std::hash so that hash_code can be used in STL containers.
Utility to calculate the size and a few similar metrics for a set of basic blocks.
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
An information struct used to provide DenseMap with the various necessary components for a given valu...
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
TargetTransformInfo & TTI
bool OnlyWhenForced
If false, use a cost model to determine whether unrolling of a loop is profitable.
const bool ForgetSCEV
If true, forget all loops when unrolling.
std::optional< unsigned > FullUnrollMaxCount
std::optional< bool > AllowPartial
std::optional< bool > AllowRuntime
std::optional< bool > AllowProfileBasedPeeling
std::optional< bool > AllowPeeling
std::optional< bool > AllowUpperBound
A CRTP mix-in to automatically provide informational APIs needed for passes.