LLVM 23.0.0git
LoopStrengthReduce.cpp
Go to the documentation of this file.
1//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This transformation analyzes and transforms the induction variables (and
10// computations derived from them) into forms suitable for efficient execution
11// on the target.
12//
13// This pass performs a strength reduction on array references inside loops that
14// have as one or more of their components the loop induction variable, it
15// rewrites expressions to take advantage of scaled-index addressing modes
16// available on the target, and it performs a variety of other optimizations
17// related to loop induction variables.
18//
19// Terminology note: this code has a lot of handling for "post-increment" or
20// "post-inc" users. This is not talking about post-increment addressing modes;
21// it is instead talking about code like this:
22//
23// %i = phi [ 0, %entry ], [ %i.next, %latch ]
24// ...
25// %i.next = add %i, 1
26// %c = icmp eq %i.next, %n
27//
28// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
29// it's useful to think about these as the same register, with some uses using
30// the value of the register before the add and some using it after. In this
31// example, the icmp is a post-increment user, since it uses %i.next, which is
32// the value of the induction variable after the increment. The other common
33// case of post-increment users is users outside the loop.
34//
35// TODO: More sophistication in the way Formulae are generated and filtered.
36//
37// TODO: Handle multiple loops at a time.
38//
39// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
40// of a GlobalValue?
41//
42// TODO: When truncation is free, truncate ICmp users' operands to make it a
43// smaller encoding (on x86 at least).
44//
45// TODO: When a negated register is used by an add (such as in a list of
46// multiple base registers, or as the increment expression in an addrec),
47// we may not actually need both reg and (-1 * reg) in registers; the
48// negation can be implemented by using a sub instead of an add. The
49// lack of support for taking this into consideration when making
50// register pressure decisions is partly worked around by the "Special"
51// use kind.
52//
53//===----------------------------------------------------------------------===//
54
56#include "llvm/ADT/APInt.h"
57#include "llvm/ADT/DenseMap.h"
58#include "llvm/ADT/DenseSet.h"
60#include "llvm/ADT/STLExtras.h"
61#include "llvm/ADT/SetVector.h"
64#include "llvm/ADT/SmallSet.h"
66#include "llvm/ADT/Statistic.h"
84#include "llvm/IR/BasicBlock.h"
85#include "llvm/IR/Constant.h"
86#include "llvm/IR/Constants.h"
89#include "llvm/IR/Dominators.h"
90#include "llvm/IR/GlobalValue.h"
91#include "llvm/IR/IRBuilder.h"
92#include "llvm/IR/InstrTypes.h"
93#include "llvm/IR/Instruction.h"
96#include "llvm/IR/Module.h"
97#include "llvm/IR/Operator.h"
98#include "llvm/IR/Type.h"
99#include "llvm/IR/Use.h"
100#include "llvm/IR/User.h"
101#include "llvm/IR/Value.h"
102#include "llvm/IR/ValueHandle.h"
104#include "llvm/Pass.h"
105#include "llvm/Support/Casting.h"
108#include "llvm/Support/Debug.h"
118#include <algorithm>
119#include <cassert>
120#include <cstddef>
121#include <cstdint>
122#include <iterator>
123#include <limits>
124#include <map>
125#include <numeric>
126#include <optional>
127#include <utility>
128
129using namespace llvm;
130using namespace SCEVPatternMatch;
131
132#define DEBUG_TYPE "loop-reduce"
133
134/// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
135/// bail out. This threshold is far beyond the number of users that LSR can
136/// conceivably solve, so it should not affect generated code, but catches the
137/// worst cases before LSR burns too much compile time and stack space.
138static const unsigned MaxIVUsers = 200;
139
140/// Limit the size of expression that SCEV-based salvaging will attempt to
141/// translate into a DIExpression.
142/// Choose a maximum size such that debuginfo is not excessively increased and
143/// the salvaging is not too expensive for the compiler.
144static const unsigned MaxSCEVSalvageExpressionSize = 64;
145
146// Cleanup congruent phis after LSR phi expansion.
148 "enable-lsr-phielim", cl::Hidden, cl::init(true),
149 cl::desc("Enable LSR phi elimination"));
150
151// The flag adds instruction count to solutions cost comparison.
153 "lsr-insns-cost", cl::Hidden, cl::init(true),
154 cl::desc("Add instruction count to a LSR cost model"));
155
156// Flag to choose how to narrow complex lsr solution
158 "lsr-exp-narrow", cl::Hidden, cl::init(false),
159 cl::desc("Narrow LSR complex solution using"
160 " expectation of registers number"));
161
162// Flag to narrow search space by filtering non-optimal formulae with
163// the same ScaledReg and Scale.
165 "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
166 cl::desc("Narrow LSR search space by filtering non-optimal formulae"
167 " with the same ScaledReg and Scale"));
168
170 "lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None),
171 cl::desc("A flag that overrides the target's preferred addressing mode."),
173 clEnumValN(TTI::AMK_None, "none", "Don't prefer any addressing mode"),
174 clEnumValN(TTI::AMK_PreIndexed, "preindexed",
175 "Prefer pre-indexed addressing mode"),
176 clEnumValN(TTI::AMK_PostIndexed, "postindexed",
177 "Prefer post-indexed addressing mode"),
178 clEnumValN(TTI::AMK_All, "all", "Consider all addressing modes")));
179
181 "lsr-complexity-limit", cl::Hidden,
182 cl::init(std::numeric_limits<uint16_t>::max()),
183 cl::desc("LSR search space complexity limit"));
184
186 "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7),
187 cl::desc("The limit on recursion depth for LSRs setup cost"));
188
190 "lsr-drop-solution", cl::Hidden,
191 cl::desc("Attempt to drop solution if it is less profitable"));
192
194 "lsr-enable-vscale-immediates", cl::Hidden, cl::init(true),
195 cl::desc("Enable analysis of vscale-relative immediates in LSR"));
196
198 "lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true),
199 cl::desc("Avoid using scaled registers with vscale-relative addressing"));
200
201#ifndef NDEBUG
202// Stress test IV chain generation.
204 "stress-ivchain", cl::Hidden, cl::init(false),
205 cl::desc("Stress test LSR IV chains"));
206#else
207static bool StressIVChain = false;
208#endif
209
210namespace {
211
212struct MemAccessTy {
213 /// Used in situations where the accessed memory type is unknown.
214 static const unsigned UnknownAddressSpace =
215 std::numeric_limits<unsigned>::max();
216
217 Type *MemTy = nullptr;
218 unsigned AddrSpace = UnknownAddressSpace;
219
220 MemAccessTy() = default;
221 MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {}
222
223 bool operator==(MemAccessTy Other) const {
224 return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
225 }
226
227 bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
228
229 static MemAccessTy getUnknown(LLVMContext &Ctx,
230 unsigned AS = UnknownAddressSpace) {
231 return MemAccessTy(Type::getVoidTy(Ctx), AS);
232 }
233
234 Type *getType() { return MemTy; }
235};
236
237/// This class holds data which is used to order reuse candidates.
238class RegSortData {
239public:
240 /// This represents the set of LSRUse indices which reference
241 /// a particular register.
242 SmallBitVector UsedByIndices;
243
244 void print(raw_ostream &OS) const;
245 void dump() const;
246};
247
248// An offset from an address that is either scalable or fixed. Used for
249// per-target optimizations of addressing modes.
250class Immediate : public details::FixedOrScalableQuantity<Immediate, int64_t> {
251 constexpr Immediate(ScalarTy MinVal, bool Scalable)
252 : FixedOrScalableQuantity(MinVal, Scalable) {}
253
254 constexpr Immediate(const FixedOrScalableQuantity<Immediate, int64_t> &V)
255 : FixedOrScalableQuantity(V) {}
256
257public:
258 constexpr Immediate() = delete;
259
260 static constexpr Immediate getFixed(ScalarTy MinVal) {
261 return {MinVal, false};
262 }
263 static constexpr Immediate getScalable(ScalarTy MinVal) {
264 return {MinVal, true};
265 }
266 static constexpr Immediate get(ScalarTy MinVal, bool Scalable) {
267 return {MinVal, Scalable};
268 }
269 static constexpr Immediate getZero() { return {0, false}; }
270 static constexpr Immediate getFixedMin() {
271 return {std::numeric_limits<int64_t>::min(), false};
272 }
273 static constexpr Immediate getFixedMax() {
274 return {std::numeric_limits<int64_t>::max(), false};
275 }
276 static constexpr Immediate getScalableMin() {
277 return {std::numeric_limits<int64_t>::min(), true};
278 }
279 static constexpr Immediate getScalableMax() {
280 return {std::numeric_limits<int64_t>::max(), true};
281 }
282
283 constexpr bool isLessThanZero() const { return Quantity < 0; }
284
285 constexpr bool isGreaterThanZero() const { return Quantity > 0; }
286
287 constexpr bool isCompatibleImmediate(const Immediate &Imm) const {
288 return isZero() || Imm.isZero() || Imm.Scalable == Scalable;
289 }
290
291 constexpr bool isMin() const {
292 return Quantity == std::numeric_limits<ScalarTy>::min();
293 }
294
295 constexpr bool isMax() const {
296 return Quantity == std::numeric_limits<ScalarTy>::max();
297 }
298
299 // Arithmetic 'operators' that cast to unsigned types first.
300 constexpr Immediate addUnsigned(const Immediate &RHS) const {
301 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
302 ScalarTy Value = (uint64_t)Quantity + RHS.getKnownMinValue();
303 return {Value, Scalable || RHS.isScalable()};
304 }
305
306 constexpr Immediate subUnsigned(const Immediate &RHS) const {
307 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
308 ScalarTy Value = (uint64_t)Quantity - RHS.getKnownMinValue();
309 return {Value, Scalable || RHS.isScalable()};
310 }
311
312 // Scale the quantity by a constant without caring about runtime scalability.
313 constexpr Immediate mulUnsigned(const ScalarTy RHS) const {
314 ScalarTy Value = (uint64_t)Quantity * RHS;
315 return {Value, Scalable};
316 }
317
318 // Helpers for generating SCEVs with vscale terms where needed.
319 const SCEV *getSCEV(ScalarEvolution &SE, Type *Ty) const {
320 const SCEV *S = SE.getConstant(Ty, Quantity);
321 if (Scalable)
322 S = SE.getMulExpr(S, SE.getVScale(S->getType()));
323 return S;
324 }
325
326 const SCEV *getNegativeSCEV(ScalarEvolution &SE, Type *Ty) const {
327 const SCEV *NegS = SE.getConstant(Ty, -(uint64_t)Quantity);
328 if (Scalable)
329 NegS = SE.getMulExpr(NegS, SE.getVScale(NegS->getType()));
330 return NegS;
331 }
332
333 const SCEV *getUnknownSCEV(ScalarEvolution &SE, Type *Ty) const {
334 // TODO: Avoid implicit trunc?
335 // See https://github.com/llvm/llvm-project/issues/112510.
336 const SCEV *SU = SE.getUnknown(
337 ConstantInt::getSigned(Ty, Quantity, /*ImplicitTrunc=*/true));
338 if (Scalable)
339 SU = SE.getMulExpr(SU, SE.getVScale(SU->getType()));
340 return SU;
341 }
342};
343
344// This is needed for the Compare type of std::map when Immediate is used
345// as a key. We don't need it to be fully correct against any value of vscale,
346// just to make sure that vscale-related terms in the map are considered against
347// each other rather than being mixed up and potentially missing opportunities.
348struct KeyOrderTargetImmediate {
349 bool operator()(const Immediate &LHS, const Immediate &RHS) const {
350 if (LHS.isScalable() && !RHS.isScalable())
351 return false;
352 if (!LHS.isScalable() && RHS.isScalable())
353 return true;
354 return LHS.getKnownMinValue() < RHS.getKnownMinValue();
355 }
356};
357
358// This would be nicer if we could be generic instead of directly using size_t,
359// but there doesn't seem to be a type trait for is_orderable or
360// is_lessthan_comparable or similar.
361struct KeyOrderSizeTAndImmediate {
362 bool operator()(const std::pair<size_t, Immediate> &LHS,
363 const std::pair<size_t, Immediate> &RHS) const {
364 size_t LSize = LHS.first;
365 size_t RSize = RHS.first;
366 if (LSize != RSize)
367 return LSize < RSize;
368 return KeyOrderTargetImmediate()(LHS.second, RHS.second);
369 }
370};
371} // end anonymous namespace
372
373#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
374void RegSortData::print(raw_ostream &OS) const {
375 OS << "[NumUses=" << UsedByIndices.count() << ']';
376}
377
378LLVM_DUMP_METHOD void RegSortData::dump() const {
379 print(errs()); errs() << '\n';
380}
381#endif
382
383namespace {
384
385/// Map register candidates to information about how they are used.
386class RegUseTracker {
387 using RegUsesTy = DenseMap<const SCEV *, RegSortData>;
388
389 RegUsesTy RegUsesMap;
391
392public:
393 void countRegister(const SCEV *Reg, size_t LUIdx);
394 void dropRegister(const SCEV *Reg, size_t LUIdx);
395 void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
396
397 bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
398
399 const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
400
401 void clear();
402
405
406 iterator begin() { return RegSequence.begin(); }
407 iterator end() { return RegSequence.end(); }
408 const_iterator begin() const { return RegSequence.begin(); }
409 const_iterator end() const { return RegSequence.end(); }
410};
411
412} // end anonymous namespace
413
414void
415RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
416 std::pair<RegUsesTy::iterator, bool> Pair = RegUsesMap.try_emplace(Reg);
417 RegSortData &RSD = Pair.first->second;
418 if (Pair.second)
419 RegSequence.push_back(Reg);
420 RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
421 RSD.UsedByIndices.set(LUIdx);
422}
423
424void
425RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
426 RegUsesTy::iterator It = RegUsesMap.find(Reg);
427 assert(It != RegUsesMap.end());
428 RegSortData &RSD = It->second;
429 assert(RSD.UsedByIndices.size() > LUIdx);
430 RSD.UsedByIndices.reset(LUIdx);
431}
432
433void
434RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
435 assert(LUIdx <= LastLUIdx);
436
437 // Update RegUses. The data structure is not optimized for this purpose;
438 // we must iterate through it and update each of the bit vectors.
439 for (auto &Pair : RegUsesMap) {
440 SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
441 if (LUIdx < UsedByIndices.size())
442 UsedByIndices[LUIdx] =
443 LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false;
444 UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
445 }
446}
447
448bool
449RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
450 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
451 if (I == RegUsesMap.end())
452 return false;
453 const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
454 int i = UsedByIndices.find_first();
455 if (i == -1) return false;
456 if ((size_t)i != LUIdx) return true;
457 return UsedByIndices.find_next(i) != -1;
458}
459
460const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
461 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
462 assert(I != RegUsesMap.end() && "Unknown register!");
463 return I->second.UsedByIndices;
464}
465
466void RegUseTracker::clear() {
467 RegUsesMap.clear();
468 RegSequence.clear();
469}
470
471namespace {
472
473/// This class holds information that describes a formula for computing
474/// satisfying a use. It may include broken-out immediates and scaled registers.
475struct Formula {
476 /// Global base address used for complex addressing.
477 GlobalValue *BaseGV = nullptr;
478
479 /// Base offset for complex addressing.
480 Immediate BaseOffset = Immediate::getZero();
481
482 /// Whether any complex addressing has a base register.
483 bool HasBaseReg = false;
484
485 /// The scale of any complex addressing.
486 int64_t Scale = 0;
487
488 /// The list of "base" registers for this use. When this is non-empty. The
489 /// canonical representation of a formula is
490 /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
491 /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
492 /// 3. The reg containing recurrent expr related with currect loop in the
493 /// formula should be put in the ScaledReg.
494 /// #1 enforces that the scaled register is always used when at least two
495 /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
496 /// #2 enforces that 1 * reg is reg.
497 /// #3 ensures invariant regs with respect to current loop can be combined
498 /// together in LSR codegen.
499 /// This invariant can be temporarily broken while building a formula.
500 /// However, every formula inserted into the LSRInstance must be in canonical
501 /// form.
503
504 /// The 'scaled' register for this use. This should be non-null when Scale is
505 /// not zero.
506 const SCEV *ScaledReg = nullptr;
507
508 /// An additional constant offset which added near the use. This requires a
509 /// temporary register, but the offset itself can live in an add immediate
510 /// field rather than a register.
511 Immediate UnfoldedOffset = Immediate::getZero();
512
513 Formula() = default;
514
515 void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
516
517 bool isCanonical(const Loop &L) const;
518
519 void canonicalize(const Loop &L);
520
521 bool unscale();
522
523 bool hasZeroEnd() const;
524
525 bool countsDownToZero() const;
526
527 size_t getNumRegs() const;
528 Type *getType() const;
529
530 void deleteBaseReg(const SCEV *&S);
531
532 bool referencesReg(const SCEV *S) const;
533 bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
534 const RegUseTracker &RegUses) const;
535
536 void print(raw_ostream &OS) const;
537 void dump() const;
538};
539
540} // end anonymous namespace
541
542/// Recursion helper for initialMatch.
543static void DoInitialMatch(const SCEV *S, Loop *L,
546 // Collect expressions which properly dominate the loop header.
547 if (SE.properlyDominates(S, L->getHeader())) {
548 Good.push_back(S);
549 return;
550 }
551
552 // Look at add operands.
553 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
554 for (const SCEV *S : Add->operands())
555 DoInitialMatch(S, L, Good, Bad, SE);
556 return;
557 }
558
559 // Look at addrec operands.
560 const SCEV *Start, *Step;
561 const Loop *ARLoop;
562 if (match(S,
563 m_scev_AffineAddRec(m_SCEV(Start), m_SCEV(Step), m_Loop(ARLoop))) &&
564 !Start->isZero()) {
565 DoInitialMatch(Start, L, Good, Bad, SE);
566 DoInitialMatch(SE.getAddRecExpr(SE.getConstant(S->getType(), 0), Step,
567 // FIXME: AR->getNoWrapFlags()
568 ARLoop, SCEV::FlagAnyWrap),
569 L, Good, Bad, SE);
570 return;
571 }
572
573 // Handle a multiplication by -1 (negation) if it didn't fold.
574 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
575 if (Mul->getOperand(0)->isAllOnesValue()) {
577 const SCEV *NewMul = SE.getMulExpr(Ops);
578
581 DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
582 const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
583 SE.getEffectiveSCEVType(NewMul->getType())));
584 for (const SCEV *S : MyGood)
585 Good.push_back(SE.getMulExpr(NegOne, S));
586 for (const SCEV *S : MyBad)
587 Bad.push_back(SE.getMulExpr(NegOne, S));
588 return;
589 }
590
591 // Ok, we can't do anything interesting. Just stuff the whole thing into a
592 // register and hope for the best.
593 Bad.push_back(S);
594}
595
596/// Incorporate loop-variant parts of S into this Formula, attempting to keep
597/// all loop-invariant and loop-computable values in a single base register.
598void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
601 DoInitialMatch(S, L, Good, Bad, SE);
602 if (!Good.empty()) {
603 const SCEV *Sum = SE.getAddExpr(Good);
604 if (!Sum->isZero())
605 BaseRegs.push_back(Sum);
606 HasBaseReg = true;
607 }
608 if (!Bad.empty()) {
609 const SCEV *Sum = SE.getAddExpr(Bad);
610 if (!Sum->isZero())
611 BaseRegs.push_back(Sum);
612 HasBaseReg = true;
613 }
614 canonicalize(*L);
615}
616
617static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L) {
618 return SCEVExprContains(S, [&L](const SCEV *S) {
619 return isa<SCEVAddRecExpr>(S) && (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
620 });
621}
622
623/// Check whether or not this formula satisfies the canonical
624/// representation.
625/// \see Formula::BaseRegs.
626bool Formula::isCanonical(const Loop &L) const {
627 assert((Scale == 0 || ScaledReg) &&
628 "ScaledReg must be non-null if Scale is non-zero");
629
630 if (!ScaledReg)
631 return BaseRegs.size() <= 1;
632
633 if (Scale != 1)
634 return true;
635
636 if (Scale == 1 && BaseRegs.empty())
637 return false;
638
639 if (containsAddRecDependentOnLoop(ScaledReg, L))
640 return true;
641
642 // If ScaledReg is not a recurrent expr, or it is but its loop is not current
643 // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
644 // loop, we want to swap the reg in BaseRegs with ScaledReg.
645 return none_of(BaseRegs, [&L](const SCEV *S) {
647 });
648}
649
650/// Helper method to morph a formula into its canonical representation.
651/// \see Formula::BaseRegs.
652/// Every formula having more than one base register, must use the ScaledReg
653/// field. Otherwise, we would have to do special cases everywhere in LSR
654/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
655/// On the other hand, 1*reg should be canonicalized into reg.
656void Formula::canonicalize(const Loop &L) {
657 if (isCanonical(L))
658 return;
659
660 if (BaseRegs.empty()) {
661 // No base reg? Use scale reg with scale = 1 as such.
662 assert(ScaledReg && "Expected 1*reg => reg");
663 assert(Scale == 1 && "Expected 1*reg => reg");
664 BaseRegs.push_back(ScaledReg);
665 Scale = 0;
666 ScaledReg = nullptr;
667 return;
668 }
669
670 // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
671 if (!ScaledReg) {
672 ScaledReg = BaseRegs.pop_back_val();
673 Scale = 1;
674 }
675
676 // If ScaledReg is an invariant with respect to L, find the reg from
677 // BaseRegs containing the recurrent expr related with Loop L. Swap the
678 // reg with ScaledReg.
679 if (!containsAddRecDependentOnLoop(ScaledReg, L)) {
680 auto I = find_if(BaseRegs, [&L](const SCEV *S) {
682 });
683 if (I != BaseRegs.end())
684 std::swap(ScaledReg, *I);
685 }
686 assert(isCanonical(L) && "Failed to canonicalize?");
687}
688
689/// Get rid of the scale in the formula.
690/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
691/// \return true if it was possible to get rid of the scale, false otherwise.
692/// \note After this operation the formula may not be in the canonical form.
693bool Formula::unscale() {
694 if (Scale != 1)
695 return false;
696 Scale = 0;
697 BaseRegs.push_back(ScaledReg);
698 ScaledReg = nullptr;
699 return true;
700}
701
702bool Formula::hasZeroEnd() const {
703 if (UnfoldedOffset || BaseOffset)
704 return false;
705 if (BaseRegs.size() != 1 || ScaledReg)
706 return false;
707 return true;
708}
709
710bool Formula::countsDownToZero() const {
711 if (!hasZeroEnd())
712 return false;
713 assert(BaseRegs.size() == 1 && "hasZeroEnd should mean one BaseReg");
714 const APInt *StepInt;
715 if (!match(BaseRegs[0], m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt))))
716 return false;
717 return StepInt->isNegative();
718}
719
720/// Return the total number of register operands used by this formula. This does
721/// not include register uses implied by non-constant addrec strides.
722size_t Formula::getNumRegs() const {
723 return !!ScaledReg + BaseRegs.size();
724}
725
726/// Return the type of this formula, if it has one, or null otherwise. This type
727/// is meaningless except for the bit size.
728Type *Formula::getType() const {
729 return !BaseRegs.empty() ? BaseRegs.front()->getType() :
730 ScaledReg ? ScaledReg->getType() :
731 BaseGV ? BaseGV->getType() :
732 nullptr;
733}
734
735/// Delete the given base reg from the BaseRegs list.
736void Formula::deleteBaseReg(const SCEV *&S) {
737 if (&S != &BaseRegs.back())
738 std::swap(S, BaseRegs.back());
739 BaseRegs.pop_back();
740}
741
742/// Test if this formula references the given register.
743bool Formula::referencesReg(const SCEV *S) const {
744 return S == ScaledReg || is_contained(BaseRegs, S);
745}
746
747/// Test whether this formula uses registers which are used by uses other than
748/// the use with the given index.
749bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
750 const RegUseTracker &RegUses) const {
751 if (ScaledReg)
752 if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
753 return true;
754 for (const SCEV *BaseReg : BaseRegs)
755 if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
756 return true;
757 return false;
758}
759
760#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
761void Formula::print(raw_ostream &OS) const {
762 ListSeparator Plus(" + ");
763 if (BaseGV) {
764 OS << Plus;
765 BaseGV->printAsOperand(OS, /*PrintType=*/false);
766 }
767 if (BaseOffset.isNonZero())
768 OS << Plus << BaseOffset;
769
770 for (const SCEV *BaseReg : BaseRegs)
771 OS << Plus << "reg(" << *BaseReg << ')';
772
773 if (HasBaseReg && BaseRegs.empty())
774 OS << Plus << "**error: HasBaseReg**";
775 else if (!HasBaseReg && !BaseRegs.empty())
776 OS << Plus << "**error: !HasBaseReg**";
777
778 if (Scale != 0) {
779 OS << Plus << Scale << "*reg(";
780 if (ScaledReg)
781 OS << *ScaledReg;
782 else
783 OS << "<unknown>";
784 OS << ')';
785 }
786 if (UnfoldedOffset.isNonZero())
787 OS << Plus << "imm(" << UnfoldedOffset << ')';
788}
789
790LLVM_DUMP_METHOD void Formula::dump() const {
791 print(errs()); errs() << '\n';
792}
793#endif
794
795/// Return true if the given addrec can be sign-extended without changing its
796/// value.
798 Type *WideTy =
800 return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
801}
802
803/// Return true if the given add can be sign-extended without changing its
804/// value.
805static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
806 Type *WideTy =
807 IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
808 return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
809}
810
811/// Return true if the given mul can be sign-extended without changing its
812/// value.
813static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
814 Type *WideTy =
816 SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
817 return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
818}
819
820/// Return an expression for LHS /s RHS, if it can be determined and if the
821/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
822/// is true, expressions like (X * Y) /s Y are simplified to X, ignoring that
823/// the multiplication may overflow, which is useful when the result will be
824/// used in a context where the most significant bits are ignored.
825static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
826 ScalarEvolution &SE,
827 bool IgnoreSignificantBits = false) {
828 // Handle the trivial case, which works for any SCEV type.
829 if (LHS == RHS)
830 return SE.getConstant(LHS->getType(), 1);
831
832 // Handle a few RHS special cases.
834 if (RC) {
835 const APInt &RA = RC->getAPInt();
836 // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
837 // some folding.
838 if (RA.isAllOnes()) {
839 if (LHS->getType()->isPointerTy())
840 return nullptr;
841 return SE.getMulExpr(LHS, RC);
842 }
843 // Handle x /s 1 as x.
844 if (RA == 1)
845 return LHS;
846 }
847
848 // Check for a division of a constant by a constant.
850 if (!RC)
851 return nullptr;
852 const APInt &LA = C->getAPInt();
853 const APInt &RA = RC->getAPInt();
854 if (LA.srem(RA) != 0)
855 return nullptr;
856 return SE.getConstant(LA.sdiv(RA));
857 }
858
859 // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
861 if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) {
862 const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
863 IgnoreSignificantBits);
864 if (!Step) return nullptr;
865 const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
866 IgnoreSignificantBits);
867 if (!Start) return nullptr;
868 // FlagNW is independent of the start value, step direction, and is
869 // preserved with smaller magnitude steps.
870 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
871 return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
872 }
873 return nullptr;
874 }
875
876 // Distribute the sdiv over add operands, if the add doesn't overflow.
878 if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
880 for (const SCEV *S : Add->operands()) {
881 const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
882 if (!Op) return nullptr;
883 Ops.push_back(Op);
884 }
885 return SE.getAddExpr(Ops);
886 }
887 return nullptr;
888 }
889
890 // Check for a multiply operand that we can pull RHS out of.
892 if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
893 // Handle special case C1*X*Y /s C2*X*Y.
894 if (const SCEVMulExpr *MulRHS = dyn_cast<SCEVMulExpr>(RHS)) {
895 if (IgnoreSignificantBits || isMulSExtable(MulRHS, SE)) {
896 const SCEVConstant *LC = dyn_cast<SCEVConstant>(Mul->getOperand(0));
897 const SCEVConstant *RC =
898 dyn_cast<SCEVConstant>(MulRHS->getOperand(0));
899 if (LC && RC) {
901 SmallVector<const SCEV *, 4> ROps(drop_begin(MulRHS->operands()));
902 if (LOps == ROps)
903 return getExactSDiv(LC, RC, SE, IgnoreSignificantBits);
904 }
905 }
906 }
907
909 bool Found = false;
910 for (const SCEV *S : Mul->operands()) {
911 if (!Found)
912 if (const SCEV *Q = getExactSDiv(S, RHS, SE,
913 IgnoreSignificantBits)) {
914 S = Q;
915 Found = true;
916 }
917 Ops.push_back(S);
918 }
919 return Found ? SE.getMulExpr(Ops) : nullptr;
920 }
921 return nullptr;
922 }
923
924 // Otherwise we don't know.
925 return nullptr;
926}
927
928/// Extracts an immediate operand from \p Ops and replaces the operand with
929/// zero. If \p PreferScalable is true and \p Ops contains both a scalable and
930/// non-scalable offsets, the scalable offset will be extracted.
932 ScalarEvolution &SE,
933 bool PreferScalable) {
934 const APInt *C;
935 SCEVUse *Op = nullptr;
936 Immediate Result = Immediate::getZero();
937
938 // Ops are sorted by their SCEVType (the order of SCEVTypes enum). So, for an
939 // AddExpr the possible order of operands is:
940 // Constant < VScale < Truncate < ZeroExtend < SignExtend < MulExpr < ...
941
942 // This means fixed-size immediates will always appear on the LHS:
943 SCEVUse &S = Ops.front();
944 if (match(S, m_scev_APInt(C)) && !C->isZero() &&
945 C->getSignificantBits() <= 64) {
946 Op = &S;
947 Result = Immediate::getFixed(C->getSExtValue());
948 }
949
950 // But scalable immediates, which are MulExpr(Vscale, Constant), can appear
951 // later in the operand list:
952 if (EnableVScaleImmediates && (Result.isZero() || PreferScalable)) {
953 for (SCEVUse &S : Ops) {
954 // We know anything past scMulExpr will not be a vscale immediate.
955 if (S->getSCEVType() > scMulExpr)
956 break;
958 Op = &S;
959 Result = Immediate::getScalable(C->getSExtValue());
960 break;
961 }
962 }
963 }
964
965 if (Result.isNonZero()) {
966 SCEVUse &S = *Op;
967 S = SE.getConstant(S->getType(), 0);
968 }
969
970 return Result;
971}
972
973/// If S involves the addition of a constant integer value, return that integer
974/// value, and mutate S to point to a new SCEV with that value excluded.
975static Immediate ExtractImmediate(SCEVUse &S, ScalarEvolution &SE,
976 bool PreferScalable = false) {
977 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
978 SmallVector<SCEVUse, 8> NewOps(Add->operands());
979 Immediate Result = ExtractImmediateOperand(NewOps, SE, PreferScalable);
980 if (Result.isZero())
981 Result = ExtractImmediate(NewOps.front(), SE, PreferScalable);
982 if (Result.isNonZero())
983 S = SE.getAddExpr(NewOps);
984 return Result;
985 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
986 SmallVector<SCEVUse, 8> NewOps(AR->operands());
987 Immediate Result = ExtractImmediate(NewOps.front(), SE, PreferScalable);
988 if (Result.isNonZero())
989 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
990 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
992 return Result;
993 }
994 return ExtractImmediateOperand({S}, SE, PreferScalable);
995}
996
997/// If S involves the addition of a GlobalValue address, return that symbol, and
998/// mutate S to point to a new SCEV with that value excluded.
1000 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
1001 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
1002 S = SE.getConstant(GV->getType(), 0);
1003 return GV;
1004 }
1005 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
1006 SmallVector<SCEVUse, 8> NewOps(Add->operands());
1007 GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
1008 if (Result)
1009 S = SE.getAddExpr(NewOps);
1010 return Result;
1011 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
1012 SmallVector<SCEVUse, 8> NewOps(AR->operands());
1013 GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
1014 if (Result)
1015 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
1016 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
1018 return Result;
1019 }
1020 return nullptr;
1021}
1022
1023/// Returns true if the specified instruction is using the specified value as an
1024/// address.
1026 Instruction *Inst, Value *OperandVal) {
1027 bool isAddress = isa<LoadInst>(Inst);
1028 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
1029 if (SI->getPointerOperand() == OperandVal)
1030 isAddress = true;
1031 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
1032 // Addressing modes can also be folded into prefetches and a variety
1033 // of intrinsics.
1034 switch (II->getIntrinsicID()) {
1035 case Intrinsic::memset:
1036 case Intrinsic::prefetch:
1037 case Intrinsic::masked_load:
1038 if (II->getArgOperand(0) == OperandVal)
1039 isAddress = true;
1040 break;
1041 case Intrinsic::masked_store:
1042 if (II->getArgOperand(1) == OperandVal)
1043 isAddress = true;
1044 break;
1045 case Intrinsic::memmove:
1046 case Intrinsic::memcpy:
1047 if (II->getArgOperand(0) == OperandVal ||
1048 II->getArgOperand(1) == OperandVal)
1049 isAddress = true;
1050 break;
1051 default: {
1052 MemIntrinsicInfo IntrInfo;
1053 if (TTI.getTgtMemIntrinsic(II, IntrInfo)) {
1054 if (IntrInfo.PtrVal == OperandVal)
1055 isAddress = true;
1056 }
1057 }
1058 }
1059 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1060 if (RMW->getPointerOperand() == OperandVal)
1061 isAddress = true;
1062 } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1063 if (CmpX->getPointerOperand() == OperandVal)
1064 isAddress = true;
1065 }
1066 return isAddress;
1067}
1068
1069/// Return the type of the memory being accessed.
1070static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
1071 Instruction *Inst, Value *OperandVal) {
1072 MemAccessTy AccessTy = MemAccessTy::getUnknown(Inst->getContext());
1073
1074 // First get the type of memory being accessed.
1075 if (Type *Ty = Inst->getAccessType())
1076 AccessTy.MemTy = Ty;
1077
1078 // Then get the pointer address space.
1079 if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
1080 AccessTy.AddrSpace = SI->getPointerAddressSpace();
1081 } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
1082 AccessTy.AddrSpace = LI->getPointerAddressSpace();
1083 } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1084 AccessTy.AddrSpace = RMW->getPointerAddressSpace();
1085 } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1086 AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
1087 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
1088 switch (II->getIntrinsicID()) {
1089 case Intrinsic::prefetch:
1090 case Intrinsic::memset:
1091 AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
1092 AccessTy.MemTy = OperandVal->getType();
1093 break;
1094 case Intrinsic::memmove:
1095 case Intrinsic::memcpy:
1096 AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace();
1097 AccessTy.MemTy = OperandVal->getType();
1098 break;
1099 case Intrinsic::masked_load:
1100 AccessTy.AddrSpace =
1101 II->getArgOperand(0)->getType()->getPointerAddressSpace();
1102 break;
1103 case Intrinsic::masked_store:
1104 AccessTy.AddrSpace =
1105 II->getArgOperand(1)->getType()->getPointerAddressSpace();
1106 break;
1107 default: {
1108 MemIntrinsicInfo IntrInfo;
1109 if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) {
1110 AccessTy.AddrSpace
1111 = IntrInfo.PtrVal->getType()->getPointerAddressSpace();
1112 }
1113
1114 break;
1115 }
1116 }
1117 }
1118
1119 return AccessTy;
1120}
1121
1122/// Return true if this AddRec is already a phi in its loop.
1123static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
1124 for (PHINode &PN : AR->getLoop()->getHeader()->phis()) {
1125 if (SE.isSCEVable(PN.getType()) &&
1126 (SE.getEffectiveSCEVType(PN.getType()) ==
1127 SE.getEffectiveSCEVType(AR->getType())) &&
1128 SE.getSCEV(&PN) == AR)
1129 return true;
1130 }
1131 return false;
1132}
1133
1134/// Check if expanding this expression is likely to incur significant cost. This
1135/// is tricky because SCEV doesn't track which expressions are actually computed
1136/// by the current IR.
1137///
1138/// We currently allow expansion of IV increments that involve adds,
1139/// multiplication by constants, and AddRecs from existing phis.
1140///
1141/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
1142/// obvious multiple of the UDivExpr.
1143static bool isHighCostExpansion(const SCEV *S,
1145 ScalarEvolution &SE) {
1146 // Zero/One operand expressions
1147 switch (S->getSCEVType()) {
1148 case scUnknown:
1149 case scConstant:
1150 case scVScale:
1151 return false;
1152 case scTruncate:
1153 return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
1154 Processed, SE);
1155 case scZeroExtend:
1156 return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
1157 Processed, SE);
1158 case scSignExtend:
1159 return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
1160 Processed, SE);
1161 default:
1162 break;
1163 }
1164
1165 if (!Processed.insert(S).second)
1166 return false;
1167
1168 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
1169 for (const SCEV *S : Add->operands()) {
1170 if (isHighCostExpansion(S, Processed, SE))
1171 return true;
1172 }
1173 return false;
1174 }
1175
1176 const SCEV *Op0, *Op1;
1177 if (match(S, m_scev_Mul(m_SCEV(Op0), m_SCEV(Op1)))) {
1178 // Multiplication by a constant is ok
1179 if (isa<SCEVConstant>(Op0))
1180 return isHighCostExpansion(Op1, Processed, SE);
1181
1182 // If we have the value of one operand, check if an existing
1183 // multiplication already generates this expression.
1184 if (const auto *U = dyn_cast<SCEVUnknown>(Op1)) {
1185 Value *UVal = U->getValue();
1186 for (User *UR : UVal->users()) {
1187 // If U is a constant, it may be used by a ConstantExpr.
1189 if (UI && UI->getOpcode() == Instruction::Mul &&
1190 SE.isSCEVable(UI->getType())) {
1191 return SE.getSCEV(UI) == S;
1192 }
1193 }
1194 }
1195 }
1196
1197 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
1198 if (isExistingPhi(AR, SE))
1199 return false;
1200 }
1201
1202 // Fow now, consider any other type of expression (div/mul/min/max) high cost.
1203 return true;
1204}
1205
1206namespace {
1207
1208class LSRUse;
1209
1210} // end anonymous namespace
1211
1212/// Check if the addressing mode defined by \p F is completely
1213/// folded in \p LU at isel time.
1214/// This includes address-mode folding and special icmp tricks.
1215/// This function returns true if \p LU can accommodate what \p F
1216/// defines and up to 1 base + 1 scaled + offset.
1217/// In other words, if \p F has several base registers, this function may
1218/// still return true. Therefore, users still need to account for
1219/// additional base registers and/or unfolded offsets to derive an
1220/// accurate cost model.
1221static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1222 const LSRUse &LU, const Formula &F);
1223
1224// Get the cost of the scaling factor used in F for LU.
1225static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI,
1226 const LSRUse &LU, const Formula &F,
1227 const Loop &L);
1228
1229namespace {
1230
1231/// This class is used to measure and compare candidate formulae.
1232class Cost {
1233 const Loop *L = nullptr;
1234 ScalarEvolution *SE = nullptr;
1235 const TargetTransformInfo *TTI = nullptr;
1236 TargetTransformInfo::LSRCost C;
1238
1239public:
1240 Cost() = delete;
1241 Cost(const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
1243 L(L), SE(&SE), TTI(&TTI), AMK(AMK) {
1244 C.Insns = 0;
1245 C.NumRegs = 0;
1246 C.AddRecCost = 0;
1247 C.NumIVMuls = 0;
1248 C.NumBaseAdds = 0;
1249 C.ImmCost = 0;
1250 C.SetupCost = 0;
1251 C.ScaleCost = 0;
1252 }
1253
1254 bool isLess(const Cost &Other) const;
1255
1256 void Lose();
1257
1258#ifndef NDEBUG
1259 // Once any of the metrics loses, they must all remain losers.
1260 bool isValid() {
1261 return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
1262 | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
1263 || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
1264 & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
1265 }
1266#endif
1267
1268 bool isLoser() {
1269 assert(isValid() && "invalid cost");
1270 return C.NumRegs == ~0u;
1271 }
1272
1273 void RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
1274 const DenseSet<const SCEV *> &VisitedRegs, const LSRUse &LU,
1275 bool HardwareLoopProfitable,
1276 SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
1277
1278 void print(raw_ostream &OS) const;
1279 void dump() const;
1280
1281private:
1282 void RateRegister(const Formula &F, const SCEV *Reg,
1283 SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
1284 bool HardwareLoopProfitable);
1285 void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1286 SmallPtrSetImpl<const SCEV *> &Regs,
1287 const LSRUse &LU, bool HardwareLoopProfitable,
1288 SmallPtrSetImpl<const SCEV *> *LoserRegs);
1289};
1290
1291/// An operand value in an instruction which is to be replaced with some
1292/// equivalent, possibly strength-reduced, replacement.
1293struct LSRFixup {
1294 /// The instruction which will be updated.
1295 Instruction *UserInst = nullptr;
1296
1297 /// The operand of the instruction which will be replaced. The operand may be
1298 /// used more than once; every instance will be replaced.
1299 Value *OperandValToReplace = nullptr;
1300
1301 /// If this user is to use the post-incremented value of an induction
1302 /// variable, this set is non-empty and holds the loops associated with the
1303 /// induction variable.
1304 PostIncLoopSet PostIncLoops;
1305
1306 /// A constant offset to be added to the LSRUse expression. This allows
1307 /// multiple fixups to share the same LSRUse with different offsets, for
1308 /// example in an unrolled loop.
1309 Immediate Offset = Immediate::getZero();
1310
1311 LSRFixup() = default;
1312
1313 bool isUseFullyOutsideLoop(const Loop *L) const;
1314
1315 void print(raw_ostream &OS) const;
1316 void dump() const;
1317};
1318
1319/// This class holds the state that LSR keeps for each use in IVUsers, as well
1320/// as uses invented by LSR itself. It includes information about what kinds of
1321/// things can be folded into the user, information about the user itself, and
1322/// information about how the use may be satisfied. TODO: Represent multiple
1323/// users of the same expression in common?
1324class LSRUse {
1325 DenseSet<SmallVector<const SCEV *, 4>> Uniquifier;
1326
1327public:
1328 /// An enum for a kind of use, indicating what types of scaled and immediate
1329 /// operands it might support.
1330 enum KindType {
1331 Basic, ///< A normal use, with no folding.
1332 Special, ///< A special case of basic, allowing -1 scales.
1333 Address, ///< An address use; folding according to TargetLowering
1334 ICmpZero ///< An equality icmp with both operands folded into one.
1335 // TODO: Add a generic icmp too?
1336 };
1337
1338 using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>;
1339
1340 KindType Kind;
1341 MemAccessTy AccessTy;
1342
1343 /// The list of operands which are to be replaced.
1345
1346 /// Keep track of the min and max offsets of the fixups.
1347 Immediate MinOffset = Immediate::getFixedMax();
1348 Immediate MaxOffset = Immediate::getFixedMin();
1349
1350 /// This records whether all of the fixups using this LSRUse are outside of
1351 /// the loop, in which case some special-case heuristics may be used.
1352 bool AllFixupsOutsideLoop = true;
1353
1354 /// This records whether all of the fixups using this LSRUse are unconditional
1355 /// within the loop, meaning they will be executed on every path to the loop
1356 /// latch. This includes fixups before early exits.
1357 bool AllFixupsUnconditional = true;
1358
1359 /// RigidFormula is set to true to guarantee that this use will be associated
1360 /// with a single formula--the one that initially matched. Some SCEV
1361 /// expressions cannot be expanded. This allows LSR to consider the registers
1362 /// used by those expressions without the need to expand them later after
1363 /// changing the formula.
1364 bool RigidFormula = false;
1365
1366 /// A list of ways to build a value that can satisfy this user. After the
1367 /// list is populated, one of these is selected heuristically and used to
1368 /// formulate a replacement for OperandValToReplace in UserInst.
1369 SmallVector<Formula, 12> Formulae;
1370
1371 /// The set of register candidates used by all formulae in this LSRUse.
1372 SmallPtrSet<const SCEV *, 4> Regs;
1373
1374 LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {}
1375
1376 LSRFixup &getNewFixup() {
1377 Fixups.push_back(LSRFixup());
1378 return Fixups.back();
1379 }
1380
1381 void pushFixup(LSRFixup &f) {
1382 Fixups.push_back(f);
1383 if (Immediate::isKnownGT(f.Offset, MaxOffset))
1384 MaxOffset = f.Offset;
1385 if (Immediate::isKnownLT(f.Offset, MinOffset))
1386 MinOffset = f.Offset;
1387 }
1388
1389 bool HasFormulaWithSameRegs(const Formula &F) const;
1390 float getNotSelectedProbability(const SCEV *Reg) const;
1391 bool InsertFormula(const Formula &F, const Loop &L);
1392 void DeleteFormula(Formula &F);
1393 void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
1394
1395 void print(raw_ostream &OS) const;
1396 void dump() const;
1397};
1398
1399} // end anonymous namespace
1400
1401static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1402 LSRUse::KindType Kind, MemAccessTy AccessTy,
1403 GlobalValue *BaseGV, Immediate BaseOffset,
1404 bool HasBaseReg, int64_t Scale,
1405 Instruction *Fixup = nullptr);
1406
1407static unsigned getSetupCost(const SCEV *Reg, unsigned Depth,
1408 const TargetTransformInfo &TTI) {
1409 if (isa<SCEVUnknown>(Reg))
1410 return 1;
1411 if (const auto *C = dyn_cast<SCEVConstant>(Reg)) {
1412 if (TTI.getIntImmCost(C->getAPInt(), C->getType(),
1415 return 0;
1416 return 1;
1417 }
1418 if (Depth == 0)
1419 return 0;
1420 if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
1421 return getSetupCost(S->getStart(), Depth - 1, TTI);
1422 if (auto S = dyn_cast<SCEVIntegralCastExpr>(Reg))
1423 return getSetupCost(S->getOperand(), Depth - 1, TTI);
1424 if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
1425 return std::accumulate(S->operands().begin(), S->operands().end(), 0,
1426 [&](unsigned i, const SCEV *Reg) {
1427 return i + getSetupCost(Reg, Depth - 1, TTI);
1428 });
1429 if (auto S = dyn_cast<SCEVUDivExpr>(Reg))
1430 return getSetupCost(S->getLHS(), Depth - 1, TTI) +
1431 getSetupCost(S->getRHS(), Depth - 1, TTI);
1432 return 0;
1433}
1434
1435/// Tally up interesting quantities from the given register.
1436void Cost::RateRegister(const Formula &F, const SCEV *Reg,
1437 SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
1438 bool HardwareLoopProfitable) {
1439 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
1440 // If this is an addrec for another loop, it should be an invariant
1441 // with respect to L since L is the innermost loop (at least
1442 // for now LSR only handles innermost loops).
1443 if (AR->getLoop() != L) {
1444 // If the AddRec exists, consider it's register free and leave it alone.
1445 if (isExistingPhi(AR, *SE) && !(AMK & TTI::AMK_PostIndexed))
1446 return;
1447
1448 // It is bad to allow LSR for current loop to add induction variables
1449 // for its sibling loops.
1450 if (!AR->getLoop()->contains(L)) {
1451 Lose();
1452 return;
1453 }
1454
1455 // Otherwise, it will be an invariant with respect to Loop L.
1456 ++C.NumRegs;
1457 return;
1458 }
1459
1460 unsigned LoopCost = 1;
1461 if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
1462 TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
1463 const SCEV *Start;
1464 const APInt *Step;
1465 if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_scev_APInt(Step)))) {
1466 // If the step size matches the base offset, we could use pre-indexed
1467 // addressing.
1468 bool CanPreIndex = (AMK & TTI::AMK_PreIndexed) &&
1469 F.BaseOffset.isFixed() &&
1470 *Step == F.BaseOffset.getFixedValue();
1471 bool CanPostIndex = (AMK & TTI::AMK_PostIndexed) &&
1472 !isa<SCEVConstant>(Start) &&
1473 SE->isLoopInvariant(Start, L);
1474 // We can only pre or post index when the load/store is unconditional.
1475 if ((CanPreIndex || CanPostIndex) && LU.AllFixupsUnconditional)
1476 LoopCost = 0;
1477 }
1478 }
1479
1480 // If the loop counts down to zero and we'll be using a hardware loop then
1481 // the addrec will be combined into the hardware loop instruction.
1482 if (LU.Kind == LSRUse::ICmpZero && F.countsDownToZero() &&
1483 HardwareLoopProfitable)
1484 LoopCost = 0;
1485 C.AddRecCost += LoopCost;
1486
1487 // Add the step value register, if it needs one.
1488 // TODO: The non-affine case isn't precisely modeled here.
1489 if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
1490 if (!Regs.count(AR->getOperand(1))) {
1491 RateRegister(F, AR->getOperand(1), Regs, LU, HardwareLoopProfitable);
1492 if (isLoser())
1493 return;
1494 }
1495 }
1496 }
1497 ++C.NumRegs;
1498
1499 // Rough heuristic; favor registers which don't require extra setup
1500 // instructions in the preheader.
1501 C.SetupCost += getSetupCost(Reg, SetupCostDepthLimit, *TTI);
1502 // Ensure we don't, even with the recusion limit, produce invalid costs.
1503 C.SetupCost = std::min<unsigned>(C.SetupCost, 1 << 16);
1504
1505 C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
1507}
1508
1509/// Record this register in the set. If we haven't seen it before, rate
1510/// it. Optional LoserRegs provides a way to declare any formula that refers to
1511/// one of those regs an instant loser.
1512void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1513 SmallPtrSetImpl<const SCEV *> &Regs,
1514 const LSRUse &LU, bool HardwareLoopProfitable,
1515 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1516 if (LoserRegs && LoserRegs->count(Reg)) {
1517 Lose();
1518 return;
1519 }
1520 if (Regs.insert(Reg).second) {
1521 RateRegister(F, Reg, Regs, LU, HardwareLoopProfitable);
1522 if (LoserRegs && isLoser())
1523 LoserRegs->insert(Reg);
1524 }
1525}
1526
1527void Cost::RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
1528 const DenseSet<const SCEV *> &VisitedRegs,
1529 const LSRUse &LU, bool HardwareLoopProfitable,
1530 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1531 if (isLoser())
1532 return;
1533 assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
1534 // Tally up the registers.
1535 unsigned PrevAddRecCost = C.AddRecCost;
1536 unsigned PrevNumRegs = C.NumRegs;
1537 unsigned PrevNumBaseAdds = C.NumBaseAdds;
1538 if (const SCEV *ScaledReg = F.ScaledReg) {
1539 if (VisitedRegs.count(ScaledReg)) {
1540 Lose();
1541 return;
1542 }
1543 RatePrimaryRegister(F, ScaledReg, Regs, LU, HardwareLoopProfitable,
1544 LoserRegs);
1545 if (isLoser())
1546 return;
1547 }
1548 for (const SCEV *BaseReg : F.BaseRegs) {
1549 if (VisitedRegs.count(BaseReg)) {
1550 Lose();
1551 return;
1552 }
1553 RatePrimaryRegister(F, BaseReg, Regs, LU, HardwareLoopProfitable,
1554 LoserRegs);
1555 if (isLoser())
1556 return;
1557 }
1558
1559 // Determine how many (unfolded) adds we'll need inside the loop.
1560 size_t NumBaseParts = F.getNumRegs();
1561 if (NumBaseParts > 1)
1562 // Do not count the base and a possible second register if the target
1563 // allows to fold 2 registers.
1564 C.NumBaseAdds +=
1565 NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F)));
1566 C.NumBaseAdds += (F.UnfoldedOffset.isNonZero());
1567
1568 // Accumulate non-free scaling amounts.
1569 C.ScaleCost += getScalingFactorCost(*TTI, LU, F, *L).getValue();
1570
1571 // Tally up the non-zero immediates.
1572 for (const LSRFixup &Fixup : LU.Fixups) {
1573 if (Fixup.Offset.isCompatibleImmediate(F.BaseOffset)) {
1574 Immediate Offset = Fixup.Offset.addUnsigned(F.BaseOffset);
1575 if (F.BaseGV)
1576 C.ImmCost += 64; // Handle symbolic values conservatively.
1577 // TODO: This should probably be the pointer size.
1578 else if (Offset.isNonZero())
1579 C.ImmCost +=
1580 APInt(64, Offset.getKnownMinValue(), true).getSignificantBits();
1581
1582 // Check with target if this offset with this instruction is
1583 // specifically not supported.
1584 if (LU.Kind == LSRUse::Address && Offset.isNonZero() &&
1585 !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1586 Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
1587 C.NumBaseAdds++;
1588 } else {
1589 // Incompatible immediate type, increase cost to avoid using
1590 C.ImmCost += 2048;
1591 }
1592 }
1593
1594 // If we don't count instruction cost exit here.
1595 if (!InsnsCost) {
1596 assert(isValid() && "invalid cost");
1597 return;
1598 }
1599
1600 // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
1601 // additional instruction (at least fill).
1602 // TODO: Need distinguish register class?
1603 unsigned TTIRegNum = TTI->getNumberOfRegisters(
1604 TTI->getRegisterClassForType(false, F.getType())) - 1;
1605 if (C.NumRegs > TTIRegNum) {
1606 // Cost already exceeded TTIRegNum, then only newly added register can add
1607 // new instructions.
1608 if (PrevNumRegs > TTIRegNum)
1609 C.Insns += (C.NumRegs - PrevNumRegs);
1610 else
1611 C.Insns += (C.NumRegs - TTIRegNum);
1612 }
1613
1614 // If ICmpZero formula ends with not 0, it could not be replaced by
1615 // just add or sub. We'll need to compare final result of AddRec.
1616 // That means we'll need an additional instruction. But if the target can
1617 // macro-fuse a compare with a branch, don't count this extra instruction.
1618 // For -10 + {0, +, 1}:
1619 // i = i + 1;
1620 // cmp i, 10
1621 //
1622 // For {-10, +, 1}:
1623 // i = i + 1;
1624 if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() &&
1625 !TTI->canMacroFuseCmp())
1626 C.Insns++;
1627 // Each new AddRec adds 1 instruction to calculation.
1628 C.Insns += (C.AddRecCost - PrevAddRecCost);
1629
1630 // BaseAdds adds instructions for unfolded registers.
1631 if (LU.Kind != LSRUse::ICmpZero)
1632 C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
1633 assert(isValid() && "invalid cost");
1634}
1635
1636/// Set this cost to a losing value.
1637void Cost::Lose() {
1638 C.Insns = std::numeric_limits<unsigned>::max();
1639 C.NumRegs = std::numeric_limits<unsigned>::max();
1640 C.AddRecCost = std::numeric_limits<unsigned>::max();
1641 C.NumIVMuls = std::numeric_limits<unsigned>::max();
1642 C.NumBaseAdds = std::numeric_limits<unsigned>::max();
1643 C.ImmCost = std::numeric_limits<unsigned>::max();
1644 C.SetupCost = std::numeric_limits<unsigned>::max();
1645 C.ScaleCost = std::numeric_limits<unsigned>::max();
1646}
1647
1648/// Choose the lower cost.
1649bool Cost::isLess(const Cost &Other) const {
1650 if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
1651 C.Insns != Other.C.Insns)
1652 return C.Insns < Other.C.Insns;
1653 return TTI->isLSRCostLess(C, Other.C);
1654}
1655
1656#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1657void Cost::print(raw_ostream &OS) const {
1658 if (InsnsCost)
1659 OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
1660 OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
1661 if (C.AddRecCost != 0)
1662 OS << ", with addrec cost " << C.AddRecCost;
1663 if (C.NumIVMuls != 0)
1664 OS << ", plus " << C.NumIVMuls << " IV mul"
1665 << (C.NumIVMuls == 1 ? "" : "s");
1666 if (C.NumBaseAdds != 0)
1667 OS << ", plus " << C.NumBaseAdds << " base add"
1668 << (C.NumBaseAdds == 1 ? "" : "s");
1669 if (C.ScaleCost != 0)
1670 OS << ", plus " << C.ScaleCost << " scale cost";
1671 if (C.ImmCost != 0)
1672 OS << ", plus " << C.ImmCost << " imm cost";
1673 if (C.SetupCost != 0)
1674 OS << ", plus " << C.SetupCost << " setup cost";
1675}
1676
1677LLVM_DUMP_METHOD void Cost::dump() const {
1678 print(errs()); errs() << '\n';
1679}
1680#endif
1681
1682/// Test whether this fixup always uses its value outside of the given loop.
1683bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
1684 // PHI nodes use their value in their incoming blocks.
1685 if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
1686 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
1687 if (PN->getIncomingValue(i) == OperandValToReplace &&
1688 L->contains(PN->getIncomingBlock(i)))
1689 return false;
1690 return true;
1691 }
1692
1693 return !L->contains(UserInst);
1694}
1695
1696#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1697void LSRFixup::print(raw_ostream &OS) const {
1698 OS << "UserInst=";
1699 // Store is common and interesting enough to be worth special-casing.
1700 if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
1701 OS << "store ";
1702 Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
1703 } else if (UserInst->getType()->isVoidTy())
1704 OS << UserInst->getOpcodeName();
1705 else
1706 UserInst->printAsOperand(OS, /*PrintType=*/false);
1707
1708 OS << ", OperandValToReplace=";
1709 OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
1710
1711 for (const Loop *PIL : PostIncLoops) {
1712 OS << ", PostIncLoop=";
1713 PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
1714 }
1715
1716 if (Offset.isNonZero())
1717 OS << ", Offset=" << Offset;
1718}
1719
1720LLVM_DUMP_METHOD void LSRFixup::dump() const {
1721 print(errs()); errs() << '\n';
1722}
1723#endif
1724
1725/// Test whether this use as a formula which has the same registers as the given
1726/// formula.
1727bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
1729 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1730 // Unstable sort by host order ok, because this is only used for uniquifying.
1731 llvm::sort(Key);
1732 return Uniquifier.count(Key);
1733}
1734
1735/// The function returns a probability of selecting formula without Reg.
1736float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
1737 unsigned FNum = 0;
1738 for (const Formula &F : Formulae)
1739 if (F.referencesReg(Reg))
1740 FNum++;
1741 return ((float)(Formulae.size() - FNum)) / Formulae.size();
1742}
1743
1744/// If the given formula has not yet been inserted, add it to the list, and
1745/// return true. Return false otherwise. The formula must be in canonical form.
1746bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
1747 assert(F.isCanonical(L) && "Invalid canonical representation");
1748
1749 if (!Formulae.empty() && RigidFormula)
1750 return false;
1751
1753 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1754 // Unstable sort by host order ok, because this is only used for uniquifying.
1755 llvm::sort(Key);
1756
1757 if (!Uniquifier.insert(Key).second)
1758 return false;
1759
1760 // Using a register to hold the value of 0 is not profitable.
1761 assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
1762 "Zero allocated in a scaled register!");
1763#ifndef NDEBUG
1764 for (const SCEV *BaseReg : F.BaseRegs)
1765 assert(!BaseReg->isZero() && "Zero allocated in a base register!");
1766#endif
1767
1768 // Add the formula to the list.
1769 Formulae.push_back(F);
1770
1771 // Record registers now being used by this use.
1772 Regs.insert_range(F.BaseRegs);
1773 if (F.ScaledReg)
1774 Regs.insert(F.ScaledReg);
1775
1776 return true;
1777}
1778
1779/// Remove the given formula from this use's list.
1780void LSRUse::DeleteFormula(Formula &F) {
1781 if (&F != &Formulae.back())
1782 std::swap(F, Formulae.back());
1783 Formulae.pop_back();
1784}
1785
1786/// Recompute the Regs field, and update RegUses.
1787void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
1788 // Now that we've filtered out some formulae, recompute the Regs set.
1789 SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
1790 Regs.clear();
1791 for (const Formula &F : Formulae) {
1792 if (F.ScaledReg) Regs.insert(F.ScaledReg);
1793 Regs.insert_range(F.BaseRegs);
1794 }
1795
1796 // Update the RegTracker.
1797 for (const SCEV *S : OldRegs)
1798 if (!Regs.count(S))
1799 RegUses.dropRegister(S, LUIdx);
1800}
1801
1802#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1803void LSRUse::print(raw_ostream &OS) const {
1804 OS << "LSR Use: Kind=";
1805 switch (Kind) {
1806 case Basic: OS << "Basic"; break;
1807 case Special: OS << "Special"; break;
1808 case ICmpZero: OS << "ICmpZero"; break;
1809 case Address:
1810 OS << "Address of ";
1811 if (AccessTy.MemTy->isPointerTy())
1812 OS << "pointer"; // the full pointer type could be really verbose
1813 else {
1814 OS << *AccessTy.MemTy;
1815 }
1816
1817 OS << " in addrspace(" << AccessTy.AddrSpace << ')';
1818 }
1819
1820 OS << ", Offsets={";
1821 bool NeedComma = false;
1822 for (const LSRFixup &Fixup : Fixups) {
1823 if (NeedComma) OS << ',';
1824 OS << Fixup.Offset;
1825 NeedComma = true;
1826 }
1827 OS << '}';
1828
1829 if (AllFixupsOutsideLoop)
1830 OS << ", all-fixups-outside-loop";
1831
1832 if (AllFixupsUnconditional)
1833 OS << ", all-fixups-unconditional";
1834}
1835
1836LLVM_DUMP_METHOD void LSRUse::dump() const {
1837 print(errs()); errs() << '\n';
1838}
1839#endif
1840
1842 LSRUse::KindType Kind, MemAccessTy AccessTy,
1843 GlobalValue *BaseGV, Immediate BaseOffset,
1844 bool HasBaseReg, int64_t Scale,
1845 Instruction *Fixup /* = nullptr */) {
1846 switch (Kind) {
1847 case LSRUse::Address: {
1848 int64_t FixedOffset =
1849 BaseOffset.isScalable() ? 0 : BaseOffset.getFixedValue();
1850 int64_t ScalableOffset =
1851 BaseOffset.isScalable() ? BaseOffset.getKnownMinValue() : 0;
1852 return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, FixedOffset,
1853 HasBaseReg, Scale, AccessTy.AddrSpace,
1854 Fixup, ScalableOffset);
1855 }
1856 case LSRUse::ICmpZero:
1857 // There's not even a target hook for querying whether it would be legal to
1858 // fold a GV into an ICmp.
1859 if (BaseGV)
1860 return false;
1861
1862 // ICmp only has two operands; don't allow more than two non-trivial parts.
1863 if (Scale != 0 && HasBaseReg && BaseOffset.isNonZero())
1864 return false;
1865
1866 // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1867 // putting the scaled register in the other operand of the icmp.
1868 if (Scale != 0 && Scale != -1)
1869 return false;
1870
1871 // If we have low-level target information, ask the target if it can fold an
1872 // integer immediate on an icmp.
1873 if (BaseOffset.isNonZero()) {
1874 // We don't have an interface to query whether the target supports
1875 // icmpzero against scalable quantities yet.
1876 if (BaseOffset.isScalable())
1877 return false;
1878
1879 // We have one of:
1880 // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
1881 // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
1882 // Offs is the ICmp immediate.
1883 if (Scale == 0)
1884 // The cast does the right thing with
1885 // std::numeric_limits<int64_t>::min().
1886 BaseOffset = BaseOffset.getFixed(-(uint64_t)BaseOffset.getFixedValue());
1887 return TTI.isLegalICmpImmediate(BaseOffset.getFixedValue());
1888 }
1889
1890 // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
1891 return true;
1892
1893 case LSRUse::Basic:
1894 // Only handle single-register values.
1895 return !BaseGV && Scale == 0 && BaseOffset.isZero();
1896
1897 case LSRUse::Special:
1898 // Special case Basic to handle -1 scales.
1899 return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset.isZero();
1900 }
1901
1902 llvm_unreachable("Invalid LSRUse Kind!");
1903}
1904
1906 Immediate MinOffset, Immediate MaxOffset,
1907 LSRUse::KindType Kind, MemAccessTy AccessTy,
1908 GlobalValue *BaseGV, Immediate BaseOffset,
1909 bool HasBaseReg, int64_t Scale) {
1910 if (BaseOffset.isNonZero() &&
1911 (BaseOffset.isScalable() != MinOffset.isScalable() ||
1912 BaseOffset.isScalable() != MaxOffset.isScalable()))
1913 return false;
1914 // Check for overflow.
1915 int64_t Base = BaseOffset.getKnownMinValue();
1916 int64_t Min = MinOffset.getKnownMinValue();
1917 int64_t Max = MaxOffset.getKnownMinValue();
1918 if (((int64_t)((uint64_t)Base + Min) > Base) != (Min > 0))
1919 return false;
1920 MinOffset = Immediate::get((uint64_t)Base + Min, MinOffset.isScalable());
1921 if (((int64_t)((uint64_t)Base + Max) > Base) != (Max > 0))
1922 return false;
1923 MaxOffset = Immediate::get((uint64_t)Base + Max, MaxOffset.isScalable());
1924
1925 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
1926 HasBaseReg, Scale) &&
1927 isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
1928 HasBaseReg, Scale);
1929}
1930
1932 Immediate MinOffset, Immediate MaxOffset,
1933 LSRUse::KindType Kind, MemAccessTy AccessTy,
1934 const Formula &F, const Loop &L) {
1935 // For the purpose of isAMCompletelyFolded either having a canonical formula
1936 // or a scale not equal to zero is correct.
1937 // Problems may arise from non canonical formulae having a scale == 0.
1938 // Strictly speaking it would best to just rely on canonical formulae.
1939 // However, when we generate the scaled formulae, we first check that the
1940 // scaling factor is profitable before computing the actual ScaledReg for
1941 // compile time sake.
1942 assert((F.isCanonical(L) || F.Scale != 0));
1943 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1944 F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
1945}
1946
1947/// Test whether we know how to expand the current formula.
1948static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1949 Immediate MaxOffset, LSRUse::KindType Kind,
1950 MemAccessTy AccessTy, GlobalValue *BaseGV,
1951 Immediate BaseOffset, bool HasBaseReg, int64_t Scale) {
1952 // We know how to expand completely foldable formulae.
1953 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1954 BaseOffset, HasBaseReg, Scale) ||
1955 // Or formulae that use a base register produced by a sum of base
1956 // registers.
1957 (Scale == 1 &&
1958 isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1959 BaseGV, BaseOffset, true, 0));
1960}
1961
1962static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1963 Immediate MaxOffset, LSRUse::KindType Kind,
1964 MemAccessTy AccessTy, const Formula &F) {
1965 return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
1966 F.BaseOffset, F.HasBaseReg, F.Scale);
1967}
1968
1970 Immediate Offset) {
1971 if (Offset.isScalable())
1972 return TTI.isLegalAddScalableImmediate(Offset.getKnownMinValue());
1973
1974 return TTI.isLegalAddImmediate(Offset.getFixedValue());
1975}
1976
1978 const LSRUse &LU, const Formula &F) {
1979 // Target may want to look at the user instructions.
1980 if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) {
1981 for (const LSRFixup &Fixup : LU.Fixups)
1982 if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1983 (F.BaseOffset + Fixup.Offset), F.HasBaseReg,
1984 F.Scale, Fixup.UserInst))
1985 return false;
1986 return true;
1987 }
1988
1989 return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1990 LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
1991 F.Scale);
1992}
1993
1995 const LSRUse &LU, const Formula &F,
1996 const Loop &L) {
1997 if (!F.Scale)
1998 return 0;
1999
2000 // If the use is not completely folded in that instruction, we will have to
2001 // pay an extra cost only for scale != 1.
2002 if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
2003 LU.AccessTy, F, L))
2004 return F.Scale != 1;
2005
2006 switch (LU.Kind) {
2007 case LSRUse::Address: {
2008 // Check the scaling factor cost with both the min and max offsets.
2009 int64_t ScalableMin = 0, ScalableMax = 0, FixedMin = 0, FixedMax = 0;
2010 if (F.BaseOffset.isScalable()) {
2011 ScalableMin = (F.BaseOffset + LU.MinOffset).getKnownMinValue();
2012 ScalableMax = (F.BaseOffset + LU.MaxOffset).getKnownMinValue();
2013 } else {
2014 FixedMin = (F.BaseOffset + LU.MinOffset).getFixedValue();
2015 FixedMax = (F.BaseOffset + LU.MaxOffset).getFixedValue();
2016 }
2017 InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost(
2018 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMin, ScalableMin),
2019 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
2020 InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost(
2021 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMax, ScalableMax),
2022 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
2023
2024 assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() &&
2025 "Legal addressing mode has an illegal cost!");
2026 return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
2027 }
2028 case LSRUse::ICmpZero:
2029 case LSRUse::Basic:
2030 case LSRUse::Special:
2031 // The use is completely folded, i.e., everything is folded into the
2032 // instruction.
2033 return 0;
2034 }
2035
2036 llvm_unreachable("Invalid LSRUse Kind!");
2037}
2038
2040 LSRUse::KindType Kind, MemAccessTy AccessTy,
2041 GlobalValue *BaseGV, Immediate BaseOffset,
2042 bool HasBaseReg) {
2043 // Fast-path: zero is always foldable.
2044 if (BaseOffset.isZero() && !BaseGV)
2045 return true;
2046
2047 // Conservatively, create an address with an immediate and a
2048 // base and a scale.
2049 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2050
2051 // Canonicalize a scale of 1 to a base register if the formula doesn't
2052 // already have a base register.
2053 if (!HasBaseReg && Scale == 1) {
2054 Scale = 0;
2055 HasBaseReg = true;
2056 }
2057
2058 // FIXME: Try with + without a scale? Maybe based on TTI?
2059 // I think basereg + scaledreg + immediateoffset isn't a good 'conservative'
2060 // default for many architectures, not just AArch64 SVE. More investigation
2061 // needed later to determine if this should be used more widely than just
2062 // on scalable types.
2063 if (HasBaseReg && BaseOffset.isNonZero() && Kind != LSRUse::ICmpZero &&
2064 AccessTy.MemTy && AccessTy.MemTy->isScalableTy() && DropScaledForVScale)
2065 Scale = 0;
2066
2067 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
2068 HasBaseReg, Scale);
2069}
2070
2072 ScalarEvolution &SE, Immediate MinOffset,
2073 Immediate MaxOffset, LSRUse::KindType Kind,
2074 MemAccessTy AccessTy, const SCEV *S,
2075 bool HasBaseReg) {
2076 // Fast-path: zero is always foldable.
2077 if (S->isZero()) return true;
2078
2079 // Conservatively, create an address with an immediate and a
2080 // base and a scale.
2081 SCEVUse SCopy = S;
2082 Immediate BaseOffset = ExtractImmediate(SCopy, SE);
2083 GlobalValue *BaseGV = ExtractSymbol(SCopy, SE);
2084
2085 // If there's anything else involved, it's not foldable.
2086 if (!SCopy->isZero())
2087 return false;
2088
2089 // Fast-path: zero is always foldable.
2090 if (BaseOffset.isZero() && !BaseGV)
2091 return true;
2092
2093 if (BaseOffset.isScalable())
2094 return false;
2095
2096 // Conservatively, create an address with an immediate and a
2097 // base and a scale.
2098 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2099
2100 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
2101 BaseOffset, HasBaseReg, Scale);
2102}
2103
2104namespace {
2105
2106/// An individual increment in a Chain of IV increments. Relate an IV user to
2107/// an expression that computes the IV it uses from the IV used by the previous
2108/// link in the Chain.
2109///
2110/// For the head of a chain, IncExpr holds the absolute SCEV expression for the
2111/// original IVOperand. The head of the chain's IVOperand is only valid during
2112/// chain collection, before LSR replaces IV users. During chain generation,
2113/// IncExpr can be used to find the new IVOperand that computes the same
2114/// expression.
2115struct IVInc {
2116 Instruction *UserInst;
2117 Value* IVOperand;
2118 const SCEV *IncExpr;
2119
2120 IVInc(Instruction *U, Value *O, const SCEV *E)
2121 : UserInst(U), IVOperand(O), IncExpr(E) {}
2122};
2123
2124// The list of IV increments in program order. We typically add the head of a
2125// chain without finding subsequent links.
2126struct IVChain {
2128 const SCEV *ExprBase = nullptr;
2129
2130 IVChain() = default;
2131 IVChain(const IVInc &Head, const SCEV *Base)
2132 : Incs(1, Head), ExprBase(Base) {}
2133
2134 using const_iterator = SmallVectorImpl<IVInc>::const_iterator;
2135
2136 // Return the first increment in the chain.
2137 const_iterator begin() const {
2138 assert(!Incs.empty());
2139 return std::next(Incs.begin());
2140 }
2141 const_iterator end() const {
2142 return Incs.end();
2143 }
2144
2145 // Returns true if this chain contains any increments.
2146 bool hasIncs() const { return Incs.size() >= 2; }
2147
2148 // Add an IVInc to the end of this chain.
2149 void add(const IVInc &X) { Incs.push_back(X); }
2150
2151 // Returns the last UserInst in the chain.
2152 Instruction *tailUserInst() const { return Incs.back().UserInst; }
2153
2154 // Returns true if IncExpr can be profitably added to this chain.
2155 bool isProfitableIncrement(const SCEV *OperExpr,
2156 const SCEV *IncExpr,
2157 ScalarEvolution&);
2158};
2159
2160/// Helper for CollectChains to track multiple IV increment uses. Distinguish
2161/// between FarUsers that definitely cross IV increments and NearUsers that may
2162/// be used between IV increments.
2163struct ChainUsers {
2164 SmallPtrSet<Instruction*, 4> FarUsers;
2165 SmallPtrSet<Instruction*, 4> NearUsers;
2166};
2167
2168/// This class holds state for the main loop strength reduction logic.
2169class LSRInstance {
2170 IVUsers &IU;
2171 ScalarEvolution &SE;
2172 DominatorTree &DT;
2173 LoopInfo &LI;
2174 AssumptionCache &AC;
2175 TargetLibraryInfo &TLI;
2176 const TargetTransformInfo &TTI;
2177 Loop *const L;
2178 MemorySSAUpdater *MSSAU;
2180 mutable SCEVExpander Rewriter;
2181 bool Changed = false;
2182 bool HardwareLoopProfitable = false;
2183
2184 /// This is the insert position that the current loop's induction variable
2185 /// increment should be placed. In simple loops, this is the latch block's
2186 /// terminator. But in more complicated cases, this is a position which will
2187 /// dominate all the in-loop post-increment users.
2188 Instruction *IVIncInsertPos = nullptr;
2189
2190 /// Interesting factors between use strides.
2191 ///
2192 /// We explicitly use a SetVector which contains a SmallSet, instead of the
2193 /// default, a SmallDenseSet, because we need to use the full range of
2194 /// int64_ts, and there's currently no good way of doing that with
2195 /// SmallDenseSet.
2196 SetVector<int64_t, SmallVector<int64_t, 8>, SmallSet<int64_t, 8>> Factors;
2197
2198 /// The cost of the current SCEV, the best solution by LSR will be dropped if
2199 /// the solution is not profitable.
2200 Cost BaselineCost;
2201
2202 /// Interesting use types, to facilitate truncation reuse.
2203 SmallSetVector<Type *, 4> Types;
2204
2205 /// The list of interesting uses.
2207
2208 /// Track which uses use which register candidates.
2209 RegUseTracker RegUses;
2210
2211 // Limit the number of chains to avoid quadratic behavior. We don't expect to
2212 // have more than a few IV increment chains in a loop. Missing a Chain falls
2213 // back to normal LSR behavior for those uses.
2214 static const unsigned MaxChains = 8;
2215
2216 /// IV users can form a chain of IV increments.
2218
2219 /// IV users that belong to profitable IVChains.
2220 SmallPtrSet<Use*, MaxChains> IVIncSet;
2221
2222 /// Induction variables that were generated and inserted by the SCEV Expander.
2223 SmallVector<llvm::WeakVH, 2> ScalarEvolutionIVs;
2224
2225 // Inserting instructions in the loop and using them as PHI's input could
2226 // break LCSSA in case if PHI's parent block is not a loop exit (i.e. the
2227 // corresponding incoming block is not loop exiting). So collect all such
2228 // instructions to form LCSSA for them later.
2229 SmallSetVector<Instruction *, 4> InsertedNonLCSSAInsts;
2230
2231 void OptimizeShadowIV();
2232 bool FindIVUserForCond(Instruction *Cond, IVStrideUse *&CondUse);
2233 Instruction *OptimizeMax(ICmpInst *Cond, IVStrideUse *&CondUse);
2234 void OptimizeLoopTermCond();
2235
2236 void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
2237 SmallVectorImpl<ChainUsers> &ChainUsersVec);
2238 void FinalizeChain(IVChain &Chain);
2239 void CollectChains();
2240 void GenerateIVChain(const IVChain &Chain,
2241 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2242
2243 void CollectInterestingTypesAndFactors();
2244 void CollectFixupsAndInitialFormulae();
2245
2246 // Support for sharing of LSRUses between LSRFixups.
2247 using UseMapTy = DenseMap<LSRUse::SCEVUseKindPair, size_t>;
2248 UseMapTy UseMap;
2249
2250 bool reconcileNewOffset(LSRUse &LU, Immediate NewOffset, bool HasBaseReg,
2251 LSRUse::KindType Kind, MemAccessTy AccessTy);
2252
2253 std::pair<size_t, Immediate> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
2254 MemAccessTy AccessTy);
2255
2256 void DeleteUse(LSRUse &LU, size_t LUIdx);
2257
2258 LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
2259
2260 void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2261 void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2262 void CountRegisters(const Formula &F, size_t LUIdx);
2263 bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
2264 bool IsFixupExecutedEachIncrement(const LSRFixup &LF) const;
2265
2266 void CollectLoopInvariantFixupsAndFormulae();
2267
2268 void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
2269 unsigned Depth = 0);
2270
2271 void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
2272 const Formula &Base, unsigned Depth,
2273 size_t Idx, bool IsScaledReg = false);
2274 void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
2275 void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2276 const Formula &Base, size_t Idx,
2277 bool IsScaledReg = false);
2278 void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2279 void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2280 const Formula &Base,
2281 const SmallVectorImpl<Immediate> &Worklist,
2282 size_t Idx, bool IsScaledReg = false);
2283 void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2284 void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2285 void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2286 void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
2287 void GenerateCrossUseConstantOffsets();
2288 void GenerateAllReuseFormulae();
2289
2290 void FilterOutUndesirableDedicatedRegisters();
2291
2292 size_t EstimateSearchSpaceComplexity() const;
2293 void NarrowSearchSpaceByDetectingSupersets();
2294 void NarrowSearchSpaceByCollapsingUnrolledCode();
2295 void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
2296 void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
2297 void NarrowSearchSpaceByFilterPostInc();
2298 void NarrowSearchSpaceByDeletingCostlyFormulas();
2299 void NarrowSearchSpaceByPickingWinnerRegs();
2300 void NarrowSearchSpaceUsingHeuristics();
2301
2302 void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
2303 Cost &SolutionCost,
2304 SmallVectorImpl<const Formula *> &Workspace,
2305 const Cost &CurCost,
2306 const SmallPtrSet<const SCEV *, 16> &CurRegs,
2307 DenseSet<const SCEV *> &VisitedRegs) const;
2308 void Solve(SmallVectorImpl<const Formula *> &Solution) const;
2309
2311 HoistInsertPosition(BasicBlock::iterator IP,
2312 const SmallVectorImpl<Instruction *> &Inputs) const;
2313 BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP,
2314 const LSRFixup &LF,
2315 const LSRUse &LU) const;
2316
2317 Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2319 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2320 void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
2321 const Formula &F,
2322 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2323 void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2324 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2325 void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
2326
2327public:
2328 LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
2329 LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC,
2330 TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU);
2331
2332 bool getChanged() const { return Changed; }
2333 const SmallVectorImpl<WeakVH> &getScalarEvolutionIVs() const {
2334 return ScalarEvolutionIVs;
2335 }
2336
2337 void print_factors_and_types(raw_ostream &OS) const;
2338 void print_fixups(raw_ostream &OS) const;
2339 void print_uses(raw_ostream &OS) const;
2340 void print(raw_ostream &OS) const;
2341 void dump() const;
2342};
2343
2344} // end anonymous namespace
2345
2346/// If IV is used in a int-to-float cast inside the loop then try to eliminate
2347/// the cast operation.
2348void LSRInstance::OptimizeShadowIV() {
2349 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2350 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2351 return;
2352
2353 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
2354 UI != E; /* empty */) {
2355 IVUsers::const_iterator CandidateUI = UI;
2356 ++UI;
2357 Instruction *ShadowUse = CandidateUI->getUser();
2358 Type *DestTy = nullptr;
2359 bool IsSigned = false;
2360
2361 /* If shadow use is a int->float cast then insert a second IV
2362 to eliminate this cast.
2363
2364 for (unsigned i = 0; i < n; ++i)
2365 foo((double)i);
2366
2367 is transformed into
2368
2369 double d = 0.0;
2370 for (unsigned i = 0; i < n; ++i, ++d)
2371 foo(d);
2372 */
2373 if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
2374 IsSigned = false;
2375 DestTy = UCast->getDestTy();
2376 }
2377 else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
2378 IsSigned = true;
2379 DestTy = SCast->getDestTy();
2380 }
2381 if (!DestTy) continue;
2382
2383 // If target does not support DestTy natively then do not apply
2384 // this transformation.
2385 if (!TTI.isTypeLegal(DestTy)) continue;
2386
2387 PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
2388 if (!PH) continue;
2389 if (PH->getNumIncomingValues() != 2) continue;
2390
2391 // If the calculation in integers overflows, the result in FP type will
2392 // differ. So we only can do this transformation if we are guaranteed to not
2393 // deal with overflowing values
2394 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH));
2395 if (!AR) continue;
2396 if (IsSigned && !AR->hasNoSignedWrap()) continue;
2397 if (!IsSigned && !AR->hasNoUnsignedWrap()) continue;
2398
2399 Type *SrcTy = PH->getType();
2400 int Mantissa = DestTy->getFPMantissaWidth();
2401 if (Mantissa == -1) continue;
2402 if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
2403 continue;
2404
2405 unsigned Entry, Latch;
2406 if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
2407 Entry = 0;
2408 Latch = 1;
2409 } else {
2410 Entry = 1;
2411 Latch = 0;
2412 }
2413
2414 ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
2415 if (!Init) continue;
2416 Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
2417 (double)Init->getSExtValue() :
2418 (double)Init->getZExtValue());
2419
2420 BinaryOperator *Incr =
2422 if (!Incr) continue;
2423 if (Incr->getOpcode() != Instruction::Add
2424 && Incr->getOpcode() != Instruction::Sub)
2425 continue;
2426
2427 /* Initialize new IV, double d = 0.0 in above example. */
2428 ConstantInt *C = nullptr;
2429 if (Incr->getOperand(0) == PH)
2431 else if (Incr->getOperand(1) == PH)
2433 else
2434 continue;
2435
2436 if (!C) continue;
2437
2438 // Ignore negative constants, as the code below doesn't handle them
2439 // correctly. TODO: Remove this restriction.
2440 if (!C->getValue().isStrictlyPositive())
2441 continue;
2442
2443 /* Add new PHINode. */
2444 PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH->getIterator());
2445 NewPH->setDebugLoc(PH->getDebugLoc());
2446
2447 /* create new increment. '++d' in above example. */
2448 Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
2449 BinaryOperator *NewIncr = BinaryOperator::Create(
2450 Incr->getOpcode() == Instruction::Add ? Instruction::FAdd
2451 : Instruction::FSub,
2452 NewPH, CFP, "IV.S.next.", Incr->getIterator());
2453 NewIncr->setDebugLoc(Incr->getDebugLoc());
2454
2455 NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
2456 NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
2457
2458 /* Remove cast operation */
2459 ShadowUse->replaceAllUsesWith(NewPH);
2460 ShadowUse->eraseFromParent();
2461 Changed = true;
2462 break;
2463 }
2464}
2465
2466/// If Cond has an operand that is an expression of an IV, set the IV user and
2467/// stride information and return true, otherwise return false.
2468bool LSRInstance::FindIVUserForCond(Instruction *Cond, IVStrideUse *&CondUse) {
2469 for (IVStrideUse &U : IU)
2470 if (U.getUser() == Cond) {
2471 // NOTE: we could handle setcc instructions with multiple uses here, but
2472 // InstCombine does it as well for simple uses, it's not clear that it
2473 // occurs enough in real life to handle.
2474 CondUse = &U;
2475 return true;
2476 }
2477 return false;
2478}
2479
2480/// Rewrite the loop's terminating condition if it uses a max computation.
2481///
2482/// This is a narrow solution to a specific, but acute, problem. For loops
2483/// like this:
2484///
2485/// i = 0;
2486/// do {
2487/// p[i] = 0.0;
2488/// } while (++i < n);
2489///
2490/// the trip count isn't just 'n', because 'n' might not be positive. And
2491/// unfortunately this can come up even for loops where the user didn't use
2492/// a C do-while loop. For example, seemingly well-behaved top-test loops
2493/// will commonly be lowered like this:
2494///
2495/// if (n > 0) {
2496/// i = 0;
2497/// do {
2498/// p[i] = 0.0;
2499/// } while (++i < n);
2500/// }
2501///
2502/// and then it's possible for subsequent optimization to obscure the if
2503/// test in such a way that indvars can't find it.
2504///
2505/// When indvars can't find the if test in loops like this, it creates a
2506/// max expression, which allows it to give the loop a canonical
2507/// induction variable:
2508///
2509/// i = 0;
2510/// max = n < 1 ? 1 : n;
2511/// do {
2512/// p[i] = 0.0;
2513/// } while (++i != max);
2514///
2515/// Canonical induction variables are necessary because the loop passes
2516/// are designed around them. The most obvious example of this is the
2517/// LoopInfo analysis, which doesn't remember trip count values. It
2518/// expects to be able to rediscover the trip count each time it is
2519/// needed, and it does this using a simple analysis that only succeeds if
2520/// the loop has a canonical induction variable.
2521///
2522/// However, when it comes time to generate code, the maximum operation
2523/// can be quite costly, especially if it's inside of an outer loop.
2524///
2525/// This function solves this problem by detecting this type of loop and
2526/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
2527/// the instructions for the maximum computation.
2528Instruction *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse *&CondUse) {
2529 // Check that the loop matches the pattern we're looking for.
2530 if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
2531 Cond->getPredicate() != CmpInst::ICMP_NE)
2532 return Cond;
2533
2534 SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
2535 if (!Sel || !Sel->hasOneUse()) return Cond;
2536
2537 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2538 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2539 return Cond;
2540 const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
2541
2542 // Add one to the backedge-taken count to get the trip count.
2543 const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
2544 if (IterationCount != SE.getSCEV(Sel)) return Cond;
2545
2546 // Check for a max calculation that matches the pattern. There's no check
2547 // for ICMP_ULE here because the comparison would be with zero, which
2548 // isn't interesting.
2549 CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
2550 const SCEVNAryExpr *Max = nullptr;
2551 if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
2552 Pred = ICmpInst::ICMP_SLE;
2553 Max = S;
2554 } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
2555 Pred = ICmpInst::ICMP_SLT;
2556 Max = S;
2557 } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
2558 Pred = ICmpInst::ICMP_ULT;
2559 Max = U;
2560 } else {
2561 // No match; bail.
2562 return Cond;
2563 }
2564
2565 // To handle a max with more than two operands, this optimization would
2566 // require additional checking and setup.
2567 if (Max->getNumOperands() != 2)
2568 return Cond;
2569
2570 const SCEV *MaxLHS = Max->getOperand(0);
2571 const SCEV *MaxRHS = Max->getOperand(1);
2572
2573 // ScalarEvolution canonicalizes constants to the left. For < and >, look
2574 // for a comparison with 1. For <= and >=, a comparison with zero.
2575 if (!MaxLHS ||
2576 (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
2577 return Cond;
2578
2579 // Check the relevant induction variable for conformance to
2580 // the pattern.
2581 const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
2582 if (!match(IV,
2584 return Cond;
2585
2586 assert(cast<SCEVAddRecExpr>(IV)->getLoop() == L &&
2587 "Loop condition operand is an addrec in a different loop!");
2588
2589 // Check the right operand of the select, and remember it, as it will
2590 // be used in the new comparison instruction.
2591 Value *NewRHS = nullptr;
2592 if (ICmpInst::isTrueWhenEqual(Pred)) {
2593 // Look for n+1, and grab n.
2594 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
2595 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2596 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2597 NewRHS = BO->getOperand(0);
2598 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
2599 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2600 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2601 NewRHS = BO->getOperand(0);
2602 if (!NewRHS)
2603 return Cond;
2604 } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
2605 NewRHS = Sel->getOperand(1);
2606 else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
2607 NewRHS = Sel->getOperand(2);
2608 else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
2609 NewRHS = SU->getValue();
2610 else
2611 // Max doesn't match expected pattern.
2612 return Cond;
2613
2614 // Determine the new comparison opcode. It may be signed or unsigned,
2615 // and the original comparison may be either equality or inequality.
2616 if (Cond->getPredicate() == CmpInst::ICMP_EQ)
2617 Pred = CmpInst::getInversePredicate(Pred);
2618
2619 // Ok, everything looks ok to change the condition into an SLT or SGE and
2620 // delete the max calculation.
2621 ICmpInst *NewCond = new ICmpInst(Cond->getIterator(), Pred,
2622 Cond->getOperand(0), NewRHS, "scmp");
2623
2624 // Delete the max calculation instructions.
2625 NewCond->setDebugLoc(Cond->getDebugLoc());
2626 Cond->replaceAllUsesWith(NewCond);
2627 CondUse->setUser(NewCond);
2629 Cond->eraseFromParent();
2630 Sel->eraseFromParent();
2631 if (Cmp->use_empty()) {
2632 salvageDebugInfo(*Cmp);
2633 Cmp->eraseFromParent();
2634 }
2635 return NewCond;
2636}
2637
2638/// Change loop terminating condition to use the postinc iv when possible.
2639void
2640LSRInstance::OptimizeLoopTermCond() {
2641 SmallPtrSet<Instruction *, 4> PostIncs;
2642
2643 // We need a different set of heuristics for rotated and non-rotated loops.
2644 // If a loop is rotated then the latch is also the backedge, so inserting
2645 // post-inc expressions just before the latch is ideal. To reduce live ranges
2646 // it also makes sense to rewrite terminating conditions to use post-inc
2647 // expressions.
2648 //
2649 // If the loop is not rotated then the latch is not a backedge; the latch
2650 // check is done in the loop head. Adding post-inc expressions before the
2651 // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
2652 // in the loop body. In this case we do *not* want to use post-inc expressions
2653 // in the latch check, and we want to insert post-inc expressions before
2654 // the backedge.
2655 BasicBlock *LatchBlock = L->getLoopLatch();
2656 SmallVector<BasicBlock*, 8> ExitingBlocks;
2657 L->getExitingBlocks(ExitingBlocks);
2658 if (!llvm::is_contained(ExitingBlocks, LatchBlock)) {
2659 // The backedge doesn't exit the loop; treat this as a head-tested loop.
2660 IVIncInsertPos = LatchBlock->getTerminator();
2661 return;
2662 }
2663
2664 // Otherwise treat this as a rotated loop.
2665 for (BasicBlock *ExitingBlock : ExitingBlocks) {
2666 // Get the terminating condition for the loop if possible. If we
2667 // can, we want to change it to use a post-incremented version of its
2668 // induction variable, to allow coalescing the live ranges for the IV into
2669 // one register value.
2670
2671 CondBrInst *TermBr = dyn_cast<CondBrInst>(ExitingBlock->getTerminator());
2672 if (!TermBr)
2673 continue;
2674
2676 // If the argument to TermBr is an extractelement, then the source of that
2677 // instruction is what's generated the condition.
2679 if (Extract)
2680 Cond = dyn_cast<Instruction>(Extract->getVectorOperand());
2681 // FIXME: We could do more here, like handling logical operations where one
2682 // side is a cmp that uses an induction variable.
2683 if (!Cond)
2684 continue;
2685
2686 // Search IVUsesByStride to find Cond's IVUse if there is one.
2687 IVStrideUse *CondUse = nullptr;
2688 if (!FindIVUserForCond(Cond, CondUse))
2689 continue;
2690
2691 // If the trip count is computed in terms of a max (due to ScalarEvolution
2692 // being unable to find a sufficient guard, for example), change the loop
2693 // comparison to use SLT or ULT instead of NE.
2694 // One consequence of doing this now is that it disrupts the count-down
2695 // optimization. That's not always a bad thing though, because in such
2696 // cases it may still be worthwhile to avoid a max.
2697 if (auto *Cmp = dyn_cast<ICmpInst>(Cond))
2698 Cond = OptimizeMax(Cmp, CondUse);
2699
2700 // If this exiting block dominates the latch block, it may also use
2701 // the post-inc value if it won't be shared with other uses.
2702 // Check for dominance.
2703 if (!DT.dominates(ExitingBlock, LatchBlock))
2704 continue;
2705
2706 // Conservatively avoid trying to use the post-inc value in non-latch
2707 // exits if there may be pre-inc users in intervening blocks.
2708 if (LatchBlock != ExitingBlock)
2709 for (const IVStrideUse &UI : IU)
2710 // Test if the use is reachable from the exiting block. This dominator
2711 // query is a conservative approximation of reachability.
2712 if (&UI != CondUse &&
2713 !DT.properlyDominates(UI.getUser()->getParent(), ExitingBlock)) {
2714 // Conservatively assume there may be reuse if the quotient of their
2715 // strides could be a legal scale.
2716 const SCEV *A = IU.getStride(*CondUse, L);
2717 const SCEV *B = IU.getStride(UI, L);
2718 if (!A || !B) continue;
2719 if (SE.getTypeSizeInBits(A->getType()) !=
2720 SE.getTypeSizeInBits(B->getType())) {
2721 if (SE.getTypeSizeInBits(A->getType()) >
2722 SE.getTypeSizeInBits(B->getType()))
2723 B = SE.getSignExtendExpr(B, A->getType());
2724 else
2725 A = SE.getSignExtendExpr(A, B->getType());
2726 }
2727 if (const SCEVConstant *D =
2729 const ConstantInt *C = D->getValue();
2730 // Stride of one or negative one can have reuse with non-addresses.
2731 if (C->isOne() || C->isMinusOne())
2732 goto decline_post_inc;
2733 // Avoid weird situations.
2734 if (C->getValue().getSignificantBits() >= 64 ||
2735 C->getValue().isMinSignedValue())
2736 goto decline_post_inc;
2737 // Check for possible scaled-address reuse.
2738 if (isAddressUse(TTI, UI.getUser(), UI.getOperandValToReplace())) {
2739 MemAccessTy AccessTy =
2740 getAccessType(TTI, UI.getUser(), UI.getOperandValToReplace());
2741 int64_t Scale = C->getSExtValue();
2742 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2743 /*BaseOffset=*/0,
2744 /*HasBaseReg=*/true, Scale,
2745 AccessTy.AddrSpace))
2746 goto decline_post_inc;
2747 Scale = -Scale;
2748 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2749 /*BaseOffset=*/0,
2750 /*HasBaseReg=*/true, Scale,
2751 AccessTy.AddrSpace))
2752 goto decline_post_inc;
2753 }
2754 }
2755 }
2756
2757 LLVM_DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: "
2758 << *Cond << '\n');
2759
2760 // It's possible for the setcc instruction to be anywhere in the loop, and
2761 // possible for it to have multiple users. If it is not immediately before
2762 // the exiting block branch, move it.
2763 if (isa_and_nonnull<CmpInst>(Cond) && Cond->getNextNode() != TermBr &&
2764 !Extract) {
2765 if (Cond->hasOneUse()) {
2766 Cond->moveBefore(TermBr->getIterator());
2767 } else {
2768 // Clone the terminating condition and insert into the loopend.
2769 Instruction *OldCond = Cond;
2770 Cond = Cond->clone();
2771 Cond->setName(L->getHeader()->getName() + ".termcond");
2772 Cond->insertInto(ExitingBlock, TermBr->getIterator());
2773
2774 // Clone the IVUse, as the old use still exists!
2775 CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
2776 TermBr->replaceUsesOfWith(OldCond, Cond);
2777 }
2778 }
2779
2780 // If we get to here, we know that we can transform the setcc instruction to
2781 // use the post-incremented version of the IV, allowing us to coalesce the
2782 // live ranges for the IV correctly.
2783 CondUse->transformToPostInc(L);
2784 Changed = true;
2785
2786 PostIncs.insert(Cond);
2787 decline_post_inc:;
2788 }
2789
2790 // Determine an insertion point for the loop induction variable increment. It
2791 // must dominate all the post-inc comparisons we just set up, and it must
2792 // dominate the loop latch edge.
2793 IVIncInsertPos = L->getLoopLatch()->getTerminator();
2794 for (Instruction *Inst : PostIncs)
2795 IVIncInsertPos = DT.findNearestCommonDominator(IVIncInsertPos, Inst);
2796}
2797
2798/// Determine if the given use can accommodate a fixup at the given offset and
2799/// other details. If so, update the use and return true.
2800bool LSRInstance::reconcileNewOffset(LSRUse &LU, Immediate NewOffset,
2801 bool HasBaseReg, LSRUse::KindType Kind,
2802 MemAccessTy AccessTy) {
2803 Immediate NewMinOffset = LU.MinOffset;
2804 Immediate NewMaxOffset = LU.MaxOffset;
2805 MemAccessTy NewAccessTy = AccessTy;
2806
2807 // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
2808 // something conservative, however this can pessimize in the case that one of
2809 // the uses will have all its uses outside the loop, for example.
2810 if (LU.Kind != Kind)
2811 return false;
2812
2813 // Check for a mismatched access type, and fall back conservatively as needed.
2814 // TODO: Be less conservative when the type is similar and can use the same
2815 // addressing modes.
2816 if (Kind == LSRUse::Address) {
2817 if (AccessTy.MemTy != LU.AccessTy.MemTy) {
2818 NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(),
2819 AccessTy.AddrSpace);
2820 }
2821 }
2822
2823 // Conservatively assume HasBaseReg is true for now.
2824 if (Immediate::isKnownLT(NewOffset, LU.MinOffset)) {
2825 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2826 LU.MaxOffset - NewOffset, HasBaseReg))
2827 return false;
2828 NewMinOffset = NewOffset;
2829 } else if (Immediate::isKnownGT(NewOffset, LU.MaxOffset)) {
2830 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2831 NewOffset - LU.MinOffset, HasBaseReg))
2832 return false;
2833 NewMaxOffset = NewOffset;
2834 }
2835
2836 // FIXME: We should be able to handle some level of scalable offset support
2837 // for 'void', but in order to get basic support up and running this is
2838 // being left out.
2839 if (NewAccessTy.MemTy && NewAccessTy.MemTy->isVoidTy() &&
2840 (NewMinOffset.isScalable() || NewMaxOffset.isScalable()))
2841 return false;
2842
2843 // Update the use.
2844 LU.MinOffset = NewMinOffset;
2845 LU.MaxOffset = NewMaxOffset;
2846 LU.AccessTy = NewAccessTy;
2847 return true;
2848}
2849
2850/// Return an LSRUse index and an offset value for a fixup which needs the given
2851/// expression, with the given kind and optional access type. Either reuse an
2852/// existing use or create a new one, as needed.
2853std::pair<size_t, Immediate> LSRInstance::getUse(const SCEV *&Expr,
2854 LSRUse::KindType Kind,
2855 MemAccessTy AccessTy) {
2856 const SCEV *Copy = Expr;
2857 SCEVUse ExprUse = Expr;
2858 Immediate Offset = ExtractImmediate(
2859 ExprUse, SE, AccessTy.MemTy && AccessTy.MemTy->isScalableTy());
2860 Expr = ExprUse;
2861
2862 // Basic uses can't accept any offset, for example.
2863 if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
2864 Offset, /*HasBaseReg=*/ true)) {
2865 Expr = Copy;
2866 Offset = Immediate::getFixed(0);
2867 }
2868
2869 std::pair<UseMapTy::iterator, bool> P =
2870 UseMap.try_emplace(LSRUse::SCEVUseKindPair(Expr, Kind));
2871 if (!P.second) {
2872 // A use already existed with this base.
2873 size_t LUIdx = P.first->second;
2874 LSRUse &LU = Uses[LUIdx];
2875 if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
2876 // Reuse this use.
2877 return std::make_pair(LUIdx, Offset);
2878 }
2879
2880 // Create a new use.
2881 size_t LUIdx = Uses.size();
2882 P.first->second = LUIdx;
2883 Uses.push_back(LSRUse(Kind, AccessTy));
2884 LSRUse &LU = Uses[LUIdx];
2885
2886 LU.MinOffset = Offset;
2887 LU.MaxOffset = Offset;
2888 return std::make_pair(LUIdx, Offset);
2889}
2890
2891/// Delete the given use from the Uses list.
2892void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
2893 if (&LU != &Uses.back())
2894 std::swap(LU, Uses.back());
2895 Uses.pop_back();
2896
2897 // Update RegUses.
2898 RegUses.swapAndDropUse(LUIdx, Uses.size());
2899}
2900
2901/// Look for a use distinct from OrigLU which is has a formula that has the same
2902/// registers as the given formula.
2903LSRUse *
2904LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
2905 const LSRUse &OrigLU) {
2906 // Search all uses for the formula. This could be more clever.
2907 for (LSRUse &LU : Uses) {
2908 // Check whether this use is close enough to OrigLU, to see whether it's
2909 // worthwhile looking through its formulae.
2910 // Ignore ICmpZero uses because they may contain formulae generated by
2911 // GenerateICmpZeroScales, in which case adding fixup offsets may
2912 // be invalid.
2913 if (&LU != &OrigLU && LU.Kind != LSRUse::ICmpZero &&
2914 LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
2915 LU.HasFormulaWithSameRegs(OrigF)) {
2916 // Scan through this use's formulae.
2917 for (const Formula &F : LU.Formulae) {
2918 // Check to see if this formula has the same registers and symbols
2919 // as OrigF.
2920 if (F.BaseRegs == OrigF.BaseRegs &&
2921 F.ScaledReg == OrigF.ScaledReg &&
2922 F.BaseGV == OrigF.BaseGV &&
2923 F.Scale == OrigF.Scale &&
2924 F.UnfoldedOffset == OrigF.UnfoldedOffset) {
2925 if (F.BaseOffset.isZero())
2926 return &LU;
2927 // This is the formula where all the registers and symbols matched;
2928 // there aren't going to be any others. Since we declined it, we
2929 // can skip the rest of the formulae and proceed to the next LSRUse.
2930 break;
2931 }
2932 }
2933 }
2934 }
2935
2936 // Nothing looked good.
2937 return nullptr;
2938}
2939
2940void LSRInstance::CollectInterestingTypesAndFactors() {
2941 SmallSetVector<const SCEV *, 4> Strides;
2942
2943 // Collect interesting types and strides.
2945 for (const IVStrideUse &U : IU) {
2946 const SCEV *Expr = IU.getExpr(U);
2947 if (!Expr)
2948 continue;
2949
2950 // Collect interesting types.
2951 Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
2952
2953 // Add strides for mentioned loops.
2954 Worklist.push_back(Expr);
2955 do {
2956 const SCEV *S = Worklist.pop_back_val();
2957 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2958 if (AR->getLoop() == L)
2959 Strides.insert(AR->getStepRecurrence(SE));
2960 Worklist.push_back(AR->getStart());
2961 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2962 append_range(Worklist, Add->operands());
2963 }
2964 } while (!Worklist.empty());
2965 }
2966
2967 // Compute interesting factors from the set of interesting strides.
2968 for (SmallSetVector<const SCEV *, 4>::const_iterator
2969 I = Strides.begin(), E = Strides.end(); I != E; ++I)
2970 for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
2971 std::next(I); NewStrideIter != E; ++NewStrideIter) {
2972 const SCEV *OldStride = *I;
2973 const SCEV *NewStride = *NewStrideIter;
2974
2975 if (SE.getTypeSizeInBits(OldStride->getType()) !=
2976 SE.getTypeSizeInBits(NewStride->getType())) {
2977 if (SE.getTypeSizeInBits(OldStride->getType()) >
2978 SE.getTypeSizeInBits(NewStride->getType()))
2979 NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2980 else
2981 OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2982 }
2983 if (const SCEVConstant *Factor =
2984 dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
2985 SE, true))) {
2986 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2987 Factors.insert(Factor->getAPInt().getSExtValue());
2988 } else if (const SCEVConstant *Factor =
2990 NewStride,
2991 SE, true))) {
2992 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2993 Factors.insert(Factor->getAPInt().getSExtValue());
2994 }
2995 }
2996
2997 // If all uses use the same type, don't bother looking for truncation-based
2998 // reuse.
2999 if (Types.size() == 1)
3000 Types.clear();
3001
3002 LLVM_DEBUG(print_factors_and_types(dbgs()));
3003}
3004
3005/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
3006/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
3007/// IVStrideUses, we could partially skip this.
3008static User::op_iterator
3010 Loop *L, ScalarEvolution &SE) {
3011 for(; OI != OE; ++OI) {
3012 if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
3013 if (!SE.isSCEVable(Oper->getType()))
3014 continue;
3015
3016 if (const SCEVAddRecExpr *AR =
3018 if (AR->getLoop() == L)
3019 break;
3020 }
3021 }
3022 }
3023 return OI;
3024}
3025
3026/// IVChain logic must consistently peek base TruncInst operands, so wrap it in
3027/// a convenient helper.
3029 if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
3030 return Trunc->getOperand(0);
3031 return Oper;
3032}
3033
3034/// Return an approximation of this SCEV expression's "base", or NULL for any
3035/// constant. Returning the expression itself is conservative. Returning a
3036/// deeper subexpression is more precise and valid as long as it isn't less
3037/// complex than another subexpression. For expressions involving multiple
3038/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
3039/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
3040/// IVInc==b-a.
3041///
3042/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
3043/// SCEVUnknown, we simply return the rightmost SCEV operand.
3044static const SCEV *getExprBase(const SCEV *S) {
3045 switch (S->getSCEVType()) {
3046 default: // including scUnknown.
3047 return S;
3048 case scConstant:
3049 case scVScale:
3050 return nullptr;
3051 case scTruncate:
3052 return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
3053 case scZeroExtend:
3054 return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
3055 case scSignExtend:
3056 return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
3057 case scAddExpr: {
3058 // Skip over scaled operands (scMulExpr) to follow add operands as long as
3059 // there's nothing more complex.
3060 // FIXME: not sure if we want to recognize negation.
3061 const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
3062 for (const SCEV *SubExpr : reverse(Add->operands())) {
3063 if (SubExpr->getSCEVType() == scAddExpr)
3064 return getExprBase(SubExpr);
3065
3066 if (SubExpr->getSCEVType() != scMulExpr)
3067 return SubExpr;
3068 }
3069 return S; // all operands are scaled, be conservative.
3070 }
3071 case scAddRecExpr:
3072 return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
3073 }
3074 llvm_unreachable("Unknown SCEV kind!");
3075}
3076
3077/// Return true if the chain increment is profitable to expand into a loop
3078/// invariant value, which may require its own register. A profitable chain
3079/// increment will be an offset relative to the same base. We allow such offsets
3080/// to potentially be used as chain increment as long as it's not obviously
3081/// expensive to expand using real instructions.
3082bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
3083 const SCEV *IncExpr,
3084 ScalarEvolution &SE) {
3085 // Aggressively form chains when -stress-ivchain.
3086 if (StressIVChain)
3087 return true;
3088
3089 // Do not replace a constant offset from IV head with a nonconstant IV
3090 // increment.
3091 if (!isa<SCEVConstant>(IncExpr)) {
3092 const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
3093 if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
3094 return false;
3095 }
3096
3097 SmallPtrSet<const SCEV*, 8> Processed;
3098 return !isHighCostExpansion(IncExpr, Processed, SE);
3099}
3100
3101/// Return true if the number of registers needed for the chain is estimated to
3102/// be less than the number required for the individual IV users. First prohibit
3103/// any IV users that keep the IV live across increments (the Users set should
3104/// be empty). Next count the number and type of increments in the chain.
3105///
3106/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
3107/// effectively use postinc addressing modes. Only consider it profitable it the
3108/// increments can be computed in fewer registers when chained.
3109///
3110/// TODO: Consider IVInc free if it's already used in another chains.
3111static bool isProfitableChain(IVChain &Chain,
3113 ScalarEvolution &SE,
3114 const TargetTransformInfo &TTI) {
3115 if (StressIVChain)
3116 return true;
3117
3118 if (!Chain.hasIncs())
3119 return false;
3120
3121 if (!Users.empty()) {
3122 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
3123 for (Instruction *Inst
3124 : Users) { dbgs() << " " << *Inst << "\n"; });
3125 return false;
3126 }
3127 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3128
3129 // The chain itself may require a register, so initialize cost to 1.
3130 int cost = 1;
3131
3132 // A complete chain likely eliminates the need for keeping the original IV in
3133 // a register. LSR does not currently know how to form a complete chain unless
3134 // the header phi already exists.
3135 if (isa<PHINode>(Chain.tailUserInst())
3136 && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
3137 --cost;
3138 }
3139 const SCEV *LastIncExpr = nullptr;
3140 unsigned NumConstIncrements = 0;
3141 unsigned NumVarIncrements = 0;
3142 unsigned NumReusedIncrements = 0;
3143
3144 if (TTI.isProfitableLSRChainElement(Chain.Incs[0].UserInst))
3145 return true;
3146
3147 for (const IVInc &Inc : Chain) {
3148 if (TTI.isProfitableLSRChainElement(Inc.UserInst))
3149 return true;
3150 if (Inc.IncExpr->isZero())
3151 continue;
3152
3153 // Incrementing by zero or some constant is neutral. We assume constants can
3154 // be folded into an addressing mode or an add's immediate operand.
3155 if (isa<SCEVConstant>(Inc.IncExpr)) {
3156 ++NumConstIncrements;
3157 continue;
3158 }
3159
3160 if (Inc.IncExpr == LastIncExpr)
3161 ++NumReusedIncrements;
3162 else
3163 ++NumVarIncrements;
3164
3165 LastIncExpr = Inc.IncExpr;
3166 }
3167 // An IV chain with a single increment is handled by LSR's postinc
3168 // uses. However, a chain with multiple increments requires keeping the IV's
3169 // value live longer than it needs to be if chained.
3170 if (NumConstIncrements > 1)
3171 --cost;
3172
3173 // Materializing increment expressions in the preheader that didn't exist in
3174 // the original code may cost a register. For example, sign-extended array
3175 // indices can produce ridiculous increments like this:
3176 // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
3177 cost += NumVarIncrements;
3178
3179 // Reusing variable increments likely saves a register to hold the multiple of
3180 // the stride.
3181 cost -= NumReusedIncrements;
3182
3183 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
3184 << "\n");
3185
3186 return cost < 0;
3187}
3188
3189/// Add this IV user to an existing chain or make it the head of a new chain.
3190void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
3191 SmallVectorImpl<ChainUsers> &ChainUsersVec) {
3192 // When IVs are used as types of varying widths, they are generally converted
3193 // to a wider type with some uses remaining narrow under a (free) trunc.
3194 Value *const NextIV = getWideOperand(IVOper);
3195 const SCEV *const OperExpr = SE.getSCEV(NextIV);
3196 const SCEV *const OperExprBase = getExprBase(OperExpr);
3197
3198 // Visit all existing chains. Check if its IVOper can be computed as a
3199 // profitable loop invariant increment from the last link in the Chain.
3200 unsigned ChainIdx = 0, NChains = IVChainVec.size();
3201 const SCEV *LastIncExpr = nullptr;
3202 for (; ChainIdx < NChains; ++ChainIdx) {
3203 IVChain &Chain = IVChainVec[ChainIdx];
3204
3205 // Prune the solution space aggressively by checking that both IV operands
3206 // are expressions that operate on the same unscaled SCEVUnknown. This
3207 // "base" will be canceled by the subsequent getMinusSCEV call. Checking
3208 // first avoids creating extra SCEV expressions.
3209 if (!StressIVChain && Chain.ExprBase != OperExprBase)
3210 continue;
3211
3212 Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
3213 if (PrevIV->getType() != NextIV->getType())
3214 continue;
3215
3216 // A phi node terminates a chain.
3217 if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
3218 continue;
3219
3220 // The increment must be loop-invariant so it can be kept in a register.
3221 const SCEV *PrevExpr = SE.getSCEV(PrevIV);
3222 const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
3223 if (isa<SCEVCouldNotCompute>(IncExpr) || !SE.isLoopInvariant(IncExpr, L))
3224 continue;
3225
3226 if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
3227 LastIncExpr = IncExpr;
3228 break;
3229 }
3230 }
3231 // If we haven't found a chain, create a new one, unless we hit the max. Don't
3232 // bother for phi nodes, because they must be last in the chain.
3233 if (ChainIdx == NChains) {
3234 if (isa<PHINode>(UserInst))
3235 return;
3236 if (NChains >= MaxChains && !StressIVChain) {
3237 LLVM_DEBUG(dbgs() << "IV Chain Limit\n");
3238 return;
3239 }
3240 LastIncExpr = OperExpr;
3241 // IVUsers may have skipped over sign/zero extensions. We don't currently
3242 // attempt to form chains involving extensions unless they can be hoisted
3243 // into this loop's AddRec.
3244 if (!isa<SCEVAddRecExpr>(LastIncExpr))
3245 return;
3246 ++NChains;
3247 IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
3248 OperExprBase));
3249 ChainUsersVec.resize(NChains);
3250 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
3251 << ") IV=" << *LastIncExpr << "\n");
3252 } else {
3253 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst
3254 << ") IV+" << *LastIncExpr << "\n");
3255 // Add this IV user to the end of the chain.
3256 IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
3257 }
3258 IVChain &Chain = IVChainVec[ChainIdx];
3259
3260 SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
3261 // This chain's NearUsers become FarUsers.
3262 if (!LastIncExpr->isZero()) {
3263 ChainUsersVec[ChainIdx].FarUsers.insert_range(NearUsers);
3264 NearUsers.clear();
3265 }
3266
3267 // All other uses of IVOperand become near uses of the chain.
3268 // We currently ignore intermediate values within SCEV expressions, assuming
3269 // they will eventually be used be the current chain, or can be computed
3270 // from one of the chain increments. To be more precise we could
3271 // transitively follow its user and only add leaf IV users to the set.
3272 for (User *U : IVOper->users()) {
3273 Instruction *OtherUse = dyn_cast<Instruction>(U);
3274 if (!OtherUse)
3275 continue;
3276 // Uses in the chain will no longer be uses if the chain is formed.
3277 // Include the head of the chain in this iteration (not Chain.begin()).
3278 IVChain::const_iterator IncIter = Chain.Incs.begin();
3279 IVChain::const_iterator IncEnd = Chain.Incs.end();
3280 for( ; IncIter != IncEnd; ++IncIter) {
3281 if (IncIter->UserInst == OtherUse)
3282 break;
3283 }
3284 if (IncIter != IncEnd)
3285 continue;
3286
3287 if (SE.isSCEVable(OtherUse->getType())
3288 && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
3289 && IU.isIVUserOrOperand(OtherUse)) {
3290 continue;
3291 }
3292 NearUsers.insert(OtherUse);
3293 }
3294
3295 // Since this user is part of the chain, it's no longer considered a use
3296 // of the chain.
3297 ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
3298}
3299
3300/// Populate the vector of Chains.
3301///
3302/// This decreases ILP at the architecture level. Targets with ample registers,
3303/// multiple memory ports, and no register renaming probably don't want
3304/// this. However, such targets should probably disable LSR altogether.
3305///
3306/// The job of LSR is to make a reasonable choice of induction variables across
3307/// the loop. Subsequent passes can easily "unchain" computation exposing more
3308/// ILP *within the loop* if the target wants it.
3309///
3310/// Finding the best IV chain is potentially a scheduling problem. Since LSR
3311/// will not reorder memory operations, it will recognize this as a chain, but
3312/// will generate redundant IV increments. Ideally this would be corrected later
3313/// by a smart scheduler:
3314/// = A[i]
3315/// = A[i+x]
3316/// A[i] =
3317/// A[i+x] =
3318///
3319/// TODO: Walk the entire domtree within this loop, not just the path to the
3320/// loop latch. This will discover chains on side paths, but requires
3321/// maintaining multiple copies of the Chains state.
3322void LSRInstance::CollectChains() {
3323 LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n");
3324 SmallVector<ChainUsers, 8> ChainUsersVec;
3325
3326 SmallVector<BasicBlock *,8> LatchPath;
3327 BasicBlock *LoopHeader = L->getHeader();
3328 for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
3329 Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
3330 LatchPath.push_back(Rung->getBlock());
3331 }
3332 LatchPath.push_back(LoopHeader);
3333
3334 // Walk the instruction stream from the loop header to the loop latch.
3335 for (BasicBlock *BB : reverse(LatchPath)) {
3336 for (Instruction &I : *BB) {
3337 // Skip instructions that weren't seen by IVUsers analysis.
3338 if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I))
3339 continue;
3340
3341 // Skip ephemeral values, as they don't produce real code.
3342 if (IU.isEphemeral(&I))
3343 continue;
3344
3345 // Ignore users that are part of a SCEV expression. This way we only
3346 // consider leaf IV Users. This effectively rediscovers a portion of
3347 // IVUsers analysis but in program order this time.
3348 if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I)))
3349 continue;
3350
3351 // Remove this instruction from any NearUsers set it may be in.
3352 for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
3353 ChainIdx < NChains; ++ChainIdx) {
3354 ChainUsersVec[ChainIdx].NearUsers.erase(&I);
3355 }
3356 // Search for operands that can be chained.
3357 SmallPtrSet<Instruction*, 4> UniqueOperands;
3358 User::op_iterator IVOpEnd = I.op_end();
3359 User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE);
3360 while (IVOpIter != IVOpEnd) {
3361 Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
3362 if (UniqueOperands.insert(IVOpInst).second)
3363 ChainInstruction(&I, IVOpInst, ChainUsersVec);
3364 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3365 }
3366 } // Continue walking down the instructions.
3367 } // Continue walking down the domtree.
3368 // Visit phi backedges to determine if the chain can generate the IV postinc.
3369 for (PHINode &PN : L->getHeader()->phis()) {
3370 if (!SE.isSCEVable(PN.getType()))
3371 continue;
3372
3373 Instruction *IncV =
3374 dyn_cast<Instruction>(PN.getIncomingValueForBlock(L->getLoopLatch()));
3375 if (IncV)
3376 ChainInstruction(&PN, IncV, ChainUsersVec);
3377 }
3378 // Remove any unprofitable chains.
3379 unsigned ChainIdx = 0;
3380 for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
3381 UsersIdx < NChains; ++UsersIdx) {
3382 if (!isProfitableChain(IVChainVec[UsersIdx],
3383 ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
3384 continue;
3385 // Preserve the chain at UsesIdx.
3386 if (ChainIdx != UsersIdx)
3387 IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
3388 FinalizeChain(IVChainVec[ChainIdx]);
3389 ++ChainIdx;
3390 }
3391 IVChainVec.resize(ChainIdx);
3392}
3393
3394void LSRInstance::FinalizeChain(IVChain &Chain) {
3395 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3396 LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
3397
3398 for (const IVInc &Inc : Chain) {
3399 LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
3400 auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
3401 assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
3402 IVIncSet.insert(UseI);
3403 }
3404}
3405
3406/// Return true if the IVInc can be folded into an addressing mode.
3407static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
3408 Value *Operand, const TargetTransformInfo &TTI) {
3409 const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
3410 Immediate IncOffset = Immediate::getZero();
3411 if (IncConst) {
3412 if (IncConst && IncConst->getAPInt().getSignificantBits() > 64)
3413 return false;
3414 IncOffset = Immediate::getFixed(IncConst->getValue()->getSExtValue());
3415 } else {
3416 // Look for mul(vscale, constant), to detect a scalable offset.
3417 const APInt *C;
3418 if (!match(IncExpr, m_scev_Mul(m_scev_APInt(C), m_SCEVVScale())) ||
3419 C->getSignificantBits() > 64)
3420 return false;
3421 IncOffset = Immediate::getScalable(C->getSExtValue());
3422 }
3423
3424 if (!isAddressUse(TTI, UserInst, Operand))
3425 return false;
3426
3427 MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
3428 if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
3429 IncOffset, /*HasBaseReg=*/false))
3430 return false;
3431
3432 return true;
3433}
3434
3435/// Generate an add or subtract for each IVInc in a chain to materialize the IV
3436/// user's operand from the previous IV user's operand.
3437void LSRInstance::GenerateIVChain(const IVChain &Chain,
3438 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
3439 // Find the new IVOperand for the head of the chain. It may have been replaced
3440 // by LSR.
3441 const IVInc &Head = Chain.Incs[0];
3442 User::op_iterator IVOpEnd = Head.UserInst->op_end();
3443 // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
3444 User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
3445 IVOpEnd, L, SE);
3446 Value *IVSrc = nullptr;
3447 while (IVOpIter != IVOpEnd) {
3448 IVSrc = getWideOperand(*IVOpIter);
3449
3450 // If this operand computes the expression that the chain needs, we may use
3451 // it. (Check this after setting IVSrc which is used below.)
3452 //
3453 // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
3454 // narrow for the chain, so we can no longer use it. We do allow using a
3455 // wider phi, assuming the LSR checked for free truncation. In that case we
3456 // should already have a truncate on this operand such that
3457 // getSCEV(IVSrc) == IncExpr.
3458 if (SE.getSCEV(*IVOpIter) == Head.IncExpr
3459 || SE.getSCEV(IVSrc) == Head.IncExpr) {
3460 break;
3461 }
3462 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3463 }
3464 if (IVOpIter == IVOpEnd) {
3465 // Gracefully give up on this chain.
3466 LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
3467 return;
3468 }
3469 assert(IVSrc && "Failed to find IV chain source");
3470
3471 LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
3472 Type *IVTy = IVSrc->getType();
3473 Type *IntTy = SE.getEffectiveSCEVType(IVTy);
3474 const SCEV *LeftOverExpr = nullptr;
3475 const SCEV *Accum = SE.getZero(IntTy);
3477 Bases.emplace_back(Accum, IVSrc);
3478
3479 for (const IVInc &Inc : Chain) {
3480 Instruction *InsertPt = Inc.UserInst;
3481 if (isa<PHINode>(InsertPt))
3482 InsertPt = L->getLoopLatch()->getTerminator();
3483
3484 // IVOper will replace the current IV User's operand. IVSrc is the IV
3485 // value currently held in a register.
3486 Value *IVOper = IVSrc;
3487 if (!Inc.IncExpr->isZero()) {
3488 // IncExpr was the result of subtraction of two narrow values, so must
3489 // be signed.
3490 const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
3491 Accum = SE.getAddExpr(Accum, IncExpr);
3492 LeftOverExpr = LeftOverExpr ?
3493 SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
3494 }
3495
3496 // Look through each base to see if any can produce a nice addressing mode.
3497 bool FoundBase = false;
3498 for (auto [MapScev, MapIVOper] : reverse(Bases)) {
3499 const SCEV *Remainder = SE.getMinusSCEV(Accum, MapScev);
3500 if (canFoldIVIncExpr(Remainder, Inc.UserInst, Inc.IVOperand, TTI)) {
3501 if (!Remainder->isZero()) {
3502 Rewriter.clearPostInc();
3503 Value *IncV = Rewriter.expandCodeFor(Remainder, IntTy, InsertPt);
3504 const SCEV *IVOperExpr =
3505 SE.getAddExpr(SE.getUnknown(MapIVOper), SE.getUnknown(IncV));
3506 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3507 } else {
3508 IVOper = MapIVOper;
3509 }
3510
3511 FoundBase = true;
3512 break;
3513 }
3514 }
3515 if (!FoundBase && LeftOverExpr && !LeftOverExpr->isZero()) {
3516 // Expand the IV increment.
3517 Rewriter.clearPostInc();
3518 Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
3519 const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
3520 SE.getUnknown(IncV));
3521 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3522
3523 // If an IV increment can't be folded, use it as the next IV value.
3524 if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
3525 assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
3526 Bases.emplace_back(Accum, IVOper);
3527 IVSrc = IVOper;
3528 LeftOverExpr = nullptr;
3529 }
3530 }
3531 Type *OperTy = Inc.IVOperand->getType();
3532 if (IVTy != OperTy) {
3533 assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
3534 "cannot extend a chained IV");
3535 IRBuilder<> Builder(InsertPt);
3536 IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
3537 }
3538 Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
3539 if (auto *OperandIsInstr = dyn_cast<Instruction>(Inc.IVOperand))
3540 DeadInsts.emplace_back(OperandIsInstr);
3541 }
3542 // If LSR created a new, wider phi, we may also replace its postinc. We only
3543 // do this if we also found a wide value for the head of the chain.
3544 if (isa<PHINode>(Chain.tailUserInst())) {
3545 for (PHINode &Phi : L->getHeader()->phis()) {
3546 if (Phi.getType() != IVSrc->getType())
3547 continue;
3549 Phi.getIncomingValueForBlock(L->getLoopLatch()));
3550 if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
3551 continue;
3552 Value *IVOper = IVSrc;
3553 Type *PostIncTy = PostIncV->getType();
3554 if (IVTy != PostIncTy) {
3555 assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
3556 IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
3557 Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
3558 IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
3559 }
3560 Phi.replaceUsesOfWith(PostIncV, IVOper);
3561 DeadInsts.emplace_back(PostIncV);
3562 }
3563 }
3564}
3565
3566void LSRInstance::CollectFixupsAndInitialFormulae() {
3567 CondBrInst *ExitBranch = nullptr;
3568 bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &TLI);
3569
3570 // For calculating baseline cost
3571 SmallPtrSet<const SCEV *, 16> Regs;
3572 DenseSet<const SCEV *> VisitedRegs;
3573 DenseSet<size_t> VisitedLSRUse;
3574
3575 for (const IVStrideUse &U : IU) {
3576 Instruction *UserInst = U.getUser();
3577 // Skip IV users that are part of profitable IV Chains.
3578 User::op_iterator UseI =
3579 find(UserInst->operands(), U.getOperandValToReplace());
3580 assert(UseI != UserInst->op_end() && "cannot find IV operand");
3581 if (IVIncSet.count(UseI)) {
3582 LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
3583 continue;
3584 }
3585
3586 LSRUse::KindType Kind = LSRUse::Basic;
3587 MemAccessTy AccessTy;
3588 if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
3589 Kind = LSRUse::Address;
3590 AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace());
3591 }
3592
3593 const SCEV *S = IU.getExpr(U);
3594 if (!S)
3595 continue;
3596 PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
3597
3598 // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
3599 // (N - i == 0), and this allows (N - i) to be the expression that we work
3600 // with rather than just N or i, so we can consider the register
3601 // requirements for both N and i at the same time. Limiting this code to
3602 // equality icmps is not a problem because all interesting loops use
3603 // equality icmps, thanks to IndVarSimplify.
3604 if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) {
3605 // If CI can be saved in some target, like replaced inside hardware loop
3606 // in PowerPC, no need to generate initial formulae for it.
3607 if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))
3608 continue;
3609 if (CI->isEquality()) {
3610 // Swap the operands if needed to put the OperandValToReplace on the
3611 // left, for consistency.
3612 Value *NV = CI->getOperand(1);
3613 if (NV == U.getOperandValToReplace()) {
3614 CI->setOperand(1, CI->getOperand(0));
3615 CI->setOperand(0, NV);
3616 NV = CI->getOperand(1);
3617 Changed = true;
3618 }
3619
3620 // x == y --> x - y == 0
3621 const SCEV *N = SE.getSCEV(NV);
3622 if (SE.isLoopInvariant(N, L) && Rewriter.isSafeToExpand(N) &&
3623 (!NV->getType()->isPointerTy() ||
3624 SE.getPointerBase(N) == SE.getPointerBase(S))) {
3625 // S is normalized, so normalize N before folding it into S
3626 // to keep the result normalized.
3627 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3628 if (!N)
3629 continue;
3630 Kind = LSRUse::ICmpZero;
3631 S = SE.getMinusSCEV(N, S);
3632 } else if (L->isLoopInvariant(NV) &&
3633 (!isa<Instruction>(NV) ||
3634 DT.dominates(cast<Instruction>(NV), L->getHeader())) &&
3635 !NV->getType()->isPointerTy()) {
3636 // If we can't generally expand the expression (e.g. it contains
3637 // a divide), but it is already at a loop invariant point before the
3638 // loop, wrap it in an unknown (to prevent the expander from trying
3639 // to re-expand in a potentially unsafe way.) The restriction to
3640 // integer types is required because the unknown hides the base, and
3641 // SCEV can't compute the difference of two unknown pointers.
3642 N = SE.getUnknown(NV);
3643 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3644 if (!N)
3645 continue;
3646 Kind = LSRUse::ICmpZero;
3647 S = SE.getMinusSCEV(N, S);
3649 }
3650
3651 // -1 and the negations of all interesting strides (except the negation
3652 // of -1) are now also interesting.
3653 for (size_t i = 0, e = Factors.size(); i != e; ++i)
3654 if (Factors[i] != -1)
3655 Factors.insert(-(uint64_t)Factors[i]);
3656 Factors.insert(-1);
3657 }
3658 }
3659
3660 // Get or create an LSRUse.
3661 std::pair<size_t, Immediate> P = getUse(S, Kind, AccessTy);
3662 size_t LUIdx = P.first;
3663 Immediate Offset = P.second;
3664 LSRUse &LU = Uses[LUIdx];
3665
3666 // Record the fixup.
3667 LSRFixup &LF = LU.getNewFixup();
3668 LF.UserInst = UserInst;
3669 LF.OperandValToReplace = U.getOperandValToReplace();
3670 LF.PostIncLoops = TmpPostIncLoops;
3671 LF.Offset = Offset;
3672 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3673 LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);
3674
3675 // Create SCEV as Formula for calculating baseline cost
3676 if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
3677 Formula F;
3678 F.initialMatch(S, L, SE);
3679 BaselineCost.RateFormula(F, Regs, VisitedRegs, LU,
3680 HardwareLoopProfitable);
3681 VisitedLSRUse.insert(LUIdx);
3682 }
3683
3684 // If this is the first use of this LSRUse, give it a formula.
3685 if (LU.Formulae.empty()) {
3686 InsertInitialFormula(S, LU, LUIdx);
3687 CountRegisters(LU.Formulae.back(), LUIdx);
3688 }
3689 }
3690
3691 LLVM_DEBUG(print_fixups(dbgs()));
3692}
3693
3694/// Insert a formula for the given expression into the given use, separating out
3695/// loop-variant portions from loop-invariant and loop-computable portions.
3696void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU,
3697 size_t LUIdx) {
3698 // Mark uses whose expressions cannot be expanded.
3699 if (!Rewriter.isSafeToExpand(S))
3700 LU.RigidFormula = true;
3701
3702 Formula F;
3703 F.initialMatch(S, L, SE);
3704 bool Inserted = InsertFormula(LU, LUIdx, F);
3705 assert(Inserted && "Initial formula already exists!"); (void)Inserted;
3706}
3707
3708/// Insert a simple single-register formula for the given expression into the
3709/// given use.
3710void
3711LSRInstance::InsertSupplementalFormula(const SCEV *S,
3712 LSRUse &LU, size_t LUIdx) {
3713 Formula F;
3714 F.BaseRegs.push_back(S);
3715 F.HasBaseReg = true;
3716 bool Inserted = InsertFormula(LU, LUIdx, F);
3717 assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
3718}
3719
3720/// Note which registers are used by the given formula, updating RegUses.
3721void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
3722 if (F.ScaledReg)
3723 RegUses.countRegister(F.ScaledReg, LUIdx);
3724 for (const SCEV *BaseReg : F.BaseRegs)
3725 RegUses.countRegister(BaseReg, LUIdx);
3726}
3727
3728/// If the given formula has not yet been inserted, add it to the list, and
3729/// return true. Return false otherwise.
3730bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
3731 // Do not insert formula that we will not be able to expand.
3732 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
3733 "Formula is illegal");
3734
3735 if (!LU.InsertFormula(F, *L))
3736 return false;
3737
3738 CountRegisters(F, LUIdx);
3739 return true;
3740}
3741
3742/// Test whether this fixup will be executed each time the corresponding IV
3743/// increment instruction is executed.
3744bool LSRInstance::IsFixupExecutedEachIncrement(const LSRFixup &LF) const {
3745 // If the fixup block dominates the IV increment block then there is no path
3746 // through the loop to the increment that doesn't pass through the fixup.
3747 return DT.dominates(LF.UserInst->getParent(), IVIncInsertPos->getParent());
3748}
3749
3750/// Check for other uses of loop-invariant values which we're tracking. These
3751/// other uses will pin these values in registers, making them less profitable
3752/// for elimination.
3753/// TODO: This currently misses non-constant addrec step registers.
3754/// TODO: Should this give more weight to users inside the loop?
3755void
3756LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
3757 SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
3758 SmallPtrSet<const SCEV *, 32> Visited;
3759
3760 // Don't collect outside uses if we are favoring postinc - the instructions in
3761 // the loop are more important than the ones outside of it.
3762 if (AMK == TTI::AMK_PostIndexed)
3763 return;
3764
3765 while (!Worklist.empty()) {
3766 const SCEV *S = Worklist.pop_back_val();
3767
3768 // Don't process the same SCEV twice
3769 if (!Visited.insert(S).second)
3770 continue;
3771
3772 if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
3773 append_range(Worklist, N->operands());
3774 else if (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(S))
3775 Worklist.push_back(C->getOperand());
3776 else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
3777 Worklist.push_back(D->getLHS());
3778 Worklist.push_back(D->getRHS());
3779 } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
3780 const Value *V = US->getValue();
3781 if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
3782 // Look for instructions defined outside the loop.
3783 if (L->contains(Inst)) continue;
3784 } else if (isa<Constant>(V))
3785 // Constants can be re-materialized.
3786 continue;
3787 for (const Use &U : V->uses()) {
3788 const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
3789 // Ignore non-instructions.
3790 if (!UserInst)
3791 continue;
3792 // Don't bother if the instruction is an EHPad.
3793 if (UserInst->isEHPad())
3794 continue;
3795 // Ignore instructions in other functions (as can happen with
3796 // Constants).
3797 if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
3798 continue;
3799 // Ignore instructions not dominated by the loop.
3800 const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
3801 UserInst->getParent() :
3802 cast<PHINode>(UserInst)->getIncomingBlock(
3804 if (!DT.dominates(L->getHeader(), UseBB))
3805 continue;
3806 // Don't bother if the instruction is in a BB which ends in an EHPad.
3807 if (UseBB->getTerminator()->isEHPad())
3808 continue;
3809
3810 // Ignore cases in which the currently-examined value could come from
3811 // a basic block terminated with an EHPad. This checks all incoming
3812 // blocks of the phi node since it is possible that the same incoming
3813 // value comes from multiple basic blocks, only some of which may end
3814 // in an EHPad. If any of them do, a subsequent rewrite attempt by this
3815 // pass would try to insert instructions into an EHPad, hitting an
3816 // assertion.
3817 if (isa<PHINode>(UserInst)) {
3818 const auto *PhiNode = cast<PHINode>(UserInst);
3819 bool HasIncompatibleEHPTerminatedBlock = false;
3820 llvm::Value *ExpectedValue = U;
3821 for (unsigned int I = 0; I < PhiNode->getNumIncomingValues(); I++) {
3822 if (PhiNode->getIncomingValue(I) == ExpectedValue) {
3823 if (PhiNode->getIncomingBlock(I)->getTerminator()->isEHPad()) {
3824 HasIncompatibleEHPTerminatedBlock = true;
3825 break;
3826 }
3827 }
3828 }
3829 if (HasIncompatibleEHPTerminatedBlock) {
3830 continue;
3831 }
3832 }
3833
3834 // Don't bother rewriting PHIs in catchswitch blocks.
3835 if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
3836 continue;
3837 // Ignore uses which are part of other SCEV expressions, to avoid
3838 // analyzing them multiple times.
3839 if (SE.isSCEVable(UserInst->getType())) {
3840 const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
3841 // If the user is a no-op, look through to its uses.
3842 if (!isa<SCEVUnknown>(UserS))
3843 continue;
3844 if (UserS == US) {
3845 Worklist.push_back(
3846 SE.getUnknown(const_cast<Instruction *>(UserInst)));
3847 continue;
3848 }
3849 }
3850 // Ignore icmp instructions which are already being analyzed.
3851 if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
3852 unsigned OtherIdx = !U.getOperandNo();
3853 Value *OtherOp = ICI->getOperand(OtherIdx);
3854 if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
3855 continue;
3856 }
3857
3858 // Do not consider uses inside lifetime intrinsics. These are not
3859 // actually materialized.
3860 if (UserInst->isLifetimeStartOrEnd())
3861 continue;
3862
3863 std::pair<size_t, Immediate> P =
3864 getUse(S, LSRUse::Basic, MemAccessTy());
3865 size_t LUIdx = P.first;
3866 Immediate Offset = P.second;
3867 LSRUse &LU = Uses[LUIdx];
3868 LSRFixup &LF = LU.getNewFixup();
3869 LF.UserInst = const_cast<Instruction *>(UserInst);
3870 LF.OperandValToReplace = U;
3871 LF.Offset = Offset;
3872 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3873 LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);
3874 InsertSupplementalFormula(US, LU, LUIdx);
3875 CountRegisters(LU.Formulae.back(), Uses.size() - 1);
3876 break;
3877 }
3878 }
3879 }
3880}
3881
3882/// Split S into subexpressions which can be pulled out into separate
3883/// registers. If C is non-null, multiply each subexpression by C.
3884///
3885/// Return remainder expression after factoring the subexpressions captured by
3886/// Ops. If Ops is complete, return NULL.
3887static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
3889 const Loop *L,
3890 ScalarEvolution &SE,
3891 unsigned Depth = 0) {
3892 // Arbitrarily cap recursion to protect compile time.
3893 if (Depth >= 3)
3894 return S;
3895
3896 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
3897 // Break out add operands.
3898 for (const SCEV *S : Add->operands()) {
3899 const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
3900 if (Remainder)
3901 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3902 }
3903 return nullptr;
3904 }
3905 const SCEV *Start, *Step;
3906 const SCEVConstant *Op0;
3907 const SCEV *Op1;
3908 if (match(S, m_scev_AffineAddRec(m_SCEV(Start), m_SCEV(Step)))) {
3909 // Split a non-zero base out of an addrec.
3910 if (Start->isZero())
3911 return S;
3912
3913 const SCEV *Remainder = CollectSubexprs(Start, C, Ops, L, SE, Depth + 1);
3914 // Split the non-zero AddRec unless it is part of a nested recurrence that
3915 // does not pertain to this loop.
3916 if (Remainder && (cast<SCEVAddRecExpr>(S)->getLoop() == L ||
3917 !isa<SCEVAddRecExpr>(Remainder))) {
3918 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3919 Remainder = nullptr;
3920 }
3921 if (Remainder != Start) {
3922 if (!Remainder)
3923 Remainder = SE.getConstant(S->getType(), 0);
3924 return SE.getAddRecExpr(Remainder, Step,
3925 cast<SCEVAddRecExpr>(S)->getLoop(),
3926 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
3928 }
3929 } else if (match(S, m_scev_Mul(m_SCEVConstant(Op0), m_SCEV(Op1)))) {
3930 // Break (C * (a + b + c)) into C*a + C*b + C*c.
3931 C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
3932 const SCEV *Remainder = CollectSubexprs(Op1, C, Ops, L, SE, Depth + 1);
3933 if (Remainder)
3934 Ops.push_back(SE.getMulExpr(C, Remainder));
3935 return nullptr;
3936 }
3937 return S;
3938}
3939
3940/// Return true if the SCEV represents a value that may end up as a
3941/// post-increment operation.
3943 LSRUse &LU, const SCEV *S, const Loop *L,
3944 ScalarEvolution &SE) {
3945 if (LU.Kind != LSRUse::Address ||
3946 !LU.AccessTy.getType()->isIntOrIntVectorTy())
3947 return false;
3948 const SCEV *Start;
3949 if (!match(S, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant())))
3950 return false;
3951 // Check if a post-indexed load/store can be used.
3952 if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, S->getType()) ||
3953 TTI.isIndexedStoreLegal(TTI.MIM_PostInc, S->getType())) {
3954 if (!isa<SCEVConstant>(Start) && SE.isLoopInvariant(Start, L))
3955 return true;
3956 }
3957 return false;
3958}
3959
3960/// Helper function for LSRInstance::GenerateReassociations.
3961void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
3962 const Formula &Base,
3963 unsigned Depth, size_t Idx,
3964 bool IsScaledReg) {
3965 const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3966 // Don't generate reassociations for the base register of a value that
3967 // may generate a post-increment operator. The reason is that the
3968 // reassociations cause extra base+register formula to be created,
3969 // and possibly chosen, but the post-increment is more efficient.
3970 if (AMK == TTI::AMK_PostIndexed && mayUsePostIncMode(TTI, LU, BaseReg, L, SE))
3971 return;
3973 const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
3974 if (Remainder)
3975 AddOps.push_back(Remainder);
3976
3977 if (AddOps.size() == 1)
3978 return;
3979
3981 JE = AddOps.end();
3982 J != JE; ++J) {
3983 // Loop-variant "unknown" values are uninteresting; we won't be able to
3984 // do anything meaningful with them.
3985 if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
3986 continue;
3987
3988 // Don't pull a constant into a register if the constant could be folded
3989 // into an immediate field.
3990 if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3991 LU.AccessTy, *J, Base.getNumRegs() > 1))
3992 continue;
3993
3994 // Collect all operands except *J.
3995 SmallVector<SCEVUse, 8> InnerAddOps(std::as_const(AddOps).begin(), J);
3996 InnerAddOps.append(std::next(J), std::as_const(AddOps).end());
3997
3998 // Don't leave just a constant behind in a register if the constant could
3999 // be folded into an immediate field.
4000 if (InnerAddOps.size() == 1 &&
4001 isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
4002 LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
4003 continue;
4004
4005 const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
4006 if (InnerSum->isZero())
4007 continue;
4008 Formula F = Base;
4009
4010 if (F.UnfoldedOffset.isNonZero() && F.UnfoldedOffset.isScalable())
4011 continue;
4012
4013 // Add the remaining pieces of the add back into the new formula.
4014 const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
4015 if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
4016 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
4017 InnerSumSC->getValue()->getZExtValue())) {
4018 F.UnfoldedOffset =
4019 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
4020 InnerSumSC->getValue()->getZExtValue());
4021 if (IsScaledReg) {
4022 F.ScaledReg = nullptr;
4023 F.Scale = 0;
4024 } else
4025 F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
4026 } else if (IsScaledReg)
4027 F.ScaledReg = InnerSum;
4028 else
4029 F.BaseRegs[Idx] = InnerSum;
4030
4031 // Add J as its own register, or an unfolded immediate.
4032 const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
4033 if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
4034 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
4035 SC->getValue()->getZExtValue()))
4036 F.UnfoldedOffset =
4037 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
4038 SC->getValue()->getZExtValue());
4039 else
4040 F.BaseRegs.push_back(*J);
4041 // We may have changed the number of register in base regs, adjust the
4042 // formula accordingly.
4043 F.canonicalize(*L);
4044
4045 if (InsertFormula(LU, LUIdx, F))
4046 // If that formula hadn't been seen before, recurse to find more like
4047 // it.
4048 // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2)
4049 // Because just Depth is not enough to bound compile time.
4050 // This means that every time AddOps.size() is greater 16^x we will add
4051 // x to Depth.
4052 GenerateReassociations(LU, LUIdx, LU.Formulae.back(),
4053 Depth + 1 + (Log2_32(AddOps.size()) >> 2));
4054 }
4055}
4056
4057/// Split out subexpressions from adds and the bases of addrecs.
4058void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
4059 Formula Base, unsigned Depth) {
4060 assert(Base.isCanonical(*L) && "Input must be in the canonical form");
4061 // Arbitrarily cap recursion to protect compile time.
4062 if (Depth >= 3)
4063 return;
4064
4065 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4066 GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
4067
4068 if (Base.Scale == 1)
4069 GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
4070 /* Idx */ -1, /* IsScaledReg */ true);
4071}
4072
4073/// Generate a formula consisting of all of the loop-dominating registers added
4074/// into a single register.
4075void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
4076 Formula Base) {
4077 // This method is only interesting on a plurality of registers.
4078 if (Base.BaseRegs.size() + (Base.Scale == 1) +
4079 (Base.UnfoldedOffset.isNonZero()) <=
4080 1)
4081 return;
4082
4083 // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
4084 // processing the formula.
4085 Base.unscale();
4087 Formula NewBase = Base;
4088 NewBase.BaseRegs.clear();
4089 Type *CombinedIntegerType = nullptr;
4090 for (const SCEV *BaseReg : Base.BaseRegs) {
4091 if (SE.properlyDominates(BaseReg, L->getHeader()) &&
4092 !SE.hasComputableLoopEvolution(BaseReg, L)) {
4093 if (!CombinedIntegerType)
4094 CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType());
4095 Ops.push_back(BaseReg);
4096 }
4097 else
4098 NewBase.BaseRegs.push_back(BaseReg);
4099 }
4100
4101 // If no register is relevant, we're done.
4102 if (Ops.size() == 0)
4103 return;
4104
4105 // Utility function for generating the required variants of the combined
4106 // registers.
4107 auto GenerateFormula = [&](const SCEV *Sum) {
4108 Formula F = NewBase;
4109
4110 // TODO: If Sum is zero, it probably means ScalarEvolution missed an
4111 // opportunity to fold something. For now, just ignore such cases
4112 // rather than proceed with zero in a register.
4113 if (Sum->isZero())
4114 return;
4115
4116 F.BaseRegs.push_back(Sum);
4117 F.canonicalize(*L);
4118 (void)InsertFormula(LU, LUIdx, F);
4119 };
4120
4121 // If we collected at least two registers, generate a formula combining them.
4122 if (Ops.size() > 1) {
4123 SmallVector<SCEVUse, 4> OpsCopy(Ops); // Don't let SE modify Ops.
4124 GenerateFormula(SE.getAddExpr(OpsCopy));
4125 }
4126
4127 // If we have an unfolded offset, generate a formula combining it with the
4128 // registers collected.
4129 if (NewBase.UnfoldedOffset.isNonZero() && NewBase.UnfoldedOffset.isFixed()) {
4130 assert(CombinedIntegerType && "Missing a type for the unfolded offset");
4131 Ops.push_back(SE.getConstant(CombinedIntegerType,
4132 NewBase.UnfoldedOffset.getFixedValue(), true));
4133 NewBase.UnfoldedOffset = Immediate::getFixed(0);
4134 GenerateFormula(SE.getAddExpr(Ops));
4135 }
4136}
4137
4138/// Helper function for LSRInstance::GenerateSymbolicOffsets.
4139void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
4140 const Formula &Base, size_t Idx,
4141 bool IsScaledReg) {
4142 SCEVUse G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4143 GlobalValue *GV = ExtractSymbol(G, SE);
4144 if (G->isZero() || !GV)
4145 return;
4146 Formula F = Base;
4147 F.BaseGV = GV;
4148 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4149 return;
4150 if (IsScaledReg)
4151 F.ScaledReg = G;
4152 else
4153 F.BaseRegs[Idx] = G;
4154 (void)InsertFormula(LU, LUIdx, F);
4155}
4156
4157/// Generate reuse formulae using symbolic offsets.
4158void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
4159 Formula Base) {
4160 // We can't add a symbolic offset if the address already contains one.
4161 if (Base.BaseGV) return;
4162
4163 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4164 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
4165 if (Base.Scale == 1)
4166 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
4167 /* IsScaledReg */ true);
4168}
4169
4170/// Helper function for LSRInstance::GenerateConstantOffsets.
4171void LSRInstance::GenerateConstantOffsetsImpl(
4172 LSRUse &LU, unsigned LUIdx, const Formula &Base,
4173 const SmallVectorImpl<Immediate> &Worklist, size_t Idx, bool IsScaledReg) {
4174
4175 auto GenerateOffset = [&](const SCEV *G, Immediate Offset) {
4176 Formula F = Base;
4177 if (!Base.BaseOffset.isCompatibleImmediate(Offset))
4178 return;
4179 F.BaseOffset = Base.BaseOffset.subUnsigned(Offset);
4180
4181 if (isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) {
4182 // Add the offset to the base register.
4183 const SCEV *NewOffset = Offset.getSCEV(SE, G->getType());
4184 const SCEV *NewG = SE.getAddExpr(NewOffset, G);
4185 // If it cancelled out, drop the base register, otherwise update it.
4186 if (NewG->isZero()) {
4187 if (IsScaledReg) {
4188 F.Scale = 0;
4189 F.ScaledReg = nullptr;
4190 } else
4191 F.deleteBaseReg(F.BaseRegs[Idx]);
4192 F.canonicalize(*L);
4193 } else if (IsScaledReg)
4194 F.ScaledReg = NewG;
4195 else
4196 F.BaseRegs[Idx] = NewG;
4197
4198 (void)InsertFormula(LU, LUIdx, F);
4199 }
4200 };
4201
4202 SCEVUse G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4203
4204 // With constant offsets and constant steps, we can generate pre-inc
4205 // accesses by having the offset equal the step. So, for access #0 with a
4206 // step of 8, we generate a G - 8 base which would require the first access
4207 // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer
4208 // for itself and hopefully becomes the base for other accesses. This means
4209 // means that a single pre-indexed access can be generated to become the new
4210 // base pointer for each iteration of the loop, resulting in no extra add/sub
4211 // instructions for pointer updating.
4212 if ((AMK & TTI::AMK_PreIndexed) && LU.Kind == LSRUse::Address) {
4213 const APInt *StepInt;
4214 if (match(G, m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt)))) {
4215 int64_t Step = StepInt->isNegative() ? StepInt->getSExtValue()
4216 : StepInt->getZExtValue();
4217
4218 for (Immediate Offset : Worklist) {
4219 if (Offset.isFixed()) {
4220 Offset = Immediate::getFixed(Offset.getFixedValue() - Step);
4221 GenerateOffset(G, Offset);
4222 }
4223 }
4224 }
4225 }
4226 for (Immediate Offset : Worklist)
4227 GenerateOffset(G, Offset);
4228
4229 // TODO: It likely makes sense to extract the immediate corresponding to the
4230 // access type (i.e., set PreferScalable to AccessTy.MemTy &&
4231 // AccessTy.MemTy->isScalableTy()).
4232 Immediate Imm = ExtractImmediate(G, SE, /*PreferScalable=*/false);
4233 if (G->isZero() || Imm.isZero() ||
4234 !Base.BaseOffset.isCompatibleImmediate(Imm))
4235 return;
4236 Formula F = Base;
4237 F.BaseOffset = F.BaseOffset.addUnsigned(Imm);
4238 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4239 return;
4240 if (IsScaledReg) {
4241 F.ScaledReg = G;
4242 } else {
4243 F.BaseRegs[Idx] = G;
4244 // We may generate non canonical Formula if G is a recurrent expr reg
4245 // related with current loop while F.ScaledReg is not.
4246 F.canonicalize(*L);
4247 }
4248 (void)InsertFormula(LU, LUIdx, F);
4249}
4250
4251/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
4252void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
4253 Formula Base) {
4254 // TODO: For now, just add the min and max offset, because it usually isn't
4255 // worthwhile looking at everything inbetween.
4257 Worklist.push_back(LU.MinOffset);
4258 if (LU.MaxOffset != LU.MinOffset)
4259 Worklist.push_back(LU.MaxOffset);
4260
4261 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4262 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
4263 if (Base.Scale == 1)
4264 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
4265 /* IsScaledReg */ true);
4266}
4267
4268/// For ICmpZero, check to see if we can scale up the comparison. For example, x
4269/// == y -> x*c == y*c.
4270void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
4271 Formula Base) {
4272 if (LU.Kind != LSRUse::ICmpZero) return;
4273
4274 // Determine the integer type for the base formula.
4275 Type *IntTy = Base.getType();
4276 if (!IntTy) return;
4277 if (SE.getTypeSizeInBits(IntTy) > 64) return;
4278
4279 // Don't do this if there is more than one offset.
4280 if (LU.MinOffset != LU.MaxOffset) return;
4281
4282 // Check if transformation is valid. It is illegal to multiply pointer.
4283 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4284 return;
4285 for (const SCEV *BaseReg : Base.BaseRegs)
4286 if (BaseReg->getType()->isPointerTy())
4287 return;
4288 assert(!Base.BaseGV && "ICmpZero use is not legal!");
4289
4290 // Check each interesting stride.
4291 for (int64_t Factor : Factors) {
4292 // Check that Factor can be represented by IntTy
4293 if (!ConstantInt::isValueValidForType(IntTy, Factor))
4294 continue;
4295 // Check that the multiplication doesn't overflow.
4296 if (Base.BaseOffset.isMin() && Factor == -1)
4297 continue;
4298 // Not supporting scalable immediates.
4299 if (Base.BaseOffset.isNonZero() && Base.BaseOffset.isScalable())
4300 continue;
4301 Immediate NewBaseOffset = Base.BaseOffset.mulUnsigned(Factor);
4302 assert(Factor != 0 && "Zero factor not expected!");
4303 if (NewBaseOffset.getFixedValue() / Factor !=
4304 Base.BaseOffset.getFixedValue())
4305 continue;
4306 // If the offset will be truncated at this use, check that it is in bounds.
4307 if (!IntTy->isPointerTy() &&
4308 !ConstantInt::isValueValidForType(IntTy, NewBaseOffset.getFixedValue()))
4309 continue;
4310
4311 // Check that multiplying with the use offset doesn't overflow.
4312 Immediate Offset = LU.MinOffset;
4313 if (Offset.isMin() && Factor == -1)
4314 continue;
4315 Offset = Offset.mulUnsigned(Factor);
4316 if (Offset.getFixedValue() / Factor != LU.MinOffset.getFixedValue())
4317 continue;
4318 // If the offset will be truncated at this use, check that it is in bounds.
4319 if (!IntTy->isPointerTy() &&
4320 !ConstantInt::isValueValidForType(IntTy, Offset.getFixedValue()))
4321 continue;
4322
4323 Formula F = Base;
4324 F.BaseOffset = NewBaseOffset;
4325
4326 // Check that this scale is legal.
4327 if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
4328 continue;
4329
4330 // Compensate for the use having MinOffset built into it.
4331 F.BaseOffset = F.BaseOffset.addUnsigned(Offset).subUnsigned(LU.MinOffset);
4332
4333 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4334
4335 // Check that multiplying with each base register doesn't overflow.
4336 for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
4337 F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
4338 if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
4339 goto next;
4340 }
4341
4342 // Check that multiplying with the scaled register doesn't overflow.
4343 if (F.ScaledReg) {
4344 F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
4345 if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
4346 continue;
4347 }
4348
4349 // Check that multiplying with the unfolded offset doesn't overflow.
4350 if (F.UnfoldedOffset.isNonZero()) {
4351 if (F.UnfoldedOffset.isMin() && Factor == -1)
4352 continue;
4353 F.UnfoldedOffset = F.UnfoldedOffset.mulUnsigned(Factor);
4354 if (F.UnfoldedOffset.getFixedValue() / Factor !=
4355 Base.UnfoldedOffset.getFixedValue())
4356 continue;
4357 // If the offset will be truncated, check that it is in bounds.
4359 IntTy, F.UnfoldedOffset.getFixedValue()))
4360 continue;
4361 }
4362
4363 // If we make it here and it's legal, add it.
4364 (void)InsertFormula(LU, LUIdx, F);
4365 next:;
4366 }
4367}
4368
4369/// Generate stride factor reuse formulae by making use of scaled-offset address
4370/// modes, for example.
4371void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
4372 // Determine the integer type for the base formula.
4373 Type *IntTy = Base.getType();
4374 if (!IntTy) return;
4375
4376 // If this Formula already has a scaled register, we can't add another one.
4377 // Try to unscale the formula to generate a better scale.
4378 if (Base.Scale != 0 && !Base.unscale())
4379 return;
4380
4381 assert(Base.Scale == 0 && "unscale did not did its job!");
4382
4383 // Check each interesting stride.
4384 for (int64_t Factor : Factors) {
4385 Base.Scale = Factor;
4386 Base.HasBaseReg = Base.BaseRegs.size() > 1;
4387 // Check whether this scale is going to be legal.
4388 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4389 Base)) {
4390 // As a special-case, handle special out-of-loop Basic users specially.
4391 // TODO: Reconsider this special case.
4392 if (LU.Kind == LSRUse::Basic &&
4393 isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
4394 LU.AccessTy, Base) &&
4395 LU.AllFixupsOutsideLoop)
4396 LU.Kind = LSRUse::Special;
4397 else
4398 continue;
4399 }
4400 // For an ICmpZero, negating a solitary base register won't lead to
4401 // new solutions.
4402 if (LU.Kind == LSRUse::ICmpZero && !Base.HasBaseReg &&
4403 Base.BaseOffset.isZero() && !Base.BaseGV)
4404 continue;
4405 // For each addrec base reg, if its loop is current loop, apply the scale.
4406 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
4407 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
4408 if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
4409 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4410 if (FactorS->isZero())
4411 continue;
4412 // Divide out the factor, ignoring high bits, since we'll be
4413 // scaling the value back up in the end.
4414 if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true))
4415 if (!Quotient->isZero()) {
4416 // TODO: This could be optimized to avoid all the copying.
4417 Formula F = Base;
4418 F.ScaledReg = Quotient;
4419 F.deleteBaseReg(F.BaseRegs[i]);
4420 // The canonical representation of 1*reg is reg, which is already in
4421 // Base. In that case, do not try to insert the formula, it will be
4422 // rejected anyway.
4423 if (F.Scale == 1 && (F.BaseRegs.empty() ||
4424 (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
4425 continue;
4426 // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
4427 // non canonical Formula with ScaledReg's loop not being L.
4428 if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
4429 F.canonicalize(*L);
4430 (void)InsertFormula(LU, LUIdx, F);
4431 }
4432 }
4433 }
4434 }
4435}
4436
4437/// Extend/Truncate \p Expr to \p ToTy considering post-inc uses in \p Loops.
4438/// For all PostIncLoopSets in \p Loops, first de-normalize \p Expr, then
4439/// perform the extension/truncate and normalize again, as the normalized form
4440/// can result in folds that are not valid in the post-inc use contexts. The
4441/// expressions for all PostIncLoopSets must match, otherwise return nullptr.
4442static const SCEV *
4444 const SCEV *Expr, Type *ToTy,
4445 ScalarEvolution &SE) {
4446 const SCEV *Result = nullptr;
4447 for (auto &L : Loops) {
4448 auto *DenormExpr = denormalizeForPostIncUse(Expr, L, SE);
4449 const SCEV *NewDenormExpr = SE.getAnyExtendExpr(DenormExpr, ToTy);
4450 const SCEV *New = normalizeForPostIncUse(NewDenormExpr, L, SE);
4451 if (!New || (Result && New != Result))
4452 return nullptr;
4453 Result = New;
4454 }
4455
4456 assert(Result && "failed to create expression");
4457 return Result;
4458}
4459
4460/// Generate reuse formulae from different IV types.
4461void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
4462 // Don't bother truncating symbolic values.
4463 if (Base.BaseGV) return;
4464
4465 // Determine the integer type for the base formula.
4466 Type *DstTy = Base.getType();
4467 if (!DstTy) return;
4468 if (DstTy->isPointerTy())
4469 return;
4470
4471 // It is invalid to extend a pointer type so exit early if ScaledReg or
4472 // any of the BaseRegs are pointers.
4473 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4474 return;
4475 if (any_of(Base.BaseRegs,
4476 [](const SCEV *S) { return S->getType()->isPointerTy(); }))
4477 return;
4478
4480 for (auto &LF : LU.Fixups)
4481 Loops.push_back(LF.PostIncLoops);
4482
4483 for (Type *SrcTy : Types) {
4484 if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
4485 Formula F = Base;
4486
4487 // Sometimes SCEV is able to prove zero during ext transform. It may
4488 // happen if SCEV did not do all possible transforms while creating the
4489 // initial node (maybe due to depth limitations), but it can do them while
4490 // taking ext.
4491 if (F.ScaledReg) {
4492 const SCEV *NewScaledReg =
4493 getAnyExtendConsideringPostIncUses(Loops, F.ScaledReg, SrcTy, SE);
4494 if (!NewScaledReg || NewScaledReg->isZero())
4495 continue;
4496 F.ScaledReg = NewScaledReg;
4497 }
4498 bool HasZeroBaseReg = false;
4499 for (const SCEV *&BaseReg : F.BaseRegs) {
4500 const SCEV *NewBaseReg =
4501 getAnyExtendConsideringPostIncUses(Loops, BaseReg, SrcTy, SE);
4502 if (!NewBaseReg || NewBaseReg->isZero()) {
4503 HasZeroBaseReg = true;
4504 break;
4505 }
4506 BaseReg = NewBaseReg;
4507 }
4508 if (HasZeroBaseReg)
4509 continue;
4510
4511 // TODO: This assumes we've done basic processing on all uses and
4512 // have an idea what the register usage is.
4513 if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
4514 continue;
4515
4516 F.canonicalize(*L);
4517 (void)InsertFormula(LU, LUIdx, F);
4518 }
4519 }
4520}
4521
4522namespace {
4523
4524/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
4525/// modifications so that the search phase doesn't have to worry about the data
4526/// structures moving underneath it.
4527struct WorkItem {
4528 size_t LUIdx;
4529 Immediate Imm;
4530 const SCEV *OrigReg;
4531
4532 WorkItem(size_t LI, Immediate I, const SCEV *R)
4533 : LUIdx(LI), Imm(I), OrigReg(R) {}
4534
4535 void print(raw_ostream &OS) const;
4536 void dump() const;
4537};
4538
4539} // end anonymous namespace
4540
4541#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4542void WorkItem::print(raw_ostream &OS) const {
4543 OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
4544 << " , add offset " << Imm;
4545}
4546
4547LLVM_DUMP_METHOD void WorkItem::dump() const {
4548 print(errs()); errs() << '\n';
4549}
4550#endif
4551
4552/// Look for registers which are a constant distance apart and try to form reuse
4553/// opportunities between them.
4554void LSRInstance::GenerateCrossUseConstantOffsets() {
4555 // Group the registers by their value without any added constant offset.
4556 using ImmMapTy = std::map<Immediate, const SCEV *, KeyOrderTargetImmediate>;
4557
4558 DenseMap<const SCEV *, ImmMapTy> Map;
4559 DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
4561 for (const SCEV *Use : RegUses) {
4562 SCEVUse Reg = Use; // Make a copy for ExtractImmediate to modify.
4563 // TODO: Extract both scalable and fixed immediates (if present)?
4564 Immediate Imm = ExtractImmediate(Reg, SE);
4565 auto Pair = Map.try_emplace(Reg);
4566 if (Pair.second)
4567 Sequence.push_back(Reg);
4568 Pair.first->second.insert(std::make_pair(Imm, Use));
4569 UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
4570 }
4571
4572 // Now examine each set of registers with the same base value. Build up
4573 // a list of work to do and do the work in a separate step so that we're
4574 // not adding formulae and register counts while we're searching.
4575 SmallVector<WorkItem, 32> WorkItems;
4576 SmallSet<std::pair<size_t, Immediate>, 32, KeyOrderSizeTAndImmediate>
4577 UniqueItems;
4578 for (const SCEV *Reg : Sequence) {
4579 const ImmMapTy &Imms = Map.find(Reg)->second;
4580
4581 // It's not worthwhile looking for reuse if there's only one offset.
4582 if (Imms.size() == 1)
4583 continue;
4584
4585 LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
4586 for (const auto &Entry
4587 : Imms) dbgs()
4588 << ' ' << Entry.first;
4589 dbgs() << '\n');
4590
4591 // Examine each offset.
4592 for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
4593 J != JE; ++J) {
4594 const SCEV *OrigReg = J->second;
4595
4596 Immediate JImm = J->first;
4597 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
4598
4599 if (!isa<SCEVConstant>(OrigReg) &&
4600 UsedByIndicesMap[Reg].count() == 1) {
4601 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4602 << '\n');
4603 continue;
4604 }
4605
4606 // Conservatively examine offsets between this orig reg a few selected
4607 // other orig regs.
4608 Immediate First = Imms.begin()->first;
4609 Immediate Last = std::prev(Imms.end())->first;
4610 if (!First.isCompatibleImmediate(Last)) {
4611 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4612 << "\n");
4613 continue;
4614 }
4615 // Only scalable if both terms are scalable, or if one is scalable and
4616 // the other is 0.
4617 bool Scalable = First.isScalable() || Last.isScalable();
4618 int64_t FI = First.getKnownMinValue();
4619 int64_t LI = Last.getKnownMinValue();
4620 // Compute (First + Last) / 2 without overflow using the fact that
4621 // First + Last = 2 * (First + Last) + (First ^ Last).
4622 int64_t Avg = (FI & LI) + ((FI ^ LI) >> 1);
4623 // If the result is negative and FI is odd and LI even (or vice versa),
4624 // we rounded towards -inf. Add 1 in that case, to round towards 0.
4625 Avg = Avg + ((FI ^ LI) & ((uint64_t)Avg >> 63));
4626 ImmMapTy::const_iterator OtherImms[] = {
4627 Imms.begin(), std::prev(Imms.end()),
4628 Imms.lower_bound(Immediate::get(Avg, Scalable))};
4629 for (const auto &M : OtherImms) {
4630 if (M == J || M == JE) continue;
4631 if (!JImm.isCompatibleImmediate(M->first))
4632 continue;
4633
4634 // Compute the difference between the two.
4635 Immediate Imm = JImm.subUnsigned(M->first);
4636 for (unsigned LUIdx : UsedByIndices.set_bits())
4637 // Make a memo of this use, offset, and register tuple.
4638 if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
4639 WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
4640 }
4641 }
4642 }
4643
4644 Map.clear();
4645 Sequence.clear();
4646 UsedByIndicesMap.clear();
4647 UniqueItems.clear();
4648
4649 // Now iterate through the worklist and add new formulae.
4650 for (const WorkItem &WI : WorkItems) {
4651 size_t LUIdx = WI.LUIdx;
4652 LSRUse &LU = Uses[LUIdx];
4653 Immediate Imm = WI.Imm;
4654 const SCEV *OrigReg = WI.OrigReg;
4655
4656 Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
4657 const SCEV *NegImmS = Imm.getNegativeSCEV(SE, IntTy);
4658 unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
4659
4660 // TODO: Use a more targeted data structure.
4661 for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
4662 Formula F = LU.Formulae[L];
4663 // FIXME: The code for the scaled and unscaled registers looks
4664 // very similar but slightly different. Investigate if they
4665 // could be merged. That way, we would not have to unscale the
4666 // Formula.
4667 F.unscale();
4668 // Use the immediate in the scaled register.
4669 if (F.ScaledReg == OrigReg) {
4670 if (!F.BaseOffset.isCompatibleImmediate(Imm))
4671 continue;
4672 Immediate Offset = F.BaseOffset.addUnsigned(Imm.mulUnsigned(F.Scale));
4673 // Don't create 50 + reg(-50).
4674 const SCEV *S = Offset.getNegativeSCEV(SE, IntTy);
4675 if (F.referencesReg(S))
4676 continue;
4677 Formula NewF = F;
4678 NewF.BaseOffset = Offset;
4679 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4680 NewF))
4681 continue;
4682 NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
4683
4684 // If the new scale is a constant in a register, and adding the constant
4685 // value to the immediate would produce a value closer to zero than the
4686 // immediate itself, then the formula isn't worthwhile.
4687 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) {
4688 // FIXME: Do we need to do something for scalable immediates here?
4689 // A scalable SCEV won't be constant, but we might still have
4690 // something in the offset? Bail out for now to be safe.
4691 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4692 continue;
4693 if (C->getValue()->isNegative() !=
4694 (NewF.BaseOffset.isLessThanZero()) &&
4695 (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
4696 .ule(std::abs(NewF.BaseOffset.getFixedValue())))
4697 continue;
4698 }
4699
4700 // OK, looks good.
4701 NewF.canonicalize(*this->L);
4702 (void)InsertFormula(LU, LUIdx, NewF);
4703 } else {
4704 // Use the immediate in a base register.
4705 for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
4706 const SCEV *BaseReg = F.BaseRegs[N];
4707 if (BaseReg != OrigReg)
4708 continue;
4709 Formula NewF = F;
4710 if (!NewF.BaseOffset.isCompatibleImmediate(Imm) ||
4711 !NewF.UnfoldedOffset.isCompatibleImmediate(Imm) ||
4712 !NewF.BaseOffset.isCompatibleImmediate(NewF.UnfoldedOffset))
4713 continue;
4714 NewF.BaseOffset = NewF.BaseOffset.addUnsigned(Imm);
4715 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
4716 LU.Kind, LU.AccessTy, NewF)) {
4717 if (AMK == TTI::AMK_PostIndexed &&
4718 mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
4719 continue;
4720 Immediate NewUnfoldedOffset = NewF.UnfoldedOffset.addUnsigned(Imm);
4721 if (!isLegalAddImmediate(TTI, NewUnfoldedOffset))
4722 continue;
4723 NewF = F;
4724 NewF.UnfoldedOffset = NewUnfoldedOffset;
4725 }
4726 NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
4727
4728 // If the new formula has a constant in a register, and adding the
4729 // constant value to the immediate would produce a value closer to
4730 // zero than the immediate itself, then the formula isn't worthwhile.
4731 for (const SCEV *NewReg : NewF.BaseRegs)
4732 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg)) {
4733 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4734 goto skip_formula;
4735 if ((C->getAPInt() + NewF.BaseOffset.getFixedValue())
4736 .abs()
4737 .slt(std::abs(NewF.BaseOffset.getFixedValue())) &&
4738 (C->getAPInt() + NewF.BaseOffset.getFixedValue())
4739 .countr_zero() >=
4741 NewF.BaseOffset.getFixedValue()))
4742 goto skip_formula;
4743 }
4744
4745 // Ok, looks good.
4746 NewF.canonicalize(*this->L);
4747 (void)InsertFormula(LU, LUIdx, NewF);
4748 break;
4749 skip_formula:;
4750 }
4751 }
4752 }
4753 }
4754}
4755
4756/// Generate formulae for each use.
4757void
4758LSRInstance::GenerateAllReuseFormulae() {
4759 // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
4760 // queries are more precise.
4761 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4762 LSRUse &LU = Uses[LUIdx];
4763 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4764 GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
4765 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4766 GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
4767 }
4768 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4769 LSRUse &LU = Uses[LUIdx];
4770 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4771 GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
4772 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4773 GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
4774 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4775 GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
4776 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4777 GenerateScales(LU, LUIdx, LU.Formulae[i]);
4778 }
4779 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4780 LSRUse &LU = Uses[LUIdx];
4781 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4782 GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
4783 }
4784
4785 GenerateCrossUseConstantOffsets();
4786
4787 LLVM_DEBUG(dbgs() << "\n"
4788 "After generating reuse formulae:\n";
4789 print_uses(dbgs()));
4790}
4791
4792/// If there are multiple formulae with the same set of registers used
4793/// by other uses, pick the best one and delete the others.
4794void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
4795 DenseSet<const SCEV *> VisitedRegs;
4796 SmallPtrSet<const SCEV *, 16> Regs;
4797 SmallPtrSet<const SCEV *, 16> LoserRegs;
4798#ifndef NDEBUG
4799 bool ChangedFormulae = false;
4800#endif
4801
4802 // Collect the best formula for each unique set of shared registers. This
4803 // is reset for each use.
4804 using BestFormulaeTy = DenseMap<SmallVector<const SCEV *, 4>, size_t>;
4805
4806 BestFormulaeTy BestFormulae;
4807
4808 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4809 LSRUse &LU = Uses[LUIdx];
4810 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
4811 dbgs() << '\n');
4812
4813 bool Any = false;
4814 for (size_t FIdx = 0, NumForms = LU.Formulae.size();
4815 FIdx != NumForms; ++FIdx) {
4816 Formula &F = LU.Formulae[FIdx];
4817
4818 // Some formulas are instant losers. For example, they may depend on
4819 // nonexistent AddRecs from other loops. These need to be filtered
4820 // immediately, otherwise heuristics could choose them over others leading
4821 // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
4822 // avoids the need to recompute this information across formulae using the
4823 // same bad AddRec. Passing LoserRegs is also essential unless we remove
4824 // the corresponding bad register from the Regs set.
4825 Cost CostF(L, SE, TTI, AMK);
4826 Regs.clear();
4827 CostF.RateFormula(F, Regs, VisitedRegs, LU, HardwareLoopProfitable,
4828 &LoserRegs);
4829 if (CostF.isLoser()) {
4830 // During initial formula generation, undesirable formulae are generated
4831 // by uses within other loops that have some non-trivial address mode or
4832 // use the postinc form of the IV. LSR needs to provide these formulae
4833 // as the basis of rediscovering the desired formula that uses an AddRec
4834 // corresponding to the existing phi. Once all formulae have been
4835 // generated, these initial losers may be pruned.
4836 LLVM_DEBUG(dbgs() << " Filtering loser "; F.print(dbgs());
4837 dbgs() << "\n");
4838 }
4839 else {
4841 for (const SCEV *Reg : F.BaseRegs) {
4842 if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
4843 Key.push_back(Reg);
4844 }
4845 if (F.ScaledReg &&
4846 RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
4847 Key.push_back(F.ScaledReg);
4848 // Unstable sort by host order ok, because this is only used for
4849 // uniquifying.
4850 llvm::sort(Key);
4851
4852 std::pair<BestFormulaeTy::const_iterator, bool> P =
4853 BestFormulae.insert(std::make_pair(Key, FIdx));
4854 if (P.second)
4855 continue;
4856
4857 Formula &Best = LU.Formulae[P.first->second];
4858
4859 Cost CostBest(L, SE, TTI, AMK);
4860 Regs.clear();
4861 CostBest.RateFormula(Best, Regs, VisitedRegs, LU,
4862 HardwareLoopProfitable);
4863 if (CostF.isLess(CostBest))
4864 std::swap(F, Best);
4865 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
4866 dbgs() << "\n"
4867 " in favor of formula ";
4868 Best.print(dbgs()); dbgs() << '\n');
4869 }
4870#ifndef NDEBUG
4871 ChangedFormulae = true;
4872#endif
4873 LU.DeleteFormula(F);
4874 --FIdx;
4875 --NumForms;
4876 Any = true;
4877 }
4878
4879 // Now that we've filtered out some formulae, recompute the Regs set.
4880 if (Any)
4881 LU.RecomputeRegs(LUIdx, RegUses);
4882
4883 // Reset this to prepare for the next use.
4884 BestFormulae.clear();
4885 }
4886
4887 LLVM_DEBUG(if (ChangedFormulae) {
4888 dbgs() << "\n"
4889 "After filtering out undesirable candidates:\n";
4890 print_uses(dbgs());
4891 });
4892}
4893
4894/// Estimate the worst-case number of solutions the solver might have to
4895/// consider. It almost never considers this many solutions because it prune the
4896/// search space, but the pruning isn't always sufficient.
4897size_t LSRInstance::EstimateSearchSpaceComplexity() const {
4898 size_t Power = 1;
4899 for (const LSRUse &LU : Uses) {
4900 size_t FSize = LU.Formulae.size();
4901 if (FSize >= ComplexityLimit) {
4902 Power = ComplexityLimit;
4903 break;
4904 }
4905 Power *= FSize;
4906 if (Power >= ComplexityLimit)
4907 break;
4908 }
4909 return Power;
4910}
4911
4912/// When one formula uses a superset of the registers of another formula, it
4913/// won't help reduce register pressure (though it may not necessarily hurt
4914/// register pressure); remove it to simplify the system.
4915void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
4916 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4917 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4918
4919 LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
4920 "which use a superset of registers used by other "
4921 "formulae.\n");
4922
4923 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4924 LSRUse &LU = Uses[LUIdx];
4925 bool Any = false;
4926 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
4927 Formula &F = LU.Formulae[i];
4928 if (F.BaseOffset.isNonZero() && F.BaseOffset.isScalable())
4929 continue;
4930 // Look for a formula with a constant or GV in a register. If the use
4931 // also has a formula with that same value in an immediate field,
4932 // delete the one that uses a register.
4934 I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
4935 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
4936 Formula NewF = F;
4937 //FIXME: Formulas should store bitwidth to do wrapping properly.
4938 // See PR41034.
4939 NewF.BaseOffset =
4940 Immediate::getFixed(NewF.BaseOffset.getFixedValue() +
4941 (uint64_t)C->getValue()->getSExtValue());
4942 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4943 (I - F.BaseRegs.begin()));
4944 if (LU.HasFormulaWithSameRegs(NewF)) {
4945 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4946 dbgs() << '\n');
4947 LU.DeleteFormula(F);
4948 --i;
4949 --e;
4950 Any = true;
4951 break;
4952 }
4953 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
4954 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
4955 if (!F.BaseGV) {
4956 Formula NewF = F;
4957 NewF.BaseGV = GV;
4958 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4959 (I - F.BaseRegs.begin()));
4960 if (LU.HasFormulaWithSameRegs(NewF)) {
4961 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4962 dbgs() << '\n');
4963 LU.DeleteFormula(F);
4964 --i;
4965 --e;
4966 Any = true;
4967 break;
4968 }
4969 }
4970 }
4971 }
4972 }
4973 if (Any)
4974 LU.RecomputeRegs(LUIdx, RegUses);
4975 }
4976
4977 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4978 }
4979}
4980
4981/// When there are many registers for expressions like A, A+1, A+2, etc.,
4982/// allocate a single register for them.
4983void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
4984 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4985 return;
4986
4987 LLVM_DEBUG(
4988 dbgs() << "The search space is too complex.\n"
4989 "Narrowing the search space by assuming that uses separated "
4990 "by a constant offset will use the same registers.\n");
4991
4992 // This is especially useful for unrolled loops.
4993
4994 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4995 LSRUse &LU = Uses[LUIdx];
4996 for (const Formula &F : LU.Formulae) {
4997 if (F.BaseOffset.isZero() || (F.Scale != 0 && F.Scale != 1))
4998 continue;
4999 assert((LU.Kind == LSRUse::Address || LU.Kind == LSRUse::ICmpZero) &&
5000 "Only address and cmp uses expected to have nonzero BaseOffset");
5001
5002 LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
5003 if (!LUThatHas)
5004 continue;
5005
5006 if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
5007 LU.Kind, LU.AccessTy))
5008 continue;
5009
5010 LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');
5011
5012 LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
5013 LUThatHas->AllFixupsUnconditional &= LU.AllFixupsUnconditional;
5014
5015 // Transfer the fixups of LU to LUThatHas.
5016 for (LSRFixup &Fixup : LU.Fixups) {
5017 Fixup.Offset += F.BaseOffset;
5018 LUThatHas->pushFixup(Fixup);
5019 LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
5020 }
5021
5022#ifndef NDEBUG
5023 Type *FixupType = LUThatHas->Fixups[0].OperandValToReplace->getType();
5024 for (LSRFixup &Fixup : LUThatHas->Fixups)
5025 assert(Fixup.OperandValToReplace->getType() == FixupType &&
5026 "Expected all fixups to have the same type");
5027#endif
5028
5029 // Delete formulae from the new use which are no longer legal.
5030 bool Any = false;
5031 for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
5032 Formula &F = LUThatHas->Formulae[i];
5033 if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
5034 LUThatHas->Kind, LUThatHas->AccessTy, F)) {
5035 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
5036 LUThatHas->DeleteFormula(F);
5037 --i;
5038 --e;
5039 Any = true;
5040 }
5041 }
5042
5043 if (Any)
5044 LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
5045
5046 // Delete the old use.
5047 DeleteUse(LU, LUIdx);
5048 --LUIdx;
5049 --NumUses;
5050 break;
5051 }
5052 }
5053
5054 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5055}
5056
5057/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
5058/// we've done more filtering, as it may be able to find more formulae to
5059/// eliminate.
5060void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
5061 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5062 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5063
5064 LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
5065 "undesirable dedicated registers.\n");
5066
5067 FilterOutUndesirableDedicatedRegisters();
5068
5069 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5070 }
5071}
5072
5073/// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
5074/// Pick the best one and delete the others.
5075/// This narrowing heuristic is to keep as many formulae with different
5076/// Scale and ScaledReg pair as possible while narrowing the search space.
5077/// The benefit is that it is more likely to find out a better solution
5078/// from a formulae set with more Scale and ScaledReg variations than
5079/// a formulae set with the same Scale and ScaledReg. The picking winner
5080/// reg heuristic will often keep the formulae with the same Scale and
5081/// ScaledReg and filter others, and we want to avoid that if possible.
5082void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
5083 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5084 return;
5085
5086 LLVM_DEBUG(
5087 dbgs() << "The search space is too complex.\n"
5088 "Narrowing the search space by choosing the best Formula "
5089 "from the Formulae with the same Scale and ScaledReg.\n");
5090
5091 // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
5092 using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
5093
5094 BestFormulaeTy BestFormulae;
5095#ifndef NDEBUG
5096 bool ChangedFormulae = false;
5097#endif
5098 DenseSet<const SCEV *> VisitedRegs;
5099 SmallPtrSet<const SCEV *, 16> Regs;
5100
5101 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5102 LSRUse &LU = Uses[LUIdx];
5103 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
5104 dbgs() << '\n');
5105
5106 // Return true if Formula FA is better than Formula FB.
5107 auto IsBetterThan = [&](Formula &FA, Formula &FB) {
5108 // First we will try to choose the Formula with fewer new registers.
5109 // For a register used by current Formula, the more the register is
5110 // shared among LSRUses, the less we increase the register number
5111 // counter of the formula.
5112 size_t FARegNum = 0;
5113 for (const SCEV *Reg : FA.BaseRegs) {
5114 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5115 FARegNum += (NumUses - UsedByIndices.count() + 1);
5116 }
5117 size_t FBRegNum = 0;
5118 for (const SCEV *Reg : FB.BaseRegs) {
5119 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5120 FBRegNum += (NumUses - UsedByIndices.count() + 1);
5121 }
5122 if (FARegNum != FBRegNum)
5123 return FARegNum < FBRegNum;
5124
5125 // If the new register numbers are the same, choose the Formula with
5126 // less Cost.
5127 Cost CostFA(L, SE, TTI, AMK);
5128 Cost CostFB(L, SE, TTI, AMK);
5129 Regs.clear();
5130 CostFA.RateFormula(FA, Regs, VisitedRegs, LU, HardwareLoopProfitable);
5131 Regs.clear();
5132 CostFB.RateFormula(FB, Regs, VisitedRegs, LU, HardwareLoopProfitable);
5133 return CostFA.isLess(CostFB);
5134 };
5135
5136 bool Any = false;
5137 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5138 ++FIdx) {
5139 Formula &F = LU.Formulae[FIdx];
5140 if (!F.ScaledReg)
5141 continue;
5142 auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx});
5143 if (P.second)
5144 continue;
5145
5146 Formula &Best = LU.Formulae[P.first->second];
5147 if (IsBetterThan(F, Best))
5148 std::swap(F, Best);
5149 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5150 dbgs() << "\n"
5151 " in favor of formula ";
5152 Best.print(dbgs()); dbgs() << '\n');
5153#ifndef NDEBUG
5154 ChangedFormulae = true;
5155#endif
5156 LU.DeleteFormula(F);
5157 --FIdx;
5158 --NumForms;
5159 Any = true;
5160 }
5161 if (Any)
5162 LU.RecomputeRegs(LUIdx, RegUses);
5163
5164 // Reset this to prepare for the next use.
5165 BestFormulae.clear();
5166 }
5167
5168 LLVM_DEBUG(if (ChangedFormulae) {
5169 dbgs() << "\n"
5170 "After filtering out undesirable candidates:\n";
5171 print_uses(dbgs());
5172 });
5173}
5174
5175/// If we are over the complexity limit, filter out any post-inc prefering
5176/// variables to only post-inc values.
5177void LSRInstance::NarrowSearchSpaceByFilterPostInc() {
5178 if (AMK != TTI::AMK_PostIndexed)
5179 return;
5180 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5181 return;
5182
5183 LLVM_DEBUG(dbgs() << "The search space is too complex.\n"
5184 "Narrowing the search space by choosing the lowest "
5185 "register Formula for PostInc Uses.\n");
5186
5187 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5188 LSRUse &LU = Uses[LUIdx];
5189
5190 if (LU.Kind != LSRUse::Address)
5191 continue;
5192 if (!TTI.isIndexedLoadLegal(TTI.MIM_PostInc, LU.AccessTy.getType()) &&
5193 !TTI.isIndexedStoreLegal(TTI.MIM_PostInc, LU.AccessTy.getType()))
5194 continue;
5195
5196 size_t MinRegs = std::numeric_limits<size_t>::max();
5197 for (const Formula &F : LU.Formulae)
5198 MinRegs = std::min(F.getNumRegs(), MinRegs);
5199
5200 bool Any = false;
5201 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5202 ++FIdx) {
5203 Formula &F = LU.Formulae[FIdx];
5204 if (F.getNumRegs() > MinRegs) {
5205 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5206 dbgs() << "\n");
5207 LU.DeleteFormula(F);
5208 --FIdx;
5209 --NumForms;
5210 Any = true;
5211 }
5212 }
5213 if (Any)
5214 LU.RecomputeRegs(LUIdx, RegUses);
5215
5216 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5217 break;
5218 }
5219
5220 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5221}
5222
5223/// The function delete formulas with high registers number expectation.
5224/// Assuming we don't know the value of each formula (already delete
5225/// all inefficient), generate probability of not selecting for each
5226/// register.
5227/// For example,
5228/// Use1:
5229/// reg(a) + reg({0,+,1})
5230/// reg(a) + reg({-1,+,1}) + 1
5231/// reg({a,+,1})
5232/// Use2:
5233/// reg(b) + reg({0,+,1})
5234/// reg(b) + reg({-1,+,1}) + 1
5235/// reg({b,+,1})
5236/// Use3:
5237/// reg(c) + reg(b) + reg({0,+,1})
5238/// reg(c) + reg({b,+,1})
5239///
5240/// Probability of not selecting
5241/// Use1 Use2 Use3
5242/// reg(a) (1/3) * 1 * 1
5243/// reg(b) 1 * (1/3) * (1/2)
5244/// reg({0,+,1}) (2/3) * (2/3) * (1/2)
5245/// reg({-1,+,1}) (2/3) * (2/3) * 1
5246/// reg({a,+,1}) (2/3) * 1 * 1
5247/// reg({b,+,1}) 1 * (2/3) * (2/3)
5248/// reg(c) 1 * 1 * 0
5249///
5250/// Now count registers number mathematical expectation for each formula:
5251/// Note that for each use we exclude probability if not selecting for the use.
5252/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
5253/// probabilty 1/3 of not selecting for Use1).
5254/// Use1:
5255/// reg(a) + reg({0,+,1}) 1 + 1/3 -- to be deleted
5256/// reg(a) + reg({-1,+,1}) + 1 1 + 4/9 -- to be deleted
5257/// reg({a,+,1}) 1
5258/// Use2:
5259/// reg(b) + reg({0,+,1}) 1/2 + 1/3 -- to be deleted
5260/// reg(b) + reg({-1,+,1}) + 1 1/2 + 2/3 -- to be deleted
5261/// reg({b,+,1}) 2/3
5262/// Use3:
5263/// reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
5264/// reg(c) + reg({b,+,1}) 1 + 2/3
5265void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
5266 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5267 return;
5268 // Ok, we have too many of formulae on our hands to conveniently handle.
5269 // Use a rough heuristic to thin out the list.
5270
5271 // Set of Regs wich will be 100% used in final solution.
5272 // Used in each formula of a solution (in example above this is reg(c)).
5273 // We can skip them in calculations.
5274 SmallPtrSet<const SCEV *, 4> UniqRegs;
5275 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5276
5277 // Map each register to probability of not selecting
5278 DenseMap <const SCEV *, float> RegNumMap;
5279 for (const SCEV *Reg : RegUses) {
5280 if (UniqRegs.count(Reg))
5281 continue;
5282 float PNotSel = 1;
5283 for (const LSRUse &LU : Uses) {
5284 if (!LU.Regs.count(Reg))
5285 continue;
5286 float P = LU.getNotSelectedProbability(Reg);
5287 if (P != 0.0)
5288 PNotSel *= P;
5289 else
5290 UniqRegs.insert(Reg);
5291 }
5292 RegNumMap.insert(std::make_pair(Reg, PNotSel));
5293 }
5294
5295 LLVM_DEBUG(
5296 dbgs() << "Narrowing the search space by deleting costly formulas\n");
5297
5298 // Delete formulas where registers number expectation is high.
5299 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5300 LSRUse &LU = Uses[LUIdx];
5301 // If nothing to delete - continue.
5302 if (LU.Formulae.size() < 2)
5303 continue;
5304 // This is temporary solution to test performance. Float should be
5305 // replaced with round independent type (based on integers) to avoid
5306 // different results for different target builds.
5307 float FMinRegNum = LU.Formulae[0].getNumRegs();
5308 float FMinARegNum = LU.Formulae[0].getNumRegs();
5309 size_t MinIdx = 0;
5310 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5311 Formula &F = LU.Formulae[i];
5312 float FRegNum = 0;
5313 float FARegNum = 0;
5314 for (const SCEV *BaseReg : F.BaseRegs) {
5315 if (UniqRegs.count(BaseReg))
5316 continue;
5317 FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5318 if (isa<SCEVAddRecExpr>(BaseReg))
5319 FARegNum +=
5320 RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5321 }
5322 if (const SCEV *ScaledReg = F.ScaledReg) {
5323 if (!UniqRegs.count(ScaledReg)) {
5324 FRegNum +=
5325 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5326 if (isa<SCEVAddRecExpr>(ScaledReg))
5327 FARegNum +=
5328 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5329 }
5330 }
5331 if (FMinRegNum > FRegNum ||
5332 (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
5333 FMinRegNum = FRegNum;
5334 FMinARegNum = FARegNum;
5335 MinIdx = i;
5336 }
5337 }
5338 LLVM_DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs());
5339 dbgs() << " with min reg num " << FMinRegNum << '\n');
5340 if (MinIdx != 0)
5341 std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
5342 while (LU.Formulae.size() != 1) {
5343 LLVM_DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs());
5344 dbgs() << '\n');
5345 LU.Formulae.pop_back();
5346 }
5347 LU.RecomputeRegs(LUIdx, RegUses);
5348 assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
5349 Formula &F = LU.Formulae[0];
5350 LLVM_DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n');
5351 // When we choose the formula, the regs become unique.
5352 UniqRegs.insert_range(F.BaseRegs);
5353 if (F.ScaledReg)
5354 UniqRegs.insert(F.ScaledReg);
5355 }
5356 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5357}
5358
5359// Check if Best and Reg are SCEVs separated by a constant amount C, and if so
5360// would the addressing offset +C would be legal where the negative offset -C is
5361// not.
5363 ScalarEvolution &SE, const SCEV *Best,
5364 const SCEV *Reg,
5365 MemAccessTy AccessType) {
5366 if (Best->getType() != Reg->getType() ||
5368 cast<SCEVAddRecExpr>(Best)->getLoop() !=
5369 cast<SCEVAddRecExpr>(Reg)->getLoop()))
5370 return false;
5371 std::optional<APInt> Diff = SE.computeConstantDifference(Best, Reg);
5372 if (!Diff)
5373 return false;
5374
5375 return TTI.isLegalAddressingMode(
5376 AccessType.MemTy, /*BaseGV=*/nullptr,
5377 /*BaseOffset=*/Diff->getSExtValue(),
5378 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace) &&
5379 !TTI.isLegalAddressingMode(
5380 AccessType.MemTy, /*BaseGV=*/nullptr,
5381 /*BaseOffset=*/-Diff->getSExtValue(),
5382 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace);
5383}
5384
5385/// Pick a register which seems likely to be profitable, and then in any use
5386/// which has any reference to that register, delete all formulae which do not
5387/// reference that register.
5388void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
5389 // With all other options exhausted, loop until the system is simple
5390 // enough to handle.
5391 SmallPtrSet<const SCEV *, 4> Taken;
5392 while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5393 // Ok, we have too many of formulae on our hands to conveniently handle.
5394 // Use a rough heuristic to thin out the list.
5395 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5396
5397 // Pick the register which is used by the most LSRUses, which is likely
5398 // to be a good reuse register candidate.
5399 const SCEV *Best = nullptr;
5400 unsigned BestNum = 0;
5401 for (const SCEV *Reg : RegUses) {
5402 if (Taken.count(Reg))
5403 continue;
5404 if (!Best) {
5405 Best = Reg;
5406 BestNum = RegUses.getUsedByIndices(Reg).count();
5407 } else {
5408 unsigned Count = RegUses.getUsedByIndices(Reg).count();
5409 if (Count > BestNum) {
5410 Best = Reg;
5411 BestNum = Count;
5412 }
5413
5414 // If the scores are the same, but the Reg is simpler for the target
5415 // (for example {x,+,1} as opposed to {x+C,+,1}, where the target can
5416 // handle +C but not -C), opt for the simpler formula.
5417 if (Count == BestNum) {
5418 int LUIdx = RegUses.getUsedByIndices(Reg).find_first();
5419 if (LUIdx >= 0 && Uses[LUIdx].Kind == LSRUse::Address &&
5421 Uses[LUIdx].AccessTy)) {
5422 Best = Reg;
5423 BestNum = Count;
5424 }
5425 }
5426 }
5427 }
5428 assert(Best && "Failed to find best LSRUse candidate");
5429
5430 LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
5431 << " will yield profitable reuse.\n");
5432 Taken.insert(Best);
5433
5434 // In any use with formulae which references this register, delete formulae
5435 // which don't reference it.
5436 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5437 LSRUse &LU = Uses[LUIdx];
5438 if (!LU.Regs.count(Best)) continue;
5439
5440 bool Any = false;
5441 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5442 Formula &F = LU.Formulae[i];
5443 if (!F.referencesReg(Best)) {
5444 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
5445 LU.DeleteFormula(F);
5446 --e;
5447 --i;
5448 Any = true;
5449 assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
5450 continue;
5451 }
5452 }
5453
5454 if (Any)
5455 LU.RecomputeRegs(LUIdx, RegUses);
5456 }
5457
5458 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5459 }
5460}
5461
5462/// If there are an extraordinary number of formulae to choose from, use some
5463/// rough heuristics to prune down the number of formulae. This keeps the main
5464/// solver from taking an extraordinary amount of time in some worst-case
5465/// scenarios.
5466void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
5467 NarrowSearchSpaceByDetectingSupersets();
5468 NarrowSearchSpaceByCollapsingUnrolledCode();
5469 NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
5471 NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
5472 NarrowSearchSpaceByFilterPostInc();
5473 if (LSRExpNarrow)
5474 NarrowSearchSpaceByDeletingCostlyFormulas();
5475 else
5476 NarrowSearchSpaceByPickingWinnerRegs();
5477}
5478
5479/// This is the recursive solver.
5480void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
5481 Cost &SolutionCost,
5482 SmallVectorImpl<const Formula *> &Workspace,
5483 const Cost &CurCost,
5484 const SmallPtrSet<const SCEV *, 16> &CurRegs,
5485 DenseSet<const SCEV *> &VisitedRegs) const {
5486 // Some ideas:
5487 // - prune more:
5488 // - use more aggressive filtering
5489 // - sort the formula so that the most profitable solutions are found first
5490 // - sort the uses too
5491 // - search faster:
5492 // - don't compute a cost, and then compare. compare while computing a cost
5493 // and bail early.
5494 // - track register sets with SmallBitVector
5495
5496 const LSRUse &LU = Uses[Workspace.size()];
5497
5498 // If this use references any register that's already a part of the
5499 // in-progress solution, consider it a requirement that a formula must
5500 // reference that register in order to be considered. This prunes out
5501 // unprofitable searching.
5502 SmallSetVector<const SCEV *, 4> ReqRegs;
5503 for (const SCEV *S : CurRegs)
5504 if (LU.Regs.count(S))
5505 ReqRegs.insert(S);
5506
5507 SmallPtrSet<const SCEV *, 16> NewRegs;
5508 Cost NewCost(L, SE, TTI, AMK);
5509 for (const Formula &F : LU.Formulae) {
5510 // Ignore formulae which may not be ideal in terms of register reuse of
5511 // ReqRegs. The formula should use all required registers before
5512 // introducing new ones.
5513 // This can sometimes (notably when trying to favour postinc) lead to
5514 // sub-optimial decisions. There it is best left to the cost modelling to
5515 // get correct.
5516 if (!(AMK & TTI::AMK_PostIndexed) || LU.Kind != LSRUse::Address) {
5517 int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
5518 for (const SCEV *Reg : ReqRegs) {
5519 if ((F.ScaledReg && F.ScaledReg == Reg) ||
5520 is_contained(F.BaseRegs, Reg)) {
5521 --NumReqRegsToFind;
5522 if (NumReqRegsToFind == 0)
5523 break;
5524 }
5525 }
5526 if (NumReqRegsToFind != 0) {
5527 // If none of the formulae satisfied the required registers, then we could
5528 // clear ReqRegs and try again. Currently, we simply give up in this case.
5529 continue;
5530 }
5531 }
5532
5533 // Evaluate the cost of the current formula. If it's already worse than
5534 // the current best, prune the search at that point.
5535 NewCost = CurCost;
5536 NewRegs = CurRegs;
5537 NewCost.RateFormula(F, NewRegs, VisitedRegs, LU, HardwareLoopProfitable);
5538 if (NewCost.isLess(SolutionCost)) {
5539 Workspace.push_back(&F);
5540 if (Workspace.size() != Uses.size()) {
5541 SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
5542 NewRegs, VisitedRegs);
5543 if (F.getNumRegs() == 1 && Workspace.size() == 1)
5544 VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
5545 } else {
5546 LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
5547 dbgs() << ".\nRegs:\n";
5548 for (const SCEV *S : NewRegs) dbgs()
5549 << "- " << *S << "\n";
5550 dbgs() << '\n');
5551
5552 SolutionCost = NewCost;
5553 Solution = Workspace;
5554 }
5555 Workspace.pop_back();
5556 }
5557 }
5558}
5559
5560/// Choose one formula from each use. Return the results in the given Solution
5561/// vector.
5562void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
5564 Cost SolutionCost(L, SE, TTI, AMK);
5565 SolutionCost.Lose();
5566 Cost CurCost(L, SE, TTI, AMK);
5567 SmallPtrSet<const SCEV *, 16> CurRegs;
5568 DenseSet<const SCEV *> VisitedRegs;
5569 Workspace.reserve(Uses.size());
5570
5571 // SolveRecurse does all the work.
5572 SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
5573 CurRegs, VisitedRegs);
5574 if (Solution.empty()) {
5575 LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
5576 return;
5577 }
5578
5579 // Ok, we've now made all our decisions.
5580 LLVM_DEBUG(dbgs() << "\n"
5581 "The chosen solution requires ";
5582 SolutionCost.print(dbgs()); dbgs() << ":\n";
5583 for (size_t i = 0, e = Uses.size(); i != e; ++i) {
5584 dbgs() << " ";
5585 Uses[i].print(dbgs());
5586 dbgs() << "\n"
5587 " ";
5588 Solution[i]->print(dbgs());
5589 dbgs() << '\n';
5590 });
5591
5592 assert(Solution.size() == Uses.size() && "Malformed solution!");
5593
5594 const bool EnableDropUnprofitableSolution = [&] {
5596 case cl::BOU_TRUE:
5597 return true;
5598 case cl::BOU_FALSE:
5599 return false;
5600 case cl::BOU_UNSET:
5602 }
5603 llvm_unreachable("Unhandled cl::boolOrDefault enum");
5604 }();
5605
5606 if (BaselineCost.isLess(SolutionCost)) {
5607 if (!EnableDropUnprofitableSolution)
5608 LLVM_DEBUG(
5609 dbgs() << "Baseline is more profitable than chosen solution, "
5610 "add option 'lsr-drop-solution' to drop LSR solution.\n");
5611 else {
5612 LLVM_DEBUG(dbgs() << "Baseline is more profitable than chosen "
5613 "solution, dropping LSR solution.\n";);
5614 Solution.clear();
5615 }
5616 }
5617}
5618
5619/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
5620/// we can go while still being dominated by the input positions. This helps
5621/// canonicalize the insert position, which encourages sharing.
5623LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
5624 const SmallVectorImpl<Instruction *> &Inputs)
5625 const {
5626 Instruction *Tentative = &*IP;
5627 while (true) {
5628 bool AllDominate = true;
5629 Instruction *BetterPos = nullptr;
5630 // Don't bother attempting to insert before a catchswitch, their basic block
5631 // cannot have other non-PHI instructions.
5632 if (isa<CatchSwitchInst>(Tentative))
5633 return IP;
5634
5635 for (Instruction *Inst : Inputs) {
5636 if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
5637 AllDominate = false;
5638 break;
5639 }
5640 // Attempt to find an insert position in the middle of the block,
5641 // instead of at the end, so that it can be used for other expansions.
5642 if (Tentative->getParent() == Inst->getParent() &&
5643 (!BetterPos || !DT.dominates(Inst, BetterPos)))
5644 BetterPos = &*std::next(BasicBlock::iterator(Inst));
5645 }
5646 if (!AllDominate)
5647 break;
5648 if (BetterPos)
5649 IP = BetterPos->getIterator();
5650 else
5651 IP = Tentative->getIterator();
5652
5653 const Loop *IPLoop = LI.getLoopFor(IP->getParent());
5654 unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
5655
5656 BasicBlock *IDom;
5657 for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
5658 if (!Rung) return IP;
5659 Rung = Rung->getIDom();
5660 if (!Rung) return IP;
5661 IDom = Rung->getBlock();
5662
5663 // Don't climb into a loop though.
5664 const Loop *IDomLoop = LI.getLoopFor(IDom);
5665 unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
5666 if (IDomDepth <= IPLoopDepth &&
5667 (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
5668 break;
5669 }
5670
5671 Tentative = IDom->getTerminator();
5672 }
5673
5674 return IP;
5675}
5676
5677/// Determine an input position which will be dominated by the operands and
5678/// which will dominate the result.
5679BasicBlock::iterator LSRInstance::AdjustInsertPositionForExpand(
5680 BasicBlock::iterator LowestIP, const LSRFixup &LF, const LSRUse &LU) const {
5681 // Collect some instructions which must be dominated by the
5682 // expanding replacement. These must be dominated by any operands that
5683 // will be required in the expansion.
5684 SmallVector<Instruction *, 4> Inputs;
5685 if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
5686 Inputs.push_back(I);
5687 if (LU.Kind == LSRUse::ICmpZero)
5688 if (Instruction *I =
5689 dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
5690 Inputs.push_back(I);
5691 if (LF.PostIncLoops.count(L)) {
5692 if (LF.isUseFullyOutsideLoop(L))
5693 Inputs.push_back(L->getLoopLatch()->getTerminator());
5694 else
5695 Inputs.push_back(IVIncInsertPos);
5696 }
5697 // The expansion must also be dominated by the increment positions of any
5698 // loops it for which it is using post-inc mode.
5699 for (const Loop *PIL : LF.PostIncLoops) {
5700 if (PIL == L) continue;
5701
5702 // Be dominated by the loop exit.
5703 SmallVector<BasicBlock *, 4> ExitingBlocks;
5704 PIL->getExitingBlocks(ExitingBlocks);
5705 if (!ExitingBlocks.empty()) {
5706 BasicBlock *BB = ExitingBlocks[0];
5707 for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
5708 BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
5709 Inputs.push_back(BB->getTerminator());
5710 }
5711 }
5712
5713 assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad() &&
5714 "Insertion point must be a normal instruction");
5715
5716 // Then, climb up the immediate dominator tree as far as we can go while
5717 // still being dominated by the input positions.
5718 BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
5719
5720 // Don't insert instructions before PHI nodes.
5721 while (isa<PHINode>(IP)) ++IP;
5722
5723 // Ignore landingpad instructions.
5724 while (IP->isEHPad()) ++IP;
5725
5726 // Set IP below instructions recently inserted by SCEVExpander. This keeps the
5727 // IP consistent across expansions and allows the previously inserted
5728 // instructions to be reused by subsequent expansion.
5729 while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
5730 ++IP;
5731
5732 return IP;
5733}
5734
5735/// Emit instructions for the leading candidate expression for this LSRUse (this
5736/// is called "expanding").
5737Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
5738 const Formula &F, BasicBlock::iterator IP,
5739 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5740 if (LU.RigidFormula)
5741 return LF.OperandValToReplace;
5742
5743 // Determine an input position which will be dominated by the operands and
5744 // which will dominate the result.
5745 IP = AdjustInsertPositionForExpand(IP, LF, LU);
5746 Rewriter.setInsertPoint(&*IP);
5747
5748 // Inform the Rewriter if we have a post-increment use, so that it can
5749 // perform an advantageous expansion.
5750 Rewriter.setPostInc(LF.PostIncLoops);
5751
5752 // This is the type that the user actually needs.
5753 Type *OpTy = LF.OperandValToReplace->getType();
5754 // This will be the type that we'll initially expand to.
5755 Type *Ty = F.getType();
5756 if (!Ty)
5757 // No type known; just expand directly to the ultimate type.
5758 Ty = OpTy;
5759 else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
5760 // Expand directly to the ultimate type if it's the right size.
5761 Ty = OpTy;
5762 // This is the type to do integer arithmetic in.
5763 Type *IntTy = SE.getEffectiveSCEVType(Ty);
5764
5765 // Build up a list of operands to add together to form the full base.
5767
5768 // Expand the BaseRegs portion.
5769 for (const SCEV *Reg : F.BaseRegs) {
5770 assert(!Reg->isZero() && "Zero allocated in a base register!");
5771
5772 // If we're expanding for a post-inc user, make the post-inc adjustment.
5773 Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE);
5774 Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
5775 }
5776
5777 // Expand the ScaledReg portion.
5778 Value *ICmpScaledV = nullptr;
5779 if (F.Scale != 0) {
5780 const SCEV *ScaledS = F.ScaledReg;
5781
5782 // If we're expanding for a post-inc user, make the post-inc adjustment.
5783 PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
5784 ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE);
5785
5786 if (LU.Kind == LSRUse::ICmpZero) {
5787 // Expand ScaleReg as if it was part of the base regs.
5788 if (F.Scale == 1)
5789 Ops.push_back(
5790 SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)));
5791 else {
5792 // An interesting way of "folding" with an icmp is to use a negated
5793 // scale, which we'll implement by inserting it into the other operand
5794 // of the icmp.
5795 assert(F.Scale == -1 &&
5796 "The only scale supported by ICmpZero uses is -1!");
5797 ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr);
5798 }
5799 } else {
5800 // Otherwise just expand the scaled register and an explicit scale,
5801 // which is expected to be matched as part of the address.
5802
5803 // Flush the operand list to suppress SCEVExpander hoisting address modes.
5804 // Unless the addressing mode will not be folded.
5805 if (!Ops.empty() && LU.Kind == LSRUse::Address &&
5806 isAMCompletelyFolded(TTI, LU, F)) {
5807 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), nullptr);
5808 Ops.clear();
5809 Ops.push_back(SE.getUnknown(FullV));
5810 }
5811 ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr));
5812 if (F.Scale != 1)
5813 ScaledS =
5814 SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
5815 Ops.push_back(ScaledS);
5816 }
5817 }
5818
5819 // Expand the GV portion.
5820 if (F.BaseGV) {
5821 // Flush the operand list to suppress SCEVExpander hoisting.
5822 if (!Ops.empty()) {
5823 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), IntTy);
5824 Ops.clear();
5825 Ops.push_back(SE.getUnknown(FullV));
5826 }
5827 Ops.push_back(SE.getUnknown(F.BaseGV));
5828 }
5829
5830 // Flush the operand list to suppress SCEVExpander hoisting of both folded and
5831 // unfolded offsets. LSR assumes they both live next to their uses.
5832 if (!Ops.empty()) {
5833 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
5834 Ops.clear();
5835 Ops.push_back(SE.getUnknown(FullV));
5836 }
5837
5838 // FIXME: Are we sure we won't get a mismatch here? Is there a way to bail
5839 // out at this point, or should we generate a SCEV adding together mixed
5840 // offsets?
5841 assert(F.BaseOffset.isCompatibleImmediate(LF.Offset) &&
5842 "Expanding mismatched offsets\n");
5843 // Expand the immediate portion.
5844 Immediate Offset = F.BaseOffset.addUnsigned(LF.Offset);
5845 if (Offset.isNonZero()) {
5846 if (LU.Kind == LSRUse::ICmpZero) {
5847 // The other interesting way of "folding" with an ICmpZero is to use a
5848 // negated immediate.
5849 if (!ICmpScaledV) {
5850 // TODO: Avoid implicit trunc?
5851 // See https://github.com/llvm/llvm-project/issues/112510.
5852 ICmpScaledV = ConstantInt::getSigned(
5853 IntTy, -(uint64_t)Offset.getFixedValue(), /*ImplicitTrunc=*/true);
5854 } else {
5855 Ops.push_back(SE.getUnknown(ICmpScaledV));
5856 ICmpScaledV = ConstantInt::getSigned(IntTy, Offset.getFixedValue(),
5857 /*ImplicitTrunc=*/true);
5858 }
5859 } else {
5860 // Just add the immediate values. These again are expected to be matched
5861 // as part of the address.
5862 Ops.push_back(Offset.getUnknownSCEV(SE, IntTy));
5863 }
5864 }
5865
5866 // Expand the unfolded offset portion.
5867 Immediate UnfoldedOffset = F.UnfoldedOffset;
5868 if (UnfoldedOffset.isNonZero()) {
5869 // Just add the immediate values.
5870 Ops.push_back(UnfoldedOffset.getUnknownSCEV(SE, IntTy));
5871 }
5872
5873 // Emit instructions summing all the operands.
5874 const SCEV *FullS = Ops.empty() ?
5875 SE.getConstant(IntTy, 0) :
5876 SE.getAddExpr(Ops);
5877 Value *FullV = Rewriter.expandCodeFor(FullS, Ty);
5878
5879 // We're done expanding now, so reset the rewriter.
5880 Rewriter.clearPostInc();
5881
5882 // An ICmpZero Formula represents an ICmp which we're handling as a
5883 // comparison against zero. Now that we've expanded an expression for that
5884 // form, update the ICmp's other operand.
5885 if (LU.Kind == LSRUse::ICmpZero) {
5886 ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
5887 if (auto *OperandIsInstr = dyn_cast<Instruction>(CI->getOperand(1)))
5888 DeadInsts.emplace_back(OperandIsInstr);
5889 assert(!F.BaseGV && "ICmp does not support folding a global value and "
5890 "a scale at the same time!");
5891 if (F.Scale == -1) {
5892 if (ICmpScaledV->getType() != OpTy) {
5894 CastInst::getCastOpcode(ICmpScaledV, false, OpTy, false),
5895 ICmpScaledV, OpTy, "tmp", CI->getIterator());
5896 ICmpScaledV = Cast;
5897 }
5898 CI->setOperand(1, ICmpScaledV);
5899 } else {
5900 // A scale of 1 means that the scale has been expanded as part of the
5901 // base regs.
5902 assert((F.Scale == 0 || F.Scale == 1) &&
5903 "ICmp does not support folding a global value and "
5904 "a scale at the same time!");
5905 // TODO: Avoid implicit trunc?
5906 // See https://github.com/llvm/llvm-project/issues/112510.
5908 -(uint64_t)Offset.getFixedValue(),
5909 /*ImplicitTrunc=*/true);
5910 if (C->getType() != OpTy) {
5912 CastInst::getCastOpcode(C, false, OpTy, false), C, OpTy,
5913 CI->getDataLayout());
5914 assert(C && "Cast of ConstantInt should have folded");
5915 }
5916
5917 CI->setOperand(1, C);
5918 }
5919 }
5920
5921 return FullV;
5922}
5923
5924/// Helper for Rewrite. PHI nodes are special because the use of their operands
5925/// effectively happens in their predecessor blocks, so the expression may need
5926/// to be expanded in multiple places.
5927void LSRInstance::RewriteForPHI(PHINode *PN, const LSRUse &LU,
5928 const LSRFixup &LF, const Formula &F,
5929 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
5930 DenseMap<BasicBlock *, Value *> Inserted;
5931
5932 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
5933 if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
5934 bool needUpdateFixups = false;
5935 BasicBlock *BB = PN->getIncomingBlock(i);
5936
5937 // If this is a critical edge, split the edge so that we do not insert
5938 // the code on all predecessor/successor paths. We do this unless this
5939 // is the canonical backedge for this loop, which complicates post-inc
5940 // users.
5941 if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
5944 BasicBlock *Parent = PN->getParent();
5945 Loop *PNLoop = LI.getLoopFor(Parent);
5946 if (!PNLoop || Parent != PNLoop->getHeader()) {
5947 // Split the critical edge.
5948 BasicBlock *NewBB = nullptr;
5949 if (!Parent->isLandingPad()) {
5950 NewBB =
5951 SplitCriticalEdge(BB, Parent,
5952 CriticalEdgeSplittingOptions(&DT, &LI, MSSAU)
5953 .setMergeIdenticalEdges()
5954 .setKeepOneInputPHIs());
5955 } else {
5957 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
5958 SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DTU, &LI);
5959 NewBB = NewBBs[0];
5960 }
5961 // If NewBB==NULL, then SplitCriticalEdge refused to split because all
5962 // phi predecessors are identical. The simple thing to do is skip
5963 // splitting in this case rather than complicate the API.
5964 if (NewBB) {
5965 // If PN is outside of the loop and BB is in the loop, we want to
5966 // move the block to be immediately before the PHI block, not
5967 // immediately after BB.
5968 if (L->contains(BB) && !L->contains(PN))
5969 NewBB->moveBefore(PN->getParent());
5970
5971 // Splitting the edge can reduce the number of PHI entries we have.
5972 e = PN->getNumIncomingValues();
5973 BB = NewBB;
5974 i = PN->getBasicBlockIndex(BB);
5975
5976 needUpdateFixups = true;
5977 }
5978 }
5979 }
5980
5981 std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
5982 Inserted.try_emplace(BB);
5983 if (!Pair.second)
5984 PN->setIncomingValue(i, Pair.first->second);
5985 else {
5986 Value *FullV =
5987 Expand(LU, LF, F, BB->getTerminator()->getIterator(), DeadInsts);
5988
5989 // If this is reuse-by-noop-cast, insert the noop cast.
5990 Type *OpTy = LF.OperandValToReplace->getType();
5991 if (FullV->getType() != OpTy)
5992 FullV = CastInst::Create(
5993 CastInst::getCastOpcode(FullV, false, OpTy, false), FullV,
5994 LF.OperandValToReplace->getType(), "tmp",
5995 BB->getTerminator()->getIterator());
5996
5997 // If the incoming block for this value is not in the loop, it means the
5998 // current PHI is not in a loop exit, so we must create a LCSSA PHI for
5999 // the inserted value.
6000 if (auto *I = dyn_cast<Instruction>(FullV))
6001 if (L->contains(I) && !L->contains(BB))
6002 InsertedNonLCSSAInsts.insert(I);
6003
6004 PN->setIncomingValue(i, FullV);
6005 Pair.first->second = FullV;
6006 }
6007
6008 // If LSR splits critical edge and phi node has other pending
6009 // fixup operands, we need to update those pending fixups. Otherwise
6010 // formulae will not be implemented completely and some instructions
6011 // will not be eliminated.
6012 if (needUpdateFixups) {
6013 for (LSRUse &LU : Uses)
6014 for (LSRFixup &Fixup : LU.Fixups)
6015 // If fixup is supposed to rewrite some operand in the phi
6016 // that was just updated, it may be already moved to
6017 // another phi node. Such fixup requires update.
6018 if (Fixup.UserInst == PN) {
6019 // Check if the operand we try to replace still exists in the
6020 // original phi.
6021 bool foundInOriginalPHI = false;
6022 for (const auto &val : PN->incoming_values())
6023 if (val == Fixup.OperandValToReplace) {
6024 foundInOriginalPHI = true;
6025 break;
6026 }
6027
6028 // If fixup operand found in original PHI - nothing to do.
6029 if (foundInOriginalPHI)
6030 continue;
6031
6032 // Otherwise it might be moved to another PHI and requires update.
6033 // If fixup operand not found in any of the incoming blocks that
6034 // means we have already rewritten it - nothing to do.
6035 for (const auto &Block : PN->blocks())
6036 for (BasicBlock::iterator I = Block->begin(); isa<PHINode>(I);
6037 ++I) {
6038 PHINode *NewPN = cast<PHINode>(I);
6039 for (const auto &val : NewPN->incoming_values())
6040 if (val == Fixup.OperandValToReplace)
6041 Fixup.UserInst = NewPN;
6042 }
6043 }
6044 }
6045 }
6046}
6047
6048/// Emit instructions for the leading candidate expression for this LSRUse (this
6049/// is called "expanding"), and update the UserInst to reference the newly
6050/// expanded value.
6051void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
6052 const Formula &F,
6053 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
6054 // First, find an insertion point that dominates UserInst. For PHI nodes,
6055 // find the nearest block which dominates all the relevant uses.
6056 if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
6057 RewriteForPHI(PN, LU, LF, F, DeadInsts);
6058 } else {
6059 Value *FullV = Expand(LU, LF, F, LF.UserInst->getIterator(), DeadInsts);
6060
6061 // If this is reuse-by-noop-cast, insert the noop cast.
6062 Type *OpTy = LF.OperandValToReplace->getType();
6063 if (FullV->getType() != OpTy) {
6064 Instruction *Cast =
6065 CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
6066 FullV, OpTy, "tmp", LF.UserInst->getIterator());
6067 FullV = Cast;
6068 }
6069
6070 // Update the user. ICmpZero is handled specially here (for now) because
6071 // Expand may have updated one of the operands of the icmp already, and
6072 // its new value may happen to be equal to LF.OperandValToReplace, in
6073 // which case doing replaceUsesOfWith leads to replacing both operands
6074 // with the same value. TODO: Reorganize this.
6075 if (LU.Kind == LSRUse::ICmpZero)
6076 LF.UserInst->setOperand(0, FullV);
6077 else
6078 LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
6079 }
6080
6081 if (auto *OperandIsInstr = dyn_cast<Instruction>(LF.OperandValToReplace))
6082 DeadInsts.emplace_back(OperandIsInstr);
6083}
6084
6085// Determine where to insert the transformed IV increment instruction for this
6086// fixup. By default this is the default insert position, but if this is a
6087// postincrement opportunity then we try to insert it in the same block as the
6088// fixup user instruction, as this is needed for a postincrement instruction to
6089// be generated.
6091 const LSRFixup &Fixup, const LSRUse &LU,
6092 Instruction *IVIncInsertPos,
6093 DominatorTree &DT) {
6094 // Only address uses can be postincremented
6095 if (LU.Kind != LSRUse::Address)
6096 return IVIncInsertPos;
6097
6098 // Don't try to postincrement if it's not legal
6099 Instruction *I = Fixup.UserInst;
6100 Type *Ty = I->getType();
6101 if (!(isa<LoadInst>(I) && TTI.isIndexedLoadLegal(TTI.MIM_PostInc, Ty)) &&
6102 !(isa<StoreInst>(I) && TTI.isIndexedStoreLegal(TTI.MIM_PostInc, Ty)))
6103 return IVIncInsertPos;
6104
6105 // It's only legal to hoist to the user block if it dominates the default
6106 // insert position.
6107 BasicBlock *HoistBlock = I->getParent();
6108 BasicBlock *IVIncBlock = IVIncInsertPos->getParent();
6109 if (!DT.dominates(I, IVIncBlock))
6110 return IVIncInsertPos;
6111
6112 return HoistBlock->getTerminator();
6113}
6114
6115/// Rewrite all the fixup locations with new values, following the chosen
6116/// solution.
6117void LSRInstance::ImplementSolution(
6118 const SmallVectorImpl<const Formula *> &Solution) {
6119 // Keep track of instructions we may have made dead, so that
6120 // we can remove them after we are done working.
6122
6123 // Mark phi nodes that terminate chains so the expander tries to reuse them.
6124 for (const IVChain &Chain : IVChainVec) {
6125 if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst()))
6126 Rewriter.setChainedPhi(PN);
6127 }
6128
6129 // Expand the new value definitions and update the users.
6130 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
6131 for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) {
6132 Instruction *InsertPos =
6133 getFixupInsertPos(TTI, Fixup, Uses[LUIdx], IVIncInsertPos, DT);
6134 Rewriter.setIVIncInsertPos(L, InsertPos);
6135 Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], DeadInsts);
6136 Changed = true;
6137 }
6138
6139 auto InsertedInsts = InsertedNonLCSSAInsts.takeVector();
6140 formLCSSAForInstructions(InsertedInsts, DT, LI, &SE);
6141
6142 for (const IVChain &Chain : IVChainVec) {
6143 GenerateIVChain(Chain, DeadInsts);
6144 Changed = true;
6145 }
6146
6147 for (const WeakVH &IV : Rewriter.getInsertedIVs())
6148 if (IV && dyn_cast<Instruction>(&*IV)->getParent())
6149 ScalarEvolutionIVs.push_back(IV);
6150
6151 // Clean up after ourselves. This must be done before deleting any
6152 // instructions.
6153 Rewriter.clear();
6154
6156 &TLI, MSSAU);
6157
6158 // In our cost analysis above, we assume that each addrec consumes exactly
6159 // one register, and arrange to have increments inserted just before the
6160 // latch to maximimize the chance this is true. However, if we reused
6161 // existing IVs, we now need to move the increments to match our
6162 // expectations. Otherwise, our cost modeling results in us having a
6163 // chosen a non-optimal result for the actual schedule. (And yes, this
6164 // scheduling decision does impact later codegen.)
6165 for (PHINode &PN : L->getHeader()->phis()) {
6166 BinaryOperator *BO = nullptr;
6167 Value *Start = nullptr, *Step = nullptr;
6168 if (!matchSimpleRecurrence(&PN, BO, Start, Step))
6169 continue;
6170
6171 switch (BO->getOpcode()) {
6172 case Instruction::Sub:
6173 if (BO->getOperand(0) != &PN)
6174 // sub is non-commutative - match handling elsewhere in LSR
6175 continue;
6176 break;
6177 case Instruction::Add:
6178 break;
6179 default:
6180 continue;
6181 };
6182
6183 if (!isa<Constant>(Step))
6184 // If not a constant step, might increase register pressure
6185 // (We assume constants have been canonicalized to RHS)
6186 continue;
6187
6188 if (BO->getParent() == IVIncInsertPos->getParent())
6189 // Only bother moving across blocks. Isel can handle block local case.
6190 continue;
6191
6192 // Can we legally schedule inc at the desired point?
6193 if (!llvm::all_of(BO->uses(),
6194 [&](Use &U) {return DT.dominates(IVIncInsertPos, U);}))
6195 continue;
6196 BO->moveBefore(IVIncInsertPos->getIterator());
6197 Changed = true;
6198 }
6199
6200
6201}
6202
6203LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
6204 DominatorTree &DT, LoopInfo &LI,
6205 const TargetTransformInfo &TTI, AssumptionCache &AC,
6206 TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU)
6207 : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), TLI(TLI), TTI(TTI), L(L),
6208 MSSAU(MSSAU), AMK(PreferredAddresingMode.getNumOccurrences() > 0
6210 : TTI.getPreferredAddressingMode(L, &SE)),
6211 Rewriter(SE, "lsr", false), BaselineCost(L, SE, TTI, AMK) {
6212 // If LoopSimplify form is not available, stay out of trouble.
6213 if (!L->isLoopSimplifyForm())
6214 return;
6215
6216 // If there's no interesting work to be done, bail early.
6217 if (IU.empty()) return;
6218
6219 // If there's too much analysis to be done, bail early. We won't be able to
6220 // model the problem anyway.
6221 unsigned NumUsers = 0;
6222 for (const IVStrideUse &U : IU) {
6223 if (++NumUsers > MaxIVUsers) {
6224 (void)U;
6225 LLVM_DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U
6226 << "\n");
6227 return;
6228 }
6229 // Bail out if we have a PHI on an EHPad that gets a value from a
6230 // CatchSwitchInst. Because the CatchSwitchInst cannot be split, there is
6231 // no good place to stick any instructions.
6232 if (auto *PN = dyn_cast<PHINode>(U.getUser())) {
6233 auto FirstNonPHI = PN->getParent()->getFirstNonPHIIt();
6234 if (isa<FuncletPadInst>(FirstNonPHI) ||
6235 isa<CatchSwitchInst>(FirstNonPHI))
6236 for (BasicBlock *PredBB : PN->blocks())
6237 if (isa<CatchSwitchInst>(PredBB->getFirstNonPHIIt()))
6238 return;
6239 }
6240 }
6241
6242 LLVM_DEBUG(dbgs() << "\nLSR on loop ";
6243 L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
6244 dbgs() << ":\n");
6245
6246 // Check if we expect this loop to use a hardware loop instruction, which will
6247 // be used when calculating the costs of formulas.
6248 HardwareLoopInfo HWLoopInfo(L);
6249 HardwareLoopProfitable =
6250 TTI.isHardwareLoopProfitable(L, SE, AC, &TLI, HWLoopInfo);
6251
6252 // Configure SCEVExpander already now, so the correct mode is used for
6253 // isSafeToExpand() checks.
6254#if LLVM_ENABLE_ABI_BREAKING_CHECKS
6255 Rewriter.setDebugType(DEBUG_TYPE);
6256#endif
6257 Rewriter.disableCanonicalMode();
6258 Rewriter.enableLSRMode();
6259
6260 // First, perform some low-level loop optimizations.
6261 OptimizeShadowIV();
6262 OptimizeLoopTermCond();
6263
6264 // If loop preparation eliminates all interesting IV users, bail.
6265 if (IU.empty()) return;
6266
6267 // Skip nested loops until we can model them better with formulae.
6268 if (!L->isInnermost()) {
6269 LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
6270 return;
6271 }
6272
6273 // Start collecting data and preparing for the solver.
6274 // If number of registers is not the major cost, we cannot benefit from the
6275 // current profitable chain optimization which is based on number of
6276 // registers.
6277 // FIXME: add profitable chain optimization for other kinds major cost, for
6278 // example number of instructions.
6279 if (TTI.isNumRegsMajorCostOfLSR() || StressIVChain)
6280 CollectChains();
6281 CollectInterestingTypesAndFactors();
6282 CollectFixupsAndInitialFormulae();
6283 CollectLoopInvariantFixupsAndFormulae();
6284
6285 if (Uses.empty())
6286 return;
6287
6288 LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
6289 print_uses(dbgs()));
6290 LLVM_DEBUG(dbgs() << "The baseline solution requires ";
6291 BaselineCost.print(dbgs()); dbgs() << "\n");
6292
6293 // Now use the reuse data to generate a bunch of interesting ways
6294 // to formulate the values needed for the uses.
6295 GenerateAllReuseFormulae();
6296
6297 FilterOutUndesirableDedicatedRegisters();
6298 NarrowSearchSpaceUsingHeuristics();
6299
6301 Solve(Solution);
6302
6303 // Release memory that is no longer needed.
6304 Factors.clear();
6305 Types.clear();
6306 RegUses.clear();
6307
6308 if (Solution.empty())
6309 return;
6310
6311#ifndef NDEBUG
6312 // Formulae should be legal.
6313 for (const LSRUse &LU : Uses) {
6314 for (const Formula &F : LU.Formulae)
6315 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
6316 F) && "Illegal formula generated!");
6317 };
6318#endif
6319
6320 // Now that we've decided what we want, make it so.
6321 ImplementSolution(Solution);
6322}
6323
6324#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
6325void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
6326 if (Factors.empty() && Types.empty()) return;
6327
6328 OS << "LSR has identified the following interesting factors and types: ";
6329 ListSeparator LS;
6330
6331 for (int64_t Factor : Factors)
6332 OS << LS << '*' << Factor;
6333
6334 for (Type *Ty : Types)
6335 OS << LS << '(' << *Ty << ')';
6336 OS << '\n';
6337}
6338
6339void LSRInstance::print_fixups(raw_ostream &OS) const {
6340 OS << "LSR is examining the following fixup sites:\n";
6341 for (const LSRUse &LU : Uses)
6342 for (const LSRFixup &LF : LU.Fixups) {
6343 dbgs() << " ";
6344 LF.print(OS);
6345 OS << '\n';
6346 }
6347}
6348
6349void LSRInstance::print_uses(raw_ostream &OS) const {
6350 OS << "LSR is examining the following uses:\n";
6351 for (const LSRUse &LU : Uses) {
6352 dbgs() << " ";
6353 LU.print(OS);
6354 OS << '\n';
6355 for (const Formula &F : LU.Formulae) {
6356 OS << " ";
6357 F.print(OS);
6358 OS << '\n';
6359 }
6360 }
6361}
6362
6363void LSRInstance::print(raw_ostream &OS) const {
6364 print_factors_and_types(OS);
6365 print_fixups(OS);
6366 print_uses(OS);
6367}
6368
6369LLVM_DUMP_METHOD void LSRInstance::dump() const {
6370 print(errs()); errs() << '\n';
6371}
6372#endif
6373
6374namespace {
6375
6376class LoopStrengthReduce : public LoopPass {
6377public:
6378 static char ID; // Pass ID, replacement for typeid
6379
6380 LoopStrengthReduce();
6381
6382private:
6383 bool runOnLoop(Loop *L, LPPassManager &LPM) override;
6384 void getAnalysisUsage(AnalysisUsage &AU) const override;
6385};
6386
6387} // end anonymous namespace
6388
6389LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
6391}
6392
6393void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
6394 // We split critical edges, so we change the CFG. However, we do update
6395 // many analyses if they are around.
6397
6398 AU.addRequired<LoopInfoWrapperPass>();
6399 AU.addPreserved<LoopInfoWrapperPass>();
6401 AU.addRequired<DominatorTreeWrapperPass>();
6402 AU.addPreserved<DominatorTreeWrapperPass>();
6403 AU.addRequired<ScalarEvolutionWrapperPass>();
6404 AU.addPreserved<ScalarEvolutionWrapperPass>();
6405 AU.addRequired<AssumptionCacheTracker>();
6406 AU.addRequired<TargetLibraryInfoWrapperPass>();
6407 // Requiring LoopSimplify a second time here prevents IVUsers from running
6408 // twice, since LoopSimplify was invalidated by running ScalarEvolution.
6410 AU.addRequired<IVUsersWrapperPass>();
6411 AU.addPreserved<IVUsersWrapperPass>();
6412 AU.addRequired<TargetTransformInfoWrapperPass>();
6413 AU.addPreserved<MemorySSAWrapperPass>();
6414}
6415
6416namespace {
6417
6418/// Enables more convenient iteration over a DWARF expression vector.
6420ToDwarfOpIter(SmallVectorImpl<uint64_t> &Expr) {
6421 llvm::DIExpression::expr_op_iterator Begin =
6422 llvm::DIExpression::expr_op_iterator(Expr.begin());
6423 llvm::DIExpression::expr_op_iterator End =
6424 llvm::DIExpression::expr_op_iterator(Expr.end());
6425 return {Begin, End};
6426}
6427
6428struct SCEVDbgValueBuilder {
6429 SCEVDbgValueBuilder() = default;
6430 SCEVDbgValueBuilder(const SCEVDbgValueBuilder &Base) { clone(Base); }
6431
6432 void clone(const SCEVDbgValueBuilder &Base) {
6433 LocationOps = Base.LocationOps;
6434 Expr = Base.Expr;
6435 }
6436
6437 void clear() {
6438 LocationOps.clear();
6439 Expr.clear();
6440 }
6441
6442 /// The DIExpression as we translate the SCEV.
6444 /// The location ops of the DIExpression.
6445 SmallVector<Value *, 2> LocationOps;
6446
6447 void pushOperator(uint64_t Op) { Expr.push_back(Op); }
6448 void pushUInt(uint64_t Operand) { Expr.push_back(Operand); }
6449
6450 /// Add a DW_OP_LLVM_arg to the expression, followed by the index of the value
6451 /// in the set of values referenced by the expression.
6452 void pushLocation(llvm::Value *V) {
6454 auto *It = llvm::find(LocationOps, V);
6455 unsigned ArgIndex = 0;
6456 if (It != LocationOps.end()) {
6457 ArgIndex = std::distance(LocationOps.begin(), It);
6458 } else {
6459 ArgIndex = LocationOps.size();
6460 LocationOps.push_back(V);
6461 }
6462 Expr.push_back(ArgIndex);
6463 }
6464
6465 void pushValue(const SCEVUnknown *U) {
6466 llvm::Value *V = cast<SCEVUnknown>(U)->getValue();
6467 pushLocation(V);
6468 }
6469
6470 bool pushConst(const SCEVConstant *C) {
6471 if (C->getAPInt().getSignificantBits() > 64)
6472 return false;
6473 Expr.push_back(llvm::dwarf::DW_OP_consts);
6474 Expr.push_back(C->getAPInt().getSExtValue());
6475 return true;
6476 }
6477
6478 // Iterating the expression as DWARF ops is convenient when updating
6479 // DWARF_OP_LLVM_args.
6481 return ToDwarfOpIter(Expr);
6482 }
6483
6484 /// Several SCEV types are sequences of the same arithmetic operator applied
6485 /// to constants and values that may be extended or truncated.
6486 bool pushArithmeticExpr(const llvm::SCEVCommutativeExpr *CommExpr,
6487 uint64_t DwarfOp) {
6488 assert((isa<llvm::SCEVAddExpr>(CommExpr) || isa<SCEVMulExpr>(CommExpr)) &&
6489 "Expected arithmetic SCEV type");
6490 bool Success = true;
6491 unsigned EmitOperator = 0;
6492 for (const auto &Op : CommExpr->operands()) {
6493 Success &= pushSCEV(Op);
6494
6495 if (EmitOperator >= 1)
6496 pushOperator(DwarfOp);
6497 ++EmitOperator;
6498 }
6499 return Success;
6500 }
6501
6502 // TODO: Identify and omit noop casts.
6503 bool pushCast(const llvm::SCEVCastExpr *C, bool IsSigned) {
6504 const llvm::SCEV *Inner = C->getOperand(0);
6505 const llvm::Type *Type = C->getType();
6506 uint64_t ToWidth = Type->getIntegerBitWidth();
6507 bool Success = pushSCEV(Inner);
6508 uint64_t CastOps[] = {dwarf::DW_OP_LLVM_convert, ToWidth,
6509 IsSigned ? llvm::dwarf::DW_ATE_signed
6510 : llvm::dwarf::DW_ATE_unsigned};
6511 for (const auto &Op : CastOps)
6512 pushOperator(Op);
6513 return Success;
6514 }
6515
6516 // TODO: MinMax - although these haven't been encountered in the test suite.
6517 bool pushSCEV(const llvm::SCEV *S) {
6518 bool Success = true;
6519 if (const SCEVConstant *StartInt = dyn_cast<SCEVConstant>(S)) {
6520 Success &= pushConst(StartInt);
6521
6522 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
6523 if (!U->getValue())
6524 return false;
6525 pushLocation(U->getValue());
6526
6527 } else if (const SCEVMulExpr *MulRec = dyn_cast<SCEVMulExpr>(S)) {
6528 Success &= pushArithmeticExpr(MulRec, llvm::dwarf::DW_OP_mul);
6529
6530 } else if (const SCEVUDivExpr *UDiv = dyn_cast<SCEVUDivExpr>(S)) {
6531 Success &= pushSCEV(UDiv->getLHS());
6532 Success &= pushSCEV(UDiv->getRHS());
6533 pushOperator(llvm::dwarf::DW_OP_div);
6534
6535 } else if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(S)) {
6536 // Assert if a new and unknown SCEVCastEXpr type is encountered.
6539 isa<SCEVSignExtendExpr>(Cast)) &&
6540 "Unexpected cast type in SCEV.");
6541 Success &= pushCast(Cast, (isa<SCEVSignExtendExpr>(Cast)));
6542
6543 } else if (const SCEVAddExpr *AddExpr = dyn_cast<SCEVAddExpr>(S)) {
6544 Success &= pushArithmeticExpr(AddExpr, llvm::dwarf::DW_OP_plus);
6545
6546 } else if (isa<SCEVAddRecExpr>(S)) {
6547 // Nested SCEVAddRecExpr are generated by nested loops and are currently
6548 // unsupported.
6549 return false;
6550
6551 } else {
6552 return false;
6553 }
6554 return Success;
6555 }
6556
6557 /// Return true if the combination of arithmetic operator and underlying
6558 /// SCEV constant value is an identity function.
6559 bool isIdentityFunction(uint64_t Op, const SCEV *S) {
6560 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
6561 if (C->getAPInt().getSignificantBits() > 64)
6562 return false;
6563 int64_t I = C->getAPInt().getSExtValue();
6564 switch (Op) {
6565 case llvm::dwarf::DW_OP_plus:
6566 case llvm::dwarf::DW_OP_minus:
6567 return I == 0;
6568 case llvm::dwarf::DW_OP_mul:
6569 case llvm::dwarf::DW_OP_div:
6570 return I == 1;
6571 }
6572 }
6573 return false;
6574 }
6575
6576 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6577 /// builder's expression stack. The stack should already contain an
6578 /// expression for the iteration count, so that it can be multiplied by
6579 /// the stride and added to the start.
6580 /// Components of the expression are omitted if they are an identity function.
6581 /// Chain (non-affine) SCEVs are not supported.
6582 bool SCEVToValueExpr(const llvm::SCEVAddRecExpr &SAR, ScalarEvolution &SE) {
6583 assert(SAR.isAffine() && "Expected affine SCEV");
6584 const SCEV *Start = SAR.getStart();
6585 const SCEV *Stride = SAR.getStepRecurrence(SE);
6586
6587 // Skip pushing arithmetic noops.
6588 if (!isIdentityFunction(llvm::dwarf::DW_OP_mul, Stride)) {
6589 if (!pushSCEV(Stride))
6590 return false;
6591 pushOperator(llvm::dwarf::DW_OP_mul);
6592 }
6593 if (!isIdentityFunction(llvm::dwarf::DW_OP_plus, Start)) {
6594 if (!pushSCEV(Start))
6595 return false;
6596 pushOperator(llvm::dwarf::DW_OP_plus);
6597 }
6598 return true;
6599 }
6600
6601 /// Create an expression that is an offset from a value (usually the IV).
6602 void createOffsetExpr(int64_t Offset, Value *OffsetValue) {
6603 pushLocation(OffsetValue);
6605 LLVM_DEBUG(
6606 dbgs() << "scev-salvage: Generated IV offset expression. Offset: "
6607 << std::to_string(Offset) << "\n");
6608 }
6609
6610 /// Combine a translation of the SCEV and the IV to create an expression that
6611 /// recovers a location's value.
6612 /// returns true if an expression was created.
6613 bool createIterCountExpr(const SCEV *S,
6614 const SCEVDbgValueBuilder &IterationCount,
6615 ScalarEvolution &SE) {
6616 // SCEVs for SSA values are most frquently of the form
6617 // {start,+,stride}, but sometimes they are ({start,+,stride} + %a + ..).
6618 // This is because %a is a PHI node that is not the IV. However, these
6619 // SCEVs have not been observed to result in debuginfo-lossy optimisations,
6620 // so its not expected this point will be reached.
6621 if (!isa<SCEVAddRecExpr>(S))
6622 return false;
6623
6624 LLVM_DEBUG(dbgs() << "scev-salvage: Location to salvage SCEV: " << *S
6625 << '\n');
6626
6627 const auto *Rec = cast<SCEVAddRecExpr>(S);
6628 if (!Rec->isAffine())
6629 return false;
6630
6632 return false;
6633
6634 // Initialise a new builder with the iteration count expression. In
6635 // combination with the value's SCEV this enables recovery.
6636 clone(IterationCount);
6637 if (!SCEVToValueExpr(*Rec, SE))
6638 return false;
6639
6640 return true;
6641 }
6642
6643 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6644 /// builder's expression stack. The stack should already contain an
6645 /// expression for the iteration count, so that it can be multiplied by
6646 /// the stride and added to the start.
6647 /// Components of the expression are omitted if they are an identity function.
6648 bool SCEVToIterCountExpr(const llvm::SCEVAddRecExpr &SAR,
6649 ScalarEvolution &SE) {
6650 assert(SAR.isAffine() && "Expected affine SCEV");
6651 const SCEV *Start = SAR.getStart();
6652 const SCEV *Stride = SAR.getStepRecurrence(SE);
6653
6654 // Skip pushing arithmetic noops.
6655 if (!isIdentityFunction(llvm::dwarf::DW_OP_minus, Start)) {
6656 if (!pushSCEV(Start))
6657 return false;
6658 pushOperator(llvm::dwarf::DW_OP_minus);
6659 }
6660 if (!isIdentityFunction(llvm::dwarf::DW_OP_div, Stride)) {
6661 if (!pushSCEV(Stride))
6662 return false;
6663 pushOperator(llvm::dwarf::DW_OP_div);
6664 }
6665 return true;
6666 }
6667
6668 // Append the current expression and locations to a location list and an
6669 // expression list. Modify the DW_OP_LLVM_arg indexes to account for
6670 // the locations already present in the destination list.
6671 void appendToVectors(SmallVectorImpl<uint64_t> &DestExpr,
6672 SmallVectorImpl<Value *> &DestLocations) {
6673 assert(!DestLocations.empty() &&
6674 "Expected the locations vector to contain the IV");
6675 // The DWARF_OP_LLVM_arg arguments of the expression being appended must be
6676 // modified to account for the locations already in the destination vector.
6677 // All builders contain the IV as the first location op.
6678 assert(!LocationOps.empty() &&
6679 "Expected the location ops to contain the IV.");
6680 // DestIndexMap[n] contains the index in DestLocations for the nth
6681 // location in this SCEVDbgValueBuilder.
6682 SmallVector<uint64_t, 2> DestIndexMap;
6683 for (const auto &Op : LocationOps) {
6684 auto It = find(DestLocations, Op);
6685 if (It != DestLocations.end()) {
6686 // Location already exists in DestLocations, reuse existing ArgIndex.
6687 DestIndexMap.push_back(std::distance(DestLocations.begin(), It));
6688 continue;
6689 }
6690 // Location is not in DestLocations, add it.
6691 DestIndexMap.push_back(DestLocations.size());
6692 DestLocations.push_back(Op);
6693 }
6694
6695 for (const auto &Op : expr_ops()) {
6696 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6697 Op.appendToVector(DestExpr);
6698 continue;
6699 }
6700
6702 // `DW_OP_LLVM_arg n` represents the nth LocationOp in this SCEV,
6703 // DestIndexMap[n] contains its new index in DestLocations.
6704 uint64_t NewIndex = DestIndexMap[Op.getArg(0)];
6705 DestExpr.push_back(NewIndex);
6706 }
6707 }
6708};
6709
6710/// Holds all the required data to salvage a dbg.value using the pre-LSR SCEVs
6711/// and DIExpression.
6712struct DVIRecoveryRec {
6713 DVIRecoveryRec(DbgVariableRecord *DVR)
6714 : DbgRef(DVR), Expr(DVR->getExpression()), HadLocationArgList(false) {}
6715
6716 DbgVariableRecord *DbgRef;
6717 DIExpression *Expr;
6718 bool HadLocationArgList;
6719 SmallVector<WeakVH, 2> LocationOps;
6722
6723 void clear() {
6724 for (auto &RE : RecoveryExprs)
6725 RE.reset();
6726 RecoveryExprs.clear();
6727 }
6728
6729 ~DVIRecoveryRec() { clear(); }
6730};
6731} // namespace
6732
6733/// Returns the total number of DW_OP_llvm_arg operands in the expression.
6734/// This helps in determining if a DIArglist is necessary or can be omitted from
6735/// the dbg.value.
6737 auto expr_ops = ToDwarfOpIter(Expr);
6738 unsigned Count = 0;
6739 for (auto Op : expr_ops)
6740 if (Op.getOp() == dwarf::DW_OP_LLVM_arg)
6741 Count++;
6742 return Count;
6743}
6744
6745/// Overwrites DVI with the location and Ops as the DIExpression. This will
6746/// create an invalid expression if Ops has any dwarf::DW_OP_llvm_arg operands,
6747/// because a DIArglist is not created for the first argument of the dbg.value.
6748template <typename T>
6749static void updateDVIWithLocation(T &DbgVal, Value *Location,
6751 assert(numLLVMArgOps(Ops) == 0 && "Expected expression that does not "
6752 "contain any DW_OP_llvm_arg operands.");
6753 DbgVal.setRawLocation(ValueAsMetadata::get(Location));
6754 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6755 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6756}
6757
6758/// Overwrite DVI with locations placed into a DIArglist.
6759template <typename T>
6760static void updateDVIWithLocations(T &DbgVal,
6761 SmallVectorImpl<Value *> &Locations,
6763 assert(numLLVMArgOps(Ops) != 0 &&
6764 "Expected expression that references DIArglist locations using "
6765 "DW_OP_llvm_arg operands.");
6767 for (Value *V : Locations)
6768 MetadataLocs.push_back(ValueAsMetadata::get(V));
6769 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6770 DbgVal.setRawLocation(llvm::DIArgList::get(DbgVal.getContext(), ValArrayRef));
6771 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6772}
6773
6774/// Write the new expression and new location ops for the dbg.value. If possible
6775/// reduce the szie of the dbg.value by omitting DIArglist. This
6776/// can be omitted if:
6777/// 1. There is only a single location, refenced by a single DW_OP_llvm_arg.
6778/// 2. The DW_OP_LLVM_arg is the first operand in the expression.
6779static void UpdateDbgValue(DVIRecoveryRec &DVIRec,
6780 SmallVectorImpl<Value *> &NewLocationOps,
6782 DbgVariableRecord *DbgVal = DVIRec.DbgRef;
6783 unsigned NumLLVMArgs = numLLVMArgOps(NewExpr);
6784 if (NumLLVMArgs == 0) {
6785 // Location assumed to be on the stack.
6786 updateDVIWithLocation(*DbgVal, NewLocationOps[0], NewExpr);
6787 } else if (NumLLVMArgs == 1 && NewExpr[0] == dwarf::DW_OP_LLVM_arg) {
6788 // There is only a single DW_OP_llvm_arg at the start of the expression,
6789 // so it can be omitted along with DIArglist.
6790 assert(NewExpr[1] == 0 &&
6791 "Lone LLVM_arg in a DIExpression should refer to location-op 0.");
6793 updateDVIWithLocation(*DbgVal, NewLocationOps[0], ShortenedOps);
6794 } else {
6795 // Multiple DW_OP_llvm_arg, so DIArgList is strictly necessary.
6796 updateDVIWithLocations(*DbgVal, NewLocationOps, NewExpr);
6797 }
6798
6799 // If the DIExpression was previously empty then add the stack terminator.
6800 // Non-empty expressions have only had elements inserted into them and so
6801 // the terminator should already be present e.g. stack_value or fragment.
6802 DIExpression *SalvageExpr = DbgVal->getExpression();
6803 if (!DVIRec.Expr->isComplex() && SalvageExpr->isComplex()) {
6804 SalvageExpr = DIExpression::append(SalvageExpr, {dwarf::DW_OP_stack_value});
6805 DbgVal->setExpression(SalvageExpr);
6806 }
6807}
6808
6809/// Cached location ops may be erased during LSR, in which case a poison is
6810/// required when restoring from the cache. The type of that location is no
6811/// longer available, so just use int8. The poison will be replaced by one or
6812/// more locations later when a SCEVDbgValueBuilder selects alternative
6813/// locations to use for the salvage.
6815 return (VH) ? VH : PoisonValue::get(llvm::Type::getInt8Ty(C));
6816}
6817
6818/// Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
6819static void restorePreTransformState(DVIRecoveryRec &DVIRec) {
6820 DbgVariableRecord *DbgVal = DVIRec.DbgRef;
6821 LLVM_DEBUG(dbgs() << "scev-salvage: restore dbg.value to pre-LSR state\n"
6822 << "scev-salvage: post-LSR: " << *DbgVal << '\n');
6823 assert(DVIRec.Expr && "Expected an expression");
6824 DbgVal->setExpression(DVIRec.Expr);
6825
6826 // Even a single location-op may be inside a DIArgList and referenced with
6827 // DW_OP_LLVM_arg, which is valid only with a DIArgList.
6828 if (!DVIRec.HadLocationArgList) {
6829 assert(DVIRec.LocationOps.size() == 1 &&
6830 "Unexpected number of location ops.");
6831 // LSR's unsuccessful salvage attempt may have added DIArgList, which in
6832 // this case was not present before, so force the location back to a
6833 // single uncontained Value.
6834 Value *CachedValue =
6835 getValueOrPoison(DVIRec.LocationOps[0], DbgVal->getContext());
6836 DbgVal->setRawLocation(ValueAsMetadata::get(CachedValue));
6837 } else {
6839 for (WeakVH VH : DVIRec.LocationOps) {
6840 Value *CachedValue = getValueOrPoison(VH, DbgVal->getContext());
6841 MetadataLocs.push_back(ValueAsMetadata::get(CachedValue));
6842 }
6843 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6844 DbgVal->setRawLocation(
6845 llvm::DIArgList::get(DbgVal->getContext(), ValArrayRef));
6846 }
6847 LLVM_DEBUG(dbgs() << "scev-salvage: pre-LSR: " << *DbgVal << '\n');
6848}
6849
6851 llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec,
6852 const SCEV *SCEVInductionVar,
6853 SCEVDbgValueBuilder IterCountExpr) {
6854
6855 if (!DVIRec.DbgRef->isKillLocation())
6856 return false;
6857
6858 // LSR may have caused several changes to the dbg.value in the failed salvage
6859 // attempt. So restore the DIExpression, the location ops and also the
6860 // location ops format, which is always DIArglist for multiple ops, but only
6861 // sometimes for a single op.
6863
6864 // LocationOpIndexMap[i] will store the post-LSR location index of
6865 // the non-optimised out location at pre-LSR index i.
6866 SmallVector<int64_t, 2> LocationOpIndexMap;
6867 LocationOpIndexMap.assign(DVIRec.LocationOps.size(), -1);
6868 SmallVector<Value *, 2> NewLocationOps;
6869 NewLocationOps.push_back(LSRInductionVar);
6870
6871 for (unsigned i = 0; i < DVIRec.LocationOps.size(); i++) {
6872 WeakVH VH = DVIRec.LocationOps[i];
6873 // Place the locations not optimised out in the list first, avoiding
6874 // inserts later. The map is used to update the DIExpression's
6875 // DW_OP_LLVM_arg arguments as the expression is updated.
6876 if (VH && !isa<UndefValue>(VH)) {
6877 NewLocationOps.push_back(VH);
6878 LocationOpIndexMap[i] = NewLocationOps.size() - 1;
6879 LLVM_DEBUG(dbgs() << "scev-salvage: Location index " << i
6880 << " now at index " << LocationOpIndexMap[i] << "\n");
6881 continue;
6882 }
6883
6884 // It's possible that a value referred to in the SCEV may have been
6885 // optimised out by LSR.
6886 if (SE.containsErasedValue(DVIRec.SCEVs[i]) ||
6887 SE.containsUndefs(DVIRec.SCEVs[i])) {
6888 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV for location at index: " << i
6889 << " refers to a location that is now undef or erased. "
6890 "Salvage abandoned.\n");
6891 return false;
6892 }
6893
6894 LLVM_DEBUG(dbgs() << "scev-salvage: salvaging location at index " << i
6895 << " with SCEV: " << *DVIRec.SCEVs[i] << "\n");
6896
6897 DVIRec.RecoveryExprs[i] = std::make_unique<SCEVDbgValueBuilder>();
6898 SCEVDbgValueBuilder *SalvageExpr = DVIRec.RecoveryExprs[i].get();
6899
6900 // Create an offset-based salvage expression if possible, as it requires
6901 // less DWARF ops than an iteration count-based expression.
6902 if (std::optional<APInt> Offset =
6903 SE.computeConstantDifference(DVIRec.SCEVs[i], SCEVInductionVar)) {
6904 if (Offset->getSignificantBits() <= 64)
6905 SalvageExpr->createOffsetExpr(Offset->getSExtValue(), LSRInductionVar);
6906 else
6907 return false;
6908 } else if (!SalvageExpr->createIterCountExpr(DVIRec.SCEVs[i], IterCountExpr,
6909 SE))
6910 return false;
6911 }
6912
6913 // Merge the DbgValueBuilder generated expressions and the original
6914 // DIExpression, place the result into an new vector.
6916 if (DVIRec.Expr->getNumElements() == 0) {
6917 assert(DVIRec.RecoveryExprs.size() == 1 &&
6918 "Expected only a single recovery expression for an empty "
6919 "DIExpression.");
6920 assert(DVIRec.RecoveryExprs[0] &&
6921 "Expected a SCEVDbgSalvageBuilder for location 0");
6922 SCEVDbgValueBuilder *B = DVIRec.RecoveryExprs[0].get();
6923 B->appendToVectors(NewExpr, NewLocationOps);
6924 }
6925 for (const auto &Op : DVIRec.Expr->expr_ops()) {
6926 // Most Ops needn't be updated.
6927 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6928 Op.appendToVector(NewExpr);
6929 continue;
6930 }
6931
6932 uint64_t LocationArgIndex = Op.getArg(0);
6933 SCEVDbgValueBuilder *DbgBuilder =
6934 DVIRec.RecoveryExprs[LocationArgIndex].get();
6935 // The location doesn't have s SCEVDbgValueBuilder, so LSR did not
6936 // optimise it away. So just translate the argument to the updated
6937 // location index.
6938 if (!DbgBuilder) {
6939 NewExpr.push_back(dwarf::DW_OP_LLVM_arg);
6940 assert(LocationOpIndexMap[Op.getArg(0)] != -1 &&
6941 "Expected a positive index for the location-op position.");
6942 NewExpr.push_back(LocationOpIndexMap[Op.getArg(0)]);
6943 continue;
6944 }
6945 // The location has a recovery expression.
6946 DbgBuilder->appendToVectors(NewExpr, NewLocationOps);
6947 }
6948
6949 UpdateDbgValue(DVIRec, NewLocationOps, NewExpr);
6950 LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: " << *DVIRec.DbgRef << "\n");
6951 return true;
6952}
6953
6954/// Obtain an expression for the iteration count, then attempt to salvage the
6955/// dbg.value intrinsics.
6957 llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar,
6958 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &DVIToUpdate) {
6959 if (DVIToUpdate.empty())
6960 return;
6961
6962 const llvm::SCEV *SCEVInductionVar = SE.getSCEV(LSRInductionVar);
6963 assert(SCEVInductionVar &&
6964 "Anticipated a SCEV for the post-LSR induction variable");
6965
6966 if (const SCEVAddRecExpr *IVAddRec =
6967 dyn_cast<SCEVAddRecExpr>(SCEVInductionVar)) {
6968 if (!IVAddRec->isAffine())
6969 return;
6970
6971 // Prevent translation using excessive resources.
6972 if (IVAddRec->getExpressionSize() > MaxSCEVSalvageExpressionSize)
6973 return;
6974
6975 // The iteration count is required to recover location values.
6976 SCEVDbgValueBuilder IterCountExpr;
6977 IterCountExpr.pushLocation(LSRInductionVar);
6978 if (!IterCountExpr.SCEVToIterCountExpr(*IVAddRec, SE))
6979 return;
6980
6981 LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV: " << *SCEVInductionVar
6982 << '\n');
6983
6984 for (auto &DVIRec : DVIToUpdate) {
6985 SalvageDVI(L, SE, LSRInductionVar, *DVIRec, SCEVInductionVar,
6986 IterCountExpr);
6987 }
6988 }
6989}
6990
6991/// Identify and cache salvageable DVI locations and expressions along with the
6992/// corresponding SCEV(s). Also ensure that the DVI is not deleted between
6993/// cacheing and salvaging.
6995 Loop *L, ScalarEvolution &SE,
6996 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &SalvageableDVISCEVs) {
6997 for (const auto &B : L->getBlocks()) {
6998 for (auto &I : *B) {
6999 for (DbgVariableRecord &DbgVal : filterDbgVars(I.getDbgRecordRange())) {
7000 if (!DbgVal.isDbgValue() && !DbgVal.isDbgAssign())
7001 continue;
7002
7003 // Ensure that if any location op is undef that the dbg.vlue is not
7004 // cached.
7005 if (DbgVal.isKillLocation())
7006 continue;
7007
7008 // Check that the location op SCEVs are suitable for translation to
7009 // DIExpression.
7010 const auto &HasTranslatableLocationOps =
7011 [&](const DbgVariableRecord &DbgValToTranslate) -> bool {
7012 for (const auto LocOp : DbgValToTranslate.location_ops()) {
7013 if (!LocOp)
7014 return false;
7015
7016 if (!SE.isSCEVable(LocOp->getType()))
7017 return false;
7018
7019 const SCEV *S = SE.getSCEV(LocOp);
7020 if (SE.containsUndefs(S))
7021 return false;
7022 }
7023 return true;
7024 };
7025
7026 if (!HasTranslatableLocationOps(DbgVal))
7027 continue;
7028
7029 std::unique_ptr<DVIRecoveryRec> NewRec =
7030 std::make_unique<DVIRecoveryRec>(&DbgVal);
7031 // Each location Op may need a SCEVDbgValueBuilder in order to recover
7032 // it. Pre-allocating a vector will enable quick lookups of the builder
7033 // later during the salvage.
7034 NewRec->RecoveryExprs.resize(DbgVal.getNumVariableLocationOps());
7035 for (const auto LocOp : DbgVal.location_ops()) {
7036 NewRec->SCEVs.push_back(SE.getSCEV(LocOp));
7037 NewRec->LocationOps.push_back(LocOp);
7038 NewRec->HadLocationArgList = DbgVal.hasArgList();
7039 }
7040 SalvageableDVISCEVs.push_back(std::move(NewRec));
7041 }
7042 }
7043 }
7044}
7045
7046/// Ideally pick the PHI IV inserted by ScalarEvolutionExpander. As a fallback
7047/// any PHi from the loop header is usable, but may have less chance of
7048/// surviving subsequent transforms.
7050 const LSRInstance &LSR) {
7051
7052 auto IsSuitableIV = [&](PHINode *P) {
7053 if (!SE.isSCEVable(P->getType()))
7054 return false;
7055 if (const SCEVAddRecExpr *Rec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(P)))
7056 return Rec->isAffine() && !SE.containsUndefs(SE.getSCEV(P));
7057 return false;
7058 };
7059
7060 // For now, just pick the first IV that was generated and inserted by
7061 // ScalarEvolution. Ideally pick an IV that is unlikely to be optimised away
7062 // by subsequent transforms.
7063 for (const WeakVH &IV : LSR.getScalarEvolutionIVs()) {
7064 if (!IV)
7065 continue;
7066
7067 // There should only be PHI node IVs.
7068 PHINode *P = cast<PHINode>(&*IV);
7069
7070 if (IsSuitableIV(P))
7071 return P;
7072 }
7073
7074 for (PHINode &P : L.getHeader()->phis()) {
7075 if (IsSuitableIV(&P))
7076 return &P;
7077 }
7078 return nullptr;
7079}
7080
7082 DominatorTree &DT, LoopInfo &LI,
7083 const TargetTransformInfo &TTI,
7085 MemorySSA *MSSA) {
7086
7087 // Debug preservation - before we start removing anything identify which DVI
7088 // meet the salvageable criteria and store their DIExpression and SCEVs.
7089 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> SalvageableDVIRecords;
7090 DbgGatherSalvagableDVI(L, SE, SalvageableDVIRecords);
7091
7092 bool Changed = false;
7093 std::unique_ptr<MemorySSAUpdater> MSSAU;
7094 if (MSSA)
7095 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
7096
7097 // Run the main LSR transformation.
7098 const LSRInstance &Reducer =
7099 LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get());
7100 Changed |= Reducer.getChanged();
7101
7102 // Remove any extra phis created by processing inner loops.
7103 Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7104 if (EnablePhiElim && L->isLoopSimplifyForm()) {
7106 SCEVExpander Rewriter(SE, "lsr", false);
7107#if LLVM_ENABLE_ABI_BREAKING_CHECKS
7108 Rewriter.setDebugType(DEBUG_TYPE);
7109#endif
7110 unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI);
7111 Rewriter.clear();
7112 if (numFolded) {
7113 Changed = true;
7115 MSSAU.get());
7116 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7117 }
7118 }
7119 // LSR may at times remove all uses of an induction variable from a loop.
7120 // The only remaining use is the PHI in the exit block.
7121 // When this is the case, if the exit value of the IV can be calculated using
7122 // SCEV, we can replace the exit block PHI with the final value of the IV and
7123 // skip the updates in each loop iteration.
7124 if (L->isRecursivelyLCSSAForm(DT, LI) && L->getExitBlock()) {
7126 SCEVExpander Rewriter(SE, "lsr", true);
7127 int Rewrites = rewriteLoopExitValues(L, &LI, &TLI, &SE, &TTI, Rewriter, &DT,
7128 UnusedIndVarInLoop, DeadInsts);
7129 Rewriter.clear();
7130 if (Rewrites) {
7131 Changed = true;
7133 MSSAU.get());
7134 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7135 }
7136 }
7137
7138 if (SalvageableDVIRecords.empty())
7139 return Changed;
7140
7141 // Obtain relevant IVs and attempt to rewrite the salvageable DVIs with
7142 // expressions composed using the derived iteration count.
7143 // TODO: Allow for multiple IV references for nested AddRecSCEVs
7144 for (const auto &L : LI) {
7145 if (llvm::PHINode *IV = GetInductionVariable(*L, SE, Reducer))
7146 DbgRewriteSalvageableDVIs(L, SE, IV, SalvageableDVIRecords);
7147 else {
7148 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV salvaging not possible. An IV "
7149 "could not be identified.\n");
7150 }
7151 }
7152
7153 for (auto &Rec : SalvageableDVIRecords)
7154 Rec->clear();
7155 SalvageableDVIRecords.clear();
7156 return Changed;
7157}
7158
7159bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
7160 if (skipLoop(L))
7161 return false;
7162
7163 auto &IU = getAnalysis<IVUsersWrapperPass>().getIU();
7164 auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
7165 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
7166 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
7167 const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
7168 *L->getHeader()->getParent());
7169 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
7170 *L->getHeader()->getParent());
7171 auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
7172 *L->getHeader()->getParent());
7173 auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
7174 MemorySSA *MSSA = nullptr;
7175 if (MSSAAnalysis)
7176 MSSA = &MSSAAnalysis->getMSSA();
7177 return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, TLI, MSSA);
7178}
7179
7182 LPMUpdater &) {
7183 if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
7184 AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI, AR.MSSA))
7185 return PreservedAnalyses::all();
7186
7187 auto PA = getLoopPassPreservedAnalyses();
7188 if (AR.MSSA)
7189 PA.preserve<MemorySSAAnalysis>();
7190 return PA;
7191}
7192
7193char LoopStrengthReduce::ID = 0;
7194
7195INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
7196 "Loop Strength Reduction", false, false)
7202INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
7203INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
7204 "Loop Strength Reduction", false, false)
7205
7206Pass *llvm::createLoopStrengthReducePass() { return new LoopStrengthReduce(); }
#define Success
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
Function Alias Analysis false
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isCanonical(const MDString *S)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
This file contains constants used for implementing Dwarf debug support.
early cse Early CSE w MemorySSA
#define DEBUG_TYPE
Hexagon Hardware Loops
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
This header provides classes for managing per-loop analyses.
static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec, const SCEV *SCEVInductionVar, SCEVDbgValueBuilder IterCountExpr)
static cl::opt< bool > DropScaledForVScale("lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true), cl::desc("Avoid using scaled registers with vscale-relative addressing"))
static Value * getWideOperand(Value *Oper)
IVChain logic must consistently peek base TruncInst operands, so wrap it in a convenient helper.
static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE)
Return true if the given add can be sign-extended without changing its value.
static bool mayUsePostIncMode(const TargetTransformInfo &TTI, LSRUse &LU, const SCEV *S, const Loop *L, ScalarEvolution &SE)
Return true if the SCEV represents a value that may end up as a post-increment operation.
static void restorePreTransformState(DVIRecoveryRec &DVIRec)
Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L)
static User::op_iterator findIVOperand(User::op_iterator OI, User::op_iterator OE, Loop *L, ScalarEvolution &SE)
Helper for CollectChains that finds an IV operand (computed by an AddRec in this loop) within [OI,...
static cl::opt< TTI::AddressingModeKind > PreferredAddresingMode("lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None), cl::desc("A flag that overrides the target's preferred addressing mode."), cl::values(clEnumValN(TTI::AMK_None, "none", "Don't prefer any addressing mode"), clEnumValN(TTI::AMK_PreIndexed, "preindexed", "Prefer pre-indexed addressing mode"), clEnumValN(TTI::AMK_PostIndexed, "postindexed", "Prefer post-indexed addressing mode"), clEnumValN(TTI::AMK_All, "all", "Consider all addressing modes")))
static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset, Immediate MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg, int64_t Scale)
Test whether we know how to expand the current formula.
static void DbgGatherSalvagableDVI(Loop *L, ScalarEvolution &SE, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &SalvageableDVISCEVs)
Identify and cache salvageable DVI locations and expressions along with the corresponding SCEV(s).
static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE)
Return true if the given mul can be sign-extended without changing its value.
static const unsigned MaxSCEVSalvageExpressionSize
Limit the size of expression that SCEV-based salvaging will attempt to translate into a DIExpression.
static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if this AddRec is already a phi in its loop.
static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F, const Loop &L)
static cl::opt< bool > InsnsCost("lsr-insns-cost", cl::Hidden, cl::init(true), cl::desc("Add instruction count to a LSR cost model"))
static cl::opt< bool > StressIVChain("stress-ivchain", cl::Hidden, cl::init(false), cl::desc("Stress test LSR IV chains"))
static bool isAddressUse(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Returns true if the specified instruction is using the specified value as an address.
static void DoInitialMatch(const SCEV *S, Loop *L, SmallVectorImpl< SCEVUse > &Good, SmallVectorImpl< SCEVUse > &Bad, ScalarEvolution &SE)
Recursion helper for initialMatch.
static void updateDVIWithLocation(T &DbgVal, Value *Location, SmallVectorImpl< uint64_t > &Ops)
Overwrites DVI with the location and Ops as the DIExpression.
static bool isLegalAddImmediate(const TargetTransformInfo &TTI, Immediate Offset)
static cl::opt< cl::boolOrDefault > AllowDropSolutionIfLessProfitable("lsr-drop-solution", cl::Hidden, cl::desc("Attempt to drop solution if it is less profitable"))
static cl::opt< bool > EnableVScaleImmediates("lsr-enable-vscale-immediates", cl::Hidden, cl::init(true), cl::desc("Enable analysis of vscale-relative immediates in LSR"))
static Instruction * getFixupInsertPos(const TargetTransformInfo &TTI, const LSRFixup &Fixup, const LSRUse &LU, Instruction *IVIncInsertPos, DominatorTree &DT)
static const SCEV * getExprBase(const SCEV *S)
Return an approximation of this SCEV expression's "base", or NULL for any constant.
static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg)
static llvm::PHINode * GetInductionVariable(const Loop &L, ScalarEvolution &SE, const LSRInstance &LSR)
Ideally pick the PHI IV inserted by ScalarEvolutionExpander.
static bool IsSimplerBaseSCEVForTarget(const TargetTransformInfo &TTI, ScalarEvolution &SE, const SCEV *Best, const SCEV *Reg, MemAccessTy AccessType)
static const unsigned MaxIVUsers
MaxIVUsers is an arbitrary threshold that provides an early opportunity for bail out.
static bool isHighCostExpansion(const SCEV *S, SmallPtrSetImpl< const SCEV * > &Processed, ScalarEvolution &SE)
Check if expanding this expression is likely to incur significant cost.
static Value * getValueOrPoison(WeakVH &VH, LLVMContext &C)
Cached location ops may be erased during LSR, in which case a poison is required when restoring from ...
static MemAccessTy getAccessType(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Return the type of the memory being accessed.
static unsigned numLLVMArgOps(SmallVectorImpl< uint64_t > &Expr)
Returns the total number of DW_OP_llvm_arg operands in the expression.
static Immediate ExtractImmediate(SCEVUse &S, ScalarEvolution &SE, bool PreferScalable=false)
If S involves the addition of a constant integer value, return that integer value,...
static void DbgRewriteSalvageableDVIs(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &DVIToUpdate)
Obtain an expression for the iteration count, then attempt to salvage the dbg.value intrinsics.
static cl::opt< bool > EnablePhiElim("enable-lsr-phielim", cl::Hidden, cl::init(true), cl::desc("Enable LSR phi elimination"))
static void UpdateDbgValue(DVIRecoveryRec &DVIRec, SmallVectorImpl< Value * > &NewLocationOps, SmallVectorImpl< uint64_t > &NewExpr)
Write the new expression and new location ops for the dbg.value.
static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if the given addrec can be sign-extended without changing its value.
static Immediate ExtractImmediateOperand(MutableArrayRef< SCEVUse > Ops, ScalarEvolution &SE, bool PreferScalable)
Extracts an immediate operand from Ops and replaces the operand with zero.
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F)
Check if the addressing mode defined by F is completely folded in LU at isel time.
static cl::opt< bool > LSRExpNarrow("lsr-exp-narrow", cl::Hidden, cl::init(false), cl::desc("Narrow LSR complex solution using" " expectation of registers number"))
static cl::opt< bool > FilterSameScaledReg("lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true), cl::desc("Narrow LSR search space by filtering non-optimal formulae" " with the same ScaledReg and Scale"))
static void updateDVIWithLocations(T &DbgVal, SmallVectorImpl< Value * > &Locations, SmallVectorImpl< uint64_t > &Ops)
Overwrite DVI with locations placed into a DIArglist.
static cl::opt< unsigned > ComplexityLimit("lsr-complexity-limit", cl::Hidden, cl::init(std::numeric_limits< uint16_t >::max()), cl::desc("LSR search space complexity limit"))
static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC, TargetLibraryInfo &TLI, MemorySSA *MSSA)
static GlobalValue * ExtractSymbol(SCEVUse &S, ScalarEvolution &SE)
If S involves the addition of a GlobalValue address, return that symbol, and mutate S to point to a n...
static bool isProfitableChain(IVChain &Chain, SmallPtrSetImpl< Instruction * > &Users, ScalarEvolution &SE, const TargetTransformInfo &TTI)
Return true if the number of registers needed for the chain is estimated to be less than the number r...
static const SCEV * CollectSubexprs(const SCEV *S, const SCEVConstant *C, SmallVectorImpl< const SCEV * > &Ops, const Loop *L, ScalarEvolution &SE, unsigned Depth=0)
Split S into subexpressions which can be pulled out into separate registers.
static const SCEV * getExactSDiv(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool IgnoreSignificantBits=false)
Return an expression for LHS /s RHS, if it can be determined and if the remainder is known to be zero...
static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, Value *Operand, const TargetTransformInfo &TTI)
Return true if the IVInc can be folded into an addressing mode.
static const SCEV * getAnyExtendConsideringPostIncUses(ArrayRef< PostIncLoopSet > Loops, const SCEV *Expr, Type *ToTy, ScalarEvolution &SE)
Extend/Truncate Expr to ToTy considering post-inc uses in Loops.
static unsigned getSetupCost(const SCEV *Reg, unsigned Depth, const TargetTransformInfo &TTI)
static cl::opt< unsigned > SetupCostDepthLimit("lsr-setupcost-depth-limit", cl::Hidden, cl::init(7), cl::desc("The limit on recursion depth for LSRs setup cost"))
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
Register Reg
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
#define T
uint64_t IntrinsicInst * II
#define P(N)
PowerPC TLS Dynamic Call Fixup
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the PointerIntPair class.
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
SI optimize exec mask operations pre RA
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
static const unsigned UnknownAddressSpace
#define LLVM_DEBUG(...)
Definition Debug.h:119
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
Virtual Register Rewriter
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:330
LLVM_ABI APInt sdiv(const APInt &RHS) const
Signed division function for APInt.
Definition APInt.cpp:1687
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition APInt.h:1554
LLVM_ABI APInt srem(const APInt &RHS) const
Function for signed remainder operation.
Definition APInt.cpp:1788
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
LLVM_ABI AnalysisUsage & addRequiredID(const void *ID)
Definition Pass.cpp:284
AnalysisUsage & addPreservedID(const void *ID)
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:530
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:388
LLVM_ABI bool isLandingPad() const
Return true if this basic block is a landing pad.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
BinaryOps getOpcode() const
Definition InstrTypes.h:409
static LLVM_ABI BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
static LLVM_ABI Instruction::CastOps getCastOpcode(const Value *Val, bool SrcIsSigned, Type *Ty, bool DstIsSigned)
Returns the opcode necessary to cast Val into Ty using usual casting rules.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ ICMP_NE
not equal
Definition InstrTypes.h:762
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:852
Value * getCondition() const
static LLVM_ABI bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI DIArgList * get(LLVMContext &Context, ArrayRef< ValueAsMetadata * > Args)
DWARF expression.
iterator_range< expr_op_iterator > expr_ops() const
static LLVM_ABI DIExpression * append(const DIExpression *Expr, ArrayRef< uint64_t > Ops)
Append the opcodes Ops to DIExpr.
unsigned getNumElements() const
static LLVM_ABI void appendOffset(SmallVectorImpl< uint64_t > &Ops, int64_t Offset)
Append Ops with operations to apply the Offset.
LLVM_ABI bool isComplex() const
Return whether the location is computed on the expression stack, meaning it cannot be a simple regist...
LLVM_ABI LLVMContext & getContext()
Record of a variable value-assignment, aka a non instruction representation of the dbg....
LLVM_ABI bool isKillLocation() const
void setRawLocation(Metadata *NewLocation)
Use of this should generally be avoided; instead, replaceVariableLocationOp and addVariableLocationOp...
void setExpression(DIExpression *NewExpr)
DIExpression * getExpression() const
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:254
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:239
NodeT * getBlock() const
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:314
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
LLVM_ABI Instruction * findNearestCommonDominator(Instruction *I1, Instruction *I2) const
Find the nearest instruction I that dominates both I1 and I2, in the sense that a result produced bef...
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
PointerType * getType() const
Global values are always pointers.
IVStrideUse - Keep track of one use of a strided induction variable.
Definition IVUsers.h:36
void transformToPostInc(const Loop *L)
transformToPostInc - Transform the expression to post-inc form for the given loop.
Definition IVUsers.cpp:365
Value * getOperandValToReplace() const
getOperandValToReplace - Return the Value of the operand in the user instruction that this IVStrideUs...
Definition IVUsers.h:55
void setUser(Instruction *NewUser)
setUser - Assign a new user instruction for this use.
Definition IVUsers.h:49
Analysis pass that exposes the IVUsers for a loop.
Definition IVUsers.h:187
ilist< IVStrideUse >::const_iterator const_iterator
Definition IVUsers.h:143
iterator end()
Definition IVUsers.h:145
iterator begin()
Definition IVUsers.h:144
bool empty() const
Definition IVUsers.h:148
LLVM_ABI void print(raw_ostream &OS) const
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isLifetimeStartOrEnd() const LLVM_READONLY
Return true if the instruction is a llvm.lifetime.start or llvm.lifetime.end marker.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isEHPad() const
Return true if the instruction is a variety of EH-block.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI Type * getAccessType() const LLVM_READONLY
Return the type this instruction accesses in memory, if any.
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:350
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
An instruction for reading from memory.
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:612
LLVM_ABI PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
An analysis that produces MemorySSA for a function.
Definition MemorySSA.h:922
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition MemorySSA.h:702
Represent a mutable reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:294
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
iterator_range< const_block_iterator > blocks() const
op_range incoming_values()
void setIncomingValue(unsigned i, Value *V)
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
static unsigned getIncomingValueNumForOperand(unsigned i)
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
This node represents an addition of some number of SCEVs.
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
SCEVUse getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
This class represents a constant integer value.
ConstantInt * getValue() const
const APInt & getAPInt() const
This class uses information about analyze scalars to rewrite expressions in canonical form.
This node represents multiplication of some number of SCEVs.
ArrayRef< SCEVUse > operands() const
This means that we are dealing with an entirely unknown SCEV value, and only represent it as its LLVM...
This class represents an analyzed expression in the program.
unsigned short getExpressionSize() const
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
static constexpr auto FlagAnyWrap
LLVM_ABI ArrayRef< SCEVUse > operands() const
Return operands of this SCEV expression.
SCEVTypes getSCEVType() const
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
LLVM_ABI uint64_t getTypeSizeInBits(Type *Ty) const
Return the size in bits of the specified type, for which isSCEVable must return true.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getAddRecExpr(SCEVUse Start, SCEVUse Step, const Loop *L, SCEV::NoWrapFlags Flags)
Get an add recurrence expression for the specified loop.
LLVM_ABI const SCEV * getNoopOrSignExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
LLVM_ABI Type * getEffectiveSCEVType(Type *Ty) const
Return a type with the same bitwidth as the given type and which represents how SCEV will treat the g...
LLVM_ABI const SCEV * getAnyExtendExpr(const SCEV *Op, Type *Ty)
getAnyExtendExpr - Return a SCEV for the given operand extended with unspecified bits out to the give...
LLVM_ABI bool containsUndefs(const SCEV *S) const
Return true if the SCEV expression contains an undef value.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
LLVM_ABI const SCEV * getVScale(Type *Ty)
LLVM_ABI bool hasComputableLoopEvolution(const SCEV *S, const Loop *L)
Return true if the given SCEV changes value in a known way in the specified loop.
LLVM_ABI const SCEV * getPointerBase(const SCEV *V)
Transitively follow the chain of pointer-type operands until reaching a SCEV that does not have a sin...
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVM_ABI const SCEV * getUnknown(Value *V)
LLVM_ABI std::optional< APInt > computeConstantDifference(const SCEV *LHS, const SCEV *RHS)
Compute LHS - RHS and returns the result as an APInt if it is a constant, and std::nullopt if it isn'...
LLVM_ABI bool properlyDominates(const SCEV *S, const BasicBlock *BB)
Return true if elements that makes up the given SCEV properly dominate the specified basic block.
LLVM_ABI bool containsErasedValue(const SCEV *S) const
Return true if the SCEV expression contains a Value that has been optimised out and is now a nullptr.
LLVMContext & getContext() const
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
iterator end()
Get an iterator to the end of the SetVector.
Definition SetVector.h:112
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition SetVector.h:106
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
iterator_range< const_set_bits_iterator > set_bits() const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
size_type size() const
Returns the number of bits in this bitvector.
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
iterator erase(const_iterator CI)
typename SuperClass::const_iterator const_iterator
typename SuperClass::iterator iterator
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
static StackOffset get(int64_t Fixed, int64_t Scalable)
Definition TypeSize.h:41
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI bool shouldDropLSRSolutionIfLessProfitable() const
Return true if LSR should drop a found solution if it's calculated to be less profitable than the bas...
LLVM_ABI bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const
Return true if LSR cost of C1 is lower than C2.
LLVM_ABI bool isIndexedStoreLegal(enum MemIndexedMode Mode, Type *Ty) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace=0, Instruction *I=nullptr, int64_t ScalableOffset=0) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI bool isIndexedLoadLegal(enum MemIndexedMode Mode, Type *Ty) const
LLVM_ABI bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
LLVM_ABI bool isLegalAddImmediate(int64_t Imm) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
LLVM_ABI bool canSaveCmp(Loop *L, CondBrInst **BI, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *LibInfo) const
Return true if the target can save a compare for loop count, for example hardware loop saves a compar...
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
@ TCC_Free
Expected to fold away in lowering.
LLVM_ABI bool canMacroFuseCmp() const
Return true if the target can fuse a compare and branch.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
@ AMK_All
Consider all addressing modes.
@ AMK_PreIndexed
Prefer pre-indexed addressing mode.
@ AMK_None
Don't prefer any addressing mode.
LLVM_ABI bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:307
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI int getFPMantissaWidth() const
Return the width of the mantissa of this type.
Definition Type.cpp:237
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
Use * op_iterator
Definition User.h:254
op_range operands()
Definition User.h:267
op_iterator op_begin()
Definition User.h:259
void setOperand(unsigned i, Value *Val)
Definition User.h:212
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:25
Value * getOperand(unsigned i) const
Definition User.h:207
op_iterator op_end()
Definition User.h:261
static LLVM_ABI ValueAsMetadata * get(Value *V)
Definition Metadata.cpp:509
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:549
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI void printAsOperand(raw_ostream &O, bool PrintType=true, const Module *M=nullptr) const
Print the name of this Value out to the specified raw_ostream.
iterator_range< use_iterator > uses()
Definition Value.h:380
A nullable Value handle that is nullable.
int getNumOccurrences() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:212
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:190
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
match_bind< const SCEVMulExpr > m_scev_Mul(const SCEVMulExpr *&V)
bool match(const SCEV *S, const Pattern &P)
SCEVAffineAddRec_match< Op0_t, Op1_t, match_isa< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
cst_pred_ty< is_specific_cst > m_scev_SpecificInt(uint64_t V)
Match an SCEV constant with a plain unsigned integer.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
@ DW_OP_LLVM_arg
Only used in LLVM metadata.
Definition Dwarf.h:149
@ DW_OP_LLVM_convert
Only used in LLVM metadata.
Definition Dwarf.h:145
constexpr double e
Sequence
A sequence of states that a pointer may go through in which an objc_retain and objc_release are actua...
Definition PtrState.h:41
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
iterator end() const
Definition BasicBlock.h:89
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
Definition SFrame.h:77
unsigned KindType
For isa, dyn_cast, etc operations on TelemetryInfo.
Definition Telemetry.h:83
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:558
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1764
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
InstructionCost Cost
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition Utils.cpp:1687
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2142
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
LLVM_ABI char & LoopSimplifyID
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:94
AnalysisManager< Loop, LoopStandardAnalysisResults & > LoopAnalysisManager
The loop analysis manager.
LLVM_ABI bool matchSimpleRecurrence(const PHINode *P, BinaryOperator *&BO, Value *&Start, Value *&Step)
Attempt to match a simple first order recurrence cycle of the form: iv = phi Ty [Start,...
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
LLVM_ABI bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
Examine each PHI in the given block and delete it if it is dead.
LLVM_ABI void initializeLoopStrengthReducePass(PassRegistry &)
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
LLVM_ABI const SCEV * denormalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE)
Denormalize S to be post-increment for all loops present in Loops.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void SplitLandingPadPredecessors(BasicBlock *OrigBB, ArrayRef< BasicBlock * > Preds, const char *Suffix, const char *Suffix2, SmallVectorImpl< BasicBlock * > &NewBBs, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, bool PreserveLCSSA=false)
This method transforms the landing pad, OrigBB, by introducing two new basic blocks into the function...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
LLVM_ABI const SCEV * normalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE, bool CheckInvertible=true)
Normalize S to be post-increment for all loops present in Loops.
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Other
Any other memory.
Definition ModRef.h:68
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
@ Add
Sum of integers.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2011
DWARFExpression::Operation Op
LLVM_ABI Pass * createLoopStrengthReducePass()
LLVM_ABI BasicBlock * SplitCriticalEdge(Instruction *TI, unsigned SuccNum, const CriticalEdgeSplittingOptions &Options=CriticalEdgeSplittingOptions(), const Twine &BBName="")
If this edge is a critical edge, insert a new node to split the critical edge.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructionsPermissive(SmallVectorImpl< WeakTrackingVH > &DeadInsts, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
Same functionality as RecursivelyDeleteTriviallyDeadInstructions, but allow instructions that are not...
Definition Local.cpp:550
constexpr unsigned BitWidth
LLVM_ABI bool formLCSSAForInstructions(SmallVectorImpl< Instruction * > &Worklist, const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE, SmallVectorImpl< PHINode * > *PHIsToRemove=nullptr, SmallVectorImpl< PHINode * > *InsertedPHIs=nullptr)
Ensures LCSSA form for every instruction from the Worklist in the scope of innermost containing loop.
Definition LCSSA.cpp:308
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
SmallPtrSet< const Loop *, 2 > PostIncLoopSet
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1771
LLVM_ABI int rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI, ScalarEvolution *SE, const TargetTransformInfo *TTI, SCEVExpander &Rewriter, DominatorTree *DT, ReplaceExitVal ReplaceExitValue, SmallVector< WeakTrackingVH, 16 > &DeadInsts)
If the final value of any expressions that are recurrent in the loop can be computed,...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
@ UnusedIndVarInLoop
Definition LoopUtils.h:600
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
SCEVUseT< const SCEV * > SCEVUse
bool SCEVExprContains(const SCEV *Root, PredTy Pred)
Return true if any node in Root satisfies the predicate Pred.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:876
#define N
Attributes of a target dependent hardware loop.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Information about a load/store intrinsic defined by the target.
Value * PtrVal
This is the pointer that the intrinsic is loading from or storing to.