doxygen/LoopIdiomRecognize_8cpp_source.html

//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

// This pass implements an idiom recognizer that transforms simple loops into a

// non-loop form.  In cases that this kicks in, it can be a significant

// performance win.

//

// If compiling for code size we avoid idiom recognition if the resulting

// code could be larger than the code for the original loop. One way this could

// happen is if the loop is not removable after idiom recognition due to the

// presence of non-idiom instructions. The initial implementation of the

// heuristics applies to idioms in multi-block loops.

//

//===----------------------------------------------------------------------===//

//

// TODO List:

//

// Future loop memory idioms to recognize: memcmp, etc.

//

// This could recognize common matrix multiplies and dot product idioms and

// replace them with calls to BLAS (if linked in??).

//

//===----------------------------------------------------------------------===//


#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"

#include "llvm/ADT/APInt.h"

#include "llvm/ADT/ArrayRef.h"

#include "llvm/ADT/DenseMap.h"

#include "llvm/ADT/MapVector.h"

#include "llvm/ADT/SetVector.h"

#include "llvm/ADT/SmallPtrSet.h"

#include "llvm/ADT/SmallVector.h"

#include "llvm/ADT/Statistic.h"

#include "llvm/ADT/StringRef.h"

#include "llvm/Analysis/AliasAnalysis.h"

#include "llvm/Analysis/CmpInstAnalysis.h"

#include "llvm/Analysis/HashRecognize.h"

#include "llvm/Analysis/LoopInfo.h"

#include "llvm/Analysis/LoopPass.h"

#include "llvm/Analysis/MemoryLocation.h"

#include "llvm/Analysis/MemorySSA.h"

#include "llvm/Analysis/MemorySSAUpdater.h"

#include "llvm/Analysis/MustExecute.h"

#include "llvm/Analysis/OptimizationRemarkEmitter.h"

#include "llvm/Analysis/ScalarEvolution.h"

#include "llvm/Analysis/ScalarEvolutionExpressions.h"

#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"

#include "llvm/Analysis/TargetLibraryInfo.h"

#include "llvm/Analysis/TargetTransformInfo.h"

#include "llvm/Analysis/ValueTracking.h"

#include "llvm/IR/BasicBlock.h"

#include "llvm/IR/Constant.h"

#include "llvm/IR/Constants.h"

#include "llvm/IR/DataLayout.h"

#include "llvm/IR/DebugLoc.h"

#include "llvm/IR/DerivedTypes.h"

#include "llvm/IR/Dominators.h"

#include "llvm/IR/GlobalValue.h"

#include "llvm/IR/GlobalVariable.h"

#include "llvm/IR/IRBuilder.h"

#include "llvm/IR/InstrTypes.h"

#include "llvm/IR/Instruction.h"

#include "llvm/IR/Instructions.h"

#include "llvm/IR/IntrinsicInst.h"

#include "llvm/IR/Intrinsics.h"

#include "llvm/IR/LLVMContext.h"

#include "llvm/IR/Module.h"

#include "llvm/IR/PassManager.h"

#include "llvm/IR/PatternMatch.h"

#include "llvm/IR/Type.h"

#include "llvm/IR/User.h"

#include "llvm/IR/Value.h"

#include "llvm/IR/ValueHandle.h"

#include "llvm/Support/Casting.h"

#include "llvm/Support/CommandLine.h"

#include "llvm/Support/Debug.h"

#include "llvm/Support/InstructionCost.h"

#include "llvm/Support/raw_ostream.h"

#include "llvm/Transforms/Utils/BuildLibCalls.h"

#include "llvm/Transforms/Utils/Local.h"

#include "llvm/Transforms/Utils/LoopUtils.h"

#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"

#include <algorithm>

#include <cassert>

#include <cstdint>

#include <utility>


using namespace llvm;

using namespace SCEVPatternMatch;


#define DEBUG_TYPE "loop-idiom"


STATISTIC(NumMemSet, "Number of memset's formed from loop stores");

STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");

STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores");

STATISTIC(NumStrLen, "Number of strlen's and wcslen's formed from loop loads");

STATISTIC(

    NumShiftUntilBitTest,

    "Number of uncountable loops recognized as 'shift until bitttest' idiom");

STATISTIC(NumShiftUntilZero,

          "Number of uncountable loops recognized as 'shift until zero' idiom");


bool DisableLIRP::All;

static cl::opt<bool, true>

    DisableLIRPAll("disable-" DEBUG_TYPE "-all",

                   cl::desc("Options to disable Loop Idiom Recognize Pass."),

                   cl::location(DisableLIRP::All), cl::init(false),

                   cl::ReallyHidden);


bool DisableLIRP::Memset;

static cl::opt<bool, true>

    DisableLIRPMemset("disable-" DEBUG_TYPE "-memset",

                      cl::desc("Proceed with loop idiom recognize pass, but do "

                               "not convert loop(s) to memset."),

                      cl::location(DisableLIRP::Memset), cl::init(false),

                      cl::ReallyHidden);


bool DisableLIRP::Memcpy;

static cl::opt<bool, true>

    DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy",

                      cl::desc("Proceed with loop idiom recognize pass, but do "

                               "not convert loop(s) to memcpy."),

                      cl::location(DisableLIRP::Memcpy), cl::init(false),

                      cl::ReallyHidden);


bool DisableLIRP::Strlen;

static cl::opt<bool, true>

    DisableLIRPStrlen("disable-loop-idiom-strlen",

                      cl::desc("Proceed with loop idiom recognize pass, but do "

                               "not convert loop(s) to strlen."),

                      cl::location(DisableLIRP::Strlen), cl::init(false),

                      cl::ReallyHidden);


bool DisableLIRP::Wcslen;

static cl::opt<bool, true>

    EnableLIRPWcslen("disable-loop-idiom-wcslen",

                     cl::desc("Proceed with loop idiom recognize pass, "

                              "enable conversion of loop(s) to wcslen."),

                     cl::location(DisableLIRP::Wcslen), cl::init(false),

                     cl::ReallyHidden);


bool DisableLIRP::HashRecognize;

static cl::opt<bool, true>

    DisableLIRPHashRecognize("disable-" DEBUG_TYPE "-hashrecognize",

                             cl::desc("Proceed with loop idiom recognize pass, "

                                      "but do not optimize CRC loops."),

                             cl::location(DisableLIRP::HashRecognize),

                             cl::init(false), cl::ReallyHidden);


static cl::opt<bool> UseLIRCodeSizeHeurs(

    "use-lir-code-size-heurs",

    cl::desc("Use loop idiom recognition code size heuristics when compiling "

             "with -Os/-Oz"),

    cl::init(true), cl::Hidden);


static cl::opt<bool> ForceMemsetPatternIntrinsic(

    "loop-idiom-force-memset-pattern-intrinsic",

    cl::desc("Use memset.pattern intrinsic whenever possible"), cl::init(false),

    cl::Hidden);


namespace {


class LoopIdiomRecognize {

  Loop *CurLoop = nullptr;

  AliasAnalysis *AA;

  DominatorTree *DT;

  LoopInfo *LI;

  ScalarEvolution *SE;

  TargetLibraryInfo *TLI;

  const TargetTransformInfo *TTI;

  const DataLayout *DL;

  OptimizationRemarkEmitter &ORE;

  bool ApplyCodeSizeHeuristics;

  std::unique_ptr<MemorySSAUpdater> MSSAU;


public:

  explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,

                              LoopInfo *LI, ScalarEvolution *SE,

                              TargetLibraryInfo *TLI,

                              const TargetTransformInfo *TTI, MemorySSA *MSSA,

                              const DataLayout *DL,

                              OptimizationRemarkEmitter &ORE)

      : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {

    if (MSSA)

      MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);

  }


  bool runOnLoop(Loop *L);


private:

  using StoreList = SmallVector<StoreInst *, 8>;

  using StoreListMap = MapVector<Value *, StoreList>;


  StoreListMap StoreRefsForMemset;

  StoreListMap StoreRefsForMemsetPattern;

  StoreList StoreRefsForMemcpy;

  bool HasMemset;

  bool HasMemsetPattern;

  bool HasMemcpy;


  /// Return code for isLegalStore()

  enum LegalStoreKind {

    None = 0,

    Memset,

    MemsetPattern,

    Memcpy,

    UnorderedAtomicMemcpy,

    DontUse // Dummy retval never to be used. Allows catching errors in retval

            // handling.

  };


  /// \name Countable Loop Idiom Handling

  /// @{


  bool runOnCountableLoop();

  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,

                      SmallVectorImpl<BasicBlock *> &ExitBlocks);


  void collectStores(BasicBlock *BB);

  LegalStoreKind isLegalStore(StoreInst *SI);

  enum class ForMemset { No, Yes };

  bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,

                         ForMemset For);


  template <typename MemInst>

  bool processLoopMemIntrinsic(

      BasicBlock *BB,

      bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),

      const SCEV *BECount);

  bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount);

  bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);


  bool processLoopStridedStore(Value *DestPtr, const SCEV *StoreSizeSCEV,

                               MaybeAlign StoreAlignment, Value *StoredVal,

                               Instruction *TheStore,

                               SmallPtrSetImpl<Instruction *> &Stores,

                               const SCEVAddRecExpr *Ev, const SCEV *BECount,

                               bool IsNegStride, bool IsLoopMemset = false);

  bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);

  bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr,

                                  const SCEV *StoreSize, MaybeAlign StoreAlign,

                                  MaybeAlign LoadAlign, Instruction *TheStore,

                                  Instruction *TheLoad,

                                  const SCEVAddRecExpr *StoreEv,

                                  const SCEVAddRecExpr *LoadEv,

                                  const SCEV *BECount);

  bool avoidLIRForMultiBlockLoop(bool IsMemset = false,

                                 bool IsLoopMemset = false);

  bool optimizeCRCLoop(const PolynomialInfo &Info);


  /// @}

  /// \name Noncountable Loop Idiom Handling

  /// @{


  bool runOnNoncountableLoop();


  bool recognizePopcount();

  void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,

                               PHINode *CntPhi, Value *Var);

  bool isProfitableToInsertFFS(Intrinsic::ID IntrinID, Value *InitX,

                               bool ZeroCheck, size_t CanonicalSize);

  bool insertFFSIfProfitable(Intrinsic::ID IntrinID, Value *InitX,

                             Instruction *DefX, PHINode *CntPhi,

                             Instruction *CntInst);

  bool recognizeAndInsertFFS();  /// Find First Set: ctlz or cttz

  bool recognizeShiftUntilLessThan();

  void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,

                                Instruction *CntInst, PHINode *CntPhi,

                                Value *Var, Instruction *DefX,

                                const DebugLoc &DL, bool ZeroCheck,

                                bool IsCntPhiUsedOutsideLoop,

                                bool InsertSub = false);


  bool recognizeShiftUntilBitTest();

  bool recognizeShiftUntilZero();

  bool recognizeAndInsertStrLen();


  /// @}

};

} // end anonymous namespace


PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,

                                              LoopStandardAnalysisResults &AR,

                                              LPMUpdater &) {

  if (DisableLIRP::All)

    return PreservedAnalyses::all();


  const auto *DL = &L.getHeader()->getDataLayout();


  // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis

  // pass.  Function analyses need to be preserved across loop transformations

  // but ORE cannot be preserved (see comment before the pass definition).

  OptimizationRemarkEmitter ORE(L.getHeader()->getParent());


  std::optional<PolynomialInfo> HR;


  LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,

                         AR.MSSA, DL, ORE);

  if (!LIR.runOnLoop(&L))

    return PreservedAnalyses::all();


  auto PA = getLoopPassPreservedAnalyses();

  if (AR.MSSA)

    PA.preserve<MemorySSAAnalysis>();

  return PA;

}


static void deleteDeadInstruction(Instruction *I) {

  I->replaceAllUsesWith(PoisonValue::get(I->getType()));

  I->eraseFromParent();

}


//===----------------------------------------------------------------------===//

//

//          Implementation of LoopIdiomRecognize

//

//===----------------------------------------------------------------------===//


bool LoopIdiomRecognize::runOnLoop(Loop *L) {

  CurLoop = L;

  // If the loop could not be converted to canonical form, it must have an

  // indirectbr in it, just give up.

  if (!L->getLoopPreheader())

    return false;


  // Disable loop idiom recognition if the function's name is a common idiom.

  StringRef Name = L->getHeader()->getParent()->getName();

  if (Name == "memset" || Name == "memcpy" || Name == "strlen" ||

      Name == "wcslen")

    return false;


  // Determine if code size heuristics need to be applied.

  ApplyCodeSizeHeuristics =

      L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;


  HasMemset = TLI->has(LibFunc_memset);

  // TODO: Unconditionally enable use of the memset pattern intrinsic (or at

  // least, opt-in via target hook) once we are confident it will never result

  // in worse codegen than without. For now, use it only when the target

  // supports memset_pattern16 libcall (or unless this is overridden by

  // command line option).

  HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);

  HasMemcpy = TLI->has(LibFunc_memcpy);


  if (HasMemset || HasMemsetPattern || ForceMemsetPatternIntrinsic ||

      HasMemcpy || !DisableLIRP::HashRecognize)

    if (SE->hasLoopInvariantBackedgeTakenCount(L))

      return runOnCountableLoop();


  return runOnNoncountableLoop();

}


bool LoopIdiomRecognize::runOnCountableLoop() {

  const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);

  assert(!isa<SCEVCouldNotCompute>(BECount) &&

         "runOnCountableLoop() called on a loop without a predictable"

         "backedge-taken count");


  // If this loop executes exactly one time, then it should be peeled, not

  // optimized by this pass.

  if (BECount->isZero())

    return false;


  SmallVector<BasicBlock *, 8> ExitBlocks;

  CurLoop->getUniqueExitBlocks(ExitBlocks);


  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["

                    << CurLoop->getHeader()->getParent()->getName()

                    << "] Countable Loop %" << CurLoop->getHeader()->getName()

                    << "\n");


  // The following transforms hoist stores/memsets into the loop pre-header.

  // Give up if the loop has instructions that may throw.

  SimpleLoopSafetyInfo SafetyInfo;

  SafetyInfo.computeLoopSafetyInfo(CurLoop);

  if (SafetyInfo.anyBlockMayThrow())

    return false;


  bool MadeChange = false;


  // Scan all the blocks in the loop that are not in subloops.

  for (auto *BB : CurLoop->getBlocks()) {

    // Ignore blocks in subloops.

    if (LI->getLoopFor(BB) != CurLoop)

      continue;


    MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);

  }


  // Optimize a CRC loop if HashRecognize found one, provided we're not

  // optimizing for size.

  if (!DisableLIRP::HashRecognize && !ApplyCodeSizeHeuristics)

    if (auto Res = HashRecognize(*CurLoop, *SE).getResult())

      optimizeCRCLoop(*Res);


  return MadeChange;

}


static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {

  const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));

  return ConstStride->getAPInt();

}


/// getMemSetPatternValue - If a strided store of the specified value is safe to

/// turn into a memset.patternn intrinsic, return the Constant that should

/// be passed in. Otherwise, return null.

///

/// TODO this function could allow more constants than it does today (e.g.

/// those over 16 bytes) now it has transitioned to being used for the

/// memset.pattern intrinsic rather than directly the memset_pattern16

/// libcall.


static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {

  // FIXME: This could check for UndefValue because it can be merged into any

  // other valid pattern.


  // If the value isn't a constant, we can't promote it to being in a constant

  // array.  We could theoretically do a store to an alloca or something, but

  // that doesn't seem worthwhile.

  Constant *C = dyn_cast<Constant>(V);

  if (!C || isa<ConstantExpr>(C))

    return nullptr;


  // Only handle simple values that are a power of two bytes in size.

  uint64_t Size = DL->getTypeSizeInBits(V->getType());

  if (Size == 0 || (Size & 7) || (Size & (Size - 1)))

    return nullptr;


  // Don't care enough about darwin/ppc to implement this.

  if (DL->isBigEndian())

    return nullptr;


  // Convert to size in bytes.

  Size /= 8;


  // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see

  // if the top and bottom are the same (e.g. for vectors and large integers).

  if (Size > 16)

    return nullptr;


  // For now, don't handle types that aren't int, floats, or pointers.

  Type *CTy = C->getType();

  if (!CTy->isIntOrPtrTy() && !CTy->isFloatingPointTy())

    return nullptr;


  return C;

}


LoopIdiomRecognize::LegalStoreKind

LoopIdiomRecognize::isLegalStore(StoreInst *SI) {

  // Don't touch volatile stores.

  if (SI->isVolatile())

    return LegalStoreKind::None;

  // We only want simple or unordered-atomic stores.

  if (!SI->isUnordered())

    return LegalStoreKind::None;


  // Avoid merging nontemporal stores.

  if (SI->getMetadata(LLVMContext::MD_nontemporal))

    return LegalStoreKind::None;


  Value *StoredVal = SI->getValueOperand();

  Value *StorePtr = SI->getPointerOperand();


  // Don't convert stores of non-integral pointer types to memsets (which stores

  // integers).

  if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))

    return LegalStoreKind::None;


  // Reject stores that are so large that they overflow an unsigned.

  // When storing out scalable vectors we bail out for now, since the code

  // below currently only works for constant strides.

  TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());

  if (SizeInBits.isScalable() || (SizeInBits.getFixedValue() & 7) ||

      (SizeInBits.getFixedValue() >> 32) != 0)

    return LegalStoreKind::None;


  // See if the pointer expression is an AddRec like {base,+,1} on the current

  // loop, which indicates a strided store.  If we have something else, it's a

  // random store we can't handle.

  const SCEV *StoreEv = SE->getSCEV(StorePtr);

  const SCEVConstant *Stride;

  if (!match(StoreEv, m_scev_AffineAddRec(m_SCEV(), m_SCEVConstant(Stride),

                                          m_SpecificLoop(CurLoop))))

    return LegalStoreKind::None;


  // See if the store can be turned into a memset.


  // If the stored value is a byte-wise value (like i32 -1), then it may be

  // turned into a memset of i8 -1, assuming that all the consecutive bytes

  // are stored.  A store of i32 0x01020304 can never be turned into a memset,

  // but it can be turned into memset_pattern if the target supports it.

  Value *SplatValue = isBytewiseValue(StoredVal, *DL);


  // Note: memset and memset_pattern on unordered-atomic is yet not supported

  bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();


  // If we're allowed to form a memset, and the stored value would be

  // acceptable for memset, use it.

  if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&

      // Verify that the stored value is loop invariant.  If not, we can't

      // promote the memset.

      CurLoop->isLoopInvariant(SplatValue)) {

    // It looks like we can use SplatValue.

    return LegalStoreKind::Memset;

  }

  if (!UnorderedAtomic && (HasMemsetPattern || ForceMemsetPatternIntrinsic) &&

      !DisableLIRP::Memset &&

      // Don't create memset_pattern16s with address spaces.

      StorePtr->getType()->getPointerAddressSpace() == 0 &&

      getMemSetPatternValue(StoredVal, DL)) {

    // It looks like we can use PatternValue!

    return LegalStoreKind::MemsetPattern;

  }


  // Otherwise, see if the store can be turned into a memcpy.

  if (HasMemcpy && !DisableLIRP::Memcpy) {

    // Check to see if the stride matches the size of the store.  If so, then we

    // know that every byte is touched in the loop.

    unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());

    APInt StrideAP = Stride->getAPInt();

    if (StoreSize != StrideAP && StoreSize != -StrideAP)

      return LegalStoreKind::None;


    // The store must be feeding a non-volatile load.

    LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());


    // Only allow non-volatile loads

    if (!LI || LI->isVolatile())

      return LegalStoreKind::None;

    // Only allow simple or unordered-atomic loads

    if (!LI->isUnordered())

      return LegalStoreKind::None;


    // See if the pointer expression is an AddRec like {base,+,1} on the current

    // loop, which indicates a strided load.  If we have something else, it's a

    // random load we can't handle.

    const SCEV *LoadEv = SE->getSCEV(LI->getPointerOperand());


    // The store and load must share the same stride.

    if (!match(LoadEv, m_scev_AffineAddRec(m_SCEV(), m_scev_Specific(Stride),

                                           m_SpecificLoop(CurLoop))))

      return LegalStoreKind::None;


    // Success.  This store can be converted into a memcpy.

    UnorderedAtomic = UnorderedAtomic || LI->isAtomic();

    return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy

                           : LegalStoreKind::Memcpy;

  }

  // This store can't be transformed into a memset/memcpy.

  return LegalStoreKind::None;

}


void LoopIdiomRecognize::collectStores(BasicBlock *BB) {

  StoreRefsForMemset.clear();

  StoreRefsForMemsetPattern.clear();

  StoreRefsForMemcpy.clear();

  for (Instruction &I : *BB) {

    StoreInst *SI = dyn_cast<StoreInst>(&I);

    if (!SI)

      continue;


    // Make sure this is a strided store with a constant stride.

    switch (isLegalStore(SI)) {

    case LegalStoreKind::None:

      // Nothing to do

      break;

    case LegalStoreKind::Memset: {

      // Find the base pointer.

      Value *Ptr = getUnderlyingObject(SI->getPointerOperand());

      StoreRefsForMemset[Ptr].push_back(SI);

    } break;

    case LegalStoreKind::MemsetPattern: {

      // Find the base pointer.

      Value *Ptr = getUnderlyingObject(SI->getPointerOperand());

      StoreRefsForMemsetPattern[Ptr].push_back(SI);

    } break;

    case LegalStoreKind::Memcpy:

    case LegalStoreKind::UnorderedAtomicMemcpy:

      StoreRefsForMemcpy.push_back(SI);

      break;

    default:

      assert(false && "unhandled return value");

      break;

    }

  }

}


/// runOnLoopBlock - Process the specified block, which lives in a counted loop

/// with the specified backedge count.  This block is known to be in the current

/// loop and not in any subloops.

bool LoopIdiomRecognize::runOnLoopBlock(

    BasicBlock *BB, const SCEV *BECount,

    SmallVectorImpl<BasicBlock *> &ExitBlocks) {

  // We can only promote stores in this block if they are unconditionally

  // executed in the loop.  For a block to be unconditionally executed, it has

  // to dominate all the exit blocks of the loop.  Verify this now.

  for (BasicBlock *ExitBlock : ExitBlocks)

    if (!DT->dominates(BB, ExitBlock))

      return false;


  bool MadeChange = false;

  // Look for store instructions, which may be optimized to memset/memcpy.

  collectStores(BB);


  // Look for a single store or sets of stores with a common base, which can be

  // optimized into a memset (memset_pattern).  The latter most commonly happens

  // with structs and handunrolled loops.

  for (auto &SL : StoreRefsForMemset)

    MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);


  for (auto &SL : StoreRefsForMemsetPattern)

    MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);


  // Optimize the store into a memcpy, if it feeds an similarly strided load.

  for (auto &SI : StoreRefsForMemcpy)

    MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);


  MadeChange |= processLoopMemIntrinsic<MemCpyInst>(

      BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);

  MadeChange |= processLoopMemIntrinsic<MemSetInst>(

      BB, &LoopIdiomRecognize::processLoopMemSet, BECount);


  return MadeChange;

}


/// See if this store(s) can be promoted to a memset.

bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,

                                           const SCEV *BECount, ForMemset For) {

  // Try to find consecutive stores that can be transformed into memsets.

  SetVector<StoreInst *> Heads, Tails;

  SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;


  // Do a quadratic search on all of the given stores and find

  // all of the pairs of stores that follow each other.

  SmallVector<unsigned, 16> IndexQueue;

  for (unsigned i = 0, e = SL.size(); i < e; ++i) {

    assert(SL[i]->isSimple() && "Expected only non-volatile stores.");


    Value *FirstStoredVal = SL[i]->getValueOperand();

    Value *FirstStorePtr = SL[i]->getPointerOperand();

    const SCEVAddRecExpr *FirstStoreEv =

        cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));

    APInt FirstStride = getStoreStride(FirstStoreEv);

    unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());


    // See if we can optimize just this store in isolation.

    if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {

      Heads.insert(SL[i]);

      continue;

    }


    Value *FirstSplatValue = nullptr;

    Constant *FirstPatternValue = nullptr;


    if (For == ForMemset::Yes)

      FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);

    else

      FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);


    assert((FirstSplatValue || FirstPatternValue) &&

           "Expected either splat value or pattern value.");


    IndexQueue.clear();

    // If a store has multiple consecutive store candidates, search Stores

    // array according to the sequence: from i+1 to e, then from i-1 to 0.

    // This is because usually pairing with immediate succeeding or preceding

    // candidate create the best chance to find memset opportunity.

    unsigned j = 0;

    for (j = i + 1; j < e; ++j)

      IndexQueue.push_back(j);

    for (j = i; j > 0; --j)

      IndexQueue.push_back(j - 1);


    for (auto &k : IndexQueue) {

      assert(SL[k]->isSimple() && "Expected only non-volatile stores.");

      Value *SecondStorePtr = SL[k]->getPointerOperand();

      const SCEVAddRecExpr *SecondStoreEv =

          cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));

      APInt SecondStride = getStoreStride(SecondStoreEv);


      if (FirstStride != SecondStride)

        continue;


      Value *SecondStoredVal = SL[k]->getValueOperand();

      Value *SecondSplatValue = nullptr;

      Constant *SecondPatternValue = nullptr;


      if (For == ForMemset::Yes)

        SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);

      else

        SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);


      assert((SecondSplatValue || SecondPatternValue) &&

             "Expected either splat value or pattern value.");


      if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {

        if (For == ForMemset::Yes) {

          if (isa<UndefValue>(FirstSplatValue))

            FirstSplatValue = SecondSplatValue;

          if (FirstSplatValue != SecondSplatValue)

            continue;

        } else {

          if (isa<UndefValue>(FirstPatternValue))

            FirstPatternValue = SecondPatternValue;

          if (FirstPatternValue != SecondPatternValue)

            continue;

        }

        Tails.insert(SL[k]);

        Heads.insert(SL[i]);

        ConsecutiveChain[SL[i]] = SL[k];

        break;

      }

    }

  }


  // We may run into multiple chains that merge into a single chain. We mark the

  // stores that we transformed so that we don't visit the same store twice.

  SmallPtrSet<Value *, 16> TransformedStores;

  bool Changed = false;


  // For stores that start but don't end a link in the chain:

  for (StoreInst *I : Heads) {

    if (Tails.count(I))

      continue;


    // We found a store instr that starts a chain. Now follow the chain and try

    // to transform it.

    SmallPtrSet<Instruction *, 8> AdjacentStores;

    StoreInst *HeadStore = I;

    unsigned StoreSize = 0;


    // Collect the chain into a list.

    while (Tails.count(I) || Heads.count(I)) {

      if (TransformedStores.count(I))

        break;

      AdjacentStores.insert(I);


      StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());

      // Move to the next value in the chain.

      I = ConsecutiveChain[I];

    }


    Value *StoredVal = HeadStore->getValueOperand();

    Value *StorePtr = HeadStore->getPointerOperand();

    const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));

    APInt Stride = getStoreStride(StoreEv);


    // Check to see if the stride matches the size of the stores.  If so, then

    // we know that every byte is touched in the loop.

    if (StoreSize != Stride && StoreSize != -Stride)

      continue;


    bool IsNegStride = StoreSize == -Stride;


    Type *IntIdxTy = DL->getIndexType(StorePtr->getType());

    const SCEV *StoreSizeSCEV = SE->getConstant(IntIdxTy, StoreSize);

    if (processLoopStridedStore(StorePtr, StoreSizeSCEV,

                                MaybeAlign(HeadStore->getAlign()), StoredVal,

                                HeadStore, AdjacentStores, StoreEv, BECount,

                                IsNegStride)) {

      TransformedStores.insert_range(AdjacentStores);

      Changed = true;

    }

  }


  return Changed;

}


/// processLoopMemIntrinsic - Template function for calling different processor

/// functions based on mem intrinsic type.

template <typename MemInst>

bool LoopIdiomRecognize::processLoopMemIntrinsic(

    BasicBlock *BB,

    bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),

    const SCEV *BECount) {

  bool MadeChange = false;

  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {

    Instruction *Inst = &*I++;

    // Look for memory instructions, which may be optimized to a larger one.

    if (MemInst *MI = dyn_cast<MemInst>(Inst)) {

      WeakTrackingVH InstPtr(&*I);

      if (!(this->*Processor)(MI, BECount))

        continue;

      MadeChange = true;


      // If processing the instruction invalidated our iterator, start over from

      // the top of the block.

      if (!InstPtr)

        I = BB->begin();

    }

  }

  return MadeChange;

}


/// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy

bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,

                                           const SCEV *BECount) {

  // We can only handle non-volatile memcpys with a constant size.

  if (MCI->isVolatile() || !isa<ConstantInt>(MCI->getLength()))

    return false;


  // If we're not allowed to hack on memcpy, we fail.

  if ((!HasMemcpy && !MCI->isForceInlined()) || DisableLIRP::Memcpy)

    return false;


  Value *Dest = MCI->getDest();

  Value *Source = MCI->getSource();

  if (!Dest || !Source)

    return false;


  // See if the load and store pointer expressions are AddRec like {base,+,1} on

  // the current loop, which indicates a strided load and store.  If we have

  // something else, it's a random load or store we can't handle.

  const SCEV *StoreEv = SE->getSCEV(Dest);

  const SCEV *LoadEv = SE->getSCEV(Source);

  const APInt *StoreStrideValue, *LoadStrideValue;

  if (!match(StoreEv,

             m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StoreStrideValue),

                                 m_SpecificLoop(CurLoop))) ||

      !match(LoadEv,

             m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(LoadStrideValue),

                                 m_SpecificLoop(CurLoop))))

    return false;


  // Reject memcpys that are so large that they overflow an unsigned.

  uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength())->getZExtValue();

  if ((SizeInBytes >> 32) != 0)

    return false;


  // Huge stride value - give up

  if (StoreStrideValue->getBitWidth() > 64 ||

      LoadStrideValue->getBitWidth() > 64)

    return false;


  if (SizeInBytes != *StoreStrideValue && SizeInBytes != -*StoreStrideValue) {

    ORE.emit([&]() {

      return OptimizationRemarkMissed(DEBUG_TYPE, "SizeStrideUnequal", MCI)

             << ore::NV("Inst", "memcpy") << " in "

             << ore::NV("Function", MCI->getFunction())

             << " function will not be hoisted: "

             << ore::NV("Reason", "memcpy size is not equal to stride");

    });

    return false;

  }


  int64_t StoreStrideInt = StoreStrideValue->getSExtValue();

  int64_t LoadStrideInt = LoadStrideValue->getSExtValue();

  // Check if the load stride matches the store stride.

  if (StoreStrideInt != LoadStrideInt)

    return false;


  return processLoopStoreOfLoopLoad(

      Dest, Source, SE->getConstant(Dest->getType(), SizeInBytes),

      MCI->getDestAlign(), MCI->getSourceAlign(), MCI, MCI,

      cast<SCEVAddRecExpr>(StoreEv), cast<SCEVAddRecExpr>(LoadEv), BECount);

}


/// processLoopMemSet - See if this memset can be promoted to a large memset.

bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,

                                           const SCEV *BECount) {

  // We can only handle non-volatile memsets.

  if (MSI->isVolatile())

    return false;


  // If we're not allowed to hack on memset, we fail.

  if (!HasMemset || DisableLIRP::Memset)

    return false;


  Value *Pointer = MSI->getDest();


  // See if the pointer expression is an AddRec like {base,+,1} on the current

  // loop, which indicates a strided store.  If we have something else, it's a

  // random store we can't handle.

  const SCEV *Ev = SE->getSCEV(Pointer);

  const SCEV *PointerStrideSCEV;

  if (!match(Ev, m_scev_AffineAddRec(m_SCEV(), m_SCEV(PointerStrideSCEV),

                                     m_SpecificLoop(CurLoop)))) {

    LLVM_DEBUG(dbgs() << "  Pointer is not affine, abort\n");

    return false;

  }


  const SCEV *MemsetSizeSCEV = SE->getSCEV(MSI->getLength());


  bool IsNegStride = false;

  const bool IsConstantSize = isa<ConstantInt>(MSI->getLength());


  if (IsConstantSize) {

    // Memset size is constant.

    // Check if the pointer stride matches the memset size. If so, then

    // we know that every byte is touched in the loop.

    LLVM_DEBUG(dbgs() << "  memset size is constant\n");

    uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();

    const APInt *Stride;

    if (!match(PointerStrideSCEV, m_scev_APInt(Stride)))

      return false;


    if (SizeInBytes != *Stride && SizeInBytes != -*Stride)

      return false;


    IsNegStride = SizeInBytes == -*Stride;

  } else {

    // Memset size is non-constant.

    // Check if the pointer stride matches the memset size.

    // To be conservative, the pass would not promote pointers that aren't in

    // address space zero. Also, the pass only handles memset length and stride

    // that are invariant for the top level loop.

    LLVM_DEBUG(dbgs() << "  memset size is non-constant\n");

    if (Pointer->getType()->getPointerAddressSpace() != 0) {

      LLVM_DEBUG(dbgs() << "  pointer is not in address space zero, "

                        << "abort\n");

      return false;

    }

    if (!SE->isLoopInvariant(MemsetSizeSCEV, CurLoop)) {

      LLVM_DEBUG(dbgs() << "  memset size is not a loop-invariant, "

                        << "abort\n");

      return false;

    }


    // Compare positive direction PointerStrideSCEV with MemsetSizeSCEV

    IsNegStride = PointerStrideSCEV->isNonConstantNegative();

    const SCEV *PositiveStrideSCEV =

        IsNegStride ? SE->getNegativeSCEV(PointerStrideSCEV)

                    : PointerStrideSCEV;

    LLVM_DEBUG(dbgs() << "  MemsetSizeSCEV: " << *MemsetSizeSCEV << "\n"

                      << "  PositiveStrideSCEV: " << *PositiveStrideSCEV

                      << "\n");


    if (PositiveStrideSCEV != MemsetSizeSCEV) {

      // If an expression is covered by the loop guard, compare again and

      // proceed with optimization if equal.

      const SCEV *FoldedPositiveStride =

          SE->applyLoopGuards(PositiveStrideSCEV, CurLoop);

      const SCEV *FoldedMemsetSize =

          SE->applyLoopGuards(MemsetSizeSCEV, CurLoop);


      LLVM_DEBUG(dbgs() << "  Try to fold SCEV based on loop guard\n"

                        << "    FoldedMemsetSize: " << *FoldedMemsetSize << "\n"

                        << "    FoldedPositiveStride: " << *FoldedPositiveStride

                        << "\n");


      if (FoldedPositiveStride != FoldedMemsetSize) {

        LLVM_DEBUG(dbgs() << "  SCEV don't match, abort\n");

        return false;

      }

    }

  }


  // Verify that the memset value is loop invariant.  If not, we can't promote

  // the memset.

  Value *SplatValue = MSI->getValue();

  if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))

    return false;


  SmallPtrSet<Instruction *, 1> MSIs;

  MSIs.insert(MSI);

  return processLoopStridedStore(Pointer, SE->getSCEV(MSI->getLength()),

                                 MSI->getDestAlign(), SplatValue, MSI, MSIs,

                                 cast<SCEVAddRecExpr>(Ev), BECount, IsNegStride,

                                 /*IsLoopMemset=*/true);

}


/// mayLoopAccessLocation - Return true if the specified loop might access the

/// specified pointer location, which is a loop-strided access.  The 'Access'

/// argument specifies what the verboten forms of access are (read or write).

static bool


mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,

                      const SCEV *BECount, const SCEV *StoreSizeSCEV,

                      AliasAnalysis &AA,

                      SmallPtrSetImpl<Instruction *> &IgnoredInsts) {

  // Get the location that may be stored across the loop.  Since the access is

  // strided positively through memory, we say that the modified location starts

  // at the pointer and has infinite size.

  LocationSize AccessSize = LocationSize::afterPointer();


  // If the loop iterates a fixed number of times, we can refine the access size

  // to be exactly the size of the memset, which is (BECount+1)*StoreSize

  const APInt *BECst, *ConstSize;

  if (match(BECount, m_scev_APInt(BECst)) &&

      match(StoreSizeSCEV, m_scev_APInt(ConstSize))) {

    std::optional<uint64_t> BEInt = BECst->tryZExtValue();

    std::optional<uint64_t> SizeInt = ConstSize->tryZExtValue();

    // FIXME: Should this check for overflow?

    if (BEInt && SizeInt)

      AccessSize = LocationSize::precise((*BEInt + 1) * *SizeInt);

  }


  // TODO: For this to be really effective, we have to dive into the pointer

  // operand in the store.  Store to &A[i] of 100 will always return may alias

  // with store of &A[100], we need to StoreLoc to be "A" with size of 100,

  // which will then no-alias a store to &A[100].

  MemoryLocation StoreLoc(Ptr, AccessSize);


  for (BasicBlock *B : L->blocks())

    for (Instruction &I : *B)

      if (!IgnoredInsts.contains(&I) &&

          isModOrRefSet(AA.getModRefInfo(&I, StoreLoc) & Access))

        return true;

  return false;

}


// If we have a negative stride, Start refers to the end of the memory location

// we're trying to memset.  Therefore, we need to recompute the base pointer,

// which is just Start - BECount*Size.


static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,

                                        Type *IntPtr, const SCEV *StoreSizeSCEV,

                                        ScalarEvolution *SE) {

  const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);

  if (!StoreSizeSCEV->isOne()) {

    // index = back edge count * store size

    Index = SE->getMulExpr(Index,

                           SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),

                           SCEV::FlagNUW);

  }

  // base pointer = start - index * store size

  return SE->getMinusSCEV(Start, Index);

}


/// Compute the number of bytes as a SCEV from the backedge taken count.

///

/// This also maps the SCEV into the provided type and tries to handle the

/// computation in a way that will fold cleanly.


static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,

                               const SCEV *StoreSizeSCEV, Loop *CurLoop,

                               const DataLayout *DL, ScalarEvolution *SE) {

  const SCEV *TripCountSCEV =

      SE->getTripCountFromExitCount(BECount, IntPtr, CurLoop);

  return SE->getMulExpr(TripCountSCEV,

                        SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),

                        SCEV::FlagNUW);

}


/// processLoopStridedStore - We see a strided store of some value.  If we can

/// transform this into a memset or memset_pattern in the loop preheader, do so.

bool LoopIdiomRecognize::processLoopStridedStore(

    Value *DestPtr, const SCEV *StoreSizeSCEV, MaybeAlign StoreAlignment,

    Value *StoredVal, Instruction *TheStore,

    SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev,

    const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) {

  Module *M = TheStore->getModule();


  // The trip count of the loop and the base pointer of the addrec SCEV is

  // guaranteed to be loop invariant, which means that it should dominate the

  // header.  This allows us to insert code for it in the preheader.

  unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();

  BasicBlock *Preheader = CurLoop->getLoopPreheader();

  IRBuilder<> Builder(Preheader->getTerminator());

  SCEVExpander Expander(*SE, *DL, "loop-idiom");

  SCEVExpanderCleaner ExpCleaner(Expander);


  Type *DestInt8PtrTy = Builder.getPtrTy(DestAS);

  Type *IntIdxTy = DL->getIndexType(DestPtr->getType());


  bool Changed = false;

  const SCEV *Start = Ev->getStart();

  // Handle negative strided loops.

  if (IsNegStride)

    Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSizeSCEV, SE);


  // TODO: ideally we should still be able to generate memset if SCEV expander

  // is taught to generate the dependencies at the latest point.

  if (!Expander.isSafeToExpand(Start))

    return Changed;


  // Okay, we have a strided store "p[i]" of a splattable value.  We can turn

  // this into a memset in the loop preheader now if we want.  However, this

  // would be unsafe to do if there is anything else in the loop that may read

  // or write to the aliased location.  Check for any overlap by generating the

  // base pointer and checking the region.

  Value *BasePtr =

      Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());


  // From here on out, conservatively report to the pass manager that we've

  // changed the IR, even if we later clean up these added instructions. There

  // may be structural differences e.g. in the order of use lists not accounted

  // for in just a textual dump of the IR. This is written as a variable, even

  // though statically all the places this dominates could be replaced with

  // 'true', with the hope that anyone trying to be clever / "more precise" with

  // the return value will read this comment, and leave them alone.

  Changed = true;


  if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,

                            StoreSizeSCEV, *AA, Stores))

    return Changed;


  if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))

    return Changed;


  // Okay, everything looks good, insert the memset.

  Value *SplatValue = isBytewiseValue(StoredVal, *DL);

  Constant *PatternValue = nullptr;

  if (!SplatValue)

    PatternValue = getMemSetPatternValue(StoredVal, DL);


  // MemsetArg is the number of bytes for the memset libcall, and the number

  // of pattern repetitions if the memset.pattern intrinsic is being used.

  Value *MemsetArg;

  std::optional<int64_t> BytesWritten;


  if (PatternValue && (HasMemsetPattern || ForceMemsetPatternIntrinsic)) {

    const SCEV *TripCountS =

        SE->getTripCountFromExitCount(BECount, IntIdxTy, CurLoop);

    if (!Expander.isSafeToExpand(TripCountS))

      return Changed;

    const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);

    if (!ConstStoreSize)

      return Changed;

    Value *TripCount = Expander.expandCodeFor(TripCountS, IntIdxTy,

                                              Preheader->getTerminator());

    uint64_t PatternRepsPerTrip =

        (ConstStoreSize->getValue()->getZExtValue() * 8) /

        DL->getTypeSizeInBits(PatternValue->getType());

    // If ConstStoreSize is not equal to the width of PatternValue, then

    // MemsetArg is TripCount * (ConstStoreSize/PatternValueWidth). Else

    // MemSetArg is just TripCount.

    MemsetArg =

        PatternRepsPerTrip == 1

            ? TripCount

            : Builder.CreateMul(TripCount,

                                Builder.getIntN(IntIdxTy->getIntegerBitWidth(),

                                                PatternRepsPerTrip));

    if (auto *CI = dyn_cast<ConstantInt>(TripCount))

      BytesWritten =

          CI->getZExtValue() * ConstStoreSize->getValue()->getZExtValue();


  } else {

    const SCEV *NumBytesS =

        getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);


    // TODO: ideally we should still be able to generate memset if SCEV expander

    // is taught to generate the dependencies at the latest point.

    if (!Expander.isSafeToExpand(NumBytesS))

      return Changed;

    MemsetArg =

        Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());

    if (auto *CI = dyn_cast<ConstantInt>(MemsetArg))

      BytesWritten = CI->getZExtValue();

  }

  assert(MemsetArg && "MemsetArg should have been set");


  AAMDNodes AATags = TheStore->getAAMetadata();

  for (Instruction *Store : Stores)

    AATags = AATags.merge(Store->getAAMetadata());

  if (BytesWritten)

    AATags = AATags.extendTo(BytesWritten.value());

  else

    AATags = AATags.extendTo(-1);


  CallInst *NewCall;

  if (SplatValue) {

    NewCall = Builder.CreateMemSet(BasePtr, SplatValue, MemsetArg,

                                   MaybeAlign(StoreAlignment),

                                   /*isVolatile=*/false, AATags);

  } else if (ForceMemsetPatternIntrinsic ||

             isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)) {

    assert(isa<SCEVConstant>(StoreSizeSCEV) && "Expected constant store size");


    NewCall = Builder.CreateIntrinsic(

        Intrinsic::experimental_memset_pattern,

        {DestInt8PtrTy, PatternValue->getType(), IntIdxTy},

        {BasePtr, PatternValue, MemsetArg,

         ConstantInt::getFalse(M->getContext())});

    if (StoreAlignment)

      cast<MemSetPatternInst>(NewCall)->setDestAlignment(*StoreAlignment);

    NewCall->setAAMetadata(AATags);

  } else {

    // Neither a memset, nor memset_pattern16

    return Changed;

  }


  NewCall->setDebugLoc(TheStore->getDebugLoc());


  if (MSSAU) {

    MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(

        NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);

    MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);

  }


  LLVM_DEBUG(dbgs() << "  Formed memset: " << *NewCall << "\n"

                    << "    from store to: " << *Ev << " at: " << *TheStore

                    << "\n");


  ORE.emit([&]() {

    OptimizationRemark R(DEBUG_TYPE, "ProcessLoopStridedStore",

                         NewCall->getDebugLoc(), Preheader);

    R << "Transformed loop-strided store in "

      << ore::NV("Function", TheStore->getFunction())

      << " function into a call to "

      << ore::NV("NewFunction", NewCall->getCalledFunction())

      << "() intrinsic";

    if (!Stores.empty())

      R << ore::setExtraArgs();

    for (auto *I : Stores) {

      R << ore::NV("FromBlock", I->getParent()->getName())

        << ore::NV("ToBlock", Preheader->getName());

    }

    return R;

  });


  // Okay, the memset has been formed.  Zap the original store and anything that

  // feeds into it.

  for (auto *I : Stores) {

    if (MSSAU)

      MSSAU->removeMemoryAccess(I, true);

    deleteDeadInstruction(I);

  }

  if (MSSAU && VerifyMemorySSA)

    MSSAU->getMemorySSA()->verifyMemorySSA();

  ++NumMemSet;

  ExpCleaner.markResultUsed();

  return true;

}


/// If the stored value is a strided load in the same loop with the same stride

/// this may be transformable into a memcpy.  This kicks in for stuff like

/// for (i) A[i] = B[i];

bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,

                                                    const SCEV *BECount) {

  assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");


  Value *StorePtr = SI->getPointerOperand();

  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));

  unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());


  // The store must be feeding a non-volatile load.

  LoadInst *LI = cast<LoadInst>(SI->getValueOperand());

  assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");


  // See if the pointer expression is an AddRec like {base,+,1} on the current

  // loop, which indicates a strided load.  If we have something else, it's a

  // random load we can't handle.

  Value *LoadPtr = LI->getPointerOperand();

  const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));


  const SCEV *StoreSizeSCEV = SE->getConstant(StorePtr->getType(), StoreSize);

  return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSizeSCEV,

                                    SI->getAlign(), LI->getAlign(), SI, LI,

                                    StoreEv, LoadEv, BECount);

}


namespace {

class MemmoveVerifier {

public:

  explicit MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr,

                           const DataLayout &DL)

      : DL(DL), BP1(llvm::GetPointerBaseWithConstantOffset(

                    LoadBasePtr.stripPointerCasts(), LoadOff, DL)),

        BP2(llvm::GetPointerBaseWithConstantOffset(

            StoreBasePtr.stripPointerCasts(), StoreOff, DL)),

        IsSameObject(BP1 == BP2) {}


  bool loadAndStoreMayFormMemmove(unsigned StoreSize, bool IsNegStride,

                                  const Instruction &TheLoad,

                                  bool IsMemCpy) const {

    if (IsMemCpy) {

      // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr

      // for negative stride.

      if ((!IsNegStride && LoadOff <= StoreOff) ||

          (IsNegStride && LoadOff >= StoreOff))

        return false;

    } else {

      // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr

      // for negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr.

      int64_t LoadSize =

          DL.getTypeSizeInBits(TheLoad.getType()).getFixedValue() / 8;

      if (BP1 != BP2 || LoadSize != int64_t(StoreSize))

        return false;

      if ((!IsNegStride && LoadOff < StoreOff + int64_t(StoreSize)) ||

          (IsNegStride && LoadOff + LoadSize > StoreOff))

        return false;

    }

    return true;

  }


private:

  const DataLayout &DL;

  int64_t LoadOff = 0;

  int64_t StoreOff = 0;

  const Value *BP1;

  const Value *BP2;


public:

  const bool IsSameObject;

};

} // namespace


bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(

    Value *DestPtr, Value *SourcePtr, const SCEV *StoreSizeSCEV,

    MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore,

    Instruction *TheLoad, const SCEVAddRecExpr *StoreEv,

    const SCEVAddRecExpr *LoadEv, const SCEV *BECount) {


  // FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to

  // conservatively bail here, since otherwise we may have to transform

  // llvm.memcpy.inline into llvm.memcpy which is illegal.

  if (auto *MCI = dyn_cast<MemCpyInst>(TheStore); MCI && MCI->isForceInlined())

    return false;


  // The trip count of the loop and the base pointer of the addrec SCEV is

  // guaranteed to be loop invariant, which means that it should dominate the

  // header.  This allows us to insert code for it in the preheader.

  BasicBlock *Preheader = CurLoop->getLoopPreheader();

  IRBuilder<> Builder(Preheader->getTerminator());

  SCEVExpander Expander(*SE, *DL, "loop-idiom");


  SCEVExpanderCleaner ExpCleaner(Expander);


  bool Changed = false;

  const SCEV *StrStart = StoreEv->getStart();

  unsigned StrAS = DestPtr->getType()->getPointerAddressSpace();

  Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));


  APInt Stride = getStoreStride(StoreEv);

  const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);


  // TODO: Deal with non-constant size; Currently expect constant store size

  assert(ConstStoreSize && "store size is expected to be a constant");


  int64_t StoreSize = ConstStoreSize->getValue()->getZExtValue();

  bool IsNegStride = StoreSize == -Stride;


  // Handle negative strided loops.

  if (IsNegStride)

    StrStart =

        getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSizeSCEV, SE);


  // Okay, we have a strided store "p[i]" of a loaded value.  We can turn

  // this into a memcpy in the loop preheader now if we want.  However, this

  // would be unsafe to do if there is anything else in the loop that may read

  // or write the memory region we're storing to.  This includes the load that

  // feeds the stores.  Check for an alias by generating the base address and

  // checking everything.

  Value *StoreBasePtr = Expander.expandCodeFor(

      StrStart, Builder.getPtrTy(StrAS), Preheader->getTerminator());


  // From here on out, conservatively report to the pass manager that we've

  // changed the IR, even if we later clean up these added instructions. There

  // may be structural differences e.g. in the order of use lists not accounted

  // for in just a textual dump of the IR. This is written as a variable, even

  // though statically all the places this dominates could be replaced with

  // 'true', with the hope that anyone trying to be clever / "more precise" with

  // the return value will read this comment, and leave them alone.

  Changed = true;


  SmallPtrSet<Instruction *, 2> IgnoredInsts;

  IgnoredInsts.insert(TheStore);


  bool IsMemCpy = isa<MemCpyInst>(TheStore);

  const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";


  bool LoopAccessStore =

      mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,

                            StoreSizeSCEV, *AA, IgnoredInsts);

  if (LoopAccessStore) {

    // For memmove case it's not enough to guarantee that loop doesn't access

    // TheStore and TheLoad. Additionally we need to make sure that TheStore is

    // the only user of TheLoad.

    if (!TheLoad->hasOneUse())

      return Changed;

    IgnoredInsts.insert(TheLoad);

    if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop,

                              BECount, StoreSizeSCEV, *AA, IgnoredInsts)) {

      ORE.emit([&]() {

        return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore",

                                        TheStore)

               << ore::NV("Inst", InstRemark) << " in "

               << ore::NV("Function", TheStore->getFunction())

               << " function will not be hoisted: "

               << ore::NV("Reason", "The loop may access store location");

      });

      return Changed;

    }

    IgnoredInsts.erase(TheLoad);

  }


  const SCEV *LdStart = LoadEv->getStart();

  unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();


  // Handle negative strided loops.

  if (IsNegStride)

    LdStart =

        getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSizeSCEV, SE);


  // For a memcpy, we have to make sure that the input array is not being

  // mutated by the loop.

  Value *LoadBasePtr = Expander.expandCodeFor(LdStart, Builder.getPtrTy(LdAS),

                                              Preheader->getTerminator());


  // If the store is a memcpy instruction, we must check if it will write to

  // the load memory locations. So remove it from the ignored stores.

  MemmoveVerifier Verifier(*LoadBasePtr, *StoreBasePtr, *DL);

  if (IsMemCpy && !Verifier.IsSameObject)

    IgnoredInsts.erase(TheStore);

  if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,

                            StoreSizeSCEV, *AA, IgnoredInsts)) {

    ORE.emit([&]() {

      return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)

             << ore::NV("Inst", InstRemark) << " in "

             << ore::NV("Function", TheStore->getFunction())

             << " function will not be hoisted: "

             << ore::NV("Reason", "The loop may access load location");

    });

    return Changed;

  }


  bool IsAtomic = TheStore->isAtomic() || TheLoad->isAtomic();

  bool UseMemMove = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore;


  if (IsAtomic) {

    // For now don't support unordered atomic memmove.

    if (UseMemMove)

      return Changed;


    // We cannot allow unaligned ops for unordered load/store, so reject

    // anything where the alignment isn't at least the element size.

    assert((StoreAlign && LoadAlign) &&

           "Expect unordered load/store to have align.");

    if (*StoreAlign < StoreSize || *LoadAlign < StoreSize)

      return Changed;


    // If the element.atomic memcpy is not lowered into explicit

    // loads/stores later, then it will be lowered into an element-size

    // specific lib call. If the lib call doesn't exist for our store size, then

    // we shouldn't generate the memcpy.

    if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())

      return Changed;

  }


  if (UseMemMove)

    if (!Verifier.loadAndStoreMayFormMemmove(StoreSize, IsNegStride, *TheLoad,

                                             IsMemCpy))

      return Changed;


  if (avoidLIRForMultiBlockLoop())

    return Changed;


  // Okay, everything is safe, we can transform this!


  const SCEV *NumBytesS =

      getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);


  Value *NumBytes =

      Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());


  AAMDNodes AATags = TheLoad->getAAMetadata();

  AAMDNodes StoreAATags = TheStore->getAAMetadata();

  AATags = AATags.merge(StoreAATags);

  if (auto CI = dyn_cast<ConstantInt>(NumBytes))

    AATags = AATags.extendTo(CI->getZExtValue());

  else

    AATags = AATags.extendTo(-1);


  CallInst *NewCall = nullptr;

  // Check whether to generate an unordered atomic memcpy:

  //  If the load or store are atomic, then they must necessarily be unordered

  //  by previous checks.

  if (!IsAtomic) {

    if (UseMemMove)

      NewCall = Builder.CreateMemMove(StoreBasePtr, StoreAlign, LoadBasePtr,

                                      LoadAlign, NumBytes,

                                      /*isVolatile=*/false, AATags);

    else

      NewCall =

          Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign,

                               NumBytes, /*isVolatile=*/false, AATags);

  } else {

    // Create the call.

    // Note that unordered atomic loads/stores are *required* by the spec to

    // have an alignment but non-atomic loads/stores may not.

    NewCall = Builder.CreateElementUnorderedAtomicMemCpy(

        StoreBasePtr, *StoreAlign, LoadBasePtr, *LoadAlign, NumBytes, StoreSize,

        AATags);

  }

  NewCall->setDebugLoc(TheStore->getDebugLoc());


  if (MSSAU) {

    MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(

        NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);

    MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);

  }


  LLVM_DEBUG(dbgs() << "  Formed new call: " << *NewCall << "\n"

                    << "    from load ptr=" << *LoadEv << " at: " << *TheLoad

                    << "\n"

                    << "    from store ptr=" << *StoreEv << " at: " << *TheStore

                    << "\n");


  ORE.emit([&]() {

    return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",

                              NewCall->getDebugLoc(), Preheader)

           << "Formed a call to "

           << ore::NV("NewFunction", NewCall->getCalledFunction())

           << "() intrinsic from " << ore::NV("Inst", InstRemark)

           << " instruction in " << ore::NV("Function", TheStore->getFunction())

           << " function"

           << ore::setExtraArgs()

           << ore::NV("FromBlock", TheStore->getParent()->getName())

           << ore::NV("ToBlock", Preheader->getName());

  });


  // Okay, a new call to memcpy/memmove has been formed.  Zap the original store

  // and anything that feeds into it.

  if (MSSAU)

    MSSAU->removeMemoryAccess(TheStore, true);

  deleteDeadInstruction(TheStore);

  if (MSSAU && VerifyMemorySSA)

    MSSAU->getMemorySSA()->verifyMemorySSA();

  if (UseMemMove)

    ++NumMemMove;

  else

    ++NumMemCpy;

  ExpCleaner.markResultUsed();

  return true;

}


// When compiling for codesize we avoid idiom recognition for a multi-block loop

// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.

//

bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,

                                                   bool IsLoopMemset) {

  if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {

    if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {

      LLVM_DEBUG(dbgs() << "  " << CurLoop->getHeader()->getParent()->getName()

                        << " : LIR " << (IsMemset ? "Memset" : "Memcpy")

                        << " avoided: multi-block top-level loop\n");

      return true;

    }

  }


  return false;

}


bool LoopIdiomRecognize::optimizeCRCLoop(const PolynomialInfo &Info) {

  // FIXME: Hexagon has a special HexagonLoopIdiom that optimizes CRC using

  // carry-less multiplication instructions, which is more efficient than our

  // Sarwate table-lookup optimization. Hence, until we're able to emit

  // target-specific instructions for Hexagon, subsuming HexagonLoopIdiom,

  // disable the optimization for Hexagon.

  Module &M = *CurLoop->getHeader()->getModule();

  Triple TT(M.getTargetTriple());

  if (TT.getArch() == Triple::hexagon)

    return false;


  // First, create a new GlobalVariable corresponding to the

  // Sarwate-lookup-table.

  Type *CRCTy = Info.LHS->getType();

  unsigned CRCBW = CRCTy->getIntegerBitWidth();

  std::array<Constant *, 256> CRCConstants;

  transform(HashRecognize::genSarwateTable(Info.RHS, Info.ByteOrderSwapped),

            CRCConstants.begin(),

            [CRCTy](const APInt &E) { return ConstantInt::get(CRCTy, E); });

  Constant *ConstArray =

      ConstantArray::get(ArrayType::get(CRCTy, 256), CRCConstants);

  GlobalVariable *GV =

      new GlobalVariable(M, ConstArray->getType(), true,

                         GlobalValue::PrivateLinkage, ConstArray, ".crctable");


  PHINode *IV = CurLoop->getCanonicalInductionVariable();

  SmallVector<PHINode *, 2> Cleanup;


  // Next, mark all PHIs for removal except IV.

  {

    for (PHINode &PN : CurLoop->getHeader()->phis()) {

      if (&PN == IV)

        continue;

      PN.replaceAllUsesWith(PoisonValue::get(PN.getType()));

      Cleanup.push_back(&PN);

    }

  }


  // Next, fix up the trip count.

  {

    unsigned NewBTC = (Info.TripCount / 8) - 1;

    BasicBlock *LoopBlk = CurLoop->getLoopLatch();

    BranchInst *BrInst = cast<BranchInst>(LoopBlk->getTerminator());

    CmpPredicate ExitPred = BrInst->getSuccessor(0) == LoopBlk

                                ? ICmpInst::Predicate::ICMP_NE

                                : ICmpInst::Predicate::ICMP_EQ;

    Instruction *ExitCond = CurLoop->getLatchCmpInst();

    Value *ExitLimit = ConstantInt::get(IV->getType(), NewBTC);

    IRBuilder<> Builder(ExitCond);

    Value *NewExitCond =

        Builder.CreateICmp(ExitPred, IV, ExitLimit, "exit.cond");

    ExitCond->replaceAllUsesWith(NewExitCond);

    deleteDeadInstruction(ExitCond);

  }


  // Finally, fill the loop with the Sarwate-table-lookup logic, and replace all

  // uses of ComputedValue.

  //

  // Little-endian:

  //   crc = (crc >> 8) ^ tbl[(iv'th byte of data) ^ (bottom byte of crc)]

  // Big-Endian:

  //   crc = (crc << 8) ^ tbl[(iv'th byte of data) ^ (top byte of crc)]

  {

    auto LoByte = [](IRBuilderBase &Builder, Value *Op, const Twine &Name) {

      return Builder.CreateZExtOrTrunc(

          Op, IntegerType::getInt8Ty(Op->getContext()), Name);

    };

    auto HiIdx = [LoByte, CRCBW](IRBuilderBase &Builder, Value *Op,

                                 const Twine &Name) {

      Type *OpTy = Op->getType();


      // When the bitwidth of the CRC mismatches the Op's bitwidth, we need to

      // use the CRC's bitwidth as the reference for shifting right.

      return LoByte(Builder,

                    CRCBW > 8 ? Builder.CreateLShr(

                                    Op, ConstantInt::get(OpTy, CRCBW - 8), Name)

                              : Op,

                    Name + ".lo.byte");

    };


    IRBuilder<> Builder(CurLoop->getHeader(),

                        CurLoop->getHeader()->getFirstNonPHIIt());


    // Create the CRC PHI, and initialize its incoming value to the initial

    // value of CRC.

    PHINode *CRCPhi = Builder.CreatePHI(CRCTy, 2, "crc");

    CRCPhi->addIncoming(Info.LHS, CurLoop->getLoopPreheader());


    // CRC is now an evolving variable, initialized to the PHI.

    Value *CRC = CRCPhi;


    // TableIndexer = ((top|bottom) byte of CRC). It is XOR'ed with (iv'th byte

    // of LHSAux), if LHSAux is non-nullptr.

    Value *Indexer = CRC;

    if (Value *Data = Info.LHSAux) {

      Type *DataTy = Data->getType();


      // To index into the (iv'th byte of LHSAux), we multiply iv by 8, and we

      // shift right by that amount, and take the lo-byte (in the little-endian

      // case), or shift left by that amount, and take the hi-idx (in the

      // big-endian case).

      Value *IVBits = Builder.CreateZExtOrTrunc(

          Builder.CreateShl(IV, 3, "iv.bits"), DataTy, "iv.indexer");

      Value *DataIndexer =

          Info.ByteOrderSwapped

              ? Builder.CreateShl(Data, IVBits, "data.indexer")

              : Builder.CreateLShr(Data, IVBits, "data.indexer");

      Indexer = Builder.CreateXor(

          DataIndexer,

          Builder.CreateZExtOrTrunc(Indexer, DataTy, "crc.indexer.cast"),

          "crc.data.indexer");

    }


    Indexer = Info.ByteOrderSwapped ? HiIdx(Builder, Indexer, "indexer.hi")

                                    : LoByte(Builder, Indexer, "indexer.lo");


    // Always index into a GEP using the index type.

    Indexer = Builder.CreateZExt(

        Indexer, SE->getDataLayout().getIndexType(GV->getType()),

        "indexer.ext");


    // CRCTableLd = CRCTable[(iv'th byte of data) ^ (top|bottom) byte of CRC].

    Value *CRCTableGEP =

        Builder.CreateInBoundsGEP(CRCTy, GV, Indexer, "tbl.ptradd");

    Value *CRCTableLd = Builder.CreateLoad(CRCTy, CRCTableGEP, "tbl.ld");


    // CRCNext = (CRC (<<|>>) 8) ^ CRCTableLd, or simply CRCTableLd in case of

    // CRC-8.

    Value *CRCNext = CRCTableLd;

    if (CRCBW > 8) {

      Value *CRCShift = Info.ByteOrderSwapped

                            ? Builder.CreateShl(CRC, 8, "crc.be.shift")

                            : Builder.CreateLShr(CRC, 8, "crc.le.shift");

      CRCNext = Builder.CreateXor(CRCShift, CRCTableLd, "crc.next");

    }


    // Connect the back-edge for the loop, and RAUW the ComputedValue.

    CRCPhi->addIncoming(CRCNext, CurLoop->getLoopLatch());

    Info.ComputedValue->replaceUsesOutsideBlock(CRCNext,

                                                CurLoop->getLoopLatch());

  }


  // Cleanup.

  {

    for (PHINode *PN : Cleanup)

      RecursivelyDeleteDeadPHINode(PN);

    SE->forgetLoop(CurLoop);

  }

  return true;

}


bool LoopIdiomRecognize::runOnNoncountableLoop() {

  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["

                    << CurLoop->getHeader()->getParent()->getName()

                    << "] Noncountable Loop %"

                    << CurLoop->getHeader()->getName() << "\n");


  return recognizePopcount() || recognizeAndInsertFFS() ||

         recognizeShiftUntilBitTest() || recognizeShiftUntilZero() ||

         recognizeShiftUntilLessThan() || recognizeAndInsertStrLen();

}


/// Check if the given conditional branch is based on the comparison between

/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is

/// true), the control yields to the loop entry. If the branch matches the

/// behavior, the variable involved in the comparison is returned. This function

/// will be called to see if the precondition and postcondition of the loop are

/// in desirable form.


static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,

                             bool JmpOnZero = false) {

  if (!BI || !BI->isConditional())

    return nullptr;


  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());

  if (!Cond)

    return nullptr;


  auto *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));

  if (!CmpZero || !CmpZero->isZero())

    return nullptr;


  BasicBlock *TrueSucc = BI->getSuccessor(0);

  BasicBlock *FalseSucc = BI->getSuccessor(1);

  if (JmpOnZero)

    std::swap(TrueSucc, FalseSucc);


  ICmpInst::Predicate Pred = Cond->getPredicate();

  if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||

      (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))

    return Cond->getOperand(0);


  return nullptr;

}


namespace {


class StrlenVerifier {

public:

  explicit StrlenVerifier(const Loop *CurLoop, ScalarEvolution *SE,

                          const TargetLibraryInfo *TLI)

      : CurLoop(CurLoop), SE(SE), TLI(TLI) {}


  bool isValidStrlenIdiom() {

    // Give up if the loop has multiple blocks, multiple backedges, or

    // multiple exit blocks

    if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1 ||

        !CurLoop->getUniqueExitBlock())

      return false;


    // It should have a preheader and a branch instruction.

    BasicBlock *Preheader = CurLoop->getLoopPreheader();

    if (!Preheader)

      return false;


    BranchInst *EntryBI = dyn_cast<BranchInst>(Preheader->getTerminator());

    if (!EntryBI)

      return false;


    // The loop exit must be conditioned on an icmp with 0 the null terminator.

    // The icmp operand has to be a load on some SSA reg that increments

    // by 1 in the loop.

    BasicBlock *LoopBody = *CurLoop->block_begin();


    // Skip if the body is too big as it most likely is not a strlen idiom.

    if (!LoopBody || LoopBody->size() >= 15)

      return false;


    BranchInst *LoopTerm = dyn_cast<BranchInst>(LoopBody->getTerminator());

    Value *LoopCond = matchCondition(LoopTerm, LoopBody);

    if (!LoopCond)

      return false;


    LoadInst *LoopLoad = dyn_cast<LoadInst>(LoopCond);

    if (!LoopLoad || LoopLoad->getPointerAddressSpace() != 0)

      return false;


    OperandType = LoopLoad->getType();

    if (!OperandType || !OperandType->isIntegerTy())

      return false;


    // See if the pointer expression is an AddRec with constant step a of form

    // ({n,+,a}) where a is the width of the char type.

    Value *IncPtr = LoopLoad->getPointerOperand();

    const SCEV *LoadEv = SE->getSCEV(IncPtr);

    const APInt *Step;

    if (!match(LoadEv,

               m_scev_AffineAddRec(m_SCEV(LoadBaseEv), m_scev_APInt(Step))))

      return false;


    LLVM_DEBUG(dbgs() << "pointer load scev: " << *LoadEv << "\n");


    unsigned StepSize = Step->getZExtValue();


    // Verify that StepSize is consistent with platform char width.

    OpWidth = OperandType->getIntegerBitWidth();

    unsigned WcharSize = TLI->getWCharSize(*LoopLoad->getModule());

    if (OpWidth != StepSize * 8)

      return false;

    if (OpWidth != 8 && OpWidth != 16 && OpWidth != 32)

      return false;

    if (OpWidth >= 16)

      if (OpWidth != WcharSize * 8)

        return false;


    // Scan every instruction in the loop to ensure there are no side effects.

    for (Instruction &I : *LoopBody)

      if (I.mayHaveSideEffects())

        return false;


    BasicBlock *LoopExitBB = CurLoop->getExitBlock();

    if (!LoopExitBB)

      return false;


    for (PHINode &PN : LoopExitBB->phis()) {

      if (!SE->isSCEVable(PN.getType()))

        return false;


      const SCEV *Ev = SE->getSCEV(&PN);

      if (!Ev)

        return false;


      LLVM_DEBUG(dbgs() << "loop exit phi scev: " << *Ev << "\n");


      // Since we verified that the loop trip count will be a valid strlen

      // idiom, we can expand all lcssa phi with {n,+,1} as (n + strlen) and use

      // SCEVExpander materialize the loop output.

      const SCEVAddRecExpr *AddRecEv = dyn_cast<SCEVAddRecExpr>(Ev);

      if (!AddRecEv || !AddRecEv->isAffine())

        return false;


      // We only want RecAddExpr with recurrence step that is constant. This

      // is good enough for all the idioms we want to recognize. Later we expand

      // and materialize the recurrence as {base,+,a} -> (base + a * strlen)

      if (!isa<SCEVConstant>(AddRecEv->getStepRecurrence(*SE)))

        return false;

    }


    return true;

  }


public:

  const Loop *CurLoop;

  ScalarEvolution *SE;

  const TargetLibraryInfo *TLI;


  unsigned OpWidth;

  ConstantInt *StepSizeCI;

  const SCEV *LoadBaseEv;

  Type *OperandType;

};


} // namespace


/// The Strlen Idiom we are trying to detect has the following structure

///

/// preheader:

///   ...

///   br label %body, ...

///

/// body:

///   ... ; %0 is incremented by a gep

///   %1 = load i8, ptr %0, align 1

///   %2 = icmp eq i8 %1, 0

///   br i1 %2, label %exit, label %body

///

/// exit:

///   %lcssa = phi [%0, %body], ...

///

/// We expect the strlen idiom to have a load of a character type that

/// is compared against '\0', and such load pointer operand must have scev

/// expression of the form {%str,+,c} where c is a ConstantInt of the

/// appropiate character width for the idiom, and %str is the base of the string

/// And, that all lcssa phis have the form {...,+,n} where n is a constant,

///

/// When transforming the output of the strlen idiom, the lccsa phi are

/// expanded using SCEVExpander as {base scev,+,a} -> (base scev + a * strlen)

/// and all subsequent uses are replaced. For example,

///

/// \code{.c}

///     const char* base = str;

///     while (*str != '\0')

///         ++str;

///     size_t result = str - base;

/// \endcode

///

/// will be transformed as follows: The idiom will be replaced by a strlen

/// computation to compute the address of the null terminator of the string.

///

/// \code{.c}

///     const char* base = str;

///     const char* end = base + strlen(str);

///     size_t result = end - base;

/// \endcode

///

/// In the case we index by an induction variable, as long as the induction

/// variable has a constant int increment, we can replace all such indvars

/// with the closed form computation of strlen

///

/// \code{.c}

///     size_t i = 0;

///     while (str[i] != '\0')

///         ++i;

///     size_t result = i;

/// \endcode

///

/// Will be replaced by

///

/// \code{.c}

///     size_t i = 0 + strlen(str);

///     size_t result = i;

/// \endcode

///

bool LoopIdiomRecognize::recognizeAndInsertStrLen() {

  if (DisableLIRP::All)

    return false;


  StrlenVerifier Verifier(CurLoop, SE, TLI);


  if (!Verifier.isValidStrlenIdiom())

    return false;


  BasicBlock *Preheader = CurLoop->getLoopPreheader();

  BasicBlock *LoopBody = *CurLoop->block_begin();

  BasicBlock *LoopExitBB = CurLoop->getExitBlock();

  BranchInst *LoopTerm = dyn_cast<BranchInst>(LoopBody->getTerminator());

  assert(Preheader && LoopBody && LoopExitBB && LoopTerm &&

         "Should be verified to be valid by StrlenVerifier");


  if (Verifier.OpWidth == 8) {

    if (DisableLIRP::Strlen)

      return false;

    if (!isLibFuncEmittable(Preheader->getModule(), TLI, LibFunc_strlen))

      return false;

  } else {

    if (DisableLIRP::Wcslen)

      return false;

    if (!isLibFuncEmittable(Preheader->getModule(), TLI, LibFunc_wcslen))

      return false;

  }


  IRBuilder<> Builder(Preheader->getTerminator());

  Builder.SetCurrentDebugLocation(CurLoop->getStartLoc());

  SCEVExpander Expander(*SE, Preheader->getModule()->getDataLayout(),

                        "strlen_idiom");

  Value *MaterialzedBase = Expander.expandCodeFor(

      Verifier.LoadBaseEv, Verifier.LoadBaseEv->getType(),

      Builder.GetInsertPoint());


  Value *StrLenFunc = nullptr;

  if (Verifier.OpWidth == 8) {

    StrLenFunc = emitStrLen(MaterialzedBase, Builder, *DL, TLI);

  } else {

    StrLenFunc = emitWcsLen(MaterialzedBase, Builder, *DL, TLI);

  }

  assert(StrLenFunc && "Failed to emit strlen function.");


  const SCEV *StrlenEv = SE->getSCEV(StrLenFunc);

  SmallVector<PHINode *, 4> Cleanup;

  for (PHINode &PN : LoopExitBB->phis()) {

    // We can now materialize the loop output as all phi have scev {base,+,a}.

    // We expand the phi as:

    //   %strlen = call i64 @strlen(%str)

    //   %phi.new = base expression + step * %strlen

    const SCEV *Ev = SE->getSCEV(&PN);

    const SCEVAddRecExpr *AddRecEv = dyn_cast<SCEVAddRecExpr>(Ev);

    const SCEVConstant *Step =

        dyn_cast<SCEVConstant>(AddRecEv->getStepRecurrence(*SE));

    const SCEV *Base = AddRecEv->getStart();


    // It is safe to truncate to base since if base is narrower than size_t

    // the equivalent user code will have to truncate anyways.

    const SCEV *NewEv = SE->getAddExpr(

        Base, SE->getMulExpr(Step, SE->getTruncateOrSignExtend(

                                       StrlenEv, Base->getType())));


    Value *MaterializedPHI = Expander.expandCodeFor(NewEv, NewEv->getType(),

                                                    Builder.GetInsertPoint());

    Expander.clear();

    PN.replaceAllUsesWith(MaterializedPHI);

    Cleanup.push_back(&PN);

  }


  // All LCSSA Loop Phi are dead, the left over dead loop body can be cleaned

  // up by later passes

  for (PHINode *PN : Cleanup)

    RecursivelyDeleteDeadPHINode(PN);


  // LoopDeletion only delete invariant loops with known trip-count. We can

  // update the condition so it will reliablely delete the invariant loop

  assert(LoopTerm->getNumSuccessors() == 2 &&

         (LoopTerm->getSuccessor(0) == LoopBody ||

          LoopTerm->getSuccessor(1) == LoopBody) &&

         "loop body must have a successor that is it self");

  ConstantInt *NewLoopCond = LoopTerm->getSuccessor(0) == LoopBody

                                 ? Builder.getFalse()

                                 : Builder.getTrue();

  LoopTerm->setCondition(NewLoopCond);

  SE->forgetLoop(CurLoop);


  ++NumStrLen;

  LLVM_DEBUG(dbgs() << "  Formed strlen idiom: " << *StrLenFunc << "\n");

  ORE.emit([&]() {

    return OptimizationRemark(DEBUG_TYPE, "recognizeAndInsertStrLen",

                              CurLoop->getStartLoc(), Preheader)

           << "Transformed " << StrLenFunc->getName() << " loop idiom";

  });


  return true;

}


/// Check if the given conditional branch is based on an unsigned less-than

/// comparison between a variable and a constant, and if the comparison is false

/// the control yields to the loop entry. If the branch matches the behaviour,

/// the variable involved in the comparison is returned.


static Value *matchShiftULTCondition(BranchInst *BI, BasicBlock *LoopEntry,

                                     APInt &Threshold) {

  if (!BI || !BI->isConditional())

    return nullptr;


  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());

  if (!Cond)

    return nullptr;


  ConstantInt *CmpConst = dyn_cast<ConstantInt>(Cond->getOperand(1));

  if (!CmpConst)

    return nullptr;


  BasicBlock *FalseSucc = BI->getSuccessor(1);

  ICmpInst::Predicate Pred = Cond->getPredicate();


  if (Pred == ICmpInst::ICMP_ULT && FalseSucc == LoopEntry) {

    Threshold = CmpConst->getValue();

    return Cond->getOperand(0);

  }


  return nullptr;

}


// Check if the recurrence variable `VarX` is in the right form to create

// the idiom. Returns the value coerced to a PHINode if so.


static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX,

                                 BasicBlock *LoopEntry) {

  auto *PhiX = dyn_cast<PHINode>(VarX);

  if (PhiX && PhiX->getParent() == LoopEntry &&

      (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))

    return PhiX;

  return nullptr;

}


/// Return true if the idiom is detected in the loop.

///

/// Additionally:

/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)

///       or nullptr if there is no such.

/// 2) \p CntPhi is set to the corresponding phi node

///       or nullptr if there is no such.

/// 3) \p InitX is set to the value whose CTLZ could be used.

/// 4) \p DefX is set to the instruction calculating Loop exit condition.

/// 5) \p Threshold is set to the constant involved in the unsigned less-than

///       comparison.

///

/// The core idiom we are trying to detect is:

/// \code

///    if (x0 < 2)

///      goto loop-exit // the precondition of the loop

///    cnt0 = init-val

///    do {

///      x = phi (x0, x.next);   //PhiX

///      cnt = phi (cnt0, cnt.next)

///

///      cnt.next = cnt + 1;

///       ...

///      x.next = x >> 1;   // DefX

///    } while (x >= 4)

/// loop-exit:

/// \endcode


static bool detectShiftUntilLessThanIdiom(Loop *CurLoop, const DataLayout &DL,

                                          Intrinsic::ID &IntrinID,

                                          Value *&InitX, Instruction *&CntInst,

                                          PHINode *&CntPhi, Instruction *&DefX,

                                          APInt &Threshold) {

  BasicBlock *LoopEntry;


  DefX = nullptr;

  CntInst = nullptr;

  CntPhi = nullptr;

  LoopEntry = *(CurLoop->block_begin());


  // step 1: Check if the loop-back branch is in desirable form.

  if (Value *T = matchShiftULTCondition(

          dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry,

          Threshold))

    DefX = dyn_cast<Instruction>(T);

  else

    return false;


  // step 2: Check the recurrence of variable X

  if (!DefX || !isa<PHINode>(DefX))

    return false;


  PHINode *VarPhi = cast<PHINode>(DefX);

  int Idx = VarPhi->getBasicBlockIndex(LoopEntry);

  if (Idx == -1)

    return false;


  DefX = dyn_cast<Instruction>(VarPhi->getIncomingValue(Idx));

  if (!DefX || DefX->getNumOperands() == 0 || DefX->getOperand(0) != VarPhi)

    return false;


  // step 3: detect instructions corresponding to "x.next = x >> 1"

  if (DefX->getOpcode() != Instruction::LShr)

    return false;


  IntrinID = Intrinsic::ctlz;

  ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));

  if (!Shft || !Shft->isOne())

    return false;


  InitX = VarPhi->getIncomingValueForBlock(CurLoop->getLoopPreheader());


  // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1

  //         or cnt.next = cnt + -1.

  // TODO: We can skip the step. If loop trip count is known (CTLZ),

  //       then all uses of "cnt.next" could be optimized to the trip count

  //       plus "cnt0". Currently it is not optimized.

  //       This step could be used to detect POPCNT instruction:

  //       cnt.next = cnt + (x.next & 1)

  for (Instruction &Inst :

       llvm::make_range(LoopEntry->getFirstNonPHIIt(), LoopEntry->end())) {

    if (Inst.getOpcode() != Instruction::Add)

      continue;


    ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));

    if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))

      continue;


    PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);

    if (!Phi)

      continue;


    CntInst = &Inst;

    CntPhi = Phi;

    break;

  }

  if (!CntInst)

    return false;


  return true;

}


/// Return true iff the idiom is detected in the loop.

///

/// Additionally:

/// 1) \p CntInst is set to the instruction counting the population bit.

/// 2) \p CntPhi is set to the corresponding phi node.

/// 3) \p Var is set to the value whose population bits are being counted.

///

/// The core idiom we are trying to detect is:

/// \code

///    if (x0 != 0)

///      goto loop-exit // the precondition of the loop

///    cnt0 = init-val;

///    do {

///       x1 = phi (x0, x2);

///       cnt1 = phi(cnt0, cnt2);

///

///       cnt2 = cnt1 + 1;

///        ...

///       x2 = x1 & (x1 - 1);

///        ...

///    } while(x != 0);

///

/// loop-exit:

/// \endcode


static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,

                                Instruction *&CntInst, PHINode *&CntPhi,

                                Value *&Var) {

  // step 1: Check to see if the look-back branch match this pattern:

  //    "if (a!=0) goto loop-entry".

  BasicBlock *LoopEntry;

  Instruction *DefX2, *CountInst;

  Value *VarX1, *VarX0;

  PHINode *PhiX, *CountPhi;


  DefX2 = CountInst = nullptr;

  VarX1 = VarX0 = nullptr;

  PhiX = CountPhi = nullptr;

  LoopEntry = *(CurLoop->block_begin());


  // step 1: Check if the loop-back branch is in desirable form.

  {

    if (Value *T = matchCondition(

            dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))

      DefX2 = dyn_cast<Instruction>(T);

    else

      return false;

  }


  // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"

  {

    if (!DefX2 || DefX2->getOpcode() != Instruction::And)

      return false;


    BinaryOperator *SubOneOp;


    if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))

      VarX1 = DefX2->getOperand(1);

    else {

      VarX1 = DefX2->getOperand(0);

      SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));

    }

    if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)

      return false;


    ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));

    if (!Dec ||

        !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||

          (SubOneOp->getOpcode() == Instruction::Add &&

           Dec->isMinusOne()))) {

      return false;

    }

  }


  // step 3: Check the recurrence of variable X

  PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);

  if (!PhiX)

    return false;


  // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1

  {

    CountInst = nullptr;

    for (Instruction &Inst :

         llvm::make_range(LoopEntry->getFirstNonPHIIt(), LoopEntry->end())) {

      if (Inst.getOpcode() != Instruction::Add)

        continue;


      ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));

      if (!Inc || !Inc->isOne())

        continue;


      PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);

      if (!Phi)

        continue;


      // Check if the result of the instruction is live of the loop.

      bool LiveOutLoop = false;

      for (User *U : Inst.users()) {

        if ((cast<Instruction>(U))->getParent() != LoopEntry) {

          LiveOutLoop = true;

          break;

        }

      }


      if (LiveOutLoop) {

        CountInst = &Inst;

        CountPhi = Phi;

        break;

      }

    }


    if (!CountInst)

      return false;

  }


  // step 5: check if the precondition is in this form:

  //   "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"

  {

    auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());

    Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());

    if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))

      return false;


    CntInst = CountInst;

    CntPhi = CountPhi;

    Var = T;

  }


  return true;

}


/// Return true if the idiom is detected in the loop.

///

/// Additionally:

/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)

///       or nullptr if there is no such.

/// 2) \p CntPhi is set to the corresponding phi node

///       or nullptr if there is no such.

/// 3) \p Var is set to the value whose CTLZ could be used.

/// 4) \p DefX is set to the instruction calculating Loop exit condition.

///

/// The core idiom we are trying to detect is:

/// \code

///    if (x0 == 0)

///      goto loop-exit // the precondition of the loop

///    cnt0 = init-val;

///    do {

///       x = phi (x0, x.next);   //PhiX

///       cnt = phi(cnt0, cnt.next);

///

///       cnt.next = cnt + 1;

///        ...

///       x.next = x >> 1;   // DefX

///        ...

///    } while(x.next != 0);

///

/// loop-exit:

/// \endcode


static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,

                                      Intrinsic::ID &IntrinID, Value *&InitX,

                                      Instruction *&CntInst, PHINode *&CntPhi,

                                      Instruction *&DefX) {

  BasicBlock *LoopEntry;

  Value *VarX = nullptr;


  DefX = nullptr;

  CntInst = nullptr;

  CntPhi = nullptr;

  LoopEntry = *(CurLoop->block_begin());


  // step 1: Check if the loop-back branch is in desirable form.

  if (Value *T = matchCondition(

          dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))

    DefX = dyn_cast<Instruction>(T);

  else

    return false;


  // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"

  if (!DefX || !DefX->isShift())

    return false;

  IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :

                                                     Intrinsic::ctlz;

  ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));

  if (!Shft || !Shft->isOne())

    return false;

  VarX = DefX->getOperand(0);


  // step 3: Check the recurrence of variable X

  PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);

  if (!PhiX)

    return false;


  InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());


  // Make sure the initial value can't be negative otherwise the ashr in the

  // loop might never reach zero which would make the loop infinite.

  if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))

    return false;


  // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1

  //         or cnt.next = cnt + -1.

  // TODO: We can skip the step. If loop trip count is known (CTLZ),

  //       then all uses of "cnt.next" could be optimized to the trip count

  //       plus "cnt0". Currently it is not optimized.

  //       This step could be used to detect POPCNT instruction:

  //       cnt.next = cnt + (x.next & 1)

  for (Instruction &Inst :

       llvm::make_range(LoopEntry->getFirstNonPHIIt(), LoopEntry->end())) {

    if (Inst.getOpcode() != Instruction::Add)

      continue;


    ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));

    if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))

      continue;


    PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);

    if (!Phi)

      continue;


    CntInst = &Inst;

    CntPhi = Phi;

    break;

  }

  if (!CntInst)

    return false;


  return true;

}


// Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always

// profitable if we delete the loop.

bool LoopIdiomRecognize::isProfitableToInsertFFS(Intrinsic::ID IntrinID,

                                                 Value *InitX, bool ZeroCheck,

                                                 size_t CanonicalSize) {

  const Value *Args[] = {InitX,

                         ConstantInt::getBool(InitX->getContext(), ZeroCheck)};


  // @llvm.dbg doesn't count as they have no semantic effect.

  auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();

  uint32_t HeaderSize =

      std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());


  IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);

  InstructionCost Cost = TTI->getIntrinsicInstrCost(

      Attrs, TargetTransformInfo::TCK_SizeAndLatency);

  if (HeaderSize != CanonicalSize && Cost > TargetTransformInfo::TCC_Basic)

    return false;


  return true;

}


/// Convert CTLZ / CTTZ idiom loop into countable loop.

/// If CTLZ / CTTZ inserted as a new trip count returns true; otherwise,

/// returns false.

bool LoopIdiomRecognize::insertFFSIfProfitable(Intrinsic::ID IntrinID,

                                               Value *InitX, Instruction *DefX,

                                               PHINode *CntPhi,

                                               Instruction *CntInst) {

  bool IsCntPhiUsedOutsideLoop = false;

  for (User *U : CntPhi->users())

    if (!CurLoop->contains(cast<Instruction>(U))) {

      IsCntPhiUsedOutsideLoop = true;

      break;

    }

  bool IsCntInstUsedOutsideLoop = false;

  for (User *U : CntInst->users())

    if (!CurLoop->contains(cast<Instruction>(U))) {

      IsCntInstUsedOutsideLoop = true;

      break;

    }

  // If both CntInst and CntPhi are used outside the loop the profitability

  // is questionable.

  if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)

    return false;


  // For some CPUs result of CTLZ(X) intrinsic is undefined

  // when X is 0. If we can not guarantee X != 0, we need to check this

  // when expand.

  bool ZeroCheck = false;

  // It is safe to assume Preheader exist as it was checked in

  // parent function RunOnLoop.

  BasicBlock *PH = CurLoop->getLoopPreheader();


  // If we are using the count instruction outside the loop, make sure we

  // have a zero check as a precondition. Without the check the loop would run

  // one iteration for before any check of the input value. This means 0 and 1

  // would have identical behavior in the original loop and thus

  if (!IsCntPhiUsedOutsideLoop) {

    auto *PreCondBB = PH->getSinglePredecessor();

    if (!PreCondBB)

      return false;

    auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());

    if (!PreCondBI)

      return false;

    if (matchCondition(PreCondBI, PH) != InitX)

      return false;

    ZeroCheck = true;

  }


  // FFS idiom loop has only 6 instructions:

  //  %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]

  //  %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]

  //  %shr = ashr %n.addr.0, 1

  //  %tobool = icmp eq %shr, 0

  //  %inc = add nsw %i.0, 1

  //  br i1 %tobool

  size_t IdiomCanonicalSize = 6;

  if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, IdiomCanonicalSize))

    return false;


  transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,

                           DefX->getDebugLoc(), ZeroCheck,

                           IsCntPhiUsedOutsideLoop);

  return true;

}


/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop

/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new

/// trip count returns true; otherwise, returns false.

bool LoopIdiomRecognize::recognizeAndInsertFFS() {

  // Give up if the loop has multiple blocks or multiple backedges.

  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)

    return false;


  Intrinsic::ID IntrinID;

  Value *InitX;

  Instruction *DefX = nullptr;

  PHINode *CntPhi = nullptr;

  Instruction *CntInst = nullptr;


  if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX, CntInst, CntPhi,

                                 DefX))

    return false;


  return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst);

}


bool LoopIdiomRecognize::recognizeShiftUntilLessThan() {

  // Give up if the loop has multiple blocks or multiple backedges.

  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)

    return false;


  Intrinsic::ID IntrinID;

  Value *InitX;

  Instruction *DefX = nullptr;

  PHINode *CntPhi = nullptr;

  Instruction *CntInst = nullptr;


  APInt LoopThreshold;

  if (!detectShiftUntilLessThanIdiom(CurLoop, *DL, IntrinID, InitX, CntInst,

                                     CntPhi, DefX, LoopThreshold))

    return false;


  if (LoopThreshold == 2) {

    // Treat as regular FFS.

    return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst);

  }


  // Look for Floor Log2 Idiom.

  if (LoopThreshold != 4)

    return false;


  // Abort if CntPhi is used outside of the loop.

  for (User *U : CntPhi->users())

    if (!CurLoop->contains(cast<Instruction>(U)))

      return false;


  // It is safe to assume Preheader exist as it was checked in

  // parent function RunOnLoop.

  BasicBlock *PH = CurLoop->getLoopPreheader();

  auto *PreCondBB = PH->getSinglePredecessor();

  if (!PreCondBB)

    return false;

  auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());

  if (!PreCondBI)

    return false;


  APInt PreLoopThreshold;

  if (matchShiftULTCondition(PreCondBI, PH, PreLoopThreshold) != InitX ||

      PreLoopThreshold != 2)

    return false;


  bool ZeroCheck = true;


  // the loop has only 6 instructions:

  //  %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]

  //  %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]

  //  %shr = ashr %n.addr.0, 1

  //  %tobool = icmp ult %n.addr.0, C

  //  %inc = add nsw %i.0, 1

  //  br i1 %tobool

  size_t IdiomCanonicalSize = 6;

  if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, IdiomCanonicalSize))

    return false;


  // log2(x) = w − 1 − clz(x)

  transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,

                           DefX->getDebugLoc(), ZeroCheck,

                           /*IsCntPhiUsedOutsideLoop=*/false,

                           /*InsertSub=*/true);

  return true;

}


/// Recognizes a population count idiom in a non-countable loop.

///

/// If detected, transforms the relevant code to issue the popcount intrinsic

/// function call, and returns true; otherwise, returns false.

bool LoopIdiomRecognize::recognizePopcount() {

  if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)

    return false;


  // Counting population are usually conducted by few arithmetic instructions.

  // Such instructions can be easily "absorbed" by vacant slots in a

  // non-compact loop. Therefore, recognizing popcount idiom only makes sense

  // in a compact loop.


  // Give up if the loop has multiple blocks or multiple backedges.

  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)

    return false;


  BasicBlock *LoopBody = *(CurLoop->block_begin());

  if (LoopBody->size() >= 20) {

    // The loop is too big, bail out.

    return false;

  }


  // It should have a preheader containing nothing but an unconditional branch.

  BasicBlock *PH = CurLoop->getLoopPreheader();

  if (!PH || &PH->front() != PH->getTerminator())

    return false;

  auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());

  if (!EntryBI || EntryBI->isConditional())

    return false;


  // It should have a precondition block where the generated popcount intrinsic

  // function can be inserted.

  auto *PreCondBB = PH->getSinglePredecessor();

  if (!PreCondBB)

    return false;

  auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());

  if (!PreCondBI || PreCondBI->isUnconditional())

    return false;


  Instruction *CntInst;

  PHINode *CntPhi;

  Value *Val;

  if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))

    return false;


  transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);

  return true;

}


static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,

                                       const DebugLoc &DL) {

  Value *Ops[] = {Val};

  Type *Tys[] = {Val->getType()};


  CallInst *CI = IRBuilder.CreateIntrinsic(Intrinsic::ctpop, Tys, Ops);

  CI->setDebugLoc(DL);


  return CI;

}


static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val,

                                    const DebugLoc &DL, bool ZeroCheck,

                                    Intrinsic::ID IID) {

  Value *Ops[] = {Val, IRBuilder.getInt1(ZeroCheck)};

  Type *Tys[] = {Val->getType()};


  CallInst *CI = IRBuilder.CreateIntrinsic(IID, Tys, Ops);

  CI->setDebugLoc(DL);


  return CI;

}


/// Transform the following loop (Using CTLZ, CTTZ is similar):

/// loop:

///   CntPhi = PHI [Cnt0, CntInst]

///   PhiX = PHI [InitX, DefX]

///   CntInst = CntPhi + 1

///   DefX = PhiX >> 1

///   LOOP_BODY

///   Br: loop if (DefX != 0)

/// Use(CntPhi) or Use(CntInst)

///

/// Into:

/// If CntPhi used outside the loop:

///   CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)

///   Count = CountPrev + 1

/// else

///   Count = BitWidth(InitX) - CTLZ(InitX)

/// loop:

///   CntPhi = PHI [Cnt0, CntInst]

///   PhiX = PHI [InitX, DefX]

///   PhiCount = PHI [Count, Dec]

///   CntInst = CntPhi + 1

///   DefX = PhiX >> 1

///   Dec = PhiCount - 1

///   LOOP_BODY

///   Br: loop if (Dec != 0)

/// Use(CountPrev + Cnt0) // Use(CntPhi)

/// or

/// Use(Count + Cnt0) // Use(CntInst)

///

/// If LOOP_BODY is empty the loop will be deleted.

/// If CntInst and DefX are not used in LOOP_BODY they will be removed.

void LoopIdiomRecognize::transformLoopToCountable(

    Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,

    PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,

    bool ZeroCheck, bool IsCntPhiUsedOutsideLoop, bool InsertSub) {

  BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());


  // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block

  IRBuilder<> Builder(PreheaderBr);

  Builder.SetCurrentDebugLocation(DL);


  // If there are no uses of CntPhi crate:

  //   Count = BitWidth - CTLZ(InitX);

  //   NewCount = Count;

  // If there are uses of CntPhi create:

  //   NewCount = BitWidth - CTLZ(InitX >> 1);

  //   Count = NewCount + 1;

  Value *InitXNext;

  if (IsCntPhiUsedOutsideLoop) {

    if (DefX->getOpcode() == Instruction::AShr)

      InitXNext = Builder.CreateAShr(InitX, 1);

    else if (DefX->getOpcode() == Instruction::LShr)

      InitXNext = Builder.CreateLShr(InitX, 1);

    else if (DefX->getOpcode() == Instruction::Shl) // cttz

      InitXNext = Builder.CreateShl(InitX, 1);

    else

      llvm_unreachable("Unexpected opcode!");

  } else

    InitXNext = InitX;

  Value *Count =

      createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);

  Type *CountTy = Count->getType();

  Count = Builder.CreateSub(

      ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count);

  if (InsertSub)

    Count = Builder.CreateSub(Count, ConstantInt::get(CountTy, 1));

  Value *NewCount = Count;

  if (IsCntPhiUsedOutsideLoop)

    Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1));


  NewCount = Builder.CreateZExtOrTrunc(NewCount, CntInst->getType());


  Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);

  if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {

    // If the counter was being incremented in the loop, add NewCount to the

    // counter's initial value, but only if the initial value is not zero.

    ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);

    if (!InitConst || !InitConst->isZero())

      NewCount = Builder.CreateAdd(NewCount, CntInitVal);

  } else {

    // If the count was being decremented in the loop, subtract NewCount from

    // the counter's initial value.

    NewCount = Builder.CreateSub(CntInitVal, NewCount);

  }


  // Step 2: Insert new IV and loop condition:

  // loop:

  //   ...

  //   PhiCount = PHI [Count, Dec]

  //   ...

  //   Dec = PhiCount - 1

  //   ...

  //   Br: loop if (Dec != 0)

  BasicBlock *Body = *(CurLoop->block_begin());

  auto *LbBr = cast<BranchInst>(Body->getTerminator());

  ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());


  PHINode *TcPhi = PHINode::Create(CountTy, 2, "tcphi");

  TcPhi->insertBefore(Body->begin());


  Builder.SetInsertPoint(LbCond);

  Instruction *TcDec = cast<Instruction>(Builder.CreateSub(

      TcPhi, ConstantInt::get(CountTy, 1), "tcdec", false, true));


  TcPhi->addIncoming(Count, Preheader);

  TcPhi->addIncoming(TcDec, Body);


  CmpInst::Predicate Pred =

      (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;

  LbCond->setPredicate(Pred);

  LbCond->setOperand(0, TcDec);

  LbCond->setOperand(1, ConstantInt::get(CountTy, 0));


  // Step 3: All the references to the original counter outside

  //  the loop are replaced with the NewCount

  if (IsCntPhiUsedOutsideLoop)

    CntPhi->replaceUsesOutsideBlock(NewCount, Body);

  else

    CntInst->replaceUsesOutsideBlock(NewCount, Body);


  // step 4: Forget the "non-computable" trip-count SCEV associated with the

  //   loop. The loop would otherwise not be deleted even if it becomes empty.

  SE->forgetLoop(CurLoop);

}


void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,

                                                 Instruction *CntInst,

                                                 PHINode *CntPhi, Value *Var) {

  BasicBlock *PreHead = CurLoop->getLoopPreheader();

  auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());

  const DebugLoc &DL = CntInst->getDebugLoc();


  // Assuming before transformation, the loop is following:

  //  if (x) // the precondition

  //     do { cnt++; x &= x - 1; } while(x);


  // Step 1: Insert the ctpop instruction at the end of the precondition block

  IRBuilder<> Builder(PreCondBr);

  Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;

  {

    PopCnt = createPopcntIntrinsic(Builder, Var, DL);

    NewCount = PopCntZext =

        Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));


    if (NewCount != PopCnt)

      (cast<Instruction>(NewCount))->setDebugLoc(DL);


    // TripCnt is exactly the number of iterations the loop has

    TripCnt = NewCount;


    // If the population counter's initial value is not zero, insert Add Inst.

    Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);

    ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);

    if (!InitConst || !InitConst->isZero()) {

      NewCount = Builder.CreateAdd(NewCount, CntInitVal);

      (cast<Instruction>(NewCount))->setDebugLoc(DL);

    }

  }


  // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to

  //   "if (NewCount == 0) loop-exit". Without this change, the intrinsic

  //   function would be partial dead code, and downstream passes will drag

  //   it back from the precondition block to the preheader.

  {

    ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());


    Value *Opnd0 = PopCntZext;

    Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);

    if (PreCond->getOperand(0) != Var)

      std::swap(Opnd0, Opnd1);


    ICmpInst *NewPreCond = cast<ICmpInst>(

        Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));

    PreCondBr->setCondition(NewPreCond);


    RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI);

  }


  // Step 3: Note that the population count is exactly the trip count of the

  // loop in question, which enable us to convert the loop from noncountable

  // loop into a countable one. The benefit is twofold:

  //

  //  - If the loop only counts population, the entire loop becomes dead after

  //    the transformation. It is a lot easier to prove a countable loop dead

  //    than to prove a noncountable one. (In some C dialects, an infinite loop

  //    isn't dead even if it computes nothing useful. In general, DCE needs

  //    to prove a noncountable loop finite before safely delete it.)

  //

  //  - If the loop also performs something else, it remains alive.

  //    Since it is transformed to countable form, it can be aggressively

  //    optimized by some optimizations which are in general not applicable

  //    to a noncountable loop.

  //

  // After this step, this loop (conceptually) would look like following:

  //   newcnt = __builtin_ctpop(x);

  //   t = newcnt;

  //   if (x)

  //     do { cnt++; x &= x-1; t--) } while (t > 0);

  BasicBlock *Body = *(CurLoop->block_begin());

  {

    auto *LbBr = cast<BranchInst>(Body->getTerminator());

    ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());

    Type *Ty = TripCnt->getType();


    PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi");

    TcPhi->insertBefore(Body->begin());


    Builder.SetInsertPoint(LbCond);

    Instruction *TcDec = cast<Instruction>(

        Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),

                          "tcdec", false, true));


    TcPhi->addIncoming(TripCnt, PreHead);

    TcPhi->addIncoming(TcDec, Body);


    CmpInst::Predicate Pred =

        (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;

    LbCond->setPredicate(Pred);

    LbCond->setOperand(0, TcDec);

    LbCond->setOperand(1, ConstantInt::get(Ty, 0));

  }


  // Step 4: All the references to the original population counter outside

  //  the loop are replaced with the NewCount -- the value returned from

  //  __builtin_ctpop().

  CntInst->replaceUsesOutsideBlock(NewCount, Body);


  // step 5: Forget the "non-computable" trip-count SCEV associated with the

  //   loop. The loop would otherwise not be deleted even if it becomes empty.

  SE->forgetLoop(CurLoop);

}


/// Match loop-invariant value.


template <typename SubPattern_t> struct match_LoopInvariant {

  SubPattern_t SubPattern;

  const Loop *L;


  match_LoopInvariant(const SubPattern_t &SP, const Loop *L)

      : SubPattern(SP), L(L) {}


  template <typename ITy> bool match(ITy *V) const {

    return L->isLoopInvariant(V) && SubPattern.match(V);

  }


};


/// Matches if the value is loop-invariant.

template <typename Ty>


inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {

  return match_LoopInvariant<Ty>(M, L);

}


/// Return true if the idiom is detected in the loop.

///

/// The core idiom we are trying to detect is:

/// \code

///   entry:

///     <...>

///     %bitmask = shl i32 1, %bitpos

///     br label %loop

///

///   loop:

///     %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]

///     %x.curr.bitmasked = and i32 %x.curr, %bitmask

///     %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0

///     %x.next = shl i32 %x.curr, 1

///     <...>

///     br i1 %x.curr.isbitunset, label %loop, label %end

///

///   end:

///     %x.curr.res = phi i32 [ %x.curr, %loop ] <...>

///     %x.next.res = phi i32 [ %x.next, %loop ] <...>

///     <...>

/// \endcode


static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,

                                         Value *&BitMask, Value *&BitPos,

                                         Value *&CurrX, Instruction *&NextX) {

  LLVM_DEBUG(dbgs() << DEBUG_TYPE

             " Performing shift-until-bittest idiom detection.\n");


  // Give up if the loop has multiple blocks or multiple backedges.

  if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {

    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");

    return false;

  }


  BasicBlock *LoopHeaderBB = CurLoop->getHeader();

  BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();

  assert(LoopPreheaderBB && "There is always a loop preheader.");


  using namespace PatternMatch;


  // Step 1: Check if the loop backedge is in desirable form.


  CmpPredicate Pred;

  Value *CmpLHS, *CmpRHS;

  BasicBlock *TrueBB, *FalseBB;

  if (!match(LoopHeaderBB->getTerminator(),

             m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),

                  m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {

    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");

    return false;

  }


  // Step 2: Check if the backedge's condition is in desirable form.


  auto MatchVariableBitMask = [&]() {

    return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&

           match(CmpLHS,

                 m_c_And(m_Value(CurrX),

                         m_CombineAnd(

                             m_Value(BitMask),

                             m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),

                                             CurLoop))));

  };


  auto MatchDecomposableConstantBitMask = [&]() {

    auto Res = llvm::decomposeBitTestICmp(

        CmpLHS, CmpRHS, Pred, /*LookThroughTrunc=*/true,

        /*AllowNonZeroC=*/false, /*DecomposeAnd=*/true);

    if (Res && Res->Mask.isPowerOf2()) {

      assert(ICmpInst::isEquality(Res->Pred));

      Pred = Res->Pred;

      CurrX = Res->X;

      BitMask = ConstantInt::get(CurrX->getType(), Res->Mask);

      BitPos = ConstantInt::get(CurrX->getType(), Res->Mask.logBase2());

      return true;

    }

    return false;

  };


  if (!MatchVariableBitMask() && !MatchDecomposableConstantBitMask()) {

    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n");

    return false;

  }


  // Step 3: Check if the recurrence is in desirable form.

  auto *CurrXPN = dyn_cast<PHINode>(CurrX);

  if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) {

    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");

    return false;

  }


  BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);

  NextX =

      dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));


  assert(CurLoop->isLoopInvariant(BaseX) &&

         "Expected BaseX to be available in the preheader!");


  if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {

    // FIXME: support right-shift?

    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");

    return false;

  }


  // Step 4: Check if the backedge's destinations are in desirable form.


  assert(ICmpInst::isEquality(Pred) &&

         "Should only get equality predicates here.");


  // cmp-br is commutative, so canonicalize to a single variant.

  if (Pred != ICmpInst::Predicate::ICMP_EQ) {

    Pred = ICmpInst::getInversePredicate(Pred);

    std::swap(TrueBB, FalseBB);

  }


  // We expect to exit loop when comparison yields false,

  // so when it yields true we should branch back to loop header.

  if (TrueBB != LoopHeaderBB) {

    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");

    return false;

  }


  // Okay, idiom checks out.

  return true;

}


/// Look for the following loop:

/// \code

///   entry:

///     <...>

///     %bitmask = shl i32 1, %bitpos

///     br label %loop

///

///   loop:

///     %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]

///     %x.curr.bitmasked = and i32 %x.curr, %bitmask

///     %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0

///     %x.next = shl i32 %x.curr, 1

///     <...>

///     br i1 %x.curr.isbitunset, label %loop, label %end

///

///   end:

///     %x.curr.res = phi i32 [ %x.curr, %loop ] <...>

///     %x.next.res = phi i32 [ %x.next, %loop ] <...>

///     <...>

/// \endcode

///

/// And transform it into:

/// \code

///   entry:

///     %bitmask = shl i32 1, %bitpos

///     %lowbitmask = add i32 %bitmask, -1

///     %mask = or i32 %lowbitmask, %bitmask

///     %x.masked = and i32 %x, %mask

///     %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,

///                                                         i1 true)

///     %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros

///     %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1

///     %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos

///     %tripcount = add i32 %backedgetakencount, 1

///     %x.curr = shl i32 %x, %backedgetakencount

///     %x.next = shl i32 %x, %tripcount

///     br label %loop

///

///   loop:

///     %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]

///     %loop.iv.next = add nuw i32 %loop.iv, 1

///     %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount

///     <...>

///     br i1 %loop.ivcheck, label %end, label %loop

///

///   end:

///     %x.curr.res = phi i32 [ %x.curr, %loop ] <...>

///     %x.next.res = phi i32 [ %x.next, %loop ] <...>

///     <...>

/// \endcode

bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {

  bool MadeChange = false;


  Value *X, *BitMask, *BitPos, *XCurr;

  Instruction *XNext;

  if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,

                                    XNext)) {

    LLVM_DEBUG(dbgs() << DEBUG_TYPE

               " shift-until-bittest idiom detection failed.\n");

    return MadeChange;

  }

  LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n");


  // Ok, it is the idiom we were looking for, we *could* transform this loop,

  // but is it profitable to transform?


  BasicBlock *LoopHeaderBB = CurLoop->getHeader();

  BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();

  assert(LoopPreheaderBB && "There is always a loop preheader.");


  BasicBlock *SuccessorBB = CurLoop->getExitBlock();

  assert(SuccessorBB && "There is only a single successor.");


  IRBuilder<> Builder(LoopPreheaderBB->getTerminator());

  Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());


  Intrinsic::ID IntrID = Intrinsic::ctlz;

  Type *Ty = X->getType();

  unsigned Bitwidth = Ty->getScalarSizeInBits();


  TargetTransformInfo::TargetCostKind CostKind =

      TargetTransformInfo::TCK_SizeAndLatency;


  // The rewrite is considered to be unprofitable iff and only iff the

  // intrinsic/shift we'll use are not cheap. Note that we are okay with *just*

  // making the loop countable, even if nothing else changes.

  IntrinsicCostAttributes Attrs(

      IntrID, Ty, {PoisonValue::get(Ty), /*is_zero_poison=*/Builder.getTrue()});

  InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);

  if (Cost > TargetTransformInfo::TCC_Basic) {

    LLVM_DEBUG(dbgs() << DEBUG_TYPE

               " Intrinsic is too costly, not beneficial\n");

    return MadeChange;

  }

  if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >

      TargetTransformInfo::TCC_Basic) {

    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n");

    return MadeChange;

  }


  // Ok, transform appears worthwhile.

  MadeChange = true;


  if (!isGuaranteedNotToBeUndefOrPoison(BitPos)) {

    // BitMask may be computed from BitPos, Freeze BitPos so we can increase

    // it's use count.

    std::optional<BasicBlock::iterator> InsertPt = std::nullopt;

    if (auto *BitPosI = dyn_cast<Instruction>(BitPos))

      InsertPt = BitPosI->getInsertionPointAfterDef();

    else

      InsertPt = DT->getRoot()->getFirstNonPHIOrDbgOrAlloca();

    if (!InsertPt)

      return false;

    FreezeInst *BitPosFrozen =

        new FreezeInst(BitPos, BitPos->getName() + ".fr", *InsertPt);

    BitPos->replaceUsesWithIf(BitPosFrozen, [BitPosFrozen](Use &U) {

      return U.getUser() != BitPosFrozen;

    });

    BitPos = BitPosFrozen;

  }


  // Step 1: Compute the loop trip count.


  Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),

                                        BitPos->getName() + ".lowbitmask");

  Value *Mask =

      Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");

  Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");

  CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(

      IntrID, Ty, {XMasked, /*is_zero_poison=*/Builder.getTrue()},

      /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros");

  Value *XMaskedNumActiveBits = Builder.CreateSub(

      ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,

      XMasked->getName() + ".numactivebits", /*HasNUW=*/true,

      /*HasNSW=*/Bitwidth != 2);

  Value *XMaskedLeadingOnePos =

      Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),

                        XMasked->getName() + ".leadingonepos", /*HasNUW=*/false,

                        /*HasNSW=*/Bitwidth > 2);


  Value *LoopBackedgeTakenCount = Builder.CreateSub(

      BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount",

      /*HasNUW=*/true, /*HasNSW=*/true);

  // We know loop's backedge-taken count, but what's loop's trip count?

  // Note that while NUW is always safe, while NSW is only for bitwidths != 2.

  Value *LoopTripCount =

      Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),

                        CurLoop->getName() + ".tripcount", /*HasNUW=*/true,

                        /*HasNSW=*/Bitwidth != 2);


  // Step 2: Compute the recurrence's final value without a loop.


  // NewX is always safe to compute, because `LoopBackedgeTakenCount`

  // will always be smaller than `bitwidth(X)`, i.e. we never get poison.

  Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);

  NewX->takeName(XCurr);

  if (auto *I = dyn_cast<Instruction>(NewX))

    I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);


  Value *NewXNext;

  // Rewriting XNext is more complicated, however, because `X << LoopTripCount`

  // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen

  // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know

  // that isn't the case, we'll need to emit an alternative, safe IR.

  if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() ||

      PatternMatch::match(

          BitPos, PatternMatch::m_SpecificInt_ICMP(

                      ICmpInst::ICMP_NE, APInt(Ty->getScalarSizeInBits(),

                                               Ty->getScalarSizeInBits() - 1))))

    NewXNext = Builder.CreateShl(X, LoopTripCount);

  else {

    // Otherwise, just additionally shift by one. It's the smallest solution,

    // alternatively, we could check that NewX is INT_MIN (or BitPos is )

    // and select 0 instead.

    NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));

  }


  NewXNext->takeName(XNext);

  if (auto *I = dyn_cast<Instruction>(NewXNext))

    I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);


  // Step 3: Adjust the successor basic block to recieve the computed

  //         recurrence's final value instead of the recurrence itself.


  XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);

  XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);


  // Step 4: Rewrite the loop into a countable form, with canonical IV.


  // The new canonical induction variable.

  Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->begin());

  auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");


  // The induction itself.

  // Note that while NUW is always safe, while NSW is only for bitwidths != 2.

  Builder.SetInsertPoint(LoopHeaderBB->getTerminator());

  auto *IVNext =

      Builder.CreateAdd(IV, ConstantInt::get(Ty, 1), IV->getName() + ".next",

                        /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);


  // The loop trip count check.

  auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,

                                       CurLoop->getName() + ".ivcheck");

  Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);

  LoopHeaderBB->getTerminator()->eraseFromParent();


  // Populate the IV PHI.

  IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);

  IV->addIncoming(IVNext, LoopHeaderBB);


  // Step 5: Forget the "non-computable" trip-count SCEV associated with the

  //   loop. The loop would otherwise not be deleted even if it becomes empty.


  SE->forgetLoop(CurLoop);


  // Other passes will take care of actually deleting the loop if possible.


  LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n");


  ++NumShiftUntilBitTest;

  return MadeChange;

}


/// Return true if the idiom is detected in the loop.

///

/// The core idiom we are trying to detect is:

/// \code

///   entry:

///     <...>

///     %start = <...>

///     %extraoffset = <...>

///     <...>

///     br label %for.cond

///

///   loop:

///     %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]

///     %nbits = add nsw i8 %iv, %extraoffset

///     %val.shifted = {{l,a}shr,shl} i8 %val, %nbits

///     %val.shifted.iszero = icmp eq i8 %val.shifted, 0

///     %iv.next = add i8 %iv, 1

///     <...>

///     br i1 %val.shifted.iszero, label %end, label %loop

///

///   end:

///     %iv.res = phi i8 [ %iv, %loop ] <...>

///     %nbits.res = phi i8 [ %nbits, %loop ] <...>

///     %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>

///     %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>

///     %iv.next.res = phi i8 [ %iv.next, %loop ] <...>

///     <...>

/// \endcode


static bool detectShiftUntilZeroIdiom(Loop *CurLoop, ScalarEvolution *SE,

                                      Instruction *&ValShiftedIsZero,

                                      Intrinsic::ID &IntrinID, Instruction *&IV,

                                      Value *&Start, Value *&Val,

                                      const SCEV *&ExtraOffsetExpr,

                                      bool &InvertedCond) {

  LLVM_DEBUG(dbgs() << DEBUG_TYPE

             " Performing shift-until-zero idiom detection.\n");


  // Give up if the loop has multiple blocks or multiple backedges.

  if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {

    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");

    return false;

  }


  Instruction *ValShifted, *NBits, *IVNext;

  Value *ExtraOffset;


  BasicBlock *LoopHeaderBB = CurLoop->getHeader();

  BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();

  assert(LoopPreheaderBB && "There is always a loop preheader.");


  using namespace PatternMatch;


  // Step 1: Check if the loop backedge, condition is in desirable form.


  CmpPredicate Pred;

  BasicBlock *TrueBB, *FalseBB;

  if (!match(LoopHeaderBB->getTerminator(),

             m_Br(m_Instruction(ValShiftedIsZero), m_BasicBlock(TrueBB),

                  m_BasicBlock(FalseBB))) ||

      !match(ValShiftedIsZero,

             m_ICmp(Pred, m_Instruction(ValShifted), m_Zero())) ||

      !ICmpInst::isEquality(Pred)) {

    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");

    return false;

  }


  // Step 2: Check if the comparison's operand is in desirable form.

  // FIXME: Val could be a one-input PHI node, which we should look past.

  if (!match(ValShifted, m_Shift(m_LoopInvariant(m_Value(Val), CurLoop),

                                 m_Instruction(NBits)))) {

    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad comparisons value computation.\n");

    return false;

  }

  IntrinID = ValShifted->getOpcode() == Instruction::Shl ? Intrinsic::cttz

                                                         : Intrinsic::ctlz;


  // Step 3: Check if the shift amount is in desirable form.


  if (match(NBits, m_c_Add(m_Instruction(IV),

                           m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&

      (NBits->hasNoSignedWrap() || NBits->hasNoUnsignedWrap()))

    ExtraOffsetExpr = SE->getNegativeSCEV(SE->getSCEV(ExtraOffset));

  else if (match(NBits,

                 m_Sub(m_Instruction(IV),

                       m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&

           NBits->hasNoSignedWrap())

    ExtraOffsetExpr = SE->getSCEV(ExtraOffset);

  else {

    IV = NBits;

    ExtraOffsetExpr = SE->getZero(NBits->getType());

  }


  // Step 4: Check if the recurrence is in desirable form.

  auto *IVPN = dyn_cast<PHINode>(IV);

  if (!IVPN || IVPN->getParent() != LoopHeaderBB) {

    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");

    return false;

  }


  Start = IVPN->getIncomingValueForBlock(LoopPreheaderBB);

  IVNext = dyn_cast<Instruction>(IVPN->getIncomingValueForBlock(LoopHeaderBB));


  if (!IVNext || !match(IVNext, m_Add(m_Specific(IVPN), m_One()))) {

    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");

    return false;

  }


  // Step 4: Check if the backedge's destinations are in desirable form.


  assert(ICmpInst::isEquality(Pred) &&

         "Should only get equality predicates here.");


  // cmp-br is commutative, so canonicalize to a single variant.

  InvertedCond = Pred != ICmpInst::Predicate::ICMP_EQ;

  if (InvertedCond) {

    Pred = ICmpInst::getInversePredicate(Pred);

    std::swap(TrueBB, FalseBB);

  }


  // We expect to exit loop when comparison yields true,

  // so when it yields false we should branch back to loop header.

  if (FalseBB != LoopHeaderBB) {

    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");

    return false;

  }


  // The new, countable, loop will certainly only run a known number of

  // iterations, It won't be infinite. But the old loop might be infinite

  // under certain conditions. For logical shifts, the value will become zero

  // after at most bitwidth(%Val) loop iterations. However, for arithmetic

  // right-shift, iff the sign bit was set, the value will never become zero,

  // and the loop may never finish.

  if (ValShifted->getOpcode() == Instruction::AShr &&

      !isMustProgress(CurLoop) && !SE->isKnownNonNegative(SE->getSCEV(Val))) {

    LLVM_DEBUG(dbgs() << DEBUG_TYPE " Can not prove the loop is finite.\n");

    return false;

  }


  // Okay, idiom checks out.

  return true;

}


/// Look for the following loop:

/// \code

///   entry:

///     <...>

///     %start = <...>

///     %extraoffset = <...>

///     <...>

///     br label %for.cond

///

///   loop:

///     %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]

///     %nbits = add nsw i8 %iv, %extraoffset

///     %val.shifted = {{l,a}shr,shl} i8 %val, %nbits

///     %val.shifted.iszero = icmp eq i8 %val.shifted, 0

///     %iv.next = add i8 %iv, 1

///     <...>

///     br i1 %val.shifted.iszero, label %end, label %loop

///

///   end:

///     %iv.res = phi i8 [ %iv, %loop ] <...>

///     %nbits.res = phi i8 [ %nbits, %loop ] <...>

///     %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>

///     %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>

///     %iv.next.res = phi i8 [ %iv.next, %loop ] <...>

///     <...>

/// \endcode

///

/// And transform it into:

/// \code

///   entry:

///     <...>

///     %start = <...>

///     %extraoffset = <...>

///     <...>

///     %val.numleadingzeros = call i8 @llvm.ct{l,t}z.i8(i8 %val, i1 0)

///     %val.numactivebits = sub i8 8, %val.numleadingzeros

///     %extraoffset.neg = sub i8 0, %extraoffset

///     %tmp = add i8 %val.numactivebits, %extraoffset.neg

///     %iv.final = call i8 @llvm.smax.i8(i8 %tmp, i8 %start)

///     %loop.tripcount = sub i8 %iv.final, %start

///     br label %loop

///

///   loop:

///     %loop.iv = phi i8 [ 0, %entry ], [ %loop.iv.next, %loop ]

///     %loop.iv.next = add i8 %loop.iv, 1

///     %loop.ivcheck = icmp eq i8 %loop.iv.next, %loop.tripcount

///     %iv = add i8 %loop.iv, %start

///     <...>

///     br i1 %loop.ivcheck, label %end, label %loop

///

///   end:

///     %iv.res = phi i8 [ %iv.final, %loop ] <...>

///     <...>

/// \endcode

bool LoopIdiomRecognize::recognizeShiftUntilZero() {

  bool MadeChange = false;


  Instruction *ValShiftedIsZero;

  Intrinsic::ID IntrID;

  Instruction *IV;

  Value *Start, *Val;

  const SCEV *ExtraOffsetExpr;

  bool InvertedCond;

  if (!detectShiftUntilZeroIdiom(CurLoop, SE, ValShiftedIsZero, IntrID, IV,

                                 Start, Val, ExtraOffsetExpr, InvertedCond)) {

    LLVM_DEBUG(dbgs() << DEBUG_TYPE

               " shift-until-zero idiom detection failed.\n");

    return MadeChange;

  }

  LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom detected!\n");


  // Ok, it is the idiom we were looking for, we *could* transform this loop,

  // but is it profitable to transform?


  BasicBlock *LoopHeaderBB = CurLoop->getHeader();

  BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();

  assert(LoopPreheaderBB && "There is always a loop preheader.");


  BasicBlock *SuccessorBB = CurLoop->getExitBlock();

  assert(SuccessorBB && "There is only a single successor.");


  IRBuilder<> Builder(LoopPreheaderBB->getTerminator());

  Builder.SetCurrentDebugLocation(IV->getDebugLoc());


  Type *Ty = Val->getType();

  unsigned Bitwidth = Ty->getScalarSizeInBits();


  TargetTransformInfo::TargetCostKind CostKind =

      TargetTransformInfo::TCK_SizeAndLatency;


  // The rewrite is considered to be unprofitable iff and only iff the

  // intrinsic we'll use are not cheap. Note that we are okay with *just*

  // making the loop countable, even if nothing else changes.

  IntrinsicCostAttributes Attrs(

      IntrID, Ty, {PoisonValue::get(Ty), /*is_zero_poison=*/Builder.getFalse()});

  InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);

  if (Cost > TargetTransformInfo::TCC_Basic) {

    LLVM_DEBUG(dbgs() << DEBUG_TYPE

               " Intrinsic is too costly, not beneficial\n");

    return MadeChange;

  }


  // Ok, transform appears worthwhile.

  MadeChange = true;


  bool OffsetIsZero = ExtraOffsetExpr->isZero();


  // Step 1: Compute the loop's final IV value / trip count.


  CallInst *ValNumLeadingZeros = Builder.CreateIntrinsic(

      IntrID, Ty, {Val, /*is_zero_poison=*/Builder.getFalse()},

      /*FMFSource=*/nullptr, Val->getName() + ".numleadingzeros");

  Value *ValNumActiveBits = Builder.CreateSub(

      ConstantInt::get(Ty, Ty->getScalarSizeInBits()), ValNumLeadingZeros,

      Val->getName() + ".numactivebits", /*HasNUW=*/true,

      /*HasNSW=*/Bitwidth != 2);


  SCEVExpander Expander(*SE, *DL, "loop-idiom");

  Expander.setInsertPoint(&*Builder.GetInsertPoint());

  Value *ExtraOffset = Expander.expandCodeFor(ExtraOffsetExpr);


  Value *ValNumActiveBitsOffset = Builder.CreateAdd(

      ValNumActiveBits, ExtraOffset, ValNumActiveBits->getName() + ".offset",

      /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true);

  Value *IVFinal = Builder.CreateIntrinsic(Intrinsic::smax, {Ty},

                                           {ValNumActiveBitsOffset, Start},

                                           /*FMFSource=*/nullptr, "iv.final");


  auto *LoopBackedgeTakenCount = cast<Instruction>(Builder.CreateSub(

      IVFinal, Start, CurLoop->getName() + ".backedgetakencount",

      /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true));

  // FIXME: or when the offset was `add nuw`


  // We know loop's backedge-taken count, but what's loop's trip count?

  Value *LoopTripCount =

      Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),

                        CurLoop->getName() + ".tripcount", /*HasNUW=*/true,

                        /*HasNSW=*/Bitwidth != 2);


  // Step 2: Adjust the successor basic block to recieve the original

  //         induction variable's final value instead of the orig. IV itself.


  IV->replaceUsesOutsideBlock(IVFinal, LoopHeaderBB);


  // Step 3: Rewrite the loop into a countable form, with canonical IV.


  // The new canonical induction variable.

  Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->begin());

  auto *CIV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");


  // The induction itself.

  Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->getFirstNonPHIIt());

  auto *CIVNext =

      Builder.CreateAdd(CIV, ConstantInt::get(Ty, 1), CIV->getName() + ".next",

                        /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);


  // The loop trip count check.

  auto *CIVCheck = Builder.CreateICmpEQ(CIVNext, LoopTripCount,

                                        CurLoop->getName() + ".ivcheck");

  auto *NewIVCheck = CIVCheck;

  if (InvertedCond) {

    NewIVCheck = Builder.CreateNot(CIVCheck);

    NewIVCheck->takeName(ValShiftedIsZero);

  }


  // The original IV, but rebased to be an offset to the CIV.

  auto *IVDePHId = Builder.CreateAdd(CIV, Start, "", /*HasNUW=*/false,

                                     /*HasNSW=*/true); // FIXME: what about NUW?

  IVDePHId->takeName(IV);


  // The loop terminator.

  Builder.SetInsertPoint(LoopHeaderBB->getTerminator());

  Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);

  LoopHeaderBB->getTerminator()->eraseFromParent();


  // Populate the IV PHI.

  CIV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);

  CIV->addIncoming(CIVNext, LoopHeaderBB);


  // Step 4: Forget the "non-computable" trip-count SCEV associated with the

  //   loop. The loop would otherwise not be deleted even if it becomes empty.


  SE->forgetLoop(CurLoop);


  // Step 5: Try to cleanup the loop's body somewhat.

  IV->replaceAllUsesWith(IVDePHId);

  IV->eraseFromParent();


  ValShiftedIsZero->replaceAllUsesWith(NewIVCheck);

  ValShiftedIsZero->eraseFromParent();


  // Other passes will take care of actually deleting the loop if possible.


  LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom optimized!\n");


  ++NumShiftUntilZero;

  return MadeChange;

}

assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

APInt.h
This file implements a class to represent arbitrary precision integral constant values and operations...

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition ARMSLSHardening.cpp:73

AliasAnalysis.h

ArrayRef.h

getParent
static const Function * getParent(const Value *V)
Definition BasicAliasAnalysis.cpp:885

BuildLibCalls.h

E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

Info
Analysis containing CSE Info
Definition CSEInfo.cpp:27

Casting.h

CmpInstAnalysis.h

CommandLine.h

Constants.h
This file contains the declarations for the subclasses of Constant, which represent the different fla...

CostKind
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))

Access
DXIL Resource Access
Definition DXILResourceAccess.cpp:424

DataLayout.h

DebugLoc.h

DenseMap.h
This file defines the DenseMap class.

DerivedTypes.h

Dominators.h

DEBUG_TYPE
#define DEBUG_TYPE
Definition GenericCycleImpl.h:31

GlobalValue.h

GlobalVariable.h

Cleanup
static const HTTPClientCleanup Cleanup
Definition HTTPClient.cpp:42

HashRecognize.h

mayLoopAccessLocation
static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, const SCEV *BECount, unsigned StoreSize, AliasAnalysis &AA, SmallPtrSetImpl< Instruction * > &Ignored)
mayLoopAccessLocation - Return true if the specified loop might access the specified pointer location...
Definition HexagonLoopIdiomRecognition.cpp:1960

IRBuilder.h

MI
IRTranslator LLVM IR MI
Definition IRTranslator.cpp:110

BasicBlock.h

Constant.h

Instruction.h

IntrinsicInst.h

Module.h
Module.h This file contains the declarations for the Module class.

PassManager.h
This header defines various interfaces for pass management in LLVM.

Type.h

User.h

Value.h

InlinePriorityMode::Size
@ Size
Definition InlineOrder.cpp:25

InstrTypes.h

InstructionCost.h
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...

Instructions.h

Intrinsics.h

TemplateParamKind::Type
@ Type
Definition ItaniumDemangle.h:1243

Ops
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Definition ItaniumDemangle.h:3368

LLVMContext.h

matchCondition
static Value * matchCondition(BranchInst *BI, BasicBlock *LoopEntry, bool JmpOnZero=false)
Check if the given conditional branch is based on the comparison between a variable and zero,...
Definition LoopIdiomRecognize.cpp:1705

getRecurrenceVar
static PHINode * getRecurrenceVar(Value *VarX, Instruction *DefX, BasicBlock *LoopEntry)
Definition LoopIdiomRecognize.cpp:2037

DisableLIRPMemset
static cl::opt< bool, true > DisableLIRPMemset("disable-" DEBUG_TYPE "-memset", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memset."), cl::location(DisableLIRP::Memset), cl::init(false), cl::ReallyHidden)

ForceMemsetPatternIntrinsic
static cl::opt< bool > ForceMemsetPatternIntrinsic("loop-idiom-force-memset-pattern-intrinsic", cl::desc("Use memset.pattern intrinsic whenever possible"), cl::init(false), cl::Hidden)

createFFSIntrinsic
static CallInst * createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL, bool ZeroCheck, Intrinsic::ID IID)
Definition LoopIdiomRecognize.cpp:2610

EnableLIRPWcslen
static cl::opt< bool, true > EnableLIRPWcslen("disable-loop-idiom-wcslen", cl::desc("Proceed with loop idiom recognize pass, " "enable conversion of loop(s) to wcslen."), cl::location(DisableLIRP::Wcslen), cl::init(false), cl::ReallyHidden)

detectShiftUntilLessThanIdiom
static bool detectShiftUntilLessThanIdiom(Loop *CurLoop, const DataLayout &DL, Intrinsic::ID &IntrinID, Value *&InitX, Instruction *&CntInst, PHINode *&CntPhi, Instruction *&DefX, APInt &Threshold)
Return true if the idiom is detected in the loop.
Definition LoopIdiomRecognize.cpp:2073

detectShiftUntilBitTestIdiom
static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX, Value *&BitMask, Value *&BitPos, Value *&CurrX, Instruction *&NextX)
Return true if the idiom is detected in the loop.
Definition LoopIdiomRecognize.cpp:2895

detectPopcountIdiom
static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, Instruction *&CntInst, PHINode *&CntPhi, Value *&Var)
Return true iff the idiom is detected in the loop.
Definition LoopIdiomRecognize.cpp:2171

getMemSetPatternValue
static Constant * getMemSetPatternValue(Value *V, const DataLayout *DL)
getMemSetPatternValue - If a strided store of the specified value is safe to turn into a memset....
Definition LoopIdiomRecognize.cpp:417

DisableLIRPMemcpy
static cl::opt< bool, true > DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memcpy."), cl::location(DisableLIRP::Memcpy), cl::init(false), cl::ReallyHidden)

createPopcntIntrinsic
static CallInst * createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL)
Definition LoopIdiomRecognize.cpp:2599

getNumBytes
static const SCEV * getNumBytes(const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, Loop *CurLoop, const DataLayout *DL, ScalarEvolution *SE)
Compute the number of bytes as a SCEV from the backedge taken count.
Definition LoopIdiomRecognize.cpp:1027

detectShiftUntilZeroIdiom
static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL, Intrinsic::ID &IntrinID, Value *&InitX, Instruction *&CntInst, PHINode *&CntPhi, Instruction *&DefX)
Return true if the idiom is detected in the loop.
Definition LoopIdiomRecognize.cpp:2304

DisableLIRPStrlen
static cl::opt< bool, true > DisableLIRPStrlen("disable-loop-idiom-strlen", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to strlen."), cl::location(DisableLIRP::Strlen), cl::init(false), cl::ReallyHidden)

getStartForNegStride
static const SCEV * getStartForNegStride(const SCEV *Start, const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, ScalarEvolution *SE)
Definition LoopIdiomRecognize.cpp:1009

getStoreStride
static APInt getStoreStride(const SCEVAddRecExpr *StoreEv)
Definition LoopIdiomRecognize.cpp:404

matchShiftULTCondition
static Value * matchShiftULTCondition(BranchInst *BI, BasicBlock *LoopEntry, APInt &Threshold)
Check if the given conditional branch is based on an unsigned less-than comparison between a variable...
Definition LoopIdiomRecognize.cpp:2011

m_LoopInvariant
match_LoopInvariant< Ty > m_LoopInvariant(const Ty &M, const Loop *L)
Matches if the value is loop-invariant.
Definition LoopIdiomRecognize.cpp:2869

DisableLIRPAll
static cl::opt< bool, true > DisableLIRPAll("disable-" DEBUG_TYPE "-all", cl::desc("Options to disable Loop Idiom Recognize Pass."), cl::location(DisableLIRP::All), cl::init(false), cl::ReallyHidden)

deleteDeadInstruction
static void deleteDeadInstruction(Instruction *I)
Definition LoopIdiomRecognize.cpp:313

DisableLIRPHashRecognize
static cl::opt< bool, true > DisableLIRPHashRecognize("disable-" DEBUG_TYPE "-hashrecognize", cl::desc("Proceed with loop idiom recognize pass, " "but do not optimize CRC loops."), cl::location(DisableLIRP::HashRecognize), cl::init(false), cl::ReallyHidden)

UseLIRCodeSizeHeurs
static cl::opt< bool > UseLIRCodeSizeHeurs("use-lir-code-size-heurs", cl::desc("Use loop idiom recognition code size heuristics when compiling " "with -Os/-Oz"), cl::init(true), cl::Hidden)

LoopIdiomRecognize.h

LoopInfo.h

LoopPass.h

LoopUtils.h

I
#define I(x, y, z)
Definition MD5.cpp:58

getDebugLoc
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
Definition MachineInstrBundle.cpp:89

MapVector.h
This file implements a map that provides insertion order iteration.

MemoryLocation.h
This file provides utility analysis objects describing memory locations.

MemorySSAUpdater.h

MemorySSA.h
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...

T
#define T
Definition Mips16ISelLowering.cpp:353

MustExecute.h
Contains a collection of routines for determining if a given instruction is guaranteed to execute if ...

OptimizationRemarkEmitter.h

if
if(PassOpts->AAPipeline)
Definition PassBuilderBindings.cpp:64

PatternMatch.h

Cond
const SmallVectorImpl< MachineOperand > & Cond
Definition RISCVRedundantCopyElimination.cpp:71

isSimple
static bool isSimple(Instruction *I)
Definition SLPVectorizer.cpp:1718

Verifier
verify safepoint Safepoint IR Verifier
Definition SafepointIRVerifier.cpp:248

ScalarEvolutionExpander.h

ScalarEvolutionExpressions.h

ScalarEvolutionPatternMatch.h

ScalarEvolution.h

SetVector.h
This file implements a set that has insertion order iteration characteristics.

SmallPtrSet.h
This file defines the SmallPtrSet class.

SmallVector.h
This file defines the SmallVector class.

Statistic.h
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...

STATISTIC
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171

StringRef.h

Debug.h

LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition Debug.h:114

X
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")

getType
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39

Ptr
@ Ptr
Definition TargetLibraryInfo.cpp:77

TargetLibraryInfo.h

TargetTransformInfo.h
This pass exposes codegen information to IR-level passes.

Local.h

ValueHandle.h

ValueTracking.h

IV
static const uint32_t IV[8]
Definition blake3_impl.h:83

llvm::APInt
Class for arbitrary precision integers.
Definition APInt.h:78

llvm::APInt::tryZExtValue
std::optional< uint64_t > tryZExtValue() const
Get zero extended value if possible.
Definition APInt.h:1552

llvm::APInt::getZExtValue
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540

llvm::APInt::getBitWidth
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488

llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562

llvm::ArrayType::get
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.

llvm::BasicBlock
LLVM Basic Block Representation.
Definition BasicBlock.h:62

llvm::BasicBlock::end
iterator end()
Definition BasicBlock.h:472

llvm::BasicBlock::begin
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459

llvm::BasicBlock::phis
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:528

llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213

llvm::BasicBlock::instructionsWithoutDebug
LLVM_ABI iterator_range< filter_iterator< BasicBlock::const_iterator, std::function< bool(const Instruction &)> > > instructionsWithoutDebug(bool SkipPseudoOp=true) const
Return a const iterator range over the instructions in the block, skipping any debug instructions.
Definition BasicBlock.cpp:206

llvm::BasicBlock::getFirstNonPHIIt
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
Definition BasicBlock.cpp:337

llvm::BasicBlock::getSinglePredecessor
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition BasicBlock.cpp:437

llvm::BasicBlock::front
const Instruction & front() const
Definition BasicBlock.h:482

llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170

llvm::BasicBlock::getFirstNonPHIOrDbgOrAlloca
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
Definition BasicBlock.cpp:406

llvm::BasicBlock::size
size_t size() const
Definition BasicBlock.h:480

llvm::BasicBlock::getTerminator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233

llvm::BasicBlock::getModule
LLVM_ABI const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition BasicBlock.cpp:248

llvm::BinaryOperator
Definition InstrTypes.h:171

llvm::BinaryOperator::getOpcode
BinaryOps getOpcode() const
Definition InstrTypes.h:374

llvm::BranchInst
Conditional or Unconditional Branch instruction.
Definition Instructions.h:3058

llvm::BranchInst::setCondition
void setCondition(Value *V)
Definition Instructions.h:3139

llvm::BranchInst::isConditional
bool isConditional() const
Definition Instructions.h:3132

llvm::BranchInst::getNumSuccessors
unsigned getNumSuccessors() const
Definition Instructions.h:3144

llvm::BranchInst::getSuccessor
BasicBlock * getSuccessor(unsigned i) const
Definition Instructions.h:3146

llvm::BranchInst::getCondition
Value * getCondition() const
Definition Instructions.h:3134

llvm::CallBase::getCalledFunction
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition InstrTypes.h:1346

llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition Instructions.h:1511

llvm::CmpInst::setPredicate
void setPredicate(Predicate P)
Set the predicate for this instruction to the specified value.
Definition InstrTypes.h:768

llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676

llvm::CmpInst::ICMP_SLE
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706

llvm::CmpInst::ICMP_UGT
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699

llvm::CmpInst::ICMP_ULT
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701

llvm::CmpInst::ICMP_EQ
@ ICMP_EQ
equal
Definition InstrTypes.h:697

llvm::CmpInst::ICMP_NE
@ ICMP_NE
not equal
Definition InstrTypes.h:698

llvm::CmpInst::getInversePredicate
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789

llvm::CmpInst::getPredicate
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:765

llvm::CmpPredicate
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition CmpPredicate.h:23

llvm::ConstantArray::get
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition Constants.cpp:1317

llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition Constants.h:87

llvm::ConstantInt::isMinusOne
bool isMinusOne() const
This function will return true iff every bit in this constant is set to true.
Definition Constants.h:226

llvm::ConstantInt::isOne
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
Definition Constants.h:220

llvm::ConstantInt::isZero
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:214

llvm::ConstantInt::getFalse
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
Definition Constants.cpp:878

llvm::ConstantInt::getZExtValue
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:163

llvm::ConstantInt::getValue
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:154

llvm::ConstantInt::getBool
static LLVM_ABI ConstantInt * getBool(LLVMContext &Context, bool V)
Definition Constants.cpp:885

llvm::Constant
This is an important base class in LLVM.
Definition Constant.h:43

llvm::Constant::getAllOnesValue
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
Definition Constants.cpp:420

llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63

llvm::DataLayout::getIndexType
LLVM_ABI IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
Definition DataLayout.cpp:931

llvm::DebugLoc
A debug info location.
Definition DebugLoc.h:124

llvm::DominatorTreeBase::getRoot
NodeT * getRoot() const
Definition GenericDomTree.h:512

llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165

llvm::DominatorTree::dominates
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition Dominators.cpp:135

llvm::FreezeInst
This class represents a freeze function that returns random concrete value if an operand is either a ...
Definition Instructions.h:5191

llvm::GlobalValue::getType
PointerType * getType() const
Global values are always pointers.
Definition GlobalValue.h:296

llvm::GlobalValue::PrivateLinkage
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition GlobalValue.h:61

llvm::GlobalVariable
Definition GlobalVariable.h:40

llvm::HashRecognize
The analysis.
Definition HashRecognize.h:74

llvm::HashRecognize::genSarwateTable
static CRCTable genSarwateTable(const APInt &GenPoly, bool ByteOrderSwapped)
Generate a lookup table of 256 entries by interleaving the generating polynomial.
Definition HashRecognize.cpp:353

llvm::ICmpInst
This instruction compares its operands according to the predicate given to the constructor.
Definition Instructions.h:1178

llvm::ICmpInst::isEquality
bool isEquality() const
Return true if this predicate is either EQ or NE.
Definition Instructions.h:1323

llvm::ICmpInst::isEquality
static bool isEquality(Predicate P)
Return true if this predicate is either EQ or NE.
Definition Instructions.h:1317

llvm::IRBuilderBase
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114

llvm::IRBuilderBase::getInt1
ConstantInt * getInt1(bool V)
Get a constant value representing either true or false.
Definition IRBuilder.h:497

llvm::IRBuilderBase::CreateZExtOrTrunc
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition IRBuilder.h:2103

llvm::IRBuilderBase::CreateIntrinsic
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition IRBuilder.cpp:835

llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788

llvm::InstructionCost
Definition InstructionCost.h:30

llvm::Instruction
Definition Instruction.h:69

llvm::Instruction::hasNoUnsignedWrap
LLVM_ABI bool hasNoUnsignedWrap() const LLVM_READONLY
Determine whether the no unsigned wrap flag is set.
Definition Instruction.cpp:415

llvm::Instruction::hasNoSignedWrap
LLVM_ABI bool hasNoSignedWrap() const LLVM_READONLY
Determine whether the no signed wrap flag is set.
Definition Instruction.cpp:422

llvm::Instruction::getDebugLoc
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition Instruction.h:513

llvm::Instruction::getModule
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition Instruction.cpp:82

llvm::Instruction::setAAMetadata
LLVM_ABI void setAAMetadata(const AAMDNodes &N)
Sets the AA metadata on this instruction from the AAMDNodes structure.
Definition Metadata.cpp:1832

llvm::Instruction::isAtomic
LLVM_ABI bool isAtomic() const LLVM_READONLY
Return true if this instruction has an AtomicOrdering of unordered or higher.
Definition Instruction.cpp:1067

llvm::Instruction::insertBefore
LLVM_ABI void insertBefore(InstListType::iterator InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified position.
Definition Instruction.cpp:119

llvm::Instruction::eraseFromParent
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition Instruction.cpp:108

llvm::Instruction::getFunction
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
Definition Instruction.cpp:86

llvm::Instruction::getAAMetadata
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
Definition Metadata.cpp:1817

llvm::Instruction::getOpcode
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition Instruction.h:312

llvm::Instruction::isShift
bool isShift() const
Definition Instruction.h:320

llvm::Instruction::setDebugLoc
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition Instruction.h:510

llvm::IntrinsicCostAttributes
Definition TargetTransformInfo.h:126

llvm::LPMUpdater
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
Definition LoopPassManager.h:226

llvm::LoadInst
An instruction for reading from memory.
Definition Instructions.h:181

llvm::LoadInst::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition Instructions.h:266

llvm::LoadInst::getPointerOperand
Value * getPointerOperand()
Definition Instructions.h:260

llvm::LoadInst::isVolatile
bool isVolatile() const
Return true if this is a load from a volatile memory location.
Definition Instructions.h:210

llvm::LoadInst::isUnordered
bool isUnordered() const
Definition Instructions.h:254

llvm::LoadInst::getAlign
Align getAlign() const
Return the alignment of the access that is being performed.
Definition Instructions.h:216

llvm::LocationSize
Definition MemoryLocation.h:67

llvm::LocationSize::precise
static LocationSize precise(uint64_t Value)
Definition MemoryLocation.h:95

llvm::LocationSize::afterPointer
static constexpr LocationSize afterPointer()
Any location after the base pointer (but still within the underlying object).
Definition MemoryLocation.h:118

llvm::LoopBase::contains
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition GenericLoopInfo.h:124

llvm::LoopBase::isOutermost
bool isOutermost() const
Return true if the loop does not have a parent (natural) loop.
Definition GenericLoopInfo.h:170

llvm::LoopBase::getLoopLatch
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
Definition GenericLoopInfoImpl.h:256

llvm::LoopBase::getNumBlocks
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition GenericLoopInfo.h:187

llvm::LoopBase::getNumBackEdges
unsigned getNumBackEdges() const
Calculate the number of back edges to the loop header.
Definition GenericLoopInfo.h:248

llvm::LoopBase::getHeader
BlockT * getHeader() const
Definition GenericLoopInfo.h:90

llvm::LoopBase::getExitBlock
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
Definition GenericLoopInfoImpl.h:107

llvm::LoopBase::getLoopPreheader
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition GenericLoopInfoImpl.h:210

llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition GenericLoopInfo.h:173

llvm::LoopBase::getUniqueExitBlocks
void getUniqueExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all unique successor blocks of this loop.
Definition GenericLoopInfoImpl.h:142

llvm::LoopBase::block_begin
block_iterator block_begin() const
Definition GenericLoopInfo.h:178

llvm::LoopBase::getUniqueExitBlock
BlockT * getUniqueExitBlock() const
If getUniqueExitBlocks would return exactly one block, return that block.
Definition GenericLoopInfoImpl.h:158

llvm::LoopIdiomRecognizePass::run
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
Definition LoopIdiomRecognize.cpp:287

llvm::LoopInfoBase::getLoopFor
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Definition GenericLoopInfo.h:606

llvm::LoopInfo
Definition LoopInfo.h:408

llvm::Loop
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40

llvm::Loop::getStartLoc
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition LoopInfo.cpp:632

llvm::Loop::isLoopInvariant
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition LoopInfo.cpp:61

llvm::Loop::getLatchCmpInst
ICmpInst * getLatchCmpInst() const
Get the latch condition instruction.
Definition LoopInfo.cpp:175

llvm::Loop::getName
StringRef getName() const
Definition LoopInfo.h:389

llvm::Loop::getCanonicalInductionVariable
PHINode * getCanonicalInductionVariable() const
Check to see if the loop has a canonical induction variable: an integer recurrence that starts at 0 a...
Definition LoopInfo.cpp:151

llvm::MemCpyInst
This class wraps the llvm.memcpy intrinsic.
Definition IntrinsicInst.h:1232

llvm::MemIntrinsicBase::getLength
Value * getLength() const
Definition IntrinsicInst.h:1004

llvm::MemIntrinsicBase::getDest
Value * getDest() const
This is just like getRawDest, but it strips off any cast instructions (including addrspacecast) that ...
Definition IntrinsicInst.h:1020

llvm::MemIntrinsicBase::getDestAlign
MaybeAlign getDestAlign() const
Definition IntrinsicInst.h:1026

llvm::MemIntrinsic::isForceInlined
bool isForceInlined() const
Definition IntrinsicInst.h:1141

llvm::MemIntrinsic::isVolatile
bool isVolatile() const
Definition IntrinsicInst.h:1137

llvm::MemSetBase::getValue
Value * getValue() const
Definition IntrinsicInst.h:1114

llvm::MemSetInst
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
Definition IntrinsicInst.h:1170

llvm::MemTransferBase::getSourceAlign
MaybeAlign getSourceAlign() const
Definition IntrinsicInst.h:1083

llvm::MemTransferBase::getSource
Value * getSource() const
This is just like getRawSource, but it strips off any cast instructions that feed it,...
Definition IntrinsicInst.h:1077

llvm::MemoryAccess
Definition MemorySSA.h:143

llvm::MemoryLocation
Representation for a specific memory location.
Definition MemoryLocation.h:217

llvm::MemorySSAAnalysis
An analysis that produces MemorySSA for a function.
Definition MemorySSA.h:936

llvm::MemorySSA
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition MemorySSA.h:702

llvm::MemorySSA::BeforeTerminator
@ BeforeTerminator
Definition MemorySSA.h:793

llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67

llvm::Module::getDataLayout
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278

llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition OptimizationRemarkEmitter.h:33

llvm::OptimizationRemarkEmitter::emit
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Definition OptimizationRemarkEmitter.cpp:79

llvm::OptimizationRemarkMissed
Diagnostic information for missed-optimization remarks.
Definition DiagnosticInfo.h:811

llvm::OptimizationRemark
Diagnostic information for applied optimization remarks.
Definition DiagnosticInfo.h:766

llvm::PHINode
Definition Instructions.h:2639

llvm::PHINode::addIncoming
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Definition Instructions.h:2774

llvm::PHINode::getIncomingValueForBlock
Value * getIncomingValueForBlock(const BasicBlock *BB) const
Definition Instructions.h:2815

llvm::PHINode::getIncomingValue
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
Definition Instructions.h:2714

llvm::PHINode::getBasicBlockIndex
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
Definition Instructions.h:2808

llvm::PHINode::Create
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
Definition Instructions.h:2674

llvm::PoisonValue::get
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition Constants.cpp:1888

llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112

llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118

llvm::SCEVAddRecExpr
This node represents a polynomial recurrence on the trip count of the specified loop.
Definition ScalarEvolutionExpressions.h:348

llvm::SCEVAddRecExpr::getStart
const SCEV * getStart() const
Definition ScalarEvolutionExpressions.h:359

llvm::SCEVAddRecExpr::getStepRecurrence
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
Definition ScalarEvolutionExpressions.h:366

llvm::SCEVAddRecExpr::isAffine
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
Definition ScalarEvolutionExpressions.h:376

llvm::SCEVConstant
This class represents a constant integer value.
Definition ScalarEvolutionExpressions.h:61

llvm::SCEVConstant::getValue
ConstantInt * getValue() const
Definition ScalarEvolutionExpressions.h:70

llvm::SCEVConstant::getAPInt
const APInt & getAPInt() const
Definition ScalarEvolutionExpressions.h:71

llvm::SCEVExpanderCleaner
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
Definition ScalarEvolutionExpander.h:561

llvm::SCEVExpander
This class uses information about analyze scalars to rewrite expressions in canonical form.
Definition ScalarEvolutionExpander.h:64

llvm::SCEVNAryExpr::getOperand
const SCEV * getOperand(unsigned i) const
Definition ScalarEvolutionExpressions.h:214

llvm::SCEV
This class represents an analyzed expression in the program.
Definition ScalarEvolution.h:72

llvm::SCEV::isOne
LLVM_ABI bool isOne() const
Return true if the expression is a constant one.
Definition ScalarEvolution.cpp:447

llvm::SCEV::isZero
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
Definition ScalarEvolution.cpp:445

llvm::SCEV::isNonConstantNegative
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Definition ScalarEvolution.cpp:451

llvm::SCEV::getType
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
Definition ScalarEvolution.cpp:383

llvm::SCEV::FlagNUW
@ FlagNUW
Definition ScalarEvolution.h:130

llvm::ScalarEvolution
The main scalar evolution driver.
Definition ScalarEvolution.h:448

llvm::ScalarEvolution::getDataLayout
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
Definition ScalarEvolution.h:1310

llvm::ScalarEvolution::isKnownNonNegative
LLVM_ABI bool isKnownNonNegative(const SCEV *S)
Test if the given expression is known to be non-negative.
Definition ScalarEvolution.cpp:11019

llvm::ScalarEvolution::getNegativeSCEV
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
Definition ScalarEvolution.cpp:4604

llvm::ScalarEvolution::getBackedgeTakenCount
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
Definition ScalarEvolution.cpp:8390

llvm::ScalarEvolution::getZero
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
Definition ScalarEvolution.h:663

llvm::ScalarEvolution::getConstant
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
Definition ScalarEvolution.cpp:470

llvm::ScalarEvolution::getSCEV
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
Definition ScalarEvolution.cpp:4582

llvm::ScalarEvolution::getTripCountFromExitCount
LLVM_ABI const SCEV * getTripCountFromExitCount(const SCEV *ExitCount)
A version of getTripCountFromExitCount below which always picks an evaluation type which can not resu...
Definition ScalarEvolution.cpp:8222

llvm::ScalarEvolution::forgetLoop
LLVM_ABI void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
Definition ScalarEvolution.cpp:8536

llvm::ScalarEvolution::isLoopInvariant
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
Definition ScalarEvolution.cpp:14200

llvm::ScalarEvolution::isSCEVable
LLVM_ABI bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
Definition ScalarEvolution.cpp:4476

llvm::ScalarEvolution::getMinusSCEV
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
Definition ScalarEvolution.cpp:4684

llvm::ScalarEvolution::hasLoopInvariantBackedgeTakenCount
LLVM_ABI bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
Definition ScalarEvolution.cpp:13812

llvm::ScalarEvolution::applyLoopGuards
LLVM_ABI const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
Definition ScalarEvolution.cpp:15999

llvm::ScalarEvolution::getMulExpr
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
Definition ScalarEvolution.cpp:3104

llvm::ScalarEvolution::getTruncateOrZeroExtend
LLVM_ABI const SCEV * getTruncateOrZeroExtend(const SCEV *V, Type *Ty, unsigned Depth=0)
Return a SCEV corresponding to a conversion of the input value to the specified type.
Definition ScalarEvolution.cpp:4734

llvm::ScalarEvolution::getAddExpr
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
Definition ScalarEvolution.cpp:2515

llvm::ScalarEvolution::getTruncateOrSignExtend
LLVM_ABI const SCEV * getTruncateOrSignExtend(const SCEV *V, Type *Ty, unsigned Depth=0)
Return a SCEV corresponding to a conversion of the input value to the specified type.
Definition ScalarEvolution.cpp:4746

llvm::SetVector
A vector that has set insertion semantics.
Definition SetVector.h:59

llvm::SetVector::count
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition SetVector.h:261

llvm::SetVector::insert
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:150

llvm::SimpleLoopSafetyInfo
Simple and conservative implementation of LoopSafetyInfo that can give false-positive answers to its ...
Definition MustExecute.h:111

llvm::SimpleLoopSafetyInfo::computeLoopSafetyInfo
void computeLoopSafetyInfo(const Loop *CurLoop) override
Computes safety information for a loop checks loop body & header for the possibility of may throw exc...
Definition MustExecute.cpp:49

llvm::SimpleLoopSafetyInfo::anyBlockMayThrow
bool anyBlockMayThrow() const override
Returns true iff any block of the loop for which this info is contains an instruction that may throw ...
Definition MustExecute.cpp:45

llvm::SmallDenseMap
Definition DenseMap.h:866

llvm::SmallPtrSetImpl
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition SmallPtrSet.h:368

llvm::SmallPtrSetImpl::erase
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition SmallPtrSet.h:404

llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition SmallPtrSet.h:455

llvm::SmallPtrSetImpl::insert_range
void insert_range(Range &&R)
Definition SmallPtrSet.h:474

llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition SmallPtrSet.h:389

llvm::SmallPtrSetImpl::contains
bool contains(ConstPtrType Ptr) const
Definition SmallPtrSet.h:461

llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition SmallPtrSet.h:527

llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition SmallVector.h:573

llvm::SmallVectorImpl::clear
void clear()
Definition SmallVector.h:610

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition SmallVector.h:416

llvm::SmallVectorTemplateCommon::size
size_t size() const
Definition SmallVector.h:79

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition SmallVector.h:1196

llvm::StoreInst
An instruction for storing to memory.
Definition Instructions.h:297

llvm::StoreInst::getAlign
Align getAlign() const
Definition Instructions.h:339

llvm::StoreInst::getValueOperand
Value * getValueOperand()
Definition Instructions.h:384

llvm::StoreInst::getPointerOperand
Value * getPointerOperand()
Definition Instructions.h:387

llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55

llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition TargetLibraryInfo.h:285

llvm::TargetLibraryInfo::getWCharSize
unsigned getWCharSize(const Module &M) const
Returns the size of the wchar_t type in bytes or 0 if the size is unknown.
Definition TargetLibraryInfo.h:574

llvm::TargetLibraryInfo::has
bool has(LibFunc F) const
Tests whether a library function is available.
Definition TargetLibraryInfo.h:392

llvm::TargetTransformInfo
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Definition TargetTransformInfo.h:223

llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition TargetTransformInfo.h:275

llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition TargetTransformInfo.h:279

llvm::TargetTransformInfo::PSK_FastHardware
@ PSK_FastHardware
Definition TargetTransformInfo.h:741

llvm::TargetTransformInfo::TCC_Basic
@ TCC_Basic
The cost of a typical 'add' instruction.
Definition TargetTransformInfo.h:302

llvm::Triple
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47

llvm::Triple::hexagon
@ hexagon
Definition Triple.h:63

llvm::Twine
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82

llvm::TypeSize
Definition TypeSize.h:333

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45

llvm::Type::getIntegerBitWidth
LLVM_ABI unsigned getIntegerBitWidth() const
Definition DerivedTypes.h:99

llvm::Type::getPointerAddressSpace
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition DerivedTypes.h:773

llvm::Type::getInt8Ty
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:295

llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352

llvm::Type::getScalarSizeInBits
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231

llvm::Type::isFloatingPointTy
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184

llvm::Type::isIntOrPtrTy
bool isIntOrPtrTy() const
Return true if this is an integer type or a pointer type.
Definition Type.h:255

llvm::Type::getIntNTy
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301

llvm::Use
A Use represents the edge between a Value definition and its users.
Definition Use.h:35

llvm::User
Definition User.h:44

llvm::User::setOperand
void setOperand(unsigned i, Value *Val)
Definition User.h:237

llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition User.h:232

llvm::User::getNumOperands
unsigned getNumOperands() const
Definition User.h:254

llvm::Value
LLVM Value Representation.
Definition Value.h:75

llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256

llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439

llvm::Value::replaceAllUsesWith
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546

llvm::Value::users
iterator_range< user_iterator > users()
Definition Value.h:426

llvm::Value::replaceUsesWithIf
LLVM_ABI void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:554

llvm::Value::replaceUsesOutsideBlock
LLVM_ABI void replaceUsesOutsideBlock(Value *V, BasicBlock *BB)
replaceUsesOutsideBlock - Go through the uses list for this definition and make each use point to "V"...
Definition Value.cpp:599

llvm::Value::getContext
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099

llvm::Value::getName
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322

llvm::Value::takeName
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396

llvm::WeakTrackingVH
Value handle that is nullable, but tries to track the Value.
Definition ValueHandle.h:205

llvm::cl::opt
Definition CommandLine.h:1455

llvm::details::FixedOrScalableQuantity::getFixedValue
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:201

llvm::details::FixedOrScalableQuantity::isScalable
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:169

llvm::ilist_detail::node_parent_access::getParent
const ParentTy * getParent() const
Definition ilist_node.h:34

uint64_t

Changed
Changed
Definition ObjCARCOpts.cpp:2369

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition ErrorHandling.h:164

llvm::AA
Abstract Attribute helper functions.
Definition Attributor.h:165

llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition AMDGPUMetadata.h:396

llvm::AMDGPU::HSAMD::Kernel::Key::Attrs
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
Definition AMDGPUMetadata.h:394

llvm::ARM::PredBlockMask::TT
@ TT
Definition ARMBaseInfo.h:107

llvm::ARM::ProfileKind::M
@ M
Definition ARMTargetParser.h:171

llvm::BTF::HeaderSize
@ HeaderSize
Definition BTF.h:61

llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition BitmaskEnum.h:127

llvm::CallingConv::C
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34

llvm::GraphProgram::Name
Name
Definition GraphWriter.h:51

llvm::ISD::BasicBlock
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81

llvm::Intrinsic::ID
unsigned ID
Definition GenericSSAContext.h:28

llvm::M68k::MemAddrModeKind::j
@ j
Definition M68kBaseInfo.h:52

llvm::M68k::MemAddrModeKind::U
@ U
Definition M68kBaseInfo.h:61

llvm::M68k::MemAddrModeKind::k
@ k
Definition M68kBaseInfo.h:67

llvm::M68k::MemAddrModeKind::L
@ L
Definition M68kBaseInfo.h:70

llvm::MCOI::OperandType
OperandType
Operands are tagged with one of the values of this enum.
Definition MCInstrDesc.h:60

llvm::PatternMatch
Definition PatternMatch.h:47

llvm::PatternMatch::m_Add
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
Definition PatternMatch.h:1174

llvm::PatternMatch::m_c_And
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
Definition PatternMatch.h:2978

llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition PatternMatch.h:49

llvm::PatternMatch::m_Instruction
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition PatternMatch.h:854

llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition PatternMatch.h:954

llvm::PatternMatch::m_One
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition PatternMatch.h:584

llvm::PatternMatch::m_CombineAnd
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
Definition PatternMatch.h:258

llvm::PatternMatch::m_Br
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
Definition PatternMatch.h:2368

llvm::PatternMatch::m_c_Add
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
Definition PatternMatch.h:2964

llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition PatternMatch.h:105

llvm::PatternMatch::m_ICmp
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
Definition PatternMatch.h:1746

llvm::PatternMatch::m_Shift
BinOpPred_match< LHS, RHS, is_shift_op > m_Shift(const LHS &L, const RHS &R)
Matches shift operations.
Definition PatternMatch.h:1637

llvm::PatternMatch::m_Shl
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
Definition PatternMatch.h:1306

llvm::PatternMatch::m_BasicBlock
class_match< BasicBlock > m_BasicBlock()
Match an arbitrary basic block value and ignore it.
Definition PatternMatch.h:202

llvm::PatternMatch::m_Zero
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition PatternMatch.h:604

llvm::PatternMatch::m_Sub
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
Definition PatternMatch.h:1186

llvm::PatternMatch::m_SpecificInt_ICMP
cst_pred_ty< icmp_pred_with_threshold > m_SpecificInt_ICMP(ICmpInst::Predicate Predicate, const APInt &Threshold)
Match an integer or vector with every element comparing 'pred' (eg/ne/...) to Threshold.
Definition PatternMatch.h:692

llvm::RISCVFenceField::R
@ R
Definition RISCVBaseInfo.h:463

llvm::SCEVPatternMatch
Definition ScalarEvolutionPatternMatch.h:19

llvm::SCEVPatternMatch::m_scev_APInt
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
Definition ScalarEvolutionPatternMatch.h:152

llvm::SCEVPatternMatch::m_SCEVConstant
class_match< const SCEVConstant > m_SCEVConstant()
Definition ScalarEvolutionPatternMatch.h:64

llvm::SCEVPatternMatch::m_SpecificLoop
specificloop_ty m_SpecificLoop(const Loop *L)
Definition ScalarEvolutionPatternMatch.h:366

llvm::SCEVPatternMatch::m_scev_AffineAddRec
SCEVAffineAddRec_match< Op0_t, Op1_t, class_match< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
Definition ScalarEvolutionPatternMatch.h:372

llvm::SCEVPatternMatch::match
bool match(const SCEV *S, const Pattern &P)
Definition ScalarEvolutionPatternMatch.h:21

llvm::SCEVPatternMatch::m_scev_Specific
specificscev_ty m_scev_Specific(const SCEV *S)
Match if we have a specific specified SCEV.
Definition ScalarEvolutionPatternMatch.h:112

llvm::SCEVPatternMatch::m_SCEV
class_match< const SCEV > m_SCEV()
Definition ScalarEvolutionPatternMatch.h:63

llvm::SI
Definition SIInstrInfo.h:1754

llvm::SPII::Store
@ Store
Definition SparcInstrInfo.h:33

llvm::Sched::Source
@ Source
Definition TargetLowering.h:105

llvm::cl::Hidden
@ Hidden
Definition CommandLine.h:139

llvm::cl::ReallyHidden
@ ReallyHidden
Definition CommandLine.h:140

llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition CommandLine.h:445

llvm::cl::location
LocationClass< Ty > location(Ty &L)
Definition CommandLine.h:465

llvm::codeview::PointerMode::Pointer
@ Pointer
Definition CodeView.h:337

llvm::codeview::EncodedFramePtrReg::BasePtr
@ BasePtr
Definition CodeView.h:528

llvm::dwarf_linker::DebugSectionKind::DebugLoc
@ DebugLoc
Definition DWARFLinkerBase.h:34

llvm::memprof::Meta::Start
@ Start
Definition MemProf.h:69

llvm::numbers::e
constexpr double e
Definition MathExtras.h:47

llvm::ore::NV
DiagnosticInfoOptimizationBase::Argument NV
Definition OptimizationRemarkEmitter.h:139

llvm::ore::setExtraArgs
DiagnosticInfoOptimizationBase::setExtraArgs setExtraArgs
Definition OptimizationRemarkEmitter.h:141

llvm::sampleprof::Base
@ Base
Definition Discriminator.h:58

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition AddressRanges.h:18

llvm::Value
FunctionAddr VTableAddr Value
Definition InstrProf.h:137

llvm::RecursivelyDeleteTriviallyDeadInstructions
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533

llvm::Cost
InstructionCost Cost
Definition FunctionSpecialization.h:103

llvm::dyn_cast
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644

llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition iterator_range.h:70

llvm::GetPointerBaseWithConstantOffset
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
Definition ValueTracking.h:344

llvm::isLibFuncEmittable
LLVM_ABI bool isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI, LibFunc TheLibFunc)
Check whether the library function is available on target and also that it in the current Module is a...
Definition BuildLibCalls.cpp:1527

llvm::LoopAnalysisManager
AnalysisManager< Loop, LoopStandardAnalysisResults & > LoopAnalysisManager
The loop analysis manager.
Definition LoopAnalysisManager.h:75

llvm::transform
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:1968

llvm::isMustProgress
LLVM_ABI bool isMustProgress(const Loop *L)
Return true if this loop can be assumed to make progress.
Definition LoopInfo.cpp:1162

llvm::dbgs
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207

llvm::Count
FunctionAddr VTableAddr Count
Definition InstrProf.h:139

llvm::isModOrRefSet
bool isModOrRefSet(const ModRefInfo MRI)
Definition ModRef.h:43

llvm::emitStrLen
LLVM_ABI Value * emitStrLen(Value *Ptr, IRBuilderBase &B, const DataLayout &DL, const TargetLibraryInfo *TLI)
Emit a call to the strlen function to the builder, for the specified pointer.
Definition BuildLibCalls.cpp:1617

llvm::isa
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548

llvm::ModRefInfo
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition ModRef.h:28

llvm::ModRefInfo::ModRef
@ ModRef
The access may reference and may modify the value stored in memory.
Definition ModRef.h:36

llvm::ModRefInfo::Mod
@ Mod
The access may modify the value stored in memory.
Definition ModRef.h:34

llvm::TTI
TargetTransformInfo TTI
Definition TargetTransformInfo.h:218

llvm::Data
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189

llvm::VerifyMemorySSA
LLVM_ABI bool VerifyMemorySSA
Enables verification of MemorySSA.
Definition MemorySSA.cpp:84

llvm::isConsecutiveAccess
LLVM_ABI bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
Definition LoopAccessAnalysis.cpp:1752

llvm::Op
DWARFExpression::Operation Op
Definition DWARFExpressionPrinter.cpp:22

llvm::isGuaranteedNotToBeUndefOrPoison
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Return true if this function can prove that V does not have undef bits and is never poison.
Definition ValueTracking.cpp:7763

llvm::emitWcsLen
LLVM_ABI Value * emitWcsLen(Value *Ptr, IRBuilderBase &B, const DataLayout &DL, const TargetLibraryInfo *TLI)
Emit a call to the wcslen function to the builder, for the specified pointer.
Definition BuildLibCalls.cpp:1624

llvm::cast
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560

llvm::PrevailingType::Yes
@ Yes
Definition FunctionImport.h:371

llvm::getLoopPassPreservedAnalyses
LLVM_ABI PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
Definition LoopAnalysisManager.cpp:141

llvm::isBytewiseValue
LLVM_ABI Value * isBytewiseValue(Value *V, const DataLayout &DL)
If the specified value can be set by repeating the same byte in memory, return the i8 value that it i...
Definition ValueTracking.cpp:6125

llvm::RecursivelyDeleteDeadPHINode
LLVM_ABI bool RecursivelyDeleteDeadPHINode(PHINode *PN, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
If the specified value is an effectively dead PHI node, due to being a def-use chain of single-use no...
Definition Local.cpp:641

llvm::InlinerFunctionImportStatsOpts::No
@ No
Definition ImportedFunctionsInliningStatistics.h:106

llvm::getUnderlyingObject
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
Definition ValueTracking.cpp:6683

llvm::AliasAnalysis
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
Definition AliasAnalysis.h:721

llvm::isKnownNonNegative
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
Definition ValueTracking.cpp:281

llvm::decomposeBitTestICmp
std::optional< DecomposedBitTest > decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate Pred, bool LookThroughTrunc=true, bool AllowNonZeroC=false, bool DecomposeAnd=false)
Decompose an icmp into the form ((X & Mask) pred C) if possible.
Definition CmpInstAnalysis.cpp:77

std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869

raw_ostream.h

llvm::AAMDNodes
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:761

llvm::AAMDNodes::merge
LLVM_ABI AAMDNodes merge(const AAMDNodes &Other) const
Given two sets of AAMDNodes applying to potentially different locations, determine the best AAMDNodes...
Definition TypeBasedAliasAnalysis.cpp:542

llvm::AAMDNodes::extendTo
AAMDNodes extendTo(ssize_t Len) const
Create a new AAMDNode that describes this AAMDNode after extending it to apply to a series of bytes o...
Definition Metadata.h:834

llvm::DisableLIRP::Wcslen
static bool Wcslen
When true, Wcslen is disabled.
Definition LoopIdiomRecognize.h:42

llvm::DisableLIRP::HashRecognize
static bool HashRecognize
When true, HashRecognize is disabled.
Definition LoopIdiomRecognize.h:45

llvm::DisableLIRP::Strlen
static bool Strlen
When true, Strlen is disabled.
Definition LoopIdiomRecognize.h:39

llvm::DisableLIRP::Memset
static bool Memset
When true, Memset is disabled.
Definition LoopIdiomRecognize.h:33

llvm::DisableLIRP::All
static bool All
When true, the entire pass is disabled.
Definition LoopIdiomRecognize.h:30

llvm::DisableLIRP::Memcpy
static bool Memcpy
When true, Memcpy is disabled.
Definition LoopIdiomRecognize.h:36

llvm::LoopStandardAnalysisResults
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Definition LoopAnalysisManager.h:52

llvm::LoopStandardAnalysisResults::SE
ScalarEvolution & SE
Definition LoopAnalysisManager.h:57

llvm::LoopStandardAnalysisResults::MSSA
MemorySSA * MSSA
Definition LoopAnalysisManager.h:60

llvm::LoopStandardAnalysisResults::TTI
TargetTransformInfo & TTI
Definition LoopAnalysisManager.h:59

llvm::LoopStandardAnalysisResults::TLI
TargetLibraryInfo & TLI
Definition LoopAnalysisManager.h:58

llvm::LoopStandardAnalysisResults::LI
LoopInfo & LI
Definition LoopAnalysisManager.h:56

llvm::LoopStandardAnalysisResults::DT
DominatorTree & DT
Definition LoopAnalysisManager.h:55

llvm::LoopStandardAnalysisResults::AA
AAResults & AA
Definition LoopAnalysisManager.h:53

llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106

llvm::PolynomialInfo
The structure that is returned when a polynomial algorithm was recognized by the analysis.
Definition HashRecognize.h:40

llvm::cl::desc
Definition CommandLine.h:411

match_LoopInvariant
Match loop-invariant value.
Definition LoopIdiomRecognize.cpp:2855

match_LoopInvariant::match
bool match(ITy *V) const
Definition LoopIdiomRecognize.cpp:2862

match_LoopInvariant::match_LoopInvariant
match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
Definition LoopIdiomRecognize.cpp:2859

match_LoopInvariant::L
const Loop * L
Definition LoopIdiomRecognize.cpp:2857

match_LoopInvariant::SubPattern
SubPattern_t SubPattern
Definition LoopIdiomRecognize.cpp:2856