LLVM  16.0.0git
LoopIdiomRecognize.cpp
Go to the documentation of this file.
1 //===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass implements an idiom recognizer that transforms simple loops into a
10 // non-loop form. In cases that this kicks in, it can be a significant
11 // performance win.
12 //
13 // If compiling for code size we avoid idiom recognition if the resulting
14 // code could be larger than the code for the original loop. One way this could
15 // happen is if the loop is not removable after idiom recognition due to the
16 // presence of non-idiom instructions. The initial implementation of the
17 // heuristics applies to idioms in multi-block loops.
18 //
19 //===----------------------------------------------------------------------===//
20 //
21 // TODO List:
22 //
23 // Future loop memory idioms to recognize:
24 // memcmp, strlen, etc.
25 // Future floating point idioms to recognize in -ffast-math mode:
26 // fpowi
27 // Future integer operation idioms to recognize:
28 // ctpop
29 //
30 // Beware that isel's default lowering for ctpop is highly inefficient for
31 // i64 and larger types when i64 is legal and the value has few bits set. It
32 // would be good to enhance isel to emit a loop for ctpop in this case.
33 //
34 // This could recognize common matrix multiplies and dot product idioms and
35 // replace them with calls to BLAS (if linked in??).
36 //
37 //===----------------------------------------------------------------------===//
38 
40 #include "llvm/ADT/APInt.h"
41 #include "llvm/ADT/ArrayRef.h"
42 #include "llvm/ADT/DenseMap.h"
43 #include "llvm/ADT/MapVector.h"
44 #include "llvm/ADT/SetVector.h"
45 #include "llvm/ADT/SmallPtrSet.h"
46 #include "llvm/ADT/SmallVector.h"
47 #include "llvm/ADT/Statistic.h"
48 #include "llvm/ADT/StringRef.h"
52 #include "llvm/Analysis/LoopInfo.h"
53 #include "llvm/Analysis/LoopPass.h"
64 #include "llvm/IR/BasicBlock.h"
65 #include "llvm/IR/Constant.h"
66 #include "llvm/IR/Constants.h"
67 #include "llvm/IR/DataLayout.h"
68 #include "llvm/IR/DebugLoc.h"
69 #include "llvm/IR/DerivedTypes.h"
70 #include "llvm/IR/Dominators.h"
71 #include "llvm/IR/GlobalValue.h"
72 #include "llvm/IR/GlobalVariable.h"
73 #include "llvm/IR/IRBuilder.h"
74 #include "llvm/IR/InstrTypes.h"
75 #include "llvm/IR/Instruction.h"
76 #include "llvm/IR/Instructions.h"
77 #include "llvm/IR/IntrinsicInst.h"
78 #include "llvm/IR/Intrinsics.h"
79 #include "llvm/IR/LLVMContext.h"
80 #include "llvm/IR/Module.h"
81 #include "llvm/IR/PassManager.h"
82 #include "llvm/IR/PatternMatch.h"
83 #include "llvm/IR/Type.h"
84 #include "llvm/IR/User.h"
85 #include "llvm/IR/Value.h"
86 #include "llvm/IR/ValueHandle.h"
87 #include "llvm/InitializePasses.h"
88 #include "llvm/Pass.h"
89 #include "llvm/Support/Casting.h"
91 #include "llvm/Support/Debug.h"
94 #include "llvm/Transforms/Scalar.h"
99 #include <algorithm>
100 #include <cassert>
101 #include <cstdint>
102 #include <utility>
103 #include <vector>
104 
105 using namespace llvm;
106 
107 #define DEBUG_TYPE "loop-idiom"
108 
109 STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
110 STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
111 STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores");
112 STATISTIC(
113  NumShiftUntilBitTest,
114  "Number of uncountable loops recognized as 'shift until bitttest' idiom");
115 STATISTIC(NumShiftUntilZero,
116  "Number of uncountable loops recognized as 'shift until zero' idiom");
117 
118 bool DisableLIRP::All;
119 static cl::opt<bool, true>
120  DisableLIRPAll("disable-" DEBUG_TYPE "-all",
121  cl::desc("Options to disable Loop Idiom Recognize Pass."),
124 
126 static cl::opt<bool, true>
127  DisableLIRPMemset("disable-" DEBUG_TYPE "-memset",
128  cl::desc("Proceed with loop idiom recognize pass, but do "
129  "not convert loop(s) to memset."),
132 
134 static cl::opt<bool, true>
135  DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy",
136  cl::desc("Proceed with loop idiom recognize pass, but do "
137  "not convert loop(s) to memcpy."),
140 
142  "use-lir-code-size-heurs",
143  cl::desc("Use loop idiom recognition code size heuristics when compiling"
144  "with -Os/-Oz"),
145  cl::init(true), cl::Hidden);
146 
147 namespace {
148 
149 class LoopIdiomRecognize {
150  Loop *CurLoop = nullptr;
151  AliasAnalysis *AA;
152  DominatorTree *DT;
153  LoopInfo *LI;
154  ScalarEvolution *SE;
155  TargetLibraryInfo *TLI;
156  const TargetTransformInfo *TTI;
157  const DataLayout *DL;
159  bool ApplyCodeSizeHeuristics;
160  std::unique_ptr<MemorySSAUpdater> MSSAU;
161 
162 public:
163  explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
164  LoopInfo *LI, ScalarEvolution *SE,
165  TargetLibraryInfo *TLI,
166  const TargetTransformInfo *TTI, MemorySSA *MSSA,
167  const DataLayout *DL,
169  : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
170  if (MSSA)
171  MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
172  }
173 
174  bool runOnLoop(Loop *L);
175 
176 private:
177  using StoreList = SmallVector<StoreInst *, 8>;
178  using StoreListMap = MapVector<Value *, StoreList>;
179 
180  StoreListMap StoreRefsForMemset;
181  StoreListMap StoreRefsForMemsetPattern;
182  StoreList StoreRefsForMemcpy;
183  bool HasMemset;
184  bool HasMemsetPattern;
185  bool HasMemcpy;
186 
187  /// Return code for isLegalStore()
188  enum LegalStoreKind {
189  None = 0,
190  Memset,
191  MemsetPattern,
192  Memcpy,
193  UnorderedAtomicMemcpy,
194  DontUse // Dummy retval never to be used. Allows catching errors in retval
195  // handling.
196  };
197 
198  /// \name Countable Loop Idiom Handling
199  /// @{
200 
201  bool runOnCountableLoop();
202  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
203  SmallVectorImpl<BasicBlock *> &ExitBlocks);
204 
205  void collectStores(BasicBlock *BB);
206  LegalStoreKind isLegalStore(StoreInst *SI);
207  enum class ForMemset { No, Yes };
208  bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
209  ForMemset For);
210 
211  template <typename MemInst>
212  bool processLoopMemIntrinsic(
213  BasicBlock *BB,
214  bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
215  const SCEV *BECount);
216  bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount);
217  bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
218 
219  bool processLoopStridedStore(Value *DestPtr, const SCEV *StoreSizeSCEV,
220  MaybeAlign StoreAlignment, Value *StoredVal,
221  Instruction *TheStore,
223  const SCEVAddRecExpr *Ev, const SCEV *BECount,
224  bool IsNegStride, bool IsLoopMemset = false);
225  bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
226  bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr,
227  const SCEV *StoreSize, MaybeAlign StoreAlign,
228  MaybeAlign LoadAlign, Instruction *TheStore,
229  Instruction *TheLoad,
230  const SCEVAddRecExpr *StoreEv,
231  const SCEVAddRecExpr *LoadEv,
232  const SCEV *BECount);
233  bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
234  bool IsLoopMemset = false);
235 
236  /// @}
237  /// \name Noncountable Loop Idiom Handling
238  /// @{
239 
240  bool runOnNoncountableLoop();
241 
242  bool recognizePopcount();
243  void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
244  PHINode *CntPhi, Value *Var);
245  bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
246  void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
247  Instruction *CntInst, PHINode *CntPhi,
248  Value *Var, Instruction *DefX,
249  const DebugLoc &DL, bool ZeroCheck,
250  bool IsCntPhiUsedOutsideLoop);
251 
252  bool recognizeShiftUntilBitTest();
253  bool recognizeShiftUntilZero();
254 
255  /// @}
256 };
257 
258 class LoopIdiomRecognizeLegacyPass : public LoopPass {
259 public:
260  static char ID;
261 
262  explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
265  }
266 
267  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
268  if (DisableLIRP::All)
269  return false;
270 
271  if (skipLoop(L))
272  return false;
273 
274  AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
275  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
276  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
277  ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
278  TargetLibraryInfo *TLI =
279  &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
280  *L->getHeader()->getParent());
281  const TargetTransformInfo *TTI =
282  &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
283  *L->getHeader()->getParent());
284  const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
285  auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
286  MemorySSA *MSSA = nullptr;
287  if (MSSAAnalysis)
288  MSSA = &MSSAAnalysis->getMSSA();
289 
290  // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
291  // pass. Function analyses need to be preserved across loop transformations
292  // but ORE cannot be preserved (see comment before the pass definition).
294 
295  LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, MSSA, DL, ORE);
296  return LIR.runOnLoop(L);
297  }
298 
299  /// This transformation requires natural loop information & requires that
300  /// loop preheaders be inserted into the CFG.
301  void getAnalysisUsage(AnalysisUsage &AU) const override {
306  }
307 };
308 
309 } // end anonymous namespace
310 
312 
315  LPMUpdater &) {
316  if (DisableLIRP::All)
317  return PreservedAnalyses::all();
318 
319  const auto *DL = &L.getHeader()->getModule()->getDataLayout();
320 
321  // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
322  // pass. Function analyses need to be preserved across loop transformations
323  // but ORE cannot be preserved (see comment before the pass definition).
325 
326  LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
327  AR.MSSA, DL, ORE);
328  if (!LIR.runOnLoop(&L))
329  return PreservedAnalyses::all();
330 
331  auto PA = getLoopPassPreservedAnalyses();
332  if (AR.MSSA)
333  PA.preserve<MemorySSAAnalysis>();
334  return PA;
335 }
336 
337 INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",
338  "Recognize loop idioms", false, false)
342 INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",
343  "Recognize loop idioms", false, false)
344 
345 Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
346 
348  I->replaceAllUsesWith(PoisonValue::get(I->getType()));
349  I->eraseFromParent();
350 }
351 
352 //===----------------------------------------------------------------------===//
353 //
354 // Implementation of LoopIdiomRecognize
355 //
356 //===----------------------------------------------------------------------===//
357 
358 bool LoopIdiomRecognize::runOnLoop(Loop *L) {
359  CurLoop = L;
360  // If the loop could not be converted to canonical form, it must have an
361  // indirectbr in it, just give up.
362  if (!L->getLoopPreheader())
363  return false;
364 
365  // Disable loop idiom recognition if the function's name is a common idiom.
367  if (Name == "memset" || Name == "memcpy")
368  return false;
369 
370  // Determine if code size heuristics need to be applied.
371  ApplyCodeSizeHeuristics =
373 
374  HasMemset = TLI->has(LibFunc_memset);
375  HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
376  HasMemcpy = TLI->has(LibFunc_memcpy);
377 
378  if (HasMemset || HasMemsetPattern || HasMemcpy)
380  return runOnCountableLoop();
381 
382  return runOnNoncountableLoop();
383 }
384 
385 bool LoopIdiomRecognize::runOnCountableLoop() {
386  const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
387  assert(!isa<SCEVCouldNotCompute>(BECount) &&
388  "runOnCountableLoop() called on a loop without a predictable"
389  "backedge-taken count");
390 
391  // If this loop executes exactly one time, then it should be peeled, not
392  // optimized by this pass.
393  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
394  if (BECst->getAPInt() == 0)
395  return false;
396 
397  SmallVector<BasicBlock *, 8> ExitBlocks;
398  CurLoop->getUniqueExitBlocks(ExitBlocks);
399 
400  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
401  << CurLoop->getHeader()->getParent()->getName()
402  << "] Countable Loop %" << CurLoop->getHeader()->getName()
403  << "\n");
404 
405  // The following transforms hoist stores/memsets into the loop pre-header.
406  // Give up if the loop has instructions that may throw.
407  SimpleLoopSafetyInfo SafetyInfo;
408  SafetyInfo.computeLoopSafetyInfo(CurLoop);
409  if (SafetyInfo.anyBlockMayThrow())
410  return false;
411 
412  bool MadeChange = false;
413 
414  // Scan all the blocks in the loop that are not in subloops.
415  for (auto *BB : CurLoop->getBlocks()) {
416  // Ignore blocks in subloops.
417  if (LI->getLoopFor(BB) != CurLoop)
418  continue;
419 
420  MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
421  }
422  return MadeChange;
423 }
424 
425 static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
426  const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
427  return ConstStride->getAPInt();
428 }
429 
430 /// getMemSetPatternValue - If a strided store of the specified value is safe to
431 /// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
432 /// be passed in. Otherwise, return null.
433 ///
434 /// Note that we don't ever attempt to use memset_pattern8 or 4, because these
435 /// just replicate their input array and then pass on to memset_pattern16.
437  // FIXME: This could check for UndefValue because it can be merged into any
438  // other valid pattern.
439 
440  // If the value isn't a constant, we can't promote it to being in a constant
441  // array. We could theoretically do a store to an alloca or something, but
442  // that doesn't seem worthwhile.
443  Constant *C = dyn_cast<Constant>(V);
444  if (!C)
445  return nullptr;
446 
447  // Only handle simple values that are a power of two bytes in size.
448  uint64_t Size = DL->getTypeSizeInBits(V->getType());
449  if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
450  return nullptr;
451 
452  // Don't care enough about darwin/ppc to implement this.
453  if (DL->isBigEndian())
454  return nullptr;
455 
456  // Convert to size in bytes.
457  Size /= 8;
458 
459  // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
460  // if the top and bottom are the same (e.g. for vectors and large integers).
461  if (Size > 16)
462  return nullptr;
463 
464  // If the constant is exactly 16 bytes, just use it.
465  if (Size == 16)
466  return C;
467 
468  // Otherwise, we'll use an array of the constants.
469  unsigned ArraySize = 16 / Size;
470  ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
471  return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
472 }
473 
474 LoopIdiomRecognize::LegalStoreKind
475 LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
476  // Don't touch volatile stores.
477  if (SI->isVolatile())
478  return LegalStoreKind::None;
479  // We only want simple or unordered-atomic stores.
480  if (!SI->isUnordered())
481  return LegalStoreKind::None;
482 
483  // Avoid merging nontemporal stores.
484  if (SI->getMetadata(LLVMContext::MD_nontemporal))
485  return LegalStoreKind::None;
486 
487  Value *StoredVal = SI->getValueOperand();
488  Value *StorePtr = SI->getPointerOperand();
489 
490  // Don't convert stores of non-integral pointer types to memsets (which stores
491  // integers).
492  if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
493  return LegalStoreKind::None;
494 
495  // Reject stores that are so large that they overflow an unsigned.
496  // When storing out scalable vectors we bail out for now, since the code
497  // below currently only works for constant strides.
498  TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
499  if (SizeInBits.isScalable() || (SizeInBits.getFixedSize() & 7) ||
500  (SizeInBits.getFixedSize() >> 32) != 0)
501  return LegalStoreKind::None;
502 
503  // See if the pointer expression is an AddRec like {base,+,1} on the current
504  // loop, which indicates a strided store. If we have something else, it's a
505  // random store we can't handle.
506  const SCEVAddRecExpr *StoreEv =
507  dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
508  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
509  return LegalStoreKind::None;
510 
511  // Check to see if we have a constant stride.
512  if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
513  return LegalStoreKind::None;
514 
515  // See if the store can be turned into a memset.
516 
517  // If the stored value is a byte-wise value (like i32 -1), then it may be
518  // turned into a memset of i8 -1, assuming that all the consecutive bytes
519  // are stored. A store of i32 0x01020304 can never be turned into a memset,
520  // but it can be turned into memset_pattern if the target supports it.
521  Value *SplatValue = isBytewiseValue(StoredVal, *DL);
522 
523  // Note: memset and memset_pattern on unordered-atomic is yet not supported
524  bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
525 
526  // If we're allowed to form a memset, and the stored value would be
527  // acceptable for memset, use it.
528  if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
529  // Verify that the stored value is loop invariant. If not, we can't
530  // promote the memset.
531  CurLoop->isLoopInvariant(SplatValue)) {
532  // It looks like we can use SplatValue.
533  return LegalStoreKind::Memset;
534  }
535  if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
536  // Don't create memset_pattern16s with address spaces.
537  StorePtr->getType()->getPointerAddressSpace() == 0 &&
538  getMemSetPatternValue(StoredVal, DL)) {
539  // It looks like we can use PatternValue!
540  return LegalStoreKind::MemsetPattern;
541  }
542 
543  // Otherwise, see if the store can be turned into a memcpy.
544  if (HasMemcpy && !DisableLIRP::Memcpy) {
545  // Check to see if the stride matches the size of the store. If so, then we
546  // know that every byte is touched in the loop.
547  APInt Stride = getStoreStride(StoreEv);
548  unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
549  if (StoreSize != Stride && StoreSize != -Stride)
550  return LegalStoreKind::None;
551 
552  // The store must be feeding a non-volatile load.
553  LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
554 
555  // Only allow non-volatile loads
556  if (!LI || LI->isVolatile())
557  return LegalStoreKind::None;
558  // Only allow simple or unordered-atomic loads
559  if (!LI->isUnordered())
560  return LegalStoreKind::None;
561 
562  // See if the pointer expression is an AddRec like {base,+,1} on the current
563  // loop, which indicates a strided load. If we have something else, it's a
564  // random load we can't handle.
565  const SCEVAddRecExpr *LoadEv =
566  dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
567  if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
568  return LegalStoreKind::None;
569 
570  // The store and load must share the same stride.
571  if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
572  return LegalStoreKind::None;
573 
574  // Success. This store can be converted into a memcpy.
575  UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
576  return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
577  : LegalStoreKind::Memcpy;
578  }
579  // This store can't be transformed into a memset/memcpy.
580  return LegalStoreKind::None;
581 }
582 
583 void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
584  StoreRefsForMemset.clear();
585  StoreRefsForMemsetPattern.clear();
586  StoreRefsForMemcpy.clear();
587  for (Instruction &I : *BB) {
588  StoreInst *SI = dyn_cast<StoreInst>(&I);
589  if (!SI)
590  continue;
591 
592  // Make sure this is a strided store with a constant stride.
593  switch (isLegalStore(SI)) {
595  // Nothing to do
596  break;
597  case LegalStoreKind::Memset: {
598  // Find the base pointer.
599  Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
600  StoreRefsForMemset[Ptr].push_back(SI);
601  } break;
602  case LegalStoreKind::MemsetPattern: {
603  // Find the base pointer.
604  Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
605  StoreRefsForMemsetPattern[Ptr].push_back(SI);
606  } break;
607  case LegalStoreKind::Memcpy:
608  case LegalStoreKind::UnorderedAtomicMemcpy:
609  StoreRefsForMemcpy.push_back(SI);
610  break;
611  default:
612  assert(false && "unhandled return value");
613  break;
614  }
615  }
616 }
617 
618 /// runOnLoopBlock - Process the specified block, which lives in a counted loop
619 /// with the specified backedge count. This block is known to be in the current
620 /// loop and not in any subloops.
621 bool LoopIdiomRecognize::runOnLoopBlock(
622  BasicBlock *BB, const SCEV *BECount,
623  SmallVectorImpl<BasicBlock *> &ExitBlocks) {
624  // We can only promote stores in this block if they are unconditionally
625  // executed in the loop. For a block to be unconditionally executed, it has
626  // to dominate all the exit blocks of the loop. Verify this now.
627  for (BasicBlock *ExitBlock : ExitBlocks)
628  if (!DT->dominates(BB, ExitBlock))
629  return false;
630 
631  bool MadeChange = false;
632  // Look for store instructions, which may be optimized to memset/memcpy.
633  collectStores(BB);
634 
635  // Look for a single store or sets of stores with a common base, which can be
636  // optimized into a memset (memset_pattern). The latter most commonly happens
637  // with structs and handunrolled loops.
638  for (auto &SL : StoreRefsForMemset)
639  MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
640 
641  for (auto &SL : StoreRefsForMemsetPattern)
642  MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
643 
644  // Optimize the store into a memcpy, if it feeds an similarly strided load.
645  for (auto &SI : StoreRefsForMemcpy)
646  MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
647 
648  MadeChange |= processLoopMemIntrinsic<MemCpyInst>(
649  BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
650  MadeChange |= processLoopMemIntrinsic<MemSetInst>(
651  BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
652 
653  return MadeChange;
654 }
655 
656 /// See if this store(s) can be promoted to a memset.
657 bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
658  const SCEV *BECount, ForMemset For) {
659  // Try to find consecutive stores that can be transformed into memsets.
660  SetVector<StoreInst *> Heads, Tails;
662 
663  // Do a quadratic search on all of the given stores and find
664  // all of the pairs of stores that follow each other.
665  SmallVector<unsigned, 16> IndexQueue;
666  for (unsigned i = 0, e = SL.size(); i < e; ++i) {
667  assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
668 
669  Value *FirstStoredVal = SL[i]->getValueOperand();
670  Value *FirstStorePtr = SL[i]->getPointerOperand();
671  const SCEVAddRecExpr *FirstStoreEv =
672  cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
673  APInt FirstStride = getStoreStride(FirstStoreEv);
674  unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
675 
676  // See if we can optimize just this store in isolation.
677  if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
678  Heads.insert(SL[i]);
679  continue;
680  }
681 
682  Value *FirstSplatValue = nullptr;
683  Constant *FirstPatternValue = nullptr;
684 
685  if (For == ForMemset::Yes)
686  FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
687  else
688  FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
689 
690  assert((FirstSplatValue || FirstPatternValue) &&
691  "Expected either splat value or pattern value.");
692 
693  IndexQueue.clear();
694  // If a store has multiple consecutive store candidates, search Stores
695  // array according to the sequence: from i+1 to e, then from i-1 to 0.
696  // This is because usually pairing with immediate succeeding or preceding
697  // candidate create the best chance to find memset opportunity.
698  unsigned j = 0;
699  for (j = i + 1; j < e; ++j)
700  IndexQueue.push_back(j);
701  for (j = i; j > 0; --j)
702  IndexQueue.push_back(j - 1);
703 
704  for (auto &k : IndexQueue) {
705  assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
706  Value *SecondStorePtr = SL[k]->getPointerOperand();
707  const SCEVAddRecExpr *SecondStoreEv =
708  cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
709  APInt SecondStride = getStoreStride(SecondStoreEv);
710 
711  if (FirstStride != SecondStride)
712  continue;
713 
714  Value *SecondStoredVal = SL[k]->getValueOperand();
715  Value *SecondSplatValue = nullptr;
716  Constant *SecondPatternValue = nullptr;
717 
718  if (For == ForMemset::Yes)
719  SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
720  else
721  SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
722 
723  assert((SecondSplatValue || SecondPatternValue) &&
724  "Expected either splat value or pattern value.");
725 
726  if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
727  if (For == ForMemset::Yes) {
728  if (isa<UndefValue>(FirstSplatValue))
729  FirstSplatValue = SecondSplatValue;
730  if (FirstSplatValue != SecondSplatValue)
731  continue;
732  } else {
733  if (isa<UndefValue>(FirstPatternValue))
734  FirstPatternValue = SecondPatternValue;
735  if (FirstPatternValue != SecondPatternValue)
736  continue;
737  }
738  Tails.insert(SL[k]);
739  Heads.insert(SL[i]);
740  ConsecutiveChain[SL[i]] = SL[k];
741  break;
742  }
743  }
744  }
745 
746  // We may run into multiple chains that merge into a single chain. We mark the
747  // stores that we transformed so that we don't visit the same store twice.
748  SmallPtrSet<Value *, 16> TransformedStores;
749  bool Changed = false;
750 
751  // For stores that start but don't end a link in the chain:
752  for (StoreInst *I : Heads) {
753  if (Tails.count(I))
754  continue;
755 
756  // We found a store instr that starts a chain. Now follow the chain and try
757  // to transform it.
758  SmallPtrSet<Instruction *, 8> AdjacentStores;
759  StoreInst *HeadStore = I;
760  unsigned StoreSize = 0;
761 
762  // Collect the chain into a list.
763  while (Tails.count(I) || Heads.count(I)) {
764  if (TransformedStores.count(I))
765  break;
766  AdjacentStores.insert(I);
767 
768  StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
769  // Move to the next value in the chain.
770  I = ConsecutiveChain[I];
771  }
772 
773  Value *StoredVal = HeadStore->getValueOperand();
774  Value *StorePtr = HeadStore->getPointerOperand();
775  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
776  APInt Stride = getStoreStride(StoreEv);
777 
778  // Check to see if the stride matches the size of the stores. If so, then
779  // we know that every byte is touched in the loop.
780  if (StoreSize != Stride && StoreSize != -Stride)
781  continue;
782 
783  bool IsNegStride = StoreSize == -Stride;
784 
785  Type *IntIdxTy = DL->getIndexType(StorePtr->getType());
786  const SCEV *StoreSizeSCEV = SE->getConstant(IntIdxTy, StoreSize);
787  if (processLoopStridedStore(StorePtr, StoreSizeSCEV,
788  MaybeAlign(HeadStore->getAlign()), StoredVal,
789  HeadStore, AdjacentStores, StoreEv, BECount,
790  IsNegStride)) {
791  TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
792  Changed = true;
793  }
794  }
795 
796  return Changed;
797 }
798 
799 /// processLoopMemIntrinsic - Template function for calling different processor
800 /// functions based on mem intrinsic type.
801 template <typename MemInst>
802 bool LoopIdiomRecognize::processLoopMemIntrinsic(
803  BasicBlock *BB,
804  bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
805  const SCEV *BECount) {
806  bool MadeChange = false;
807  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
808  Instruction *Inst = &*I++;
809  // Look for memory instructions, which may be optimized to a larger one.
810  if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
811  WeakTrackingVH InstPtr(&*I);
812  if (!(this->*Processor)(MI, BECount))
813  continue;
814  MadeChange = true;
815 
816  // If processing the instruction invalidated our iterator, start over from
817  // the top of the block.
818  if (!InstPtr)
819  I = BB->begin();
820  }
821  }
822  return MadeChange;
823 }
824 
825 /// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy
826 bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
827  const SCEV *BECount) {
828  // We can only handle non-volatile memcpys with a constant size.
829  if (MCI->isVolatile() || !isa<ConstantInt>(MCI->getLength()))
830  return false;
831 
832  // If we're not allowed to hack on memcpy, we fail.
833  if ((!HasMemcpy && !isa<MemCpyInlineInst>(MCI)) || DisableLIRP::Memcpy)
834  return false;
835 
836  Value *Dest = MCI->getDest();
837  Value *Source = MCI->getSource();
838  if (!Dest || !Source)
839  return false;
840 
841  // See if the load and store pointer expressions are AddRec like {base,+,1} on
842  // the current loop, which indicates a strided load and store. If we have
843  // something else, it's a random load or store we can't handle.
844  const SCEVAddRecExpr *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Dest));
845  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
846  return false;
847  const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Source));
848  if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
849  return false;
850 
851  // Reject memcpys that are so large that they overflow an unsigned.
852  uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength())->getZExtValue();
853  if ((SizeInBytes >> 32) != 0)
854  return false;
855 
856  // Check if the stride matches the size of the memcpy. If so, then we know
857  // that every byte is touched in the loop.
858  const SCEVConstant *ConstStoreStride =
859  dyn_cast<SCEVConstant>(StoreEv->getOperand(1));
860  const SCEVConstant *ConstLoadStride =
861  dyn_cast<SCEVConstant>(LoadEv->getOperand(1));
862  if (!ConstStoreStride || !ConstLoadStride)
863  return false;
864 
865  APInt StoreStrideValue = ConstStoreStride->getAPInt();
866  APInt LoadStrideValue = ConstLoadStride->getAPInt();
867  // Huge stride value - give up
868  if (StoreStrideValue.getBitWidth() > 64 || LoadStrideValue.getBitWidth() > 64)
869  return false;
870 
871  if (SizeInBytes != StoreStrideValue && SizeInBytes != -StoreStrideValue) {
872  ORE.emit([&]() {
873  return OptimizationRemarkMissed(DEBUG_TYPE, "SizeStrideUnequal", MCI)
874  << ore::NV("Inst", "memcpy") << " in "
875  << ore::NV("Function", MCI->getFunction())
876  << " function will not be hoisted: "
877  << ore::NV("Reason", "memcpy size is not equal to stride");
878  });
879  return false;
880  }
881 
882  int64_t StoreStrideInt = StoreStrideValue.getSExtValue();
883  int64_t LoadStrideInt = LoadStrideValue.getSExtValue();
884  // Check if the load stride matches the store stride.
885  if (StoreStrideInt != LoadStrideInt)
886  return false;
887 
888  return processLoopStoreOfLoopLoad(
889  Dest, Source, SE->getConstant(Dest->getType(), SizeInBytes),
890  MCI->getDestAlign(), MCI->getSourceAlign(), MCI, MCI, StoreEv, LoadEv,
891  BECount);
892 }
893 
894 /// processLoopMemSet - See if this memset can be promoted to a large memset.
895 bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
896  const SCEV *BECount) {
897  // We can only handle non-volatile memsets.
898  if (MSI->isVolatile())
899  return false;
900 
901  // If we're not allowed to hack on memset, we fail.
902  if (!HasMemset || DisableLIRP::Memset)
903  return false;
904 
905  Value *Pointer = MSI->getDest();
906 
907  // See if the pointer expression is an AddRec like {base,+,1} on the current
908  // loop, which indicates a strided store. If we have something else, it's a
909  // random store we can't handle.
910  const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
911  if (!Ev || Ev->getLoop() != CurLoop)
912  return false;
913  if (!Ev->isAffine()) {
914  LLVM_DEBUG(dbgs() << " Pointer is not affine, abort\n");
915  return false;
916  }
917 
918  const SCEV *PointerStrideSCEV = Ev->getOperand(1);
919  const SCEV *MemsetSizeSCEV = SE->getSCEV(MSI->getLength());
920  if (!PointerStrideSCEV || !MemsetSizeSCEV)
921  return false;
922 
923  bool IsNegStride = false;
924  const bool IsConstantSize = isa<ConstantInt>(MSI->getLength());
925 
926  if (IsConstantSize) {
927  // Memset size is constant.
928  // Check if the pointer stride matches the memset size. If so, then
929  // we know that every byte is touched in the loop.
930  LLVM_DEBUG(dbgs() << " memset size is constant\n");
931  uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
932  const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
933  if (!ConstStride)
934  return false;
935 
936  APInt Stride = ConstStride->getAPInt();
937  if (SizeInBytes != Stride && SizeInBytes != -Stride)
938  return false;
939 
940  IsNegStride = SizeInBytes == -Stride;
941  } else {
942  // Memset size is non-constant.
943  // Check if the pointer stride matches the memset size.
944  // To be conservative, the pass would not promote pointers that aren't in
945  // address space zero. Also, the pass only handles memset length and stride
946  // that are invariant for the top level loop.
947  LLVM_DEBUG(dbgs() << " memset size is non-constant\n");
948  if (Pointer->getType()->getPointerAddressSpace() != 0) {
949  LLVM_DEBUG(dbgs() << " pointer is not in address space zero, "
950  << "abort\n");
951  return false;
952  }
953  if (!SE->isLoopInvariant(MemsetSizeSCEV, CurLoop)) {
954  LLVM_DEBUG(dbgs() << " memset size is not a loop-invariant, "
955  << "abort\n");
956  return false;
957  }
958 
959  // Compare positive direction PointerStrideSCEV with MemsetSizeSCEV
960  IsNegStride = PointerStrideSCEV->isNonConstantNegative();
961  const SCEV *PositiveStrideSCEV =
962  IsNegStride ? SE->getNegativeSCEV(PointerStrideSCEV)
963  : PointerStrideSCEV;
964  LLVM_DEBUG(dbgs() << " MemsetSizeSCEV: " << *MemsetSizeSCEV << "\n"
965  << " PositiveStrideSCEV: " << *PositiveStrideSCEV
966  << "\n");
967 
968  if (PositiveStrideSCEV != MemsetSizeSCEV) {
969  // If an expression is covered by the loop guard, compare again and
970  // proceed with optimization if equal.
971  const SCEV *FoldedPositiveStride =
972  SE->applyLoopGuards(PositiveStrideSCEV, CurLoop);
973  const SCEV *FoldedMemsetSize =
974  SE->applyLoopGuards(MemsetSizeSCEV, CurLoop);
975 
976  LLVM_DEBUG(dbgs() << " Try to fold SCEV based on loop guard\n"
977  << " FoldedMemsetSize: " << *FoldedMemsetSize << "\n"
978  << " FoldedPositiveStride: " << *FoldedPositiveStride
979  << "\n");
980 
981  if (FoldedPositiveStride != FoldedMemsetSize) {
982  LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n");
983  return false;
984  }
985  }
986  }
987 
988  // Verify that the memset value is loop invariant. If not, we can't promote
989  // the memset.
990  Value *SplatValue = MSI->getValue();
991  if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
992  return false;
993 
995  MSIs.insert(MSI);
996  return processLoopStridedStore(Pointer, SE->getSCEV(MSI->getLength()),
997  MSI->getDestAlign(), SplatValue, MSI, MSIs, Ev,
998  BECount, IsNegStride, /*IsLoopMemset=*/true);
999 }
1000 
1001 /// mayLoopAccessLocation - Return true if the specified loop might access the
1002 /// specified pointer location, which is a loop-strided access. The 'Access'
1003 /// argument specifies what the verboten forms of access are (read or write).
1004 static bool
1006  const SCEV *BECount, const SCEV *StoreSizeSCEV,
1007  AliasAnalysis &AA,
1008  SmallPtrSetImpl<Instruction *> &IgnoredInsts) {
1009  // Get the location that may be stored across the loop. Since the access is
1010  // strided positively through memory, we say that the modified location starts
1011  // at the pointer and has infinite size.
1013 
1014  // If the loop iterates a fixed number of times, we can refine the access size
1015  // to be exactly the size of the memset, which is (BECount+1)*StoreSize
1016  const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount);
1017  const SCEVConstant *ConstSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
1018  if (BECst && ConstSize)
1019  AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
1020  ConstSize->getValue()->getZExtValue());
1021 
1022  // TODO: For this to be really effective, we have to dive into the pointer
1023  // operand in the store. Store to &A[i] of 100 will always return may alias
1024  // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
1025  // which will then no-alias a store to &A[100].
1026  MemoryLocation StoreLoc(Ptr, AccessSize);
1027 
1028  for (BasicBlock *B : L->blocks())
1029  for (Instruction &I : *B)
1030  if (!IgnoredInsts.contains(&I) &&
1031  isModOrRefSet(AA.getModRefInfo(&I, StoreLoc) & Access))
1032  return true;
1033  return false;
1034 }
1035 
1036 // If we have a negative stride, Start refers to the end of the memory location
1037 // we're trying to memset. Therefore, we need to recompute the base pointer,
1038 // which is just Start - BECount*Size.
1039 static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
1040  Type *IntPtr, const SCEV *StoreSizeSCEV,
1041  ScalarEvolution *SE) {
1042  const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
1043  if (!StoreSizeSCEV->isOne()) {
1044  // index = back edge count * store size
1045  Index = SE->getMulExpr(Index,
1046  SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
1047  SCEV::FlagNUW);
1048  }
1049  // base pointer = start - index * store size
1050  return SE->getMinusSCEV(Start, Index);
1051 }
1052 
1053 /// Compute trip count from the backedge taken count.
1054 static const SCEV *getTripCount(const SCEV *BECount, Type *IntPtr,
1055  Loop *CurLoop, const DataLayout *DL,
1056  ScalarEvolution *SE) {
1057  const SCEV *TripCountS = nullptr;
1058  // The # stored bytes is (BECount+1). Expand the trip count out to
1059  // pointer size if it isn't already.
1060  //
1061  // If we're going to need to zero extend the BE count, check if we can add
1062  // one to it prior to zero extending without overflow. Provided this is safe,
1063  // it allows better simplification of the +1.
1064  if (DL->getTypeSizeInBits(BECount->getType()) <
1065  DL->getTypeSizeInBits(IntPtr) &&
1067  CurLoop, ICmpInst::ICMP_NE, BECount,
1068  SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
1069  TripCountS = SE->getZeroExtendExpr(
1070  SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
1071  IntPtr);
1072  } else {
1073  TripCountS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
1074  SE->getOne(IntPtr), SCEV::FlagNUW);
1075  }
1076 
1077  return TripCountS;
1078 }
1079 
1080 /// Compute the number of bytes as a SCEV from the backedge taken count.
1081 ///
1082 /// This also maps the SCEV into the provided type and tries to handle the
1083 /// computation in a way that will fold cleanly.
1084 static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
1085  const SCEV *StoreSizeSCEV, Loop *CurLoop,
1086  const DataLayout *DL, ScalarEvolution *SE) {
1087  const SCEV *TripCountSCEV = getTripCount(BECount, IntPtr, CurLoop, DL, SE);
1088 
1089  return SE->getMulExpr(TripCountSCEV,
1090  SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
1091  SCEV::FlagNUW);
1092 }
1093 
1094 /// processLoopStridedStore - We see a strided store of some value. If we can
1095 /// transform this into a memset or memset_pattern in the loop preheader, do so.
1096 bool LoopIdiomRecognize::processLoopStridedStore(
1097  Value *DestPtr, const SCEV *StoreSizeSCEV, MaybeAlign StoreAlignment,
1098  Value *StoredVal, Instruction *TheStore,
1100  const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) {
1101  Module *M = TheStore->getModule();
1102  Value *SplatValue = isBytewiseValue(StoredVal, *DL);
1103  Constant *PatternValue = nullptr;
1104 
1105  if (!SplatValue)
1106  PatternValue = getMemSetPatternValue(StoredVal, DL);
1107 
1108  assert((SplatValue || PatternValue) &&
1109  "Expected either splat value or pattern value.");
1110 
1111  // The trip count of the loop and the base pointer of the addrec SCEV is
1112  // guaranteed to be loop invariant, which means that it should dominate the
1113  // header. This allows us to insert code for it in the preheader.
1114  unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
1115  BasicBlock *Preheader = CurLoop->getLoopPreheader();
1116  IRBuilder<> Builder(Preheader->getTerminator());
1117  SCEVExpander Expander(*SE, *DL, "loop-idiom");
1118  SCEVExpanderCleaner ExpCleaner(Expander);
1119 
1120  Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
1121  Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
1122 
1123  bool Changed = false;
1124  const SCEV *Start = Ev->getStart();
1125  // Handle negative strided loops.
1126  if (IsNegStride)
1127  Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSizeSCEV, SE);
1128 
1129  // TODO: ideally we should still be able to generate memset if SCEV expander
1130  // is taught to generate the dependencies at the latest point.
1131  if (!Expander.isSafeToExpand(Start))
1132  return Changed;
1133 
1134  // Okay, we have a strided store "p[i]" of a splattable value. We can turn
1135  // this into a memset in the loop preheader now if we want. However, this
1136  // would be unsafe to do if there is anything else in the loop that may read
1137  // or write to the aliased location. Check for any overlap by generating the
1138  // base pointer and checking the region.
1139  Value *BasePtr =
1140  Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
1141 
1142  // From here on out, conservatively report to the pass manager that we've
1143  // changed the IR, even if we later clean up these added instructions. There
1144  // may be structural differences e.g. in the order of use lists not accounted
1145  // for in just a textual dump of the IR. This is written as a variable, even
1146  // though statically all the places this dominates could be replaced with
1147  // 'true', with the hope that anyone trying to be clever / "more precise" with
1148  // the return value will read this comment, and leave them alone.
1149  Changed = true;
1150 
1151  if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1152  StoreSizeSCEV, *AA, Stores))
1153  return Changed;
1154 
1155  if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
1156  return Changed;
1157 
1158  // Okay, everything looks good, insert the memset.
1159 
1160  const SCEV *NumBytesS =
1161  getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1162 
1163  // TODO: ideally we should still be able to generate memset if SCEV expander
1164  // is taught to generate the dependencies at the latest point.
1165  if (!Expander.isSafeToExpand(NumBytesS))
1166  return Changed;
1167 
1168  Value *NumBytes =
1169  Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1170 
1171  CallInst *NewCall;
1172  if (SplatValue) {
1173  AAMDNodes AATags = TheStore->getAAMetadata();
1174  for (Instruction *Store : Stores)
1175  AATags = AATags.merge(Store->getAAMetadata());
1176  if (auto CI = dyn_cast<ConstantInt>(NumBytes))
1177  AATags = AATags.extendTo(CI->getZExtValue());
1178  else
1179  AATags = AATags.extendTo(-1);
1180 
1181  NewCall = Builder.CreateMemSet(
1182  BasePtr, SplatValue, NumBytes, MaybeAlign(StoreAlignment),
1183  /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
1184  } else if (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)) {
1185  // Everything is emitted in default address space
1186  Type *Int8PtrTy = DestInt8PtrTy;
1187 
1188  StringRef FuncName = "memset_pattern16";
1189  FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16,
1190  Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy);
1191  inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI);
1192 
1193  // Otherwise we should form a memset_pattern16. PatternValue is known to be
1194  // an constant array of 16-bytes. Plop the value into a mergable global.
1195  GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
1197  PatternValue, ".memset_pattern");
1198  GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
1199  GV->setAlignment(Align(16));
1200  Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
1201  NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
1202  } else
1203  return Changed;
1204 
1205  NewCall->setDebugLoc(TheStore->getDebugLoc());
1206 
1207  if (MSSAU) {
1208  MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1209  NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1210  MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1211  }
1212 
1213  LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
1214  << " from store to: " << *Ev << " at: " << *TheStore
1215  << "\n");
1216 
1217  ORE.emit([&]() {
1218  OptimizationRemark R(DEBUG_TYPE, "ProcessLoopStridedStore",
1219  NewCall->getDebugLoc(), Preheader);
1220  R << "Transformed loop-strided store in "
1221  << ore::NV("Function", TheStore->getFunction())
1222  << " function into a call to "
1223  << ore::NV("NewFunction", NewCall->getCalledFunction())
1224  << "() intrinsic";
1225  if (!Stores.empty())
1226  R << ore::setExtraArgs();
1227  for (auto *I : Stores) {
1228  R << ore::NV("FromBlock", I->getParent()->getName())
1229  << ore::NV("ToBlock", Preheader->getName());
1230  }
1231  return R;
1232  });
1233 
1234  // Okay, the memset has been formed. Zap the original store and anything that
1235  // feeds into it.
1236  for (auto *I : Stores) {
1237  if (MSSAU)
1238  MSSAU->removeMemoryAccess(I, true);
1240  }
1241  if (MSSAU && VerifyMemorySSA)
1242  MSSAU->getMemorySSA()->verifyMemorySSA();
1243  ++NumMemSet;
1244  ExpCleaner.markResultUsed();
1245  return true;
1246 }
1247 
1248 /// If the stored value is a strided load in the same loop with the same stride
1249 /// this may be transformable into a memcpy. This kicks in for stuff like
1250 /// for (i) A[i] = B[i];
1251 bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1252  const SCEV *BECount) {
1253  assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
1254 
1255  Value *StorePtr = SI->getPointerOperand();
1256  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1257  unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1258 
1259  // The store must be feeding a non-volatile load.
1260  LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
1261  assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
1262 
1263  // See if the pointer expression is an AddRec like {base,+,1} on the current
1264  // loop, which indicates a strided load. If we have something else, it's a
1265  // random load we can't handle.
1266  Value *LoadPtr = LI->getPointerOperand();
1267  const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
1268 
1269  const SCEV *StoreSizeSCEV = SE->getConstant(StorePtr->getType(), StoreSize);
1270  return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSizeSCEV,
1271  SI->getAlign(), LI->getAlign(), SI, LI,
1272  StoreEv, LoadEv, BECount);
1273 }
1274 
1276 public:
1277  explicit MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr,
1278  const DataLayout &DL)
1280  LoadBasePtr.stripPointerCasts(), LoadOff, DL)),
1282  StoreBasePtr.stripPointerCasts(), StoreOff, DL)),
1283  IsSameObject(BP1 == BP2) {}
1284 
1285  bool loadAndStoreMayFormMemmove(unsigned StoreSize, bool IsNegStride,
1286  const Instruction &TheLoad,
1287  bool IsMemCpy) const {
1288  if (IsMemCpy) {
1289  // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1290  // for negative stride.
1291  if ((!IsNegStride && LoadOff <= StoreOff) ||
1292  (IsNegStride && LoadOff >= StoreOff))
1293  return false;
1294  } else {
1295  // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1296  // for negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr.
1297  int64_t LoadSize =
1298  DL.getTypeSizeInBits(TheLoad.getType()).getFixedSize() / 8;
1299  if (BP1 != BP2 || LoadSize != int64_t(StoreSize))
1300  return false;
1301  if ((!IsNegStride && LoadOff < StoreOff + int64_t(StoreSize)) ||
1302  (IsNegStride && LoadOff + LoadSize > StoreOff))
1303  return false;
1304  }
1305  return true;
1306  }
1307 
1308 private:
1309  const DataLayout &DL;
1310  int64_t LoadOff = 0;
1311  int64_t StoreOff = 0;
1312  const Value *BP1;
1313  const Value *BP2;
1314 
1315 public:
1316  const bool IsSameObject;
1317 };
1318 
1319 bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
1320  Value *DestPtr, Value *SourcePtr, const SCEV *StoreSizeSCEV,
1321  MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore,
1322  Instruction *TheLoad, const SCEVAddRecExpr *StoreEv,
1323  const SCEVAddRecExpr *LoadEv, const SCEV *BECount) {
1324 
1325  // FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to
1326  // conservatively bail here, since otherwise we may have to transform
1327  // llvm.memcpy.inline into llvm.memcpy which is illegal.
1328  if (isa<MemCpyInlineInst>(TheStore))
1329  return false;
1330 
1331  // The trip count of the loop and the base pointer of the addrec SCEV is
1332  // guaranteed to be loop invariant, which means that it should dominate the
1333  // header. This allows us to insert code for it in the preheader.
1334  BasicBlock *Preheader = CurLoop->getLoopPreheader();
1335  IRBuilder<> Builder(Preheader->getTerminator());
1336  SCEVExpander Expander(*SE, *DL, "loop-idiom");
1337 
1338  SCEVExpanderCleaner ExpCleaner(Expander);
1339 
1340  bool Changed = false;
1341  const SCEV *StrStart = StoreEv->getStart();
1342  unsigned StrAS = DestPtr->getType()->getPointerAddressSpace();
1343  Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
1344 
1345  APInt Stride = getStoreStride(StoreEv);
1346  const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
1347 
1348  // TODO: Deal with non-constant size; Currently expect constant store size
1349  assert(ConstStoreSize && "store size is expected to be a constant");
1350 
1351  int64_t StoreSize = ConstStoreSize->getValue()->getZExtValue();
1352  bool IsNegStride = StoreSize == -Stride;
1353 
1354  // Handle negative strided loops.
1355  if (IsNegStride)
1356  StrStart =
1357  getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1358 
1359  // Okay, we have a strided store "p[i]" of a loaded value. We can turn
1360  // this into a memcpy in the loop preheader now if we want. However, this
1361  // would be unsafe to do if there is anything else in the loop that may read
1362  // or write the memory region we're storing to. This includes the load that
1363  // feeds the stores. Check for an alias by generating the base address and
1364  // checking everything.
1365  Value *StoreBasePtr = Expander.expandCodeFor(
1366  StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
1367 
1368  // From here on out, conservatively report to the pass manager that we've
1369  // changed the IR, even if we later clean up these added instructions. There
1370  // may be structural differences e.g. in the order of use lists not accounted
1371  // for in just a textual dump of the IR. This is written as a variable, even
1372  // though statically all the places this dominates could be replaced with
1373  // 'true', with the hope that anyone trying to be clever / "more precise" with
1374  // the return value will read this comment, and leave them alone.
1375  Changed = true;
1376 
1377  SmallPtrSet<Instruction *, 2> IgnoredInsts;
1378  IgnoredInsts.insert(TheStore);
1379 
1380  bool IsMemCpy = isa<MemCpyInst>(TheStore);
1381  const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";
1382 
1383  bool LoopAccessStore =
1384  mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1385  StoreSizeSCEV, *AA, IgnoredInsts);
1386  if (LoopAccessStore) {
1387  // For memmove case it's not enough to guarantee that loop doesn't access
1388  // TheStore and TheLoad. Additionally we need to make sure that TheStore is
1389  // the only user of TheLoad.
1390  if (!TheLoad->hasOneUse())
1391  return Changed;
1392  IgnoredInsts.insert(TheLoad);
1393  if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop,
1394  BECount, StoreSizeSCEV, *AA, IgnoredInsts)) {
1395  ORE.emit([&]() {
1396  return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore",
1397  TheStore)
1398  << ore::NV("Inst", InstRemark) << " in "
1399  << ore::NV("Function", TheStore->getFunction())
1400  << " function will not be hoisted: "
1401  << ore::NV("Reason", "The loop may access store location");
1402  });
1403  return Changed;
1404  }
1405  IgnoredInsts.erase(TheLoad);
1406  }
1407 
1408  const SCEV *LdStart = LoadEv->getStart();
1409  unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
1410 
1411  // Handle negative strided loops.
1412  if (IsNegStride)
1413  LdStart =
1414  getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1415 
1416  // For a memcpy, we have to make sure that the input array is not being
1417  // mutated by the loop.
1418  Value *LoadBasePtr = Expander.expandCodeFor(
1419  LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
1420 
1421  // If the store is a memcpy instruction, we must check if it will write to
1422  // the load memory locations. So remove it from the ignored stores.
1423  MemmoveVerifier Verifier(*LoadBasePtr, *StoreBasePtr, *DL);
1424  if (IsMemCpy && !Verifier.IsSameObject)
1425  IgnoredInsts.erase(TheStore);
1426  if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1427  StoreSizeSCEV, *AA, IgnoredInsts)) {
1428  ORE.emit([&]() {
1429  return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
1430  << ore::NV("Inst", InstRemark) << " in "
1431  << ore::NV("Function", TheStore->getFunction())
1432  << " function will not be hoisted: "
1433  << ore::NV("Reason", "The loop may access load location");
1434  });
1435  return Changed;
1436  }
1437 
1438  bool UseMemMove = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore;
1439  if (UseMemMove)
1440  if (!Verifier.loadAndStoreMayFormMemmove(StoreSize, IsNegStride, *TheLoad,
1441  IsMemCpy))
1442  return Changed;
1443 
1444  if (avoidLIRForMultiBlockLoop())
1445  return Changed;
1446 
1447  // Okay, everything is safe, we can transform this!
1448 
1449  const SCEV *NumBytesS =
1450  getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1451 
1452  Value *NumBytes =
1453  Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1454 
1455  AAMDNodes AATags = TheLoad->getAAMetadata();
1456  AAMDNodes StoreAATags = TheStore->getAAMetadata();
1457  AATags = AATags.merge(StoreAATags);
1458  if (auto CI = dyn_cast<ConstantInt>(NumBytes))
1459  AATags = AATags.extendTo(CI->getZExtValue());
1460  else
1461  AATags = AATags.extendTo(-1);
1462 
1463  CallInst *NewCall = nullptr;
1464  // Check whether to generate an unordered atomic memcpy:
1465  // If the load or store are atomic, then they must necessarily be unordered
1466  // by previous checks.
1467  if (!TheStore->isAtomic() && !TheLoad->isAtomic()) {
1468  if (UseMemMove)
1469  NewCall = Builder.CreateMemMove(
1470  StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, NumBytes,
1471  /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
1472  else
1473  NewCall =
1474  Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign,
1475  NumBytes, /*isVolatile=*/false, AATags.TBAA,
1476  AATags.TBAAStruct, AATags.Scope, AATags.NoAlias);
1477  } else {
1478  // For now don't support unordered atomic memmove.
1479  if (UseMemMove)
1480  return Changed;
1481  // We cannot allow unaligned ops for unordered load/store, so reject
1482  // anything where the alignment isn't at least the element size.
1483  assert((StoreAlign && LoadAlign) &&
1484  "Expect unordered load/store to have align.");
1485  if (StoreAlign.value() < StoreSize || LoadAlign.value() < StoreSize)
1486  return Changed;
1487 
1488  // If the element.atomic memcpy is not lowered into explicit
1489  // loads/stores later, then it will be lowered into an element-size
1490  // specific lib call. If the lib call doesn't exist for our store size, then
1491  // we shouldn't generate the memcpy.
1492  if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1493  return Changed;
1494 
1495  // Create the call.
1496  // Note that unordered atomic loads/stores are *required* by the spec to
1497  // have an alignment but non-atomic loads/stores may not.
1498  NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1499  StoreBasePtr, StoreAlign.value(), LoadBasePtr, LoadAlign.value(),
1500  NumBytes, StoreSize, AATags.TBAA, AATags.TBAAStruct, AATags.Scope,
1501  AATags.NoAlias);
1502  }
1503  NewCall->setDebugLoc(TheStore->getDebugLoc());
1504 
1505  if (MSSAU) {
1506  MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1507  NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1508  MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1509  }
1510 
1511  LLVM_DEBUG(dbgs() << " Formed new call: " << *NewCall << "\n"
1512  << " from load ptr=" << *LoadEv << " at: " << *TheLoad
1513  << "\n"
1514  << " from store ptr=" << *StoreEv << " at: " << *TheStore
1515  << "\n");
1516 
1517  ORE.emit([&]() {
1518  return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",
1519  NewCall->getDebugLoc(), Preheader)
1520  << "Formed a call to "
1521  << ore::NV("NewFunction", NewCall->getCalledFunction())
1522  << "() intrinsic from " << ore::NV("Inst", InstRemark)
1523  << " instruction in " << ore::NV("Function", TheStore->getFunction())
1524  << " function"
1525  << ore::setExtraArgs()
1526  << ore::NV("FromBlock", TheStore->getParent()->getName())
1527  << ore::NV("ToBlock", Preheader->getName());
1528  });
1529 
1530  // Okay, a new call to memcpy/memmove has been formed. Zap the original store
1531  // and anything that feeds into it.
1532  if (MSSAU)
1533  MSSAU->removeMemoryAccess(TheStore, true);
1534  deleteDeadInstruction(TheStore);
1535  if (MSSAU && VerifyMemorySSA)
1536  MSSAU->getMemorySSA()->verifyMemorySSA();
1537  if (UseMemMove)
1538  ++NumMemMove;
1539  else
1540  ++NumMemCpy;
1541  ExpCleaner.markResultUsed();
1542  return true;
1543 }
1544 
1545 // When compiling for codesize we avoid idiom recognition for a multi-block loop
1546 // unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1547 //
1548 bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1549  bool IsLoopMemset) {
1550  if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
1551  if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
1552  LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
1553  << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
1554  << " avoided: multi-block top-level loop\n");
1555  return true;
1556  }
1557  }
1558 
1559  return false;
1560 }
1561 
1562 bool LoopIdiomRecognize::runOnNoncountableLoop() {
1563  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
1564  << CurLoop->getHeader()->getParent()->getName()
1565  << "] Noncountable Loop %"
1566  << CurLoop->getHeader()->getName() << "\n");
1567 
1568  return recognizePopcount() || recognizeAndInsertFFS() ||
1569  recognizeShiftUntilBitTest() || recognizeShiftUntilZero();
1570 }
1571 
1572 /// Check if the given conditional branch is based on the comparison between
1573 /// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1574 /// true), the control yields to the loop entry. If the branch matches the
1575 /// behavior, the variable involved in the comparison is returned. This function
1576 /// will be called to see if the precondition and postcondition of the loop are
1577 /// in desirable form.
1578 static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
1579  bool JmpOnZero = false) {
1580  if (!BI || !BI->isConditional())
1581  return nullptr;
1582 
1583  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
1584  if (!Cond)
1585  return nullptr;
1586 
1587  ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
1588  if (!CmpZero || !CmpZero->isZero())
1589  return nullptr;
1590 
1591  BasicBlock *TrueSucc = BI->getSuccessor(0);
1592  BasicBlock *FalseSucc = BI->getSuccessor(1);
1593  if (JmpOnZero)
1594  std::swap(TrueSucc, FalseSucc);
1595 
1596  ICmpInst::Predicate Pred = Cond->getPredicate();
1597  if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
1598  (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
1599  return Cond->getOperand(0);
1600 
1601  return nullptr;
1602 }
1603 
1604 // Check if the recurrence variable `VarX` is in the right form to create
1605 // the idiom. Returns the value coerced to a PHINode if so.
1607  BasicBlock *LoopEntry) {
1608  auto *PhiX = dyn_cast<PHINode>(VarX);
1609  if (PhiX && PhiX->getParent() == LoopEntry &&
1610  (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
1611  return PhiX;
1612  return nullptr;
1613 }
1614 
1615 /// Return true iff the idiom is detected in the loop.
1616 ///
1617 /// Additionally:
1618 /// 1) \p CntInst is set to the instruction counting the population bit.
1619 /// 2) \p CntPhi is set to the corresponding phi node.
1620 /// 3) \p Var is set to the value whose population bits are being counted.
1621 ///
1622 /// The core idiom we are trying to detect is:
1623 /// \code
1624 /// if (x0 != 0)
1625 /// goto loop-exit // the precondition of the loop
1626 /// cnt0 = init-val;
1627 /// do {
1628 /// x1 = phi (x0, x2);
1629 /// cnt1 = phi(cnt0, cnt2);
1630 ///
1631 /// cnt2 = cnt1 + 1;
1632 /// ...
1633 /// x2 = x1 & (x1 - 1);
1634 /// ...
1635 /// } while(x != 0);
1636 ///
1637 /// loop-exit:
1638 /// \endcode
1639 static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
1640  Instruction *&CntInst, PHINode *&CntPhi,
1641  Value *&Var) {
1642  // step 1: Check to see if the look-back branch match this pattern:
1643  // "if (a!=0) goto loop-entry".
1644  BasicBlock *LoopEntry;
1645  Instruction *DefX2, *CountInst;
1646  Value *VarX1, *VarX0;
1647  PHINode *PhiX, *CountPhi;
1648 
1649  DefX2 = CountInst = nullptr;
1650  VarX1 = VarX0 = nullptr;
1651  PhiX = CountPhi = nullptr;
1652  LoopEntry = *(CurLoop->block_begin());
1653 
1654  // step 1: Check if the loop-back branch is in desirable form.
1655  {
1656  if (Value *T = matchCondition(
1657  dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1658  DefX2 = dyn_cast<Instruction>(T);
1659  else
1660  return false;
1661  }
1662 
1663  // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1664  {
1665  if (!DefX2 || DefX2->getOpcode() != Instruction::And)
1666  return false;
1667 
1668  BinaryOperator *SubOneOp;
1669 
1670  if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
1671  VarX1 = DefX2->getOperand(1);
1672  else {
1673  VarX1 = DefX2->getOperand(0);
1674  SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
1675  }
1676  if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
1677  return false;
1678 
1679  ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
1680  if (!Dec ||
1681  !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
1682  (SubOneOp->getOpcode() == Instruction::Add &&
1683  Dec->isMinusOne()))) {
1684  return false;
1685  }
1686  }
1687 
1688  // step 3: Check the recurrence of variable X
1689  PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
1690  if (!PhiX)
1691  return false;
1692 
1693  // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
1694  {
1695  CountInst = nullptr;
1696  for (Instruction &Inst : llvm::make_range(
1697  LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
1698  if (Inst.getOpcode() != Instruction::Add)
1699  continue;
1700 
1701  ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
1702  if (!Inc || !Inc->isOne())
1703  continue;
1704 
1705  PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
1706  if (!Phi)
1707  continue;
1708 
1709  // Check if the result of the instruction is live of the loop.
1710  bool LiveOutLoop = false;
1711  for (User *U : Inst.users()) {
1712  if ((cast<Instruction>(U))->getParent() != LoopEntry) {
1713  LiveOutLoop = true;
1714  break;
1715  }
1716  }
1717 
1718  if (LiveOutLoop) {
1719  CountInst = &Inst;
1720  CountPhi = Phi;
1721  break;
1722  }
1723  }
1724 
1725  if (!CountInst)
1726  return false;
1727  }
1728 
1729  // step 5: check if the precondition is in this form:
1730  // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
1731  {
1732  auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1733  Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
1734  if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
1735  return false;
1736 
1737  CntInst = CountInst;
1738  CntPhi = CountPhi;
1739  Var = T;
1740  }
1741 
1742  return true;
1743 }
1744 
1745 /// Return true if the idiom is detected in the loop.
1746 ///
1747 /// Additionally:
1748 /// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1749 /// or nullptr if there is no such.
1750 /// 2) \p CntPhi is set to the corresponding phi node
1751 /// or nullptr if there is no such.
1752 /// 3) \p Var is set to the value whose CTLZ could be used.
1753 /// 4) \p DefX is set to the instruction calculating Loop exit condition.
1754 ///
1755 /// The core idiom we are trying to detect is:
1756 /// \code
1757 /// if (x0 == 0)
1758 /// goto loop-exit // the precondition of the loop
1759 /// cnt0 = init-val;
1760 /// do {
1761 /// x = phi (x0, x.next); //PhiX
1762 /// cnt = phi(cnt0, cnt.next);
1763 ///
1764 /// cnt.next = cnt + 1;
1765 /// ...
1766 /// x.next = x >> 1; // DefX
1767 /// ...
1768 /// } while(x.next != 0);
1769 ///
1770 /// loop-exit:
1771 /// \endcode
1772 static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
1773  Intrinsic::ID &IntrinID, Value *&InitX,
1774  Instruction *&CntInst, PHINode *&CntPhi,
1775  Instruction *&DefX) {
1776  BasicBlock *LoopEntry;
1777  Value *VarX = nullptr;
1778 
1779  DefX = nullptr;
1780  CntInst = nullptr;
1781  CntPhi = nullptr;
1782  LoopEntry = *(CurLoop->block_begin());
1783 
1784  // step 1: Check if the loop-back branch is in desirable form.
1785  if (Value *T = matchCondition(
1786  dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1787  DefX = dyn_cast<Instruction>(T);
1788  else
1789  return false;
1790 
1791  // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
1792  if (!DefX || !DefX->isShift())
1793  return false;
1794  IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
1795  Intrinsic::ctlz;
1796  ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
1797  if (!Shft || !Shft->isOne())
1798  return false;
1799  VarX = DefX->getOperand(0);
1800 
1801  // step 3: Check the recurrence of variable X
1802  PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
1803  if (!PhiX)
1804  return false;
1805 
1806  InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
1807 
1808  // Make sure the initial value can't be negative otherwise the ashr in the
1809  // loop might never reach zero which would make the loop infinite.
1810  if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
1811  return false;
1812 
1813  // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1814  // or cnt.next = cnt + -1.
1815  // TODO: We can skip the step. If loop trip count is known (CTLZ),
1816  // then all uses of "cnt.next" could be optimized to the trip count
1817  // plus "cnt0". Currently it is not optimized.
1818  // This step could be used to detect POPCNT instruction:
1819  // cnt.next = cnt + (x.next & 1)
1820  for (Instruction &Inst : llvm::make_range(
1821  LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
1822  if (Inst.getOpcode() != Instruction::Add)
1823  continue;
1824 
1825  ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
1826  if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
1827  continue;
1828 
1829  PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
1830  if (!Phi)
1831  continue;
1832 
1833  CntInst = &Inst;
1834  CntPhi = Phi;
1835  break;
1836  }
1837  if (!CntInst)
1838  return false;
1839 
1840  return true;
1841 }
1842 
1843 /// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
1844 /// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
1845 /// trip count returns true; otherwise, returns false.
1846 bool LoopIdiomRecognize::recognizeAndInsertFFS() {
1847  // Give up if the loop has multiple blocks or multiple backedges.
1848  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1849  return false;
1850 
1851  Intrinsic::ID IntrinID;
1852  Value *InitX;
1853  Instruction *DefX = nullptr;
1854  PHINode *CntPhi = nullptr;
1855  Instruction *CntInst = nullptr;
1856  // Help decide if transformation is profitable. For ShiftUntilZero idiom,
1857  // this is always 6.
1858  size_t IdiomCanonicalSize = 6;
1859 
1860  if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
1861  CntInst, CntPhi, DefX))
1862  return false;
1863 
1864  bool IsCntPhiUsedOutsideLoop = false;
1865  for (User *U : CntPhi->users())
1866  if (!CurLoop->contains(cast<Instruction>(U))) {
1867  IsCntPhiUsedOutsideLoop = true;
1868  break;
1869  }
1870  bool IsCntInstUsedOutsideLoop = false;
1871  for (User *U : CntInst->users())
1872  if (!CurLoop->contains(cast<Instruction>(U))) {
1873  IsCntInstUsedOutsideLoop = true;
1874  break;
1875  }
1876  // If both CntInst and CntPhi are used outside the loop the profitability
1877  // is questionable.
1878  if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
1879  return false;
1880 
1881  // For some CPUs result of CTLZ(X) intrinsic is undefined
1882  // when X is 0. If we can not guarantee X != 0, we need to check this
1883  // when expand.
1884  bool ZeroCheck = false;
1885  // It is safe to assume Preheader exist as it was checked in
1886  // parent function RunOnLoop.
1887  BasicBlock *PH = CurLoop->getLoopPreheader();
1888 
1889  // If we are using the count instruction outside the loop, make sure we
1890  // have a zero check as a precondition. Without the check the loop would run
1891  // one iteration for before any check of the input value. This means 0 and 1
1892  // would have identical behavior in the original loop and thus
1893  if (!IsCntPhiUsedOutsideLoop) {
1894  auto *PreCondBB = PH->getSinglePredecessor();
1895  if (!PreCondBB)
1896  return false;
1897  auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1898  if (!PreCondBI)
1899  return false;
1900  if (matchCondition(PreCondBI, PH) != InitX)
1901  return false;
1902  ZeroCheck = true;
1903  }
1904 
1905  // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
1906  // profitable if we delete the loop.
1907 
1908  // the loop has only 6 instructions:
1909  // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
1910  // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
1911  // %shr = ashr %n.addr.0, 1
1912  // %tobool = icmp eq %shr, 0
1913  // %inc = add nsw %i.0, 1
1914  // br i1 %tobool
1915 
1916  const Value *Args[] = {InitX,
1917  ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
1918 
1919  // @llvm.dbg doesn't count as they have no semantic effect.
1920  auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
1922  std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
1923 
1924  IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
1927  if (HeaderSize != IdiomCanonicalSize &&
1929  return false;
1930 
1931  transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
1932  DefX->getDebugLoc(), ZeroCheck,
1933  IsCntPhiUsedOutsideLoop);
1934  return true;
1935 }
1936 
1937 /// Recognizes a population count idiom in a non-countable loop.
1938 ///
1939 /// If detected, transforms the relevant code to issue the popcount intrinsic
1940 /// function call, and returns true; otherwise, returns false.
1941 bool LoopIdiomRecognize::recognizePopcount() {
1943  return false;
1944 
1945  // Counting population are usually conducted by few arithmetic instructions.
1946  // Such instructions can be easily "absorbed" by vacant slots in a
1947  // non-compact loop. Therefore, recognizing popcount idiom only makes sense
1948  // in a compact loop.
1949 
1950  // Give up if the loop has multiple blocks or multiple backedges.
1951  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1952  return false;
1953 
1954  BasicBlock *LoopBody = *(CurLoop->block_begin());
1955  if (LoopBody->size() >= 20) {
1956  // The loop is too big, bail out.
1957  return false;
1958  }
1959 
1960  // It should have a preheader containing nothing but an unconditional branch.
1961  BasicBlock *PH = CurLoop->getLoopPreheader();
1962  if (!PH || &PH->front() != PH->getTerminator())
1963  return false;
1964  auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
1965  if (!EntryBI || EntryBI->isConditional())
1966  return false;
1967 
1968  // It should have a precondition block where the generated popcount intrinsic
1969  // function can be inserted.
1970  auto *PreCondBB = PH->getSinglePredecessor();
1971  if (!PreCondBB)
1972  return false;
1973  auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1974  if (!PreCondBI || PreCondBI->isUnconditional())
1975  return false;
1976 
1977  Instruction *CntInst;
1978  PHINode *CntPhi;
1979  Value *Val;
1980  if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
1981  return false;
1982 
1983  transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
1984  return true;
1985 }
1986 
1988  const DebugLoc &DL) {
1989  Value *Ops[] = {Val};
1990  Type *Tys[] = {Val->getType()};
1991 
1993  Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
1994  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1995  CI->setDebugLoc(DL);
1996 
1997  return CI;
1998 }
1999 
2001  const DebugLoc &DL, bool ZeroCheck,
2002  Intrinsic::ID IID) {
2003  Value *Ops[] = {Val, IRBuilder.getInt1(ZeroCheck)};
2004  Type *Tys[] = {Val->getType()};
2005 
2007  Function *Func = Intrinsic::getDeclaration(M, IID, Tys);
2008  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
2009  CI->setDebugLoc(DL);
2010 
2011  return CI;
2012 }
2013 
2014 /// Transform the following loop (Using CTLZ, CTTZ is similar):
2015 /// loop:
2016 /// CntPhi = PHI [Cnt0, CntInst]
2017 /// PhiX = PHI [InitX, DefX]
2018 /// CntInst = CntPhi + 1
2019 /// DefX = PhiX >> 1
2020 /// LOOP_BODY
2021 /// Br: loop if (DefX != 0)
2022 /// Use(CntPhi) or Use(CntInst)
2023 ///
2024 /// Into:
2025 /// If CntPhi used outside the loop:
2026 /// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
2027 /// Count = CountPrev + 1
2028 /// else
2029 /// Count = BitWidth(InitX) - CTLZ(InitX)
2030 /// loop:
2031 /// CntPhi = PHI [Cnt0, CntInst]
2032 /// PhiX = PHI [InitX, DefX]
2033 /// PhiCount = PHI [Count, Dec]
2034 /// CntInst = CntPhi + 1
2035 /// DefX = PhiX >> 1
2036 /// Dec = PhiCount - 1
2037 /// LOOP_BODY
2038 /// Br: loop if (Dec != 0)
2039 /// Use(CountPrev + Cnt0) // Use(CntPhi)
2040 /// or
2041 /// Use(Count + Cnt0) // Use(CntInst)
2042 ///
2043 /// If LOOP_BODY is empty the loop will be deleted.
2044 /// If CntInst and DefX are not used in LOOP_BODY they will be removed.
2045 void LoopIdiomRecognize::transformLoopToCountable(
2046  Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
2047  PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
2048  bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
2049  BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
2050 
2051  // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
2052  IRBuilder<> Builder(PreheaderBr);
2053  Builder.SetCurrentDebugLocation(DL);
2054 
2055  // If there are no uses of CntPhi crate:
2056  // Count = BitWidth - CTLZ(InitX);
2057  // NewCount = Count;
2058  // If there are uses of CntPhi create:
2059  // NewCount = BitWidth - CTLZ(InitX >> 1);
2060  // Count = NewCount + 1;
2061  Value *InitXNext;
2062  if (IsCntPhiUsedOutsideLoop) {
2063  if (DefX->getOpcode() == Instruction::AShr)
2064  InitXNext = Builder.CreateAShr(InitX, 1);
2065  else if (DefX->getOpcode() == Instruction::LShr)
2066  InitXNext = Builder.CreateLShr(InitX, 1);
2067  else if (DefX->getOpcode() == Instruction::Shl) // cttz
2068  InitXNext = Builder.CreateShl(InitX, 1);
2069  else
2070  llvm_unreachable("Unexpected opcode!");
2071  } else
2072  InitXNext = InitX;
2073  Value *Count =
2074  createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
2075  Type *CountTy = Count->getType();
2076  Count = Builder.CreateSub(
2077  ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count);
2078  Value *NewCount = Count;
2079  if (IsCntPhiUsedOutsideLoop)
2080  Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1));
2081 
2082  NewCount = Builder.CreateZExtOrTrunc(NewCount, CntInst->getType());
2083 
2084  Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
2085  if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
2086  // If the counter was being incremented in the loop, add NewCount to the
2087  // counter's initial value, but only if the initial value is not zero.
2088  ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2089  if (!InitConst || !InitConst->isZero())
2090  NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2091  } else {
2092  // If the count was being decremented in the loop, subtract NewCount from
2093  // the counter's initial value.
2094  NewCount = Builder.CreateSub(CntInitVal, NewCount);
2095  }
2096 
2097  // Step 2: Insert new IV and loop condition:
2098  // loop:
2099  // ...
2100  // PhiCount = PHI [Count, Dec]
2101  // ...
2102  // Dec = PhiCount - 1
2103  // ...
2104  // Br: loop if (Dec != 0)
2105  BasicBlock *Body = *(CurLoop->block_begin());
2106  auto *LbBr = cast<BranchInst>(Body->getTerminator());
2107  ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2108 
2109  PHINode *TcPhi = PHINode::Create(CountTy, 2, "tcphi", &Body->front());
2110 
2111  Builder.SetInsertPoint(LbCond);
2112  Instruction *TcDec = cast<Instruction>(Builder.CreateSub(
2113  TcPhi, ConstantInt::get(CountTy, 1), "tcdec", false, true));
2114 
2115  TcPhi->addIncoming(Count, Preheader);
2116  TcPhi->addIncoming(TcDec, Body);
2117 
2118  CmpInst::Predicate Pred =
2119  (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
2120  LbCond->setPredicate(Pred);
2121  LbCond->setOperand(0, TcDec);
2122  LbCond->setOperand(1, ConstantInt::get(CountTy, 0));
2123 
2124  // Step 3: All the references to the original counter outside
2125  // the loop are replaced with the NewCount
2126  if (IsCntPhiUsedOutsideLoop)
2127  CntPhi->replaceUsesOutsideBlock(NewCount, Body);
2128  else
2129  CntInst->replaceUsesOutsideBlock(NewCount, Body);
2130 
2131  // step 4: Forget the "non-computable" trip-count SCEV associated with the
2132  // loop. The loop would otherwise not be deleted even if it becomes empty.
2133  SE->forgetLoop(CurLoop);
2134 }
2135 
2136 void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
2137  Instruction *CntInst,
2138  PHINode *CntPhi, Value *Var) {
2139  BasicBlock *PreHead = CurLoop->getLoopPreheader();
2140  auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
2141  const DebugLoc &DL = CntInst->getDebugLoc();
2142 
2143  // Assuming before transformation, the loop is following:
2144  // if (x) // the precondition
2145  // do { cnt++; x &= x - 1; } while(x);
2146 
2147  // Step 1: Insert the ctpop instruction at the end of the precondition block
2148  IRBuilder<> Builder(PreCondBr);
2149  Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
2150  {
2151  PopCnt = createPopcntIntrinsic(Builder, Var, DL);
2152  NewCount = PopCntZext =
2153  Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
2154 
2155  if (NewCount != PopCnt)
2156  (cast<Instruction>(NewCount))->setDebugLoc(DL);
2157 
2158  // TripCnt is exactly the number of iterations the loop has
2159  TripCnt = NewCount;
2160 
2161  // If the population counter's initial value is not zero, insert Add Inst.
2162  Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
2163  ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2164  if (!InitConst || !InitConst->isZero()) {
2165  NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2166  (cast<Instruction>(NewCount))->setDebugLoc(DL);
2167  }
2168  }
2169 
2170  // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
2171  // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
2172  // function would be partial dead code, and downstream passes will drag
2173  // it back from the precondition block to the preheader.
2174  {
2175  ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
2176 
2177  Value *Opnd0 = PopCntZext;
2178  Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
2179  if (PreCond->getOperand(0) != Var)
2180  std::swap(Opnd0, Opnd1);
2181 
2182  ICmpInst *NewPreCond = cast<ICmpInst>(
2183  Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
2184  PreCondBr->setCondition(NewPreCond);
2185 
2187  }
2188 
2189  // Step 3: Note that the population count is exactly the trip count of the
2190  // loop in question, which enable us to convert the loop from noncountable
2191  // loop into a countable one. The benefit is twofold:
2192  //
2193  // - If the loop only counts population, the entire loop becomes dead after
2194  // the transformation. It is a lot easier to prove a countable loop dead
2195  // than to prove a noncountable one. (In some C dialects, an infinite loop
2196  // isn't dead even if it computes nothing useful. In general, DCE needs
2197  // to prove a noncountable loop finite before safely delete it.)
2198  //
2199  // - If the loop also performs something else, it remains alive.
2200  // Since it is transformed to countable form, it can be aggressively
2201  // optimized by some optimizations which are in general not applicable
2202  // to a noncountable loop.
2203  //
2204  // After this step, this loop (conceptually) would look like following:
2205  // newcnt = __builtin_ctpop(x);
2206  // t = newcnt;
2207  // if (x)
2208  // do { cnt++; x &= x-1; t--) } while (t > 0);
2209  BasicBlock *Body = *(CurLoop->block_begin());
2210  {
2211  auto *LbBr = cast<BranchInst>(Body->getTerminator());
2212  ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2213  Type *Ty = TripCnt->getType();
2214 
2215  PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
2216 
2217  Builder.SetInsertPoint(LbCond);
2218  Instruction *TcDec = cast<Instruction>(
2219  Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
2220  "tcdec", false, true));
2221 
2222  TcPhi->addIncoming(TripCnt, PreHead);
2223  TcPhi->addIncoming(TcDec, Body);
2224 
2225  CmpInst::Predicate Pred =
2226  (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
2227  LbCond->setPredicate(Pred);
2228  LbCond->setOperand(0, TcDec);
2229  LbCond->setOperand(1, ConstantInt::get(Ty, 0));
2230  }
2231 
2232  // Step 4: All the references to the original population counter outside
2233  // the loop are replaced with the NewCount -- the value returned from
2234  // __builtin_ctpop().
2235  CntInst->replaceUsesOutsideBlock(NewCount, Body);
2236 
2237  // step 5: Forget the "non-computable" trip-count SCEV associated with the
2238  // loop. The loop would otherwise not be deleted even if it becomes empty.
2239  SE->forgetLoop(CurLoop);
2240 }
2241 
2242 /// Match loop-invariant value.
2243 template <typename SubPattern_t> struct match_LoopInvariant {
2244  SubPattern_t SubPattern;
2245  const Loop *L;
2246 
2247  match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
2248  : SubPattern(SP), L(L) {}
2249 
2250  template <typename ITy> bool match(ITy *V) {
2251  return L->isLoopInvariant(V) && SubPattern.match(V);
2252  }
2253 };
2254 
2255 /// Matches if the value is loop-invariant.
2256 template <typename Ty>
2257 inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
2258  return match_LoopInvariant<Ty>(M, L);
2259 }
2260 
2261 /// Return true if the idiom is detected in the loop.
2262 ///
2263 /// The core idiom we are trying to detect is:
2264 /// \code
2265 /// entry:
2266 /// <...>
2267 /// %bitmask = shl i32 1, %bitpos
2268 /// br label %loop
2269 ///
2270 /// loop:
2271 /// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2272 /// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2273 /// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2274 /// %x.next = shl i32 %x.curr, 1
2275 /// <...>
2276 /// br i1 %x.curr.isbitunset, label %loop, label %end
2277 ///
2278 /// end:
2279 /// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2280 /// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2281 /// <...>
2282 /// \endcode
2283 static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
2284  Value *&BitMask, Value *&BitPos,
2285  Value *&CurrX, Instruction *&NextX) {
2287  " Performing shift-until-bittest idiom detection.\n");
2288 
2289  // Give up if the loop has multiple blocks or multiple backedges.
2290  if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
2291  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
2292  return false;
2293  }
2294 
2295  BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2296  BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2297  assert(LoopPreheaderBB && "There is always a loop preheader.");
2298 
2299  using namespace PatternMatch;
2300 
2301  // Step 1: Check if the loop backedge is in desirable form.
2302 
2303  ICmpInst::Predicate Pred;
2304  Value *CmpLHS, *CmpRHS;
2305  BasicBlock *TrueBB, *FalseBB;
2306  if (!match(LoopHeaderBB->getTerminator(),
2307  m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
2308  m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {
2309  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
2310  return false;
2311  }
2312 
2313  // Step 2: Check if the backedge's condition is in desirable form.
2314 
2315  auto MatchVariableBitMask = [&]() {
2316  return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2317  match(CmpLHS,
2318  m_c_And(m_Value(CurrX),
2319  m_CombineAnd(
2320  m_Value(BitMask),
2321  m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),
2322  CurLoop))));
2323  };
2324  auto MatchConstantBitMask = [&]() {
2325  return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2326  match(CmpLHS, m_And(m_Value(CurrX),
2327  m_CombineAnd(m_Value(BitMask), m_Power2()))) &&
2328  (BitPos = ConstantExpr::getExactLogBase2(cast<Constant>(BitMask)));
2329  };
2330  auto MatchDecomposableConstantBitMask = [&]() {
2331  APInt Mask;
2332  return llvm::decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, CurrX, Mask) &&
2333  ICmpInst::isEquality(Pred) && Mask.isPowerOf2() &&
2334  (BitMask = ConstantInt::get(CurrX->getType(), Mask)) &&
2335  (BitPos = ConstantInt::get(CurrX->getType(), Mask.logBase2()));
2336  };
2337 
2338  if (!MatchVariableBitMask() && !MatchConstantBitMask() &&
2339  !MatchDecomposableConstantBitMask()) {
2340  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n");
2341  return false;
2342  }
2343 
2344  // Step 3: Check if the recurrence is in desirable form.
2345  auto *CurrXPN = dyn_cast<PHINode>(CurrX);
2346  if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) {
2347  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
2348  return false;
2349  }
2350 
2351  BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);
2352  NextX =
2353  dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));
2354 
2355  assert(CurLoop->isLoopInvariant(BaseX) &&
2356  "Expected BaseX to be avaliable in the preheader!");
2357 
2358  if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {
2359  // FIXME: support right-shift?
2360  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
2361  return false;
2362  }
2363 
2364  // Step 4: Check if the backedge's destinations are in desirable form.
2365 
2366  assert(ICmpInst::isEquality(Pred) &&
2367  "Should only get equality predicates here.");
2368 
2369  // cmp-br is commutative, so canonicalize to a single variant.
2370  if (Pred != ICmpInst::Predicate::ICMP_EQ) {
2371  Pred = ICmpInst::getInversePredicate(Pred);
2372  std::swap(TrueBB, FalseBB);
2373  }
2374 
2375  // We expect to exit loop when comparison yields false,
2376  // so when it yields true we should branch back to loop header.
2377  if (TrueBB != LoopHeaderBB) {
2378  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
2379  return false;
2380  }
2381 
2382  // Okay, idiom checks out.
2383  return true;
2384 }
2385 
2386 /// Look for the following loop:
2387 /// \code
2388 /// entry:
2389 /// <...>
2390 /// %bitmask = shl i32 1, %bitpos
2391 /// br label %loop
2392 ///
2393 /// loop:
2394 /// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2395 /// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2396 /// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2397 /// %x.next = shl i32 %x.curr, 1
2398 /// <...>
2399 /// br i1 %x.curr.isbitunset, label %loop, label %end
2400 ///
2401 /// end:
2402 /// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2403 /// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2404 /// <...>
2405 /// \endcode
2406 ///
2407 /// And transform it into:
2408 /// \code
2409 /// entry:
2410 /// %bitmask = shl i32 1, %bitpos
2411 /// %lowbitmask = add i32 %bitmask, -1
2412 /// %mask = or i32 %lowbitmask, %bitmask
2413 /// %x.masked = and i32 %x, %mask
2414 /// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
2415 /// i1 true)
2416 /// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
2417 /// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
2418 /// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
2419 /// %tripcount = add i32 %backedgetakencount, 1
2420 /// %x.curr = shl i32 %x, %backedgetakencount
2421 /// %x.next = shl i32 %x, %tripcount
2422 /// br label %loop
2423 ///
2424 /// loop:
2425 /// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
2426 /// %loop.iv.next = add nuw i32 %loop.iv, 1
2427 /// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
2428 /// <...>
2429 /// br i1 %loop.ivcheck, label %end, label %loop
2430 ///
2431 /// end:
2432 /// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2433 /// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2434 /// <...>
2435 /// \endcode
2436 bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
2437  bool MadeChange = false;
2438 
2439  Value *X, *BitMask, *BitPos, *XCurr;
2440  Instruction *XNext;
2441  if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,
2442  XNext)) {
2444  " shift-until-bittest idiom detection failed.\n");
2445  return MadeChange;
2446  }
2447  LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n");
2448 
2449  // Ok, it is the idiom we were looking for, we *could* transform this loop,
2450  // but is it profitable to transform?
2451 
2452  BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2453  BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2454  assert(LoopPreheaderBB && "There is always a loop preheader.");
2455 
2456  BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2457  assert(SuccessorBB && "There is only a single successor.");
2458 
2459  IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2460  Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());
2461 
2462  Intrinsic::ID IntrID = Intrinsic::ctlz;
2463  Type *Ty = X->getType();
2464  unsigned Bitwidth = Ty->getScalarSizeInBits();
2465 
2468 
2469  // The rewrite is considered to be unprofitable iff and only iff the
2470  // intrinsic/shift we'll use are not cheap. Note that we are okay with *just*
2471  // making the loop countable, even if nothing else changes.
2473  IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getTrue()});
2475  if (Cost > TargetTransformInfo::TCC_Basic) {
2477  " Intrinsic is too costly, not beneficial\n");
2478  return MadeChange;
2479  }
2480  if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >
2482  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n");
2483  return MadeChange;
2484  }
2485 
2486  // Ok, transform appears worthwhile.
2487  MadeChange = true;
2488 
2489  // Step 1: Compute the loop trip count.
2490 
2491  Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
2492  BitPos->getName() + ".lowbitmask");
2493  Value *Mask =
2494  Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
2495  Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
2496  CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
2497  IntrID, Ty, {XMasked, /*is_zero_undef=*/Builder.getTrue()},
2498  /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros");
2499  Value *XMaskedNumActiveBits = Builder.CreateSub(
2500  ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
2501  XMasked->getName() + ".numactivebits", /*HasNUW=*/true,
2502  /*HasNSW=*/Bitwidth != 2);
2503  Value *XMaskedLeadingOnePos =
2504  Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),
2505  XMasked->getName() + ".leadingonepos", /*HasNUW=*/false,
2506  /*HasNSW=*/Bitwidth > 2);
2507 
2508  Value *LoopBackedgeTakenCount = Builder.CreateSub(
2509  BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount",
2510  /*HasNUW=*/true, /*HasNSW=*/true);
2511  // We know loop's backedge-taken count, but what's loop's trip count?
2512  // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2513  Value *LoopTripCount =
2514  Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
2515  CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
2516  /*HasNSW=*/Bitwidth != 2);
2517 
2518  // Step 2: Compute the recurrence's final value without a loop.
2519 
2520  // NewX is always safe to compute, because `LoopBackedgeTakenCount`
2521  // will always be smaller than `bitwidth(X)`, i.e. we never get poison.
2522  Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);
2523  NewX->takeName(XCurr);
2524  if (auto *I = dyn_cast<Instruction>(NewX))
2525  I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
2526 
2527  Value *NewXNext;
2528  // Rewriting XNext is more complicated, however, because `X << LoopTripCount`
2529  // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
2530  // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
2531  // that isn't the case, we'll need to emit an alternative, safe IR.
2532  if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() ||
2536  Ty->getScalarSizeInBits() - 1))))
2537  NewXNext = Builder.CreateShl(X, LoopTripCount);
2538  else {
2539  // Otherwise, just additionally shift by one. It's the smallest solution,
2540  // alternatively, we could check that NewX is INT_MIN (or BitPos is )
2541  // and select 0 instead.
2542  NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));
2543  }
2544 
2545  NewXNext->takeName(XNext);
2546  if (auto *I = dyn_cast<Instruction>(NewXNext))
2547  I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
2548 
2549  // Step 3: Adjust the successor basic block to recieve the computed
2550  // recurrence's final value instead of the recurrence itself.
2551 
2552  XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);
2553  XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);
2554 
2555  // Step 4: Rewrite the loop into a countable form, with canonical IV.
2556 
2557  // The new canonical induction variable.
2558  Builder.SetInsertPoint(&LoopHeaderBB->front());
2559  auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
2560 
2561  // The induction itself.
2562  // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2563  Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
2564  auto *IVNext =
2565  Builder.CreateAdd(IV, ConstantInt::get(Ty, 1), IV->getName() + ".next",
2566  /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
2567 
2568  // The loop trip count check.
2569  auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
2570  CurLoop->getName() + ".ivcheck");
2571  Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
2572  LoopHeaderBB->getTerminator()->eraseFromParent();
2573 
2574  // Populate the IV PHI.
2575  IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
2576  IV->addIncoming(IVNext, LoopHeaderBB);
2577 
2578  // Step 5: Forget the "non-computable" trip-count SCEV associated with the
2579  // loop. The loop would otherwise not be deleted even if it becomes empty.
2580 
2581  SE->forgetLoop(CurLoop);
2582 
2583  // Other passes will take care of actually deleting the loop if possible.
2584 
2585  LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n");
2586 
2587  ++NumShiftUntilBitTest;
2588  return MadeChange;
2589 }
2590 
2591 /// Return true if the idiom is detected in the loop.
2592 ///
2593 /// The core idiom we are trying to detect is:
2594 /// \code
2595 /// entry:
2596 /// <...>
2597 /// %start = <...>
2598 /// %extraoffset = <...>
2599 /// <...>
2600 /// br label %for.cond
2601 ///
2602 /// loop:
2603 /// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
2604 /// %nbits = add nsw i8 %iv, %extraoffset
2605 /// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
2606 /// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
2607 /// %iv.next = add i8 %iv, 1
2608 /// <...>
2609 /// br i1 %val.shifted.iszero, label %end, label %loop
2610 ///
2611 /// end:
2612 /// %iv.res = phi i8 [ %iv, %loop ] <...>
2613 /// %nbits.res = phi i8 [ %nbits, %loop ] <...>
2614 /// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
2615 /// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
2616 /// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
2617 /// <...>
2618 /// \endcode
2620  Instruction *&ValShiftedIsZero,
2621  Intrinsic::ID &IntrinID, Instruction *&IV,
2622  Value *&Start, Value *&Val,
2623  const SCEV *&ExtraOffsetExpr,
2624  bool &InvertedCond) {
2626  " Performing shift-until-zero idiom detection.\n");
2627 
2628  // Give up if the loop has multiple blocks or multiple backedges.
2629  if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
2630  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
2631  return false;
2632  }
2633 
2634  Instruction *ValShifted, *NBits, *IVNext;
2635  Value *ExtraOffset;
2636 
2637  BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2638  BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2639  assert(LoopPreheaderBB && "There is always a loop preheader.");
2640 
2641  using namespace PatternMatch;
2642 
2643  // Step 1: Check if the loop backedge, condition is in desirable form.
2644 
2645  ICmpInst::Predicate Pred;
2646  BasicBlock *TrueBB, *FalseBB;
2647  if (!match(LoopHeaderBB->getTerminator(),
2648  m_Br(m_Instruction(ValShiftedIsZero), m_BasicBlock(TrueBB),
2649  m_BasicBlock(FalseBB))) ||
2650  !match(ValShiftedIsZero,
2651  m_ICmp(Pred, m_Instruction(ValShifted), m_Zero())) ||
2652  !ICmpInst::isEquality(Pred)) {
2653  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
2654  return false;
2655  }
2656 
2657  // Step 2: Check if the comparison's operand is in desirable form.
2658  // FIXME: Val could be a one-input PHI node, which we should look past.
2659  if (!match(ValShifted, m_Shift(m_LoopInvariant(m_Value(Val), CurLoop),
2660  m_Instruction(NBits)))) {
2661  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad comparisons value computation.\n");
2662  return false;
2663  }
2664  IntrinID = ValShifted->getOpcode() == Instruction::Shl ? Intrinsic::cttz
2665  : Intrinsic::ctlz;
2666 
2667  // Step 3: Check if the shift amount is in desirable form.
2668 
2669  if (match(NBits, m_c_Add(m_Instruction(IV),
2670  m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
2671  (NBits->hasNoSignedWrap() || NBits->hasNoUnsignedWrap()))
2672  ExtraOffsetExpr = SE->getNegativeSCEV(SE->getSCEV(ExtraOffset));
2673  else if (match(NBits,
2675  m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
2676  NBits->hasNoSignedWrap())
2677  ExtraOffsetExpr = SE->getSCEV(ExtraOffset);
2678  else {
2679  IV = NBits;
2680  ExtraOffsetExpr = SE->getZero(NBits->getType());
2681  }
2682 
2683  // Step 4: Check if the recurrence is in desirable form.
2684  auto *IVPN = dyn_cast<PHINode>(IV);
2685  if (!IVPN || IVPN->getParent() != LoopHeaderBB) {
2686  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
2687  return false;
2688  }
2689 
2690  Start = IVPN->getIncomingValueForBlock(LoopPreheaderBB);
2691  IVNext = dyn_cast<Instruction>(IVPN->getIncomingValueForBlock(LoopHeaderBB));
2692 
2693  if (!IVNext || !match(IVNext, m_Add(m_Specific(IVPN), m_One()))) {
2694  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
2695  return false;
2696  }
2697 
2698  // Step 4: Check if the backedge's destinations are in desirable form.
2699 
2700  assert(ICmpInst::isEquality(Pred) &&
2701  "Should only get equality predicates here.");
2702 
2703  // cmp-br is commutative, so canonicalize to a single variant.
2704  InvertedCond = Pred != ICmpInst::Predicate::ICMP_EQ;
2705  if (InvertedCond) {
2706  Pred = ICmpInst::getInversePredicate(Pred);
2707  std::swap(TrueBB, FalseBB);
2708  }
2709 
2710  // We expect to exit loop when comparison yields true,
2711  // so when it yields false we should branch back to loop header.
2712  if (FalseBB != LoopHeaderBB) {
2713  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
2714  return false;
2715  }
2716 
2717  // The new, countable, loop will certainly only run a known number of
2718  // iterations, It won't be infinite. But the old loop might be infinite
2719  // under certain conditions. For logical shifts, the value will become zero
2720  // after at most bitwidth(%Val) loop iterations. However, for arithmetic
2721  // right-shift, iff the sign bit was set, the value will never become zero,
2722  // and the loop may never finish.
2723  if (ValShifted->getOpcode() == Instruction::AShr &&
2724  !isMustProgress(CurLoop) && !SE->isKnownNonNegative(SE->getSCEV(Val))) {
2725  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Can not prove the loop is finite.\n");
2726  return false;
2727  }
2728 
2729  // Okay, idiom checks out.
2730  return true;
2731 }
2732 
2733 /// Look for the following loop:
2734 /// \code
2735 /// entry:
2736 /// <...>
2737 /// %start = <...>
2738 /// %extraoffset = <...>
2739 /// <...>
2740 /// br label %for.cond
2741 ///
2742 /// loop:
2743 /// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
2744 /// %nbits = add nsw i8 %iv, %extraoffset
2745 /// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
2746 /// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
2747 /// %iv.next = add i8 %iv, 1
2748 /// <...>
2749 /// br i1 %val.shifted.iszero, label %end, label %loop
2750 ///
2751 /// end:
2752 /// %iv.res = phi i8 [ %iv, %loop ] <...>
2753 /// %nbits.res = phi i8 [ %nbits, %loop ] <...>
2754 /// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
2755 /// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
2756 /// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
2757 /// <...>
2758 /// \endcode
2759 ///
2760 /// And transform it into:
2761 /// \code
2762 /// entry:
2763 /// <...>
2764 /// %start = <...>
2765 /// %extraoffset = <...>
2766 /// <...>
2767 /// %val.numleadingzeros = call i8 @llvm.ct{l,t}z.i8(i8 %val, i1 0)
2768 /// %val.numactivebits = sub i8 8, %val.numleadingzeros
2769 /// %extraoffset.neg = sub i8 0, %extraoffset
2770 /// %tmp = add i8 %val.numactivebits, %extraoffset.neg
2771 /// %iv.final = call i8 @llvm.smax.i8(i8 %tmp, i8 %start)
2772 /// %loop.tripcount = sub i8 %iv.final, %start
2773 /// br label %loop
2774 ///
2775 /// loop:
2776 /// %loop.iv = phi i8 [ 0, %entry ], [ %loop.iv.next, %loop ]
2777 /// %loop.iv.next = add i8 %loop.iv, 1
2778 /// %loop.ivcheck = icmp eq i8 %loop.iv.next, %loop.tripcount
2779 /// %iv = add i8 %loop.iv, %start
2780 /// <...>
2781 /// br i1 %loop.ivcheck, label %end, label %loop
2782 ///
2783 /// end:
2784 /// %iv.res = phi i8 [ %iv.final, %loop ] <...>
2785 /// <...>
2786 /// \endcode
2787 bool LoopIdiomRecognize::recognizeShiftUntilZero() {
2788  bool MadeChange = false;
2789 
2790  Instruction *ValShiftedIsZero;
2791  Intrinsic::ID IntrID;
2792  Instruction *IV;
2793  Value *Start, *Val;
2794  const SCEV *ExtraOffsetExpr;
2795  bool InvertedCond;
2796  if (!detectShiftUntilZeroIdiom(CurLoop, SE, ValShiftedIsZero, IntrID, IV,
2797  Start, Val, ExtraOffsetExpr, InvertedCond)) {
2799  " shift-until-zero idiom detection failed.\n");
2800  return MadeChange;
2801  }
2802  LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom detected!\n");
2803 
2804  // Ok, it is the idiom we were looking for, we *could* transform this loop,
2805  // but is it profitable to transform?
2806 
2807  BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2808  BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2809  assert(LoopPreheaderBB && "There is always a loop preheader.");
2810 
2811  BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2812  assert(SuccessorBB && "There is only a single successor.");
2813 
2814  IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2815  Builder.SetCurrentDebugLocation(IV->getDebugLoc());
2816 
2817  Type *Ty = Val->getType();
2818  unsigned Bitwidth = Ty->getScalarSizeInBits();
2819 
2822 
2823  // The rewrite is considered to be unprofitable iff and only iff the
2824  // intrinsic we'll use are not cheap. Note that we are okay with *just*
2825  // making the loop countable, even if nothing else changes.
2827  IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getFalse()});
2829  if (Cost > TargetTransformInfo::TCC_Basic) {
2831  " Intrinsic is too costly, not beneficial\n");
2832  return MadeChange;
2833  }
2834 
2835  // Ok, transform appears worthwhile.
2836  MadeChange = true;
2837 
2838  bool OffsetIsZero = false;
2839  if (auto *ExtraOffsetExprC = dyn_cast<SCEVConstant>(ExtraOffsetExpr))
2840  OffsetIsZero = ExtraOffsetExprC->isZero();
2841 
2842  // Step 1: Compute the loop's final IV value / trip count.
2843 
2844  CallInst *ValNumLeadingZeros = Builder.CreateIntrinsic(
2845  IntrID, Ty, {Val, /*is_zero_undef=*/Builder.getFalse()},
2846  /*FMFSource=*/nullptr, Val->getName() + ".numleadingzeros");
2847  Value *ValNumActiveBits = Builder.CreateSub(
2848  ConstantInt::get(Ty, Ty->getScalarSizeInBits()), ValNumLeadingZeros,
2849  Val->getName() + ".numactivebits", /*HasNUW=*/true,
2850  /*HasNSW=*/Bitwidth != 2);
2851 
2852  SCEVExpander Expander(*SE, *DL, "loop-idiom");
2853  Expander.setInsertPoint(&*Builder.GetInsertPoint());
2854  Value *ExtraOffset = Expander.expandCodeFor(ExtraOffsetExpr);
2855 
2856  Value *ValNumActiveBitsOffset = Builder.CreateAdd(
2857  ValNumActiveBits, ExtraOffset, ValNumActiveBits->getName() + ".offset",
2858  /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true);
2859  Value *IVFinal = Builder.CreateIntrinsic(Intrinsic::smax, {Ty},
2860  {ValNumActiveBitsOffset, Start},
2861  /*FMFSource=*/nullptr, "iv.final");
2862 
2863  auto *LoopBackedgeTakenCount = cast<Instruction>(Builder.CreateSub(
2864  IVFinal, Start, CurLoop->getName() + ".backedgetakencount",
2865  /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true));
2866  // FIXME: or when the offset was `add nuw`
2867 
2868  // We know loop's backedge-taken count, but what's loop's trip count?
2869  Value *LoopTripCount =
2870  Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
2871  CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
2872  /*HasNSW=*/Bitwidth != 2);
2873 
2874  // Step 2: Adjust the successor basic block to recieve the original
2875  // induction variable's final value instead of the orig. IV itself.
2876 
2877  IV->replaceUsesOutsideBlock(IVFinal, LoopHeaderBB);
2878 
2879  // Step 3: Rewrite the loop into a countable form, with canonical IV.
2880 
2881  // The new canonical induction variable.
2882  Builder.SetInsertPoint(&LoopHeaderBB->front());
2883  auto *CIV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
2884 
2885  // The induction itself.
2886  Builder.SetInsertPoint(LoopHeaderBB->getFirstNonPHI());
2887  auto *CIVNext =
2888  Builder.CreateAdd(CIV, ConstantInt::get(Ty, 1), CIV->getName() + ".next",
2889  /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
2890 
2891  // The loop trip count check.
2892  auto *CIVCheck = Builder.CreateICmpEQ(CIVNext, LoopTripCount,
2893  CurLoop->getName() + ".ivcheck");
2894  auto *NewIVCheck = CIVCheck;
2895  if (InvertedCond) {
2896  NewIVCheck = Builder.CreateNot(CIVCheck);
2897  NewIVCheck->takeName(ValShiftedIsZero);
2898  }
2899 
2900  // The original IV, but rebased to be an offset to the CIV.
2901  auto *IVDePHId = Builder.CreateAdd(CIV, Start, "", /*HasNUW=*/false,
2902  /*HasNSW=*/true); // FIXME: what about NUW?
2903  IVDePHId->takeName(IV);
2904 
2905  // The loop terminator.
2906  Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
2907  Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
2908  LoopHeaderBB->getTerminator()->eraseFromParent();
2909 
2910  // Populate the IV PHI.
2911  CIV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
2912  CIV->addIncoming(CIVNext, LoopHeaderBB);
2913 
2914  // Step 4: Forget the "non-computable" trip-count SCEV associated with the
2915  // loop. The loop would otherwise not be deleted even if it becomes empty.
2916 
2917  SE->forgetLoop(CurLoop);
2918 
2919  // Step 5: Try to cleanup the loop's body somewhat.
2920  IV->replaceAllUsesWith(IVDePHId);
2921  IV->eraseFromParent();
2922 
2923  ValShiftedIsZero->replaceAllUsesWith(NewIVCheck);
2924  ValShiftedIsZero->eraseFromParent();
2925 
2926  // Other passes will take care of actually deleting the loop if possible.
2927 
2928  LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom optimized!\n");
2929 
2930  ++NumShiftUntilZero;
2931  return MadeChange;
2932 }
i
i
Definition: README.txt:29
llvm::InstructionCost
Definition: InstructionCost.h:29
DisableLIRPAll
static cl::opt< bool, true > DisableLIRPAll("disable-" DEBUG_TYPE "-all", cl::desc("Options to disable Loop Idiom Recognize Pass."), cl::location(DisableLIRP::All), cl::init(false), cl::ReallyHidden)
llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:152
llvm::TargetTransformInfo::PSK_FastHardware
@ PSK_FastHardware
Definition: TargetTransformInfo.h:585
llvm::RecursivelyDeleteTriviallyDeadInstructions
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:518
llvm::ScalarEvolution::hasLoopInvariantBackedgeTakenCount
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
Definition: ScalarEvolution.cpp:13333
llvm::BasicBlock::end
iterator end()
Definition: BasicBlock.h:308
Attrs
Function Attrs
Definition: README_ALTIVEC.txt:215
llvm::OptimizationRemarkMissed
Diagnostic information for missed-optimization remarks.
Definition: DiagnosticInfo.h:734
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:217
llvm::Loop::isLoopInvariant
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:60
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:108
llvm::ScalarEvolution::isLoopEntryGuardedByCond
bool isLoopEntryGuardedByCond(const Loop *L, ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS)
Test whether entry to the loop is protected by a conditional between LHS and RHS.
Definition: ScalarEvolution.cpp:11401
llvm::ScalarEvolution::getNegativeSCEV
const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
Definition: ScalarEvolution.cpp:4463
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::Instruction::getModule
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:69
llvm::CmpInst::ICMP_EQ
@ ICMP_EQ
equal
Definition: InstrTypes.h:740
llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition: iterator_range.h:53
llvm::isKnownNonNegative
bool isKnownNonNegative(const Value *V, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Returns true if the give value is known to be non-negative.
Definition: ValueTracking.cpp:321
match_LoopInvariant
Match loop-invariant value.
Definition: LoopIdiomRecognize.cpp:2243
llvm::SmallPtrSetImpl::erase
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:379
llvm::LoopBase::getExitBlock
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
Definition: LoopInfoImpl.h:107
llvm::WeakTrackingVH
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
match_LoopInvariant::L
const Loop * L
Definition: LoopIdiomRecognize.cpp:2245
CmpInstAnalysis.h
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::AArch64PACKey::ID
ID
Definition: AArch64BaseInfo.h:818
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
llvm::LocationSize::afterPointer
constexpr static LocationSize afterPointer()
Any location after the base pointer (but still within the underlying object).
Definition: MemoryLocation.h:124
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1481
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:719
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:87
llvm::SCEVAddRecExpr::isAffine
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
Definition: ScalarEvolutionExpressions.h:370
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:104
IntrinsicInst.h
ScalarEvolutionExpander.h
llvm::MemIntrinsicBase::getDestAlign
MaybeAlign getDestAlign() const
Definition: IntrinsicInst.h:837
Scalar.h
llvm::SCEVAddRecExpr::getStart
const SCEV * getStart() const
Definition: ScalarEvolutionExpressions.h:353
llvm::TypeSize::getFixedSize
ScalarTy getFixedSize() const
Definition: TypeSize.h:444
MemorySSAUpdater.h
llvm::Function
Definition: Function.h:60
llvm::cl::location
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:467
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:547
StringRef.h
Pass.h
llvm::LoopBase::contains
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:139
llvm::SCEVExpander
This class uses information about analyze scalars to rewrite expressions in canonical form.
Definition: ScalarEvolutionExpander.h:50
llvm::BasicBlock::instructionsWithoutDebug
iterator_range< filter_iterator< BasicBlock::const_iterator, std::function< bool(const Instruction &)> > > instructionsWithoutDebug(bool SkipPseudoOp=true) const
Return a const iterator range over the instructions in the block, skipping any debug instructions.
Definition: BasicBlock.cpp:103
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:328
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1199
Statistic.h
llvm::PatternMatch::m_Add
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
Definition: PatternMatch.h:979
llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1498
detectPopcountIdiom
static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, Instruction *&CntInst, PHINode *&CntPhi, Value *&Var)
Return true iff the idiom is detected in the loop.
Definition: LoopIdiomRecognize.cpp:1639
llvm::TargetTransformInfo
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Definition: TargetTransformInfo.h:172
LoopAccessAnalysis.h
llvm::Type::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:729
llvm::IRBuilder<>
MapVector.h
llvm::GlobalVariable
Definition: GlobalVariable.h:39
llvm::SmallDenseMap
Definition: DenseMap.h:880
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:449
llvm::TargetTransformInfo::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
Definition: TargetTransformInfo.cpp:968
llvm::CmpInst::ICMP_NE
@ ICMP_NE
not equal
Definition: InstrTypes.h:741
llvm::CmpInst::getInversePredicate
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:833
llvm::ConstantExpr::getBitCast
static Constant * getBitCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2202
llvm::initializeLoopIdiomRecognizeLegacyPassPass
void initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry &)
ValueTracking.h
Local.h
OptimizationRemarkEmitter.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
llvm::getLoopAnalysisUsage
void getLoopAnalysisUsage(AnalysisUsage &AU)
Helper to consistently add the set of standard passes to a loop pass's AnalysisUsage.
Definition: LoopUtils.cpp:142
llvm::SCEVExpander::expandCodeFor
Value * expandCodeFor(const SCEV *SH, Type *Ty, Instruction *I)
Insert code to directly compute the specified SCEV expression into the program.
Definition: ScalarEvolutionExpander.h:278
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:140
llvm::PatternMatch::m_Br
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
Definition: PatternMatch.h:1731
APInt.h
llvm::AAMDNodes
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition: Metadata.h:652
ScalarEvolution.h
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
DenseMap.h
llvm::APInt::getBitWidth
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1431
llvm::ICmpInst::isEquality
bool isEquality() const
Return true if this predicate is either EQ or NE.
Definition: Instructions.h:1280
Module.h
llvm::AAMDNodes::TBAAStruct
MDNode * TBAAStruct
The tag for type-based alias analysis (tbaa struct).
Definition: Metadata.h:672
llvm::createLoopIdiomPass
Pass * createLoopIdiomPass()
Definition: LoopIdiomRecognize.cpp:345
getTripCount
static const SCEV * getTripCount(const SCEV *BECount, Type *IntPtr, Loop *CurLoop, const DataLayout *DL, ScalarEvolution *SE)
Compute trip count from the backedge taken count.
Definition: LoopIdiomRecognize.cpp:1054
llvm::Instruction::isShift
bool isShift() const
Definition: Instruction.h:171
llvm::isBytewiseValue
Value * isBytewiseValue(Value *V, const DataLayout &DL)
If the specified value can be set by repeating the same byte in memory, return the i8 value that it i...
Definition: ValueTracking.cpp:3931
llvm::LoopStandardAnalysisResults
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Definition: LoopAnalysisManager.h:51
match_LoopInvariant::match
bool match(ITy *V)
Definition: LoopIdiomRecognize.cpp:2250
llvm::isModOrRefSet
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
llvm::SCEVExpanderCleaner::markResultUsed
void markResultUsed()
Indicate that the result of the expansion is used.
Definition: ScalarEvolutionExpander.h:523
llvm::GlobalValue::UnnamedAddr::Global
@ Global
llvm::BTF::HeaderSize
@ HeaderSize
Definition: BTF.h:60
llvm::isConsecutiveAccess
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
Definition: LoopAccessAnalysis.cpp:1590
T
#define T
Definition: Mips16ISelLowering.cpp:341
llvm::ScalarEvolution::applyLoopGuards
const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
Definition: ScalarEvolution.cpp:14801
llvm::MapVector< Value *, StoreList >
llvm::SmallPtrSet< Value *, 16 >
llvm::ore::NV
DiagnosticInfoOptimizationBase::Argument NV
Definition: OptimizationRemarkEmitter.h:136
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::DisableLIRP::All
static bool All
When true, the entire pass is disabled.
Definition: LoopIdiomRecognize.h:30
llvm::CmpInst::ICMP_SLE
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:749
llvm::ScalarEvolution::isKnownNonNegative
bool isKnownNonNegative(const SCEV *S)
Test if the given expression is known to be non-negative.
Definition: ScalarEvolution.cpp:10670
llvm::ConstantExpr::getExactLogBase2
static Constant * getExactLogBase2(Constant *C)
If C is a scalar/fixed width vector of known powers of 2, then this function returns a new scalar/fix...
Definition: Constants.cpp:2685
llvm::BasicBlock::getSinglePredecessor
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:285
llvm::GlobalValue::setUnnamedAddr
void setUnnamedAddr(UnnamedAddr Val)
Definition: GlobalValue.h:225
llvm::ArrayType
Class to represent array types.
Definition: DerivedTypes.h:357
llvm::LoadInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:260
llvm::ScalarEvolution::getTruncateOrZeroExtend
const SCEV * getTruncateOrZeroExtend(const SCEV *V, Type *Ty, unsigned Depth=0)
Return a SCEV corresponding to a conversion of the input value to the specified type.
Definition: ScalarEvolution.cpp:4599
llvm::PatternMatch::m_c_And
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
Definition: PatternMatch.h:2251
llvm::AAMDNodes::Scope
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition: Metadata.h:675
llvm::SCEVConstant::getValue
ConstantInt * getValue() const
Definition: ScalarEvolutionExpressions.h:69
llvm::LoopStandardAnalysisResults::DT
DominatorTree & DT
Definition: LoopAnalysisManager.h:54
llvm::LoadInst::getAlign
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:216
llvm::SimpleLoopSafetyInfo
Simple and conservative implementation of LoopSafetyInfo that can give false-positive answers to its ...
Definition: MustExecute.h:110
llvm::codeview::PointerMode::Pointer
@ Pointer
llvm::cl::ReallyHidden
@ ReallyHidden
Definition: CommandLine.h:141
llvm::LinearPolySize::isScalable
bool isScalable() const
Returns whether the size is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:298
llvm::IRBuilderBase::getInt1
ConstantInt * getInt1(bool V)
Get a constant value representing either true or false.
Definition: IRBuilder.h:444
MustExecute.h
llvm::LoopBase::getNumBlocks
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:202
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::RISCVFenceField::R
@ R
Definition: RISCVBaseInfo.h:265
llvm::MemSetBase::getValue
Value * getValue() const
Definition: IntrinsicInst.h:933
loop
Analysis the ScalarEvolution expression for r is< loop > Outside the loop
Definition: README.txt:8
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:55
AliasAnalysis.h
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::ScalarEvolution::getMulExpr
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
Definition: ScalarEvolution.cpp:3089
llvm::DominatorTree::dominates
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
Instruction.h
CommandLine.h
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::Instruction::getOpcode
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:164
llvm::StoreInst::getValueOperand
Value * getValueOperand()
Definition: Instructions.h:386
GlobalValue.h
llvm::TargetTransformInfo::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
Definition: TargetTransformInfo.cpp:790
llvm::InlinerFunctionImportStatsOpts::No
@ No
llvm::MemorySSAWrapperPass
Legacy analysis pass which computes MemorySSA.
Definition: MemorySSA.h:984
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:24
llvm::ScalarEvolution::getOne
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
Definition: ScalarEvolution.h:649
Constants.h
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
llvm::AAResults
Definition: AliasAnalysis.h:294
LoopIdiomRecognize.h
llvm::TargetTransformInfo::getPopcntSupport
PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const
Return hardware support for population count.
Definition: TargetTransformInfo.cpp:572
InlinePriorityMode::Cost
@ Cost
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::LocationSize
Definition: MemoryLocation.h:66
llvm::User
Definition: User.h:44
Intrinsics.h
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::Instruction::isAtomic
bool isAtomic() const LLVM_READONLY
Return true if this instruction has an AtomicOrdering of unordered or higher.
Definition: Instruction.cpp:646
InstrTypes.h
llvm::CallBase::getCalledFunction
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1396
SI
@ SI
Definition: SIInstrInfo.cpp:7882
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::LoopBase::blocks
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:195
TargetLibraryInfo.h
llvm::PatternMatch::m_Instruction
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:716
false
Definition: StackSlotColoring.cpp:141
llvm::dwarf::Index
Index
Definition: Dwarf.h:472
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
llvm::PatternMatch::m_c_Add
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
Definition: PatternMatch.h:2237
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::PHINode::getIncomingValueForBlock
Value * getIncomingValueForBlock(const BasicBlock *BB) const
Definition: Instructions.h:2883
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:188
llvm::Constant::getAllOnesValue
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:395
llvm::BinaryOperator::getOpcode
BinaryOps getOpcode() const
Definition: InstrTypes.h:392
llvm::Instruction
Definition: Instruction.h:42
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:189
llvm::STATISTIC
STATISTIC(NumFunctions, "Total number of functions")
llvm::codeview::EncodedFramePtrReg::BasePtr
@ BasePtr
llvm::UndefValue::get
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1713
llvm::LocationSize::precise
static LocationSize precise(uint64_t Value)
Definition: MemoryLocation.h:101
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:879
llvm::isMustProgress
bool isMustProgress(const Loop *L)
Return true if this loop can be assumed to make progress.
Definition: LoopInfo.cpp:1118
LoopUtils.h
DEBUG_TYPE
#define DEBUG_TYPE
Definition: LoopIdiomRecognize.cpp:107
llvm::getUnderlyingObject
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
Definition: ValueTracking.cpp:4498
DebugLoc.h
llvm::LPPassManager
Definition: LoopPass.h:76
SmallPtrSet.h
llvm::Instruction::getAAMetadata
AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
Definition: Metadata.cpp:1500
m_LoopInvariant
match_LoopInvariant< Ty > m_LoopInvariant(const Ty &M, const Loop *L)
Matches if the value is loop-invariant.
Definition: LoopIdiomRecognize.cpp:2257
llvm::BasicBlock::getModule
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:147
MemmoveVerifier::MemmoveVerifier
MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr, const DataLayout &DL)
Definition: LoopIdiomRecognize.cpp:1277
Align
uint64_t Align
Definition: ELFObjHandler.cpp:81
PatternMatch.h
llvm::BasicBlock::getFirstNonPHI
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:209
llvm::StoreInst::getAlign
Align getAlign() const
Definition: Instructions.h:341
llvm::MemTransferBase::getSourceAlign
MaybeAlign getSourceAlign() const
Definition: IntrinsicInst.h:898
llvm::PatternMatch::m_Shift
BinOpPred_match< LHS, RHS, is_shift_op > m_Shift(const LHS &L, const RHS &R)
Matches shift operations.
Definition: PatternMatch.h:1296
llvm::Loop::getName
StringRef getName() const
Definition: LoopInfo.h:891
llvm::Type::getIntegerBitWidth
unsigned getIntegerBitWidth() const
Definition: DerivedTypes.h:97
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
Type.h
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:119
X
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
llvm::MemSetInst
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
Definition: IntrinsicInst.h:1073
llvm::PatternMatch::m_One
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:517
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
LoopInfo.h
llvm::BranchInst::getCondition
Value * getCondition() const
Definition: Instructions.h:3214
llvm::SCEV::isNonConstantNegative
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Definition: ScalarEvolution.cpp:452
llvm::ScalarEvolution::getSCEV
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
Definition: ScalarEvolution.cpp:4441
llvm::LoopBase::block_begin
block_iterator block_begin() const
Definition: LoopInfo.h:193
llvm::PatternMatch::m_Power2
cst_pred_ty< is_power2 > m_Power2()
Match an integer or vector power-of-2.
Definition: PatternMatch.h:544
getStoreStride
static APInt getStoreStride(const SCEVAddRecExpr *StoreEv)
Definition: LoopIdiomRecognize.cpp:425
UseLIRCodeSizeHeurs
static cl::opt< bool > UseLIRCodeSizeHeurs("use-lir-code-size-heurs", cl::desc("Use loop idiom recognition code size heuristics when compiling" "with -Os/-Oz"), cl::init(true), cl::Hidden)
llvm::getLoopPassPreservedAnalyses
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
Definition: LoopAnalysisManager.cpp:137
Verifier
verify safepoint Safepoint IR Verifier
Definition: SafepointIRVerifier.cpp:251
BasicBlock.h
llvm::cl::opt
Definition: CommandLine.h:1412
llvm::SCEV
This class represents an analyzed expression in the program.
Definition: ScalarEvolution.h:75
llvm::PatternMatch::m_Zero
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:537
llvm::StoreInst
An instruction for storing to memory.
Definition: Instructions.h:297
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
llvm::Instruction::hasNoSignedWrap
bool hasNoSignedWrap() const LLVM_READONLY
Determine whether the no signed wrap flag is set.
Definition: Instruction.cpp:169
llvm::Instruction::eraseFromParent
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:81
llvm::ICmpInst
This instruction compares its operands according to the predicate given to the constructor.
Definition: Instructions.h:1185
llvm::TargetLibraryInfoWrapperPass
Definition: TargetLibraryInfo.h:474
uint64_t
llvm::LoopPass
Definition: LoopPass.h:28
llvm::TargetTransformInfoWrapperPass
Wrapper pass for TargetTransformInfo.
Definition: TargetTransformInfo.h:2641
llvm::Instruction::hasNoUnsignedWrap
bool hasNoUnsignedWrap() const LLVM_READONLY
Determine whether the no unsigned wrap flag is set.
Definition: Instruction.cpp:165
llvm::GlobalValue::getParent
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:650
llvm::LPMUpdater
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
Definition: LoopPassManager.h:262
llvm::SmallPtrSetImpl::end
iterator end() const
Definition: SmallPtrSet.h:408
llvm::LoopBase::getUniqueExitBlocks
void getUniqueExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all unique successor blocks of this loop.
Definition: LoopInfoImpl.h:142
createPopcntIntrinsic
static CallInst * createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL)
Definition: LoopIdiomRecognize.cpp:1987
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
llvm::PHINode::addIncoming
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Definition: Instructions.h:2848
llvm::numbers::e
constexpr double e
Definition: MathExtras.h:53
MemoryLocation.h
llvm::MemorySSA
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition: MemorySSA.h:700
llvm::LoopInfoBase::getLoopFor
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Definition: LoopInfo.h:992
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::AAResults::getModRefInfo
ModRefInfo getModRefInfo(const Instruction *I, const Optional< MemoryLocation > &OptLoc)
Check whether or not an instruction may read or write the optionally specified memory location.
Definition: AliasAnalysis.h:488
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:447
llvm::LoopBase::getLoopPreheader
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:183
llvm::PatternMatch::m_And
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1093
llvm::SmallPtrSetImpl::begin
iterator begin() const
Definition: SmallPtrSet.h:403
ArrayRef.h
llvm::Instruction::setDebugLoc
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:351
llvm::SCEVConstant
This class represents a constant integer value.
Definition: ScalarEvolutionExpressions.h:60
llvm::getOrInsertLibFunc
FunctionCallee getOrInsertLibFunc(Module *M, const TargetLibraryInfo &TLI, LibFunc TheLibFunc, FunctionType *T, AttributeList AttributeList)
Calls getOrInsertFunction() and then makes sure to add mandatory argument attributes.
Definition: BuildLibCalls.cpp:1277
llvm::ModRefInfo
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition: ModRef.h:27
IRBuilder.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::PatternMatch::m_Sub
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
Definition: PatternMatch.h:991
llvm::Value::replaceUsesOutsideBlock
void replaceUsesOutsideBlock(Value *V, BasicBlock *BB)
replaceUsesOutsideBlock - Go through the uses list for this definition and make each use point to "V"...
Definition: Value.cpp:584
std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:853
llvm::OptimizationRemarkEmitter::emit
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Definition: OptimizationRemarkEmitter.cpp:77
match_LoopInvariant::SubPattern
SubPattern_t SubPattern
Definition: LoopIdiomRecognize.cpp:2244
Ptr
@ Ptr
Definition: TargetLibraryInfo.cpp:60
detectShiftUntilZeroIdiom
static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL, Intrinsic::ID &IntrinID, Value *&InitX, Instruction *&CntInst, PHINode *&CntPhi, Instruction *&DefX)
Return true if the idiom is detected in the loop.
Definition: LoopIdiomRecognize.cpp:1772
llvm::TargetLibraryInfo::has
bool has(LibFunc F) const
Tests whether a library function is available.
Definition: TargetLibraryInfo.h:331
llvm::TTI
TargetTransformInfo TTI
Definition: TargetTransformInfo.h:167
llvm::SCEVExpanderCleaner
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
Definition: ScalarEvolutionExpander.h:509
llvm::ArrayType::get
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:638
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
llvm::MemorySSAAnalysis
An analysis that produces MemorySSA for a function.
Definition: MemorySSA.h:934
llvm::SimpleLoopSafetyInfo::computeLoopSafetyInfo
void computeLoopSafetyInfo(const Loop *CurLoop) override
Computes safety information for a loop checks loop body & header for the possibility of may throw exc...
Definition: MustExecute.cpp:51
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:383
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:651
InstructionCost.h
llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:75
MemmoveVerifier
Definition: LoopIdiomRecognize.cpp:1275
llvm::User::setOperand
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
llvm::SetVector::insert
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:141
getType
static M68kRelType getType(unsigned Kind, MCSymbolRefExpr::VariantKind &Modifier, bool &IsPCRel)
Definition: M68kELFObjectWriter.cpp:48
DisableLIRPMemcpy
static cl::opt< bool, true > DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memcpy."), cl::location(DisableLIRP::Memcpy), cl::init(false), cl::ReallyHidden)
getDebugLoc
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
Definition: MachineInstrBundle.cpp:110
llvm::PatternMatch::m_CombineAnd
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
Definition: PatternMatch.h:224
llvm::Sched::Source
@ Source
Definition: TargetLowering.h:99
llvm::MemIntrinsicBase::getDest
Value * getDest() const
This is just like getRawDest, but it strips off any cast instructions (including addrspacecast) that ...
Definition: IntrinsicInst.h:824
llvm::LoopInfo
Definition: LoopInfo.h:1108
llvm::AAMDNodes::merge
AAMDNodes merge(const AAMDNodes &Other) const
Given two sets of AAMDNodes applying to potentially different locations, determine the best AAMDNodes...
Definition: TypeBasedAliasAnalysis.cpp:524
llvm::BinaryOperator
Definition: InstrTypes.h:188
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
DataLayout.h
llvm::PatternMatch::m_SpecificInt_ICMP
cst_pred_ty< icmp_pred_with_threshold > m_SpecificInt_ICMP(ICmpInst::Predicate Predicate, const APInt &Threshold)
Match an integer or vector with every element comparing 'pred' (eg/ne/...) to Threshold.
Definition: PatternMatch.h:598
Cond
SmallVector< MachineOperand, 4 > Cond
Definition: BasicBlockSections.cpp:137
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:221
llvm::ScalarEvolution::getConstant
const SCEV * getConstant(ConstantInt *V)
Definition: ScalarEvolution.cpp:471
LoopPass.h
llvm::SCEVExpander::setInsertPoint
void setInsertPoint(Instruction *IP)
Set the current insertion point.
Definition: ScalarEvolutionExpander.h:348
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
llvm::ConstantInt::isZero
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:194
llvm::LoopBase::isOutermost
bool isOutermost() const
Return true if the loop does not have a parent (natural) loop.
Definition: LoopInfo.h:185
llvm::Instruction::getFunction
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:73
llvm::AnalysisUsage::addPreserved
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Definition: PassAnalysisSupport.h:98
llvm::Value::replaceAllUsesWith
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:532
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
getParent
static const Function * getParent(const Value *V)
Definition: BasicAliasAnalysis.cpp:804
uint32_t
llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:994
llvm::ModRefInfo::Mod
@ Mod
The access may modify the value stored in memory.
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:82
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::SCEV::FlagNUW
@ FlagNUW
Definition: ScalarEvolution.h:133
llvm::ConstantInt::isMinusOne
bool isMinusOne() const
This function will return true iff every bit in this constant is set to true.
Definition: Constants.h:206
llvm::LoopStandardAnalysisResults::LI
LoopInfo & LI
Definition: LoopAnalysisManager.h:55
llvm::MemoryAccess
Definition: MemorySSA.h:142
idiom
loop idiom
Definition: LoopIdiomRecognize.cpp:342
llvm::Value::getName
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:308
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:173
ValueHandle.h
matchCondition
static Value * matchCondition(BranchInst *BI, BasicBlock *LoopEntry, bool JmpOnZero=false)
Check if the given conditional branch is based on the comparison between a variable and zero,...
Definition: LoopIdiomRecognize.cpp:1578
llvm::SimpleLoopSafetyInfo::anyBlockMayThrow
bool anyBlockMayThrow() const override
Returns true iff any block of the loop for which this info is contains an instruction that may throw ...
Definition: MustExecute.cpp:47
mayLoopAccessLocation
static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, const SCEV *BECount, const SCEV *StoreSizeSCEV, AliasAnalysis &AA, SmallPtrSetImpl< Instruction * > &IgnoredInsts)
mayLoopAccessLocation - Return true if the specified loop might access the specified pointer location...
Definition: LoopIdiomRecognize.cpp:1005
llvm::Function::hasOptSize
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:645
llvm::ScalarEvolution::isLoopInvariant
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
Definition: ScalarEvolution.cpp:13625
detectShiftUntilBitTestIdiom
static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX, Value *&BitMask, Value *&BitPos, Value *&CurrX, Instruction *&NextX)
Return true if the idiom is detected in the loop.
Definition: LoopIdiomRecognize.cpp:2283
llvm::ConstantInt::getZExtValue
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:142
llvm::BasicBlock::front
const Instruction & front() const
Definition: BasicBlock.h:318
llvm::LoopIdiomRecognizePass::run
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
Definition: LoopIdiomRecognize.cpp:313
llvm::DisableLIRP::Memcpy
static bool Memcpy
When true, Memcpy is disabled.
Definition: LoopIdiomRecognize.h:36
j
return j(j<< 16)
llvm::isLibFuncEmittable
bool isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI, LibFunc TheLibFunc)
Check whether the library function is available on target and also that it in the current Module is a...
Definition: BuildLibCalls.cpp:1353
getNumBytes
static const SCEV * getNumBytes(const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, Loop *CurLoop, const DataLayout *DL, ScalarEvolution *SE)
Compute the number of bytes as a SCEV from the backedge taken count.
Definition: LoopIdiomRecognize.cpp:1084
Constant.h
llvm::ScalarEvolution::forgetLoop
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
Definition: ScalarEvolution.cpp:8371
llvm::SCEVAddRecExpr::getLoop
const Loop * getLoop() const
Definition: ScalarEvolutionExpressions.h:354
llvm::GraphProgram::Name
Name
Definition: GraphWriter.h:50
llvm::decomposeBitTestICmp
bool decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate &Pred, Value *&X, APInt &Mask, bool LookThroughTrunc=true)
Decompose an icmp into the form ((X & Mask) pred 0) if possible.
Definition: CmpInstAnalysis.cpp:76
llvm::ScalarEvolution::getMinusSCEV
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
Definition: ScalarEvolution.cpp:4549
llvm::None
constexpr std::nullopt_t None
Definition: None.h:27
MemmoveVerifier::loadAndStoreMayFormMemmove
bool loadAndStoreMayFormMemmove(unsigned StoreSize, bool IsNegStride, const Instruction &TheLoad, bool IsMemCpy) const
Definition: LoopIdiomRecognize.cpp:1285
llvm::AAMDNodes::NoAlias
MDNode * NoAlias
The tag specifying the noalias scope.
Definition: Metadata.h:678
llvm::MemIntrinsicBase::getLength
Value * getLength() const
Definition: IntrinsicInst.h:815
llvm::LoopStandardAnalysisResults::TTI
TargetTransformInfo & TTI
Definition: LoopAnalysisManager.h:58
getMemSetPatternValue
static Constant * getMemSetPatternValue(Value *V, const DataLayout *DL)
getMemSetPatternValue - If a strided store of the specified value is safe to turn into a memset_patte...
Definition: LoopIdiomRecognize.cpp:436
llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:158
llvm::PHINode::Create
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
Definition: Instructions.h:2740
llvm::ore::setExtraArgs
DiagnosticInfoOptimizationBase::setExtraArgs setExtraArgs
Definition: OptimizationRemarkEmitter.h:138
llvm::ConstantInt::getBool
static ConstantInt * getBool(LLVMContext &Context, bool V)
Definition: Constants.cpp:841
GlobalVariable.h
llvm::IRBuilderBase::GetInsertBlock
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:173
llvm::MemorySSA::BeforeTerminator
@ BeforeTerminator
Definition: MemorySSA.h:788
llvm::TypeSize
Definition: TypeSize.h:435
llvm::SCEVAddRecExpr
This node represents a polynomial recurrence on the trip count of the specified loop.
Definition: ScalarEvolutionExpressions.h:342
Casting.h
idioms
loop Recognize loop idioms
Definition: LoopIdiomRecognize.cpp:343
llvm::LoopBase::getHeader
BlockT * getHeader() const
Definition: LoopInfo.h:105
PassManager.h
llvm::SetVector::count
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:215
llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition: TargetLibraryInfo.h:225
llvm::SCEV::isOne
bool isOne() const
Return true if the expression is a constant one.
Definition: ScalarEvolution.cpp:440
llvm::ConstantArray::get
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1241
llvm::GetPointerBaseWithConstantOffset
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
Definition: ValueTracking.h:272
llvm::SmallVectorImpl::clear
void clear()
Definition: SmallVector.h:614
llvm::PrevailingType::Yes
@ Yes
llvm::LoopBase::getNumBackEdges
unsigned getNumBackEdges() const
Calculate the number of back edges to the loop header.
Definition: LoopInfo.h:267
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:185
llvm::CmpInst::setPredicate
void setPredicate(Predicate P)
Set the predicate for this instruction to the specified value.
Definition: InstrTypes.h:812
isSimple
static bool isSimple(Instruction *I)
Definition: SLPVectorizer.cpp:786
llvm::LoopStandardAnalysisResults::SE
ScalarEvolution & SE
Definition: LoopAnalysisManager.h:56
llvm::LoopStandardAnalysisResults::AA
AAResults & AA
Definition: LoopAnalysisManager.h:52
llvm::AAMDNodes::TBAA
MDNode * TBAA
The tag for type-based alias analysis.
Definition: Metadata.h:669
llvm::SCEVConstant::getAPInt
const APInt & getAPInt() const
Definition: ScalarEvolutionExpressions.h:70
MemmoveVerifier::IsSameObject
const bool IsSameObject
Definition: LoopIdiomRecognize.cpp:1316
ScalarEvolutionExpressions.h
llvm::OptimizationRemark
Diagnostic information for applied optimization remarks.
Definition: DiagnosticInfo.h:689
llvm::Pass
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:91
MemorySSA.h
llvm::ScalarEvolution::getZeroExtendExpr
const SCEV * getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
Definition: ScalarEvolution.cpp:1604
llvm::GlobalValue::PrivateLinkage
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:56
Instructions.h
llvm::MemCpyInst
This class wraps the llvm.memcpy intrinsic.
Definition: IntrinsicInst.h:1125
llvm::LoadInst::isVolatile
bool isVolatile() const
Return true if this is a load from a volatile memory location.
Definition: Instructions.h:210
llvm::MemIntrinsic::isVolatile
bool isVolatile() const
Definition: IntrinsicInst.h:1050
SmallVector.h
match_LoopInvariant::match_LoopInvariant
match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
Definition: LoopIdiomRecognize.cpp:2247
llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:772
llvm::Instruction::getDebugLoc
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:354
User.h
llvm::StoreInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:389
llvm::PatternMatch::m_ICmp
CmpClass_match< LHS, RHS, ICmpInst, ICmpInst::Predicate > m_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R)
Definition: PatternMatch.h:1394
llvm::CmpInst::ICMP_UGT
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:742
Dominators.h
llvm::BasicBlock::size
size_t size() const
Definition: BasicBlock.h:316
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:91
llvm::CmpInst::getPredicate
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:809
llvm::ScalarEvolution::getZero
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
Definition: ScalarEvolution.h:646
getStartForNegStride
static const SCEV * getStartForNegStride(const SCEV *Start, const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, ScalarEvolution *SE)
Definition: LoopIdiomRecognize.cpp:1039
TargetTransformInfo.h
llvm::LoopStandardAnalysisResults::MSSA
MemorySSA * MSSA
Definition: LoopAnalysisManager.h:61
llvm::FunctionCallee
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:165
llvm::PHINode
Definition: Instructions.h:2698
llvm::BasicBlock::getTerminator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:119
llvm::Optional::value
constexpr const T & value() const &
Definition: Optional.h:281
IV
static const uint32_t IV[8]
Definition: blake3_impl.h:85
llvm::SmallVectorImpl< BasicBlock * >
llvm::TargetTransformInfo::getAtomicMemIntrinsicMaxElementSize
unsigned getAtomicMemIntrinsicMaxElementSize() const
Definition: TargetTransformInfo.cpp:1043
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom", "Recognize loop idioms", false, false) INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass
llvm::Module::getDataLayout
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:398
DerivedTypes.h
llvm::SmallPtrSetImpl< Instruction * >
llvm::SCEV::getType
Type * getType() const
Return the LLVM type of this SCEV expression.
Definition: ScalarEvolution.cpp:402
llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition: InstructionSimplify.h:42
llvm::ScalarEvolution::getAddExpr
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
Definition: ScalarEvolution.cpp:2492
createFFSIntrinsic
static CallInst * createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL, bool ZeroCheck, Intrinsic::ID IID)
Definition: LoopIdiomRecognize.cpp:2000
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1473
llvm::ConstantInt::isOne
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
Definition: Constants.h:200
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::LoadInst::isUnordered
bool isUnordered() const
Definition: Instructions.h:254
DisableLIRPMemset
static cl::opt< bool, true > DisableLIRPMemset("disable-" DEBUG_TYPE "-memset", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memset."), cl::location(DisableLIRP::Memset), cl::init(false), cl::ReallyHidden)
deleteDeadInstruction
static void deleteDeadInstruction(Instruction *I)
Definition: LoopIdiomRecognize.cpp:347
llvm::inferNonMandatoryLibFuncAttrs
bool inferNonMandatoryLibFuncAttrs(Module *M, StringRef Name, const TargetLibraryInfo &TLI)
Analyze the name and prototype of the given function and set any applicable attributes.
Definition: BuildLibCalls.cpp:252
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::VerifyMemorySSA
bool VerifyMemorySSA
Enables verification of MemorySSA.
Definition: MemorySSA.cpp:91
LLVMContext.h
llvm::TargetTransformInfo::TCC_Basic
@ TCC_Basic
The cost of a typical 'add' instruction.
Definition: TargetTransformInfo.h:244
llvm::Value::takeName
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:381
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
llvm::PatternMatch::m_BasicBlock
class_match< BasicBlock > m_BasicBlock()
Match an arbitrary basic block value and ignore it.
Definition: PatternMatch.h:168
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:394
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:413
llvm::GlobalObject::setAlignment
void setAlignment(MaybeAlign Align)
Definition: Globals.cpp:126
llvm::SCEVNAryExpr::getOperand
const SCEV * getOperand(unsigned i) const
Definition: ScalarEvolutionExpressions.h:201
llvm::BranchInst
Conditional or Unconditional Branch instruction.
Definition: Instructions.h:3133
raw_ostream.h
llvm::SetVector
A vector that has set insertion semantics.
Definition: SetVector.h:40
llvm::ScalarEvolution::getBackedgeTakenCount
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
Definition: ScalarEvolution.cpp:8241
llvm::SmallPtrSetImpl::contains
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:389
Value.h
getRecurrenceVar
static PHINode * getRecurrenceVar(Value *VarX, Instruction *DefX, BasicBlock *LoopEntry)
Definition: LoopIdiomRecognize.cpp:1606
llvm::LoopStandardAnalysisResults::TLI
TargetLibraryInfo & TLI
Definition: LoopAnalysisManager.h:57
InitializePasses.h
BuildLibCalls.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::PatternMatch::m_Shl
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1111
Debug.h
llvm::Value::users
iterator_range< user_iterator > users()
Definition: Value.h:421
llvm::APIntOps::smax
const APInt & smax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be signed.
Definition: APInt.h:2152
llvm::BranchInst::isConditional
bool isConditional() const
Definition: Instructions.h:3212
llvm::ModRefInfo::ModRef
@ ModRef
The access may reference and may modify the value stored in memory.
llvm::IRBuilderBase::CreateCall
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=None, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2269
llvm::BranchInst::getSuccessor
BasicBlock * getSuccessor(unsigned i) const
Definition: Instructions.h:3226
llvm::MemoryLocation
Representation for a specific memory location.
Definition: MemoryLocation.h:210
llvm::DisableLIRP::Memset
static bool Memset
When true, Memset is disabled.
Definition: LoopIdiomRecognize.h:33
SetVector.h
llvm::AAMDNodes::extendTo
AAMDNodes extendTo(ssize_t Len) const
Create a new AAMDNode that describes this AAMDNode after extending it to apply to a series of bytes o...
Definition: Metadata.h:719
llvm::MemTransferBase::getSource
Value * getSource() const
This is just like getRawSource, but it strips off any cast instructions that feed it,...
Definition: IntrinsicInst.h:884
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38
llvm::PoisonValue::get
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1732