LLVM  15.0.0git
LoopIdiomRecognize.cpp
Go to the documentation of this file.
1 //===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass implements an idiom recognizer that transforms simple loops into a
10 // non-loop form. In cases that this kicks in, it can be a significant
11 // performance win.
12 //
13 // If compiling for code size we avoid idiom recognition if the resulting
14 // code could be larger than the code for the original loop. One way this could
15 // happen is if the loop is not removable after idiom recognition due to the
16 // presence of non-idiom instructions. The initial implementation of the
17 // heuristics applies to idioms in multi-block loops.
18 //
19 //===----------------------------------------------------------------------===//
20 //
21 // TODO List:
22 //
23 // Future loop memory idioms to recognize:
24 // memcmp, strlen, etc.
25 // Future floating point idioms to recognize in -ffast-math mode:
26 // fpowi
27 // Future integer operation idioms to recognize:
28 // ctpop
29 //
30 // Beware that isel's default lowering for ctpop is highly inefficient for
31 // i64 and larger types when i64 is legal and the value has few bits set. It
32 // would be good to enhance isel to emit a loop for ctpop in this case.
33 //
34 // This could recognize common matrix multiplies and dot product idioms and
35 // replace them with calls to BLAS (if linked in??).
36 //
37 //===----------------------------------------------------------------------===//
38 
40 #include "llvm/ADT/APInt.h"
41 #include "llvm/ADT/ArrayRef.h"
42 #include "llvm/ADT/DenseMap.h"
43 #include "llvm/ADT/MapVector.h"
44 #include "llvm/ADT/SetVector.h"
45 #include "llvm/ADT/SmallPtrSet.h"
46 #include "llvm/ADT/SmallVector.h"
47 #include "llvm/ADT/Statistic.h"
48 #include "llvm/ADT/StringRef.h"
52 #include "llvm/Analysis/LoopInfo.h"
53 #include "llvm/Analysis/LoopPass.h"
64 #include "llvm/IR/BasicBlock.h"
65 #include "llvm/IR/Constant.h"
66 #include "llvm/IR/Constants.h"
67 #include "llvm/IR/DataLayout.h"
68 #include "llvm/IR/DebugLoc.h"
69 #include "llvm/IR/DerivedTypes.h"
70 #include "llvm/IR/Dominators.h"
71 #include "llvm/IR/GlobalValue.h"
72 #include "llvm/IR/GlobalVariable.h"
73 #include "llvm/IR/IRBuilder.h"
74 #include "llvm/IR/InstrTypes.h"
75 #include "llvm/IR/Instruction.h"
76 #include "llvm/IR/Instructions.h"
77 #include "llvm/IR/IntrinsicInst.h"
78 #include "llvm/IR/Intrinsics.h"
79 #include "llvm/IR/LLVMContext.h"
80 #include "llvm/IR/Module.h"
81 #include "llvm/IR/PassManager.h"
82 #include "llvm/IR/PatternMatch.h"
83 #include "llvm/IR/Type.h"
84 #include "llvm/IR/User.h"
85 #include "llvm/IR/Value.h"
86 #include "llvm/IR/ValueHandle.h"
87 #include "llvm/InitializePasses.h"
88 #include "llvm/Pass.h"
89 #include "llvm/Support/Casting.h"
91 #include "llvm/Support/Debug.h"
94 #include "llvm/Transforms/Scalar.h"
99 #include <algorithm>
100 #include <cassert>
101 #include <cstdint>
102 #include <utility>
103 #include <vector>
104 
105 using namespace llvm;
106 
107 #define DEBUG_TYPE "loop-idiom"
108 
109 STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
110 STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
111 STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores");
112 STATISTIC(
113  NumShiftUntilBitTest,
114  "Number of uncountable loops recognized as 'shift until bitttest' idiom");
115 STATISTIC(NumShiftUntilZero,
116  "Number of uncountable loops recognized as 'shift until zero' idiom");
117 
118 bool DisableLIRP::All;
119 static cl::opt<bool, true>
120  DisableLIRPAll("disable-" DEBUG_TYPE "-all",
121  cl::desc("Options to disable Loop Idiom Recognize Pass."),
124 
126 static cl::opt<bool, true>
127  DisableLIRPMemset("disable-" DEBUG_TYPE "-memset",
128  cl::desc("Proceed with loop idiom recognize pass, but do "
129  "not convert loop(s) to memset."),
132 
134 static cl::opt<bool, true>
135  DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy",
136  cl::desc("Proceed with loop idiom recognize pass, but do "
137  "not convert loop(s) to memcpy."),
140 
142  "use-lir-code-size-heurs",
143  cl::desc("Use loop idiom recognition code size heuristics when compiling"
144  "with -Os/-Oz"),
145  cl::init(true), cl::Hidden);
146 
147 namespace {
148 
149 class LoopIdiomRecognize {
150  Loop *CurLoop = nullptr;
151  AliasAnalysis *AA;
152  DominatorTree *DT;
153  LoopInfo *LI;
154  ScalarEvolution *SE;
155  TargetLibraryInfo *TLI;
156  const TargetTransformInfo *TTI;
157  const DataLayout *DL;
159  bool ApplyCodeSizeHeuristics;
160  std::unique_ptr<MemorySSAUpdater> MSSAU;
161 
162 public:
163  explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
164  LoopInfo *LI, ScalarEvolution *SE,
165  TargetLibraryInfo *TLI,
166  const TargetTransformInfo *TTI, MemorySSA *MSSA,
167  const DataLayout *DL,
169  : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
170  if (MSSA)
171  MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
172  }
173 
174  bool runOnLoop(Loop *L);
175 
176 private:
177  using StoreList = SmallVector<StoreInst *, 8>;
178  using StoreListMap = MapVector<Value *, StoreList>;
179 
180  StoreListMap StoreRefsForMemset;
181  StoreListMap StoreRefsForMemsetPattern;
182  StoreList StoreRefsForMemcpy;
183  bool HasMemset;
184  bool HasMemsetPattern;
185  bool HasMemcpy;
186 
187  /// Return code for isLegalStore()
188  enum LegalStoreKind {
189  None = 0,
190  Memset,
191  MemsetPattern,
192  Memcpy,
193  UnorderedAtomicMemcpy,
194  DontUse // Dummy retval never to be used. Allows catching errors in retval
195  // handling.
196  };
197 
198  /// \name Countable Loop Idiom Handling
199  /// @{
200 
201  bool runOnCountableLoop();
202  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
203  SmallVectorImpl<BasicBlock *> &ExitBlocks);
204 
205  void collectStores(BasicBlock *BB);
206  LegalStoreKind isLegalStore(StoreInst *SI);
207  enum class ForMemset { No, Yes };
208  bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
209  ForMemset For);
210 
211  template <typename MemInst>
212  bool processLoopMemIntrinsic(
213  BasicBlock *BB,
214  bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
215  const SCEV *BECount);
216  bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount);
217  bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
218 
219  bool processLoopStridedStore(Value *DestPtr, const SCEV *StoreSizeSCEV,
220  MaybeAlign StoreAlignment, Value *StoredVal,
221  Instruction *TheStore,
223  const SCEVAddRecExpr *Ev, const SCEV *BECount,
224  bool IsNegStride, bool IsLoopMemset = false);
225  bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
226  bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr,
227  const SCEV *StoreSize, MaybeAlign StoreAlign,
228  MaybeAlign LoadAlign, Instruction *TheStore,
229  Instruction *TheLoad,
230  const SCEVAddRecExpr *StoreEv,
231  const SCEVAddRecExpr *LoadEv,
232  const SCEV *BECount);
233  bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
234  bool IsLoopMemset = false);
235 
236  /// @}
237  /// \name Noncountable Loop Idiom Handling
238  /// @{
239 
240  bool runOnNoncountableLoop();
241 
242  bool recognizePopcount();
243  void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
244  PHINode *CntPhi, Value *Var);
245  bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
246  void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
247  Instruction *CntInst, PHINode *CntPhi,
248  Value *Var, Instruction *DefX,
249  const DebugLoc &DL, bool ZeroCheck,
250  bool IsCntPhiUsedOutsideLoop);
251 
252  bool recognizeShiftUntilBitTest();
253  bool recognizeShiftUntilZero();
254 
255  /// @}
256 };
257 
258 class LoopIdiomRecognizeLegacyPass : public LoopPass {
259 public:
260  static char ID;
261 
262  explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
265  }
266 
267  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
268  if (DisableLIRP::All)
269  return false;
270 
271  if (skipLoop(L))
272  return false;
273 
274  AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
275  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
276  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
277  ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
278  TargetLibraryInfo *TLI =
279  &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
280  *L->getHeader()->getParent());
281  const TargetTransformInfo *TTI =
282  &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
283  *L->getHeader()->getParent());
284  const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
285  auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
286  MemorySSA *MSSA = nullptr;
287  if (MSSAAnalysis)
288  MSSA = &MSSAAnalysis->getMSSA();
289 
290  // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
291  // pass. Function analyses need to be preserved across loop transformations
292  // but ORE cannot be preserved (see comment before the pass definition).
294 
295  LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, MSSA, DL, ORE);
296  return LIR.runOnLoop(L);
297  }
298 
299  /// This transformation requires natural loop information & requires that
300  /// loop preheaders be inserted into the CFG.
301  void getAnalysisUsage(AnalysisUsage &AU) const override {
306  }
307 };
308 
309 } // end anonymous namespace
310 
312 
315  LPMUpdater &) {
316  if (DisableLIRP::All)
317  return PreservedAnalyses::all();
318 
319  const auto *DL = &L.getHeader()->getModule()->getDataLayout();
320 
321  // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
322  // pass. Function analyses need to be preserved across loop transformations
323  // but ORE cannot be preserved (see comment before the pass definition).
325 
326  LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
327  AR.MSSA, DL, ORE);
328  if (!LIR.runOnLoop(&L))
329  return PreservedAnalyses::all();
330 
331  auto PA = getLoopPassPreservedAnalyses();
332  if (AR.MSSA)
333  PA.preserve<MemorySSAAnalysis>();
334  return PA;
335 }
336 
337 INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",
338  "Recognize loop idioms", false, false)
342 INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",
343  "Recognize loop idioms", false, false)
344 
345 Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
346 
348  I->replaceAllUsesWith(UndefValue::get(I->getType()));
349  I->eraseFromParent();
350 }
351 
352 //===----------------------------------------------------------------------===//
353 //
354 // Implementation of LoopIdiomRecognize
355 //
356 //===----------------------------------------------------------------------===//
357 
358 bool LoopIdiomRecognize::runOnLoop(Loop *L) {
359  CurLoop = L;
360  // If the loop could not be converted to canonical form, it must have an
361  // indirectbr in it, just give up.
362  if (!L->getLoopPreheader())
363  return false;
364 
365  // Disable loop idiom recognition if the function's name is a common idiom.
367  if (Name == "memset" || Name == "memcpy")
368  return false;
369 
370  // Determine if code size heuristics need to be applied.
371  ApplyCodeSizeHeuristics =
373 
374  HasMemset = TLI->has(LibFunc_memset);
375  HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
376  HasMemcpy = TLI->has(LibFunc_memcpy);
377 
378  if (HasMemset || HasMemsetPattern || HasMemcpy)
380  return runOnCountableLoop();
381 
382  return runOnNoncountableLoop();
383 }
384 
385 bool LoopIdiomRecognize::runOnCountableLoop() {
386  const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
387  assert(!isa<SCEVCouldNotCompute>(BECount) &&
388  "runOnCountableLoop() called on a loop without a predictable"
389  "backedge-taken count");
390 
391  // If this loop executes exactly one time, then it should be peeled, not
392  // optimized by this pass.
393  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
394  if (BECst->getAPInt() == 0)
395  return false;
396 
397  SmallVector<BasicBlock *, 8> ExitBlocks;
398  CurLoop->getUniqueExitBlocks(ExitBlocks);
399 
400  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
401  << CurLoop->getHeader()->getParent()->getName()
402  << "] Countable Loop %" << CurLoop->getHeader()->getName()
403  << "\n");
404 
405  // The following transforms hoist stores/memsets into the loop pre-header.
406  // Give up if the loop has instructions that may throw.
407  SimpleLoopSafetyInfo SafetyInfo;
408  SafetyInfo.computeLoopSafetyInfo(CurLoop);
409  if (SafetyInfo.anyBlockMayThrow())
410  return false;
411 
412  bool MadeChange = false;
413 
414  // Scan all the blocks in the loop that are not in subloops.
415  for (auto *BB : CurLoop->getBlocks()) {
416  // Ignore blocks in subloops.
417  if (LI->getLoopFor(BB) != CurLoop)
418  continue;
419 
420  MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
421  }
422  return MadeChange;
423 }
424 
425 static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
426  const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
427  return ConstStride->getAPInt();
428 }
429 
430 /// getMemSetPatternValue - If a strided store of the specified value is safe to
431 /// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
432 /// be passed in. Otherwise, return null.
433 ///
434 /// Note that we don't ever attempt to use memset_pattern8 or 4, because these
435 /// just replicate their input array and then pass on to memset_pattern16.
437  // FIXME: This could check for UndefValue because it can be merged into any
438  // other valid pattern.
439 
440  // If the value isn't a constant, we can't promote it to being in a constant
441  // array. We could theoretically do a store to an alloca or something, but
442  // that doesn't seem worthwhile.
443  Constant *C = dyn_cast<Constant>(V);
444  if (!C)
445  return nullptr;
446 
447  // Only handle simple values that are a power of two bytes in size.
448  uint64_t Size = DL->getTypeSizeInBits(V->getType());
449  if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
450  return nullptr;
451 
452  // Don't care enough about darwin/ppc to implement this.
453  if (DL->isBigEndian())
454  return nullptr;
455 
456  // Convert to size in bytes.
457  Size /= 8;
458 
459  // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
460  // if the top and bottom are the same (e.g. for vectors and large integers).
461  if (Size > 16)
462  return nullptr;
463 
464  // If the constant is exactly 16 bytes, just use it.
465  if (Size == 16)
466  return C;
467 
468  // Otherwise, we'll use an array of the constants.
469  unsigned ArraySize = 16 / Size;
470  ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
471  return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
472 }
473 
474 LoopIdiomRecognize::LegalStoreKind
475 LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
476  // Don't touch volatile stores.
477  if (SI->isVolatile())
478  return LegalStoreKind::None;
479  // We only want simple or unordered-atomic stores.
480  if (!SI->isUnordered())
481  return LegalStoreKind::None;
482 
483  // Avoid merging nontemporal stores.
484  if (SI->getMetadata(LLVMContext::MD_nontemporal))
485  return LegalStoreKind::None;
486 
487  Value *StoredVal = SI->getValueOperand();
488  Value *StorePtr = SI->getPointerOperand();
489 
490  // Don't convert stores of non-integral pointer types to memsets (which stores
491  // integers).
492  if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
493  return LegalStoreKind::None;
494 
495  // Reject stores that are so large that they overflow an unsigned.
496  // When storing out scalable vectors we bail out for now, since the code
497  // below currently only works for constant strides.
498  TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
499  if (SizeInBits.isScalable() || (SizeInBits.getFixedSize() & 7) ||
500  (SizeInBits.getFixedSize() >> 32) != 0)
501  return LegalStoreKind::None;
502 
503  // See if the pointer expression is an AddRec like {base,+,1} on the current
504  // loop, which indicates a strided store. If we have something else, it's a
505  // random store we can't handle.
506  const SCEVAddRecExpr *StoreEv =
507  dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
508  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
509  return LegalStoreKind::None;
510 
511  // Check to see if we have a constant stride.
512  if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
513  return LegalStoreKind::None;
514 
515  // See if the store can be turned into a memset.
516 
517  // If the stored value is a byte-wise value (like i32 -1), then it may be
518  // turned into a memset of i8 -1, assuming that all the consecutive bytes
519  // are stored. A store of i32 0x01020304 can never be turned into a memset,
520  // but it can be turned into memset_pattern if the target supports it.
521  Value *SplatValue = isBytewiseValue(StoredVal, *DL);
522 
523  // Note: memset and memset_pattern on unordered-atomic is yet not supported
524  bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
525 
526  // If we're allowed to form a memset, and the stored value would be
527  // acceptable for memset, use it.
528  if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
529  // Verify that the stored value is loop invariant. If not, we can't
530  // promote the memset.
531  CurLoop->isLoopInvariant(SplatValue)) {
532  // It looks like we can use SplatValue.
533  return LegalStoreKind::Memset;
534  }
535  if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
536  // Don't create memset_pattern16s with address spaces.
537  StorePtr->getType()->getPointerAddressSpace() == 0 &&
538  getMemSetPatternValue(StoredVal, DL)) {
539  // It looks like we can use PatternValue!
540  return LegalStoreKind::MemsetPattern;
541  }
542 
543  // Otherwise, see if the store can be turned into a memcpy.
544  if (HasMemcpy && !DisableLIRP::Memcpy) {
545  // Check to see if the stride matches the size of the store. If so, then we
546  // know that every byte is touched in the loop.
547  APInt Stride = getStoreStride(StoreEv);
548  unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
549  if (StoreSize != Stride && StoreSize != -Stride)
550  return LegalStoreKind::None;
551 
552  // The store must be feeding a non-volatile load.
553  LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
554 
555  // Only allow non-volatile loads
556  if (!LI || LI->isVolatile())
557  return LegalStoreKind::None;
558  // Only allow simple or unordered-atomic loads
559  if (!LI->isUnordered())
560  return LegalStoreKind::None;
561 
562  // See if the pointer expression is an AddRec like {base,+,1} on the current
563  // loop, which indicates a strided load. If we have something else, it's a
564  // random load we can't handle.
565  const SCEVAddRecExpr *LoadEv =
566  dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
567  if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
568  return LegalStoreKind::None;
569 
570  // The store and load must share the same stride.
571  if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
572  return LegalStoreKind::None;
573 
574  // Success. This store can be converted into a memcpy.
575  UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
576  return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
577  : LegalStoreKind::Memcpy;
578  }
579  // This store can't be transformed into a memset/memcpy.
580  return LegalStoreKind::None;
581 }
582 
583 void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
584  StoreRefsForMemset.clear();
585  StoreRefsForMemsetPattern.clear();
586  StoreRefsForMemcpy.clear();
587  for (Instruction &I : *BB) {
588  StoreInst *SI = dyn_cast<StoreInst>(&I);
589  if (!SI)
590  continue;
591 
592  // Make sure this is a strided store with a constant stride.
593  switch (isLegalStore(SI)) {
595  // Nothing to do
596  break;
597  case LegalStoreKind::Memset: {
598  // Find the base pointer.
599  Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
600  StoreRefsForMemset[Ptr].push_back(SI);
601  } break;
602  case LegalStoreKind::MemsetPattern: {
603  // Find the base pointer.
604  Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
605  StoreRefsForMemsetPattern[Ptr].push_back(SI);
606  } break;
607  case LegalStoreKind::Memcpy:
608  case LegalStoreKind::UnorderedAtomicMemcpy:
609  StoreRefsForMemcpy.push_back(SI);
610  break;
611  default:
612  assert(false && "unhandled return value");
613  break;
614  }
615  }
616 }
617 
618 /// runOnLoopBlock - Process the specified block, which lives in a counted loop
619 /// with the specified backedge count. This block is known to be in the current
620 /// loop and not in any subloops.
621 bool LoopIdiomRecognize::runOnLoopBlock(
622  BasicBlock *BB, const SCEV *BECount,
623  SmallVectorImpl<BasicBlock *> &ExitBlocks) {
624  // We can only promote stores in this block if they are unconditionally
625  // executed in the loop. For a block to be unconditionally executed, it has
626  // to dominate all the exit blocks of the loop. Verify this now.
627  for (BasicBlock *ExitBlock : ExitBlocks)
628  if (!DT->dominates(BB, ExitBlock))
629  return false;
630 
631  bool MadeChange = false;
632  // Look for store instructions, which may be optimized to memset/memcpy.
633  collectStores(BB);
634 
635  // Look for a single store or sets of stores with a common base, which can be
636  // optimized into a memset (memset_pattern). The latter most commonly happens
637  // with structs and handunrolled loops.
638  for (auto &SL : StoreRefsForMemset)
639  MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
640 
641  for (auto &SL : StoreRefsForMemsetPattern)
642  MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
643 
644  // Optimize the store into a memcpy, if it feeds an similarly strided load.
645  for (auto &SI : StoreRefsForMemcpy)
646  MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
647 
648  MadeChange |= processLoopMemIntrinsic<MemCpyInst>(
649  BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
650  MadeChange |= processLoopMemIntrinsic<MemSetInst>(
651  BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
652 
653  return MadeChange;
654 }
655 
656 /// See if this store(s) can be promoted to a memset.
657 bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
658  const SCEV *BECount, ForMemset For) {
659  // Try to find consecutive stores that can be transformed into memsets.
660  SetVector<StoreInst *> Heads, Tails;
662 
663  // Do a quadratic search on all of the given stores and find
664  // all of the pairs of stores that follow each other.
665  SmallVector<unsigned, 16> IndexQueue;
666  for (unsigned i = 0, e = SL.size(); i < e; ++i) {
667  assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
668 
669  Value *FirstStoredVal = SL[i]->getValueOperand();
670  Value *FirstStorePtr = SL[i]->getPointerOperand();
671  const SCEVAddRecExpr *FirstStoreEv =
672  cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
673  APInt FirstStride = getStoreStride(FirstStoreEv);
674  unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
675 
676  // See if we can optimize just this store in isolation.
677  if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
678  Heads.insert(SL[i]);
679  continue;
680  }
681 
682  Value *FirstSplatValue = nullptr;
683  Constant *FirstPatternValue = nullptr;
684 
685  if (For == ForMemset::Yes)
686  FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
687  else
688  FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
689 
690  assert((FirstSplatValue || FirstPatternValue) &&
691  "Expected either splat value or pattern value.");
692 
693  IndexQueue.clear();
694  // If a store has multiple consecutive store candidates, search Stores
695  // array according to the sequence: from i+1 to e, then from i-1 to 0.
696  // This is because usually pairing with immediate succeeding or preceding
697  // candidate create the best chance to find memset opportunity.
698  unsigned j = 0;
699  for (j = i + 1; j < e; ++j)
700  IndexQueue.push_back(j);
701  for (j = i; j > 0; --j)
702  IndexQueue.push_back(j - 1);
703 
704  for (auto &k : IndexQueue) {
705  assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
706  Value *SecondStorePtr = SL[k]->getPointerOperand();
707  const SCEVAddRecExpr *SecondStoreEv =
708  cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
709  APInt SecondStride = getStoreStride(SecondStoreEv);
710 
711  if (FirstStride != SecondStride)
712  continue;
713 
714  Value *SecondStoredVal = SL[k]->getValueOperand();
715  Value *SecondSplatValue = nullptr;
716  Constant *SecondPatternValue = nullptr;
717 
718  if (For == ForMemset::Yes)
719  SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
720  else
721  SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
722 
723  assert((SecondSplatValue || SecondPatternValue) &&
724  "Expected either splat value or pattern value.");
725 
726  if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
727  if (For == ForMemset::Yes) {
728  if (isa<UndefValue>(FirstSplatValue))
729  FirstSplatValue = SecondSplatValue;
730  if (FirstSplatValue != SecondSplatValue)
731  continue;
732  } else {
733  if (isa<UndefValue>(FirstPatternValue))
734  FirstPatternValue = SecondPatternValue;
735  if (FirstPatternValue != SecondPatternValue)
736  continue;
737  }
738  Tails.insert(SL[k]);
739  Heads.insert(SL[i]);
740  ConsecutiveChain[SL[i]] = SL[k];
741  break;
742  }
743  }
744  }
745 
746  // We may run into multiple chains that merge into a single chain. We mark the
747  // stores that we transformed so that we don't visit the same store twice.
748  SmallPtrSet<Value *, 16> TransformedStores;
749  bool Changed = false;
750 
751  // For stores that start but don't end a link in the chain:
752  for (StoreInst *I : Heads) {
753  if (Tails.count(I))
754  continue;
755 
756  // We found a store instr that starts a chain. Now follow the chain and try
757  // to transform it.
758  SmallPtrSet<Instruction *, 8> AdjacentStores;
759  StoreInst *HeadStore = I;
760  unsigned StoreSize = 0;
761 
762  // Collect the chain into a list.
763  while (Tails.count(I) || Heads.count(I)) {
764  if (TransformedStores.count(I))
765  break;
766  AdjacentStores.insert(I);
767 
768  StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
769  // Move to the next value in the chain.
770  I = ConsecutiveChain[I];
771  }
772 
773  Value *StoredVal = HeadStore->getValueOperand();
774  Value *StorePtr = HeadStore->getPointerOperand();
775  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
776  APInt Stride = getStoreStride(StoreEv);
777 
778  // Check to see if the stride matches the size of the stores. If so, then
779  // we know that every byte is touched in the loop.
780  if (StoreSize != Stride && StoreSize != -Stride)
781  continue;
782 
783  bool IsNegStride = StoreSize == -Stride;
784 
785  Type *IntIdxTy = DL->getIndexType(StorePtr->getType());
786  const SCEV *StoreSizeSCEV = SE->getConstant(IntIdxTy, StoreSize);
787  if (processLoopStridedStore(StorePtr, StoreSizeSCEV,
788  MaybeAlign(HeadStore->getAlign()), StoredVal,
789  HeadStore, AdjacentStores, StoreEv, BECount,
790  IsNegStride)) {
791  TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
792  Changed = true;
793  }
794  }
795 
796  return Changed;
797 }
798 
799 /// processLoopMemIntrinsic - Template function for calling different processor
800 /// functions based on mem intrinsic type.
801 template <typename MemInst>
802 bool LoopIdiomRecognize::processLoopMemIntrinsic(
803  BasicBlock *BB,
804  bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
805  const SCEV *BECount) {
806  bool MadeChange = false;
807  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
808  Instruction *Inst = &*I++;
809  // Look for memory instructions, which may be optimized to a larger one.
810  if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
811  WeakTrackingVH InstPtr(&*I);
812  if (!(this->*Processor)(MI, BECount))
813  continue;
814  MadeChange = true;
815 
816  // If processing the instruction invalidated our iterator, start over from
817  // the top of the block.
818  if (!InstPtr)
819  I = BB->begin();
820  }
821  }
822  return MadeChange;
823 }
824 
825 /// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy
826 bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
827  const SCEV *BECount) {
828  // We can only handle non-volatile memcpys with a constant size.
829  if (MCI->isVolatile() || !isa<ConstantInt>(MCI->getLength()))
830  return false;
831 
832  // If we're not allowed to hack on memcpy, we fail.
833  if ((!HasMemcpy && !isa<MemCpyInlineInst>(MCI)) || DisableLIRP::Memcpy)
834  return false;
835 
836  Value *Dest = MCI->getDest();
837  Value *Source = MCI->getSource();
838  if (!Dest || !Source)
839  return false;
840 
841  // See if the load and store pointer expressions are AddRec like {base,+,1} on
842  // the current loop, which indicates a strided load and store. If we have
843  // something else, it's a random load or store we can't handle.
844  const SCEVAddRecExpr *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Dest));
845  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
846  return false;
847  const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Source));
848  if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
849  return false;
850 
851  // Reject memcpys that are so large that they overflow an unsigned.
852  uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength())->getZExtValue();
853  if ((SizeInBytes >> 32) != 0)
854  return false;
855 
856  // Check if the stride matches the size of the memcpy. If so, then we know
857  // that every byte is touched in the loop.
858  const SCEVConstant *ConstStoreStride =
859  dyn_cast<SCEVConstant>(StoreEv->getOperand(1));
860  const SCEVConstant *ConstLoadStride =
861  dyn_cast<SCEVConstant>(LoadEv->getOperand(1));
862  if (!ConstStoreStride || !ConstLoadStride)
863  return false;
864 
865  APInt StoreStrideValue = ConstStoreStride->getAPInt();
866  APInt LoadStrideValue = ConstLoadStride->getAPInt();
867  // Huge stride value - give up
868  if (StoreStrideValue.getBitWidth() > 64 || LoadStrideValue.getBitWidth() > 64)
869  return false;
870 
871  if (SizeInBytes != StoreStrideValue && SizeInBytes != -StoreStrideValue) {
872  ORE.emit([&]() {
873  return OptimizationRemarkMissed(DEBUG_TYPE, "SizeStrideUnequal", MCI)
874  << ore::NV("Inst", "memcpy") << " in "
875  << ore::NV("Function", MCI->getFunction())
876  << " function will not be hoisted: "
877  << ore::NV("Reason", "memcpy size is not equal to stride");
878  });
879  return false;
880  }
881 
882  int64_t StoreStrideInt = StoreStrideValue.getSExtValue();
883  int64_t LoadStrideInt = LoadStrideValue.getSExtValue();
884  // Check if the load stride matches the store stride.
885  if (StoreStrideInt != LoadStrideInt)
886  return false;
887 
888  return processLoopStoreOfLoopLoad(
889  Dest, Source, SE->getConstant(Dest->getType(), SizeInBytes),
890  MCI->getDestAlign(), MCI->getSourceAlign(), MCI, MCI, StoreEv, LoadEv,
891  BECount);
892 }
893 
894 /// processLoopMemSet - See if this memset can be promoted to a large memset.
895 bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
896  const SCEV *BECount) {
897  // We can only handle non-volatile memsets.
898  if (MSI->isVolatile())
899  return false;
900 
901  // If we're not allowed to hack on memset, we fail.
902  if (!HasMemset || DisableLIRP::Memset)
903  return false;
904 
905  Value *Pointer = MSI->getDest();
906 
907  // See if the pointer expression is an AddRec like {base,+,1} on the current
908  // loop, which indicates a strided store. If we have something else, it's a
909  // random store we can't handle.
910  const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
911  if (!Ev || Ev->getLoop() != CurLoop)
912  return false;
913  if (!Ev->isAffine()) {
914  LLVM_DEBUG(dbgs() << " Pointer is not affine, abort\n");
915  return false;
916  }
917 
918  const SCEV *PointerStrideSCEV = Ev->getOperand(1);
919  const SCEV *MemsetSizeSCEV = SE->getSCEV(MSI->getLength());
920  if (!PointerStrideSCEV || !MemsetSizeSCEV)
921  return false;
922 
923  bool IsNegStride = false;
924  const bool IsConstantSize = isa<ConstantInt>(MSI->getLength());
925 
926  if (IsConstantSize) {
927  // Memset size is constant.
928  // Check if the pointer stride matches the memset size. If so, then
929  // we know that every byte is touched in the loop.
930  LLVM_DEBUG(dbgs() << " memset size is constant\n");
931  uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
932  const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
933  if (!ConstStride)
934  return false;
935 
936  APInt Stride = ConstStride->getAPInt();
937  if (SizeInBytes != Stride && SizeInBytes != -Stride)
938  return false;
939 
940  IsNegStride = SizeInBytes == -Stride;
941  } else {
942  // Memset size is non-constant.
943  // Check if the pointer stride matches the memset size.
944  // To be conservative, the pass would not promote pointers that aren't in
945  // address space zero. Also, the pass only handles memset length and stride
946  // that are invariant for the top level loop.
947  LLVM_DEBUG(dbgs() << " memset size is non-constant\n");
948  if (Pointer->getType()->getPointerAddressSpace() != 0) {
949  LLVM_DEBUG(dbgs() << " pointer is not in address space zero, "
950  << "abort\n");
951  return false;
952  }
953  if (!SE->isLoopInvariant(MemsetSizeSCEV, CurLoop)) {
954  LLVM_DEBUG(dbgs() << " memset size is not a loop-invariant, "
955  << "abort\n");
956  return false;
957  }
958 
959  // Compare positive direction PointerStrideSCEV with MemsetSizeSCEV
960  IsNegStride = PointerStrideSCEV->isNonConstantNegative();
961  const SCEV *PositiveStrideSCEV =
962  IsNegStride ? SE->getNegativeSCEV(PointerStrideSCEV)
963  : PointerStrideSCEV;
964  LLVM_DEBUG(dbgs() << " MemsetSizeSCEV: " << *MemsetSizeSCEV << "\n"
965  << " PositiveStrideSCEV: " << *PositiveStrideSCEV
966  << "\n");
967 
968  if (PositiveStrideSCEV != MemsetSizeSCEV) {
969  // If an expression is covered by the loop guard, compare again and
970  // proceed with optimization if equal.
971  const SCEV *FoldedPositiveStride =
972  SE->applyLoopGuards(PositiveStrideSCEV, CurLoop);
973  const SCEV *FoldedMemsetSize =
974  SE->applyLoopGuards(MemsetSizeSCEV, CurLoop);
975 
976  LLVM_DEBUG(dbgs() << " Try to fold SCEV based on loop guard\n"
977  << " FoldedMemsetSize: " << *FoldedMemsetSize << "\n"
978  << " FoldedPositiveStride: " << *FoldedPositiveStride
979  << "\n");
980 
981  if (FoldedPositiveStride != FoldedMemsetSize) {
982  LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n");
983  return false;
984  }
985  }
986  }
987 
988  // Verify that the memset value is loop invariant. If not, we can't promote
989  // the memset.
990  Value *SplatValue = MSI->getValue();
991  if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
992  return false;
993 
995  MSIs.insert(MSI);
996  return processLoopStridedStore(Pointer, SE->getSCEV(MSI->getLength()),
997  MSI->getDestAlign(), SplatValue, MSI, MSIs, Ev,
998  BECount, IsNegStride, /*IsLoopMemset=*/true);
999 }
1000 
1001 /// mayLoopAccessLocation - Return true if the specified loop might access the
1002 /// specified pointer location, which is a loop-strided access. The 'Access'
1003 /// argument specifies what the verboten forms of access are (read or write).
1004 static bool
1006  const SCEV *BECount, const SCEV *StoreSizeSCEV,
1007  AliasAnalysis &AA,
1008  SmallPtrSetImpl<Instruction *> &IgnoredInsts) {
1009  // Get the location that may be stored across the loop. Since the access is
1010  // strided positively through memory, we say that the modified location starts
1011  // at the pointer and has infinite size.
1013 
1014  // If the loop iterates a fixed number of times, we can refine the access size
1015  // to be exactly the size of the memset, which is (BECount+1)*StoreSize
1016  const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount);
1017  const SCEVConstant *ConstSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
1018  if (BECst && ConstSize)
1019  AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
1020  ConstSize->getValue()->getZExtValue());
1021 
1022  // TODO: For this to be really effective, we have to dive into the pointer
1023  // operand in the store. Store to &A[i] of 100 will always return may alias
1024  // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
1025  // which will then no-alias a store to &A[100].
1026  MemoryLocation StoreLoc(Ptr, AccessSize);
1027 
1028  for (BasicBlock *B : L->blocks())
1029  for (Instruction &I : *B)
1030  if (!IgnoredInsts.contains(&I) &&
1031  isModOrRefSet(
1032  intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
1033  return true;
1034  return false;
1035 }
1036 
1037 // If we have a negative stride, Start refers to the end of the memory location
1038 // we're trying to memset. Therefore, we need to recompute the base pointer,
1039 // which is just Start - BECount*Size.
1040 static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
1041  Type *IntPtr, const SCEV *StoreSizeSCEV,
1042  ScalarEvolution *SE) {
1043  const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
1044  if (!StoreSizeSCEV->isOne()) {
1045  // index = back edge count * store size
1046  Index = SE->getMulExpr(Index,
1047  SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
1048  SCEV::FlagNUW);
1049  }
1050  // base pointer = start - index * store size
1051  return SE->getMinusSCEV(Start, Index);
1052 }
1053 
1054 /// Compute trip count from the backedge taken count.
1055 static const SCEV *getTripCount(const SCEV *BECount, Type *IntPtr,
1056  Loop *CurLoop, const DataLayout *DL,
1057  ScalarEvolution *SE) {
1058  const SCEV *TripCountS = nullptr;
1059  // The # stored bytes is (BECount+1). Expand the trip count out to
1060  // pointer size if it isn't already.
1061  //
1062  // If we're going to need to zero extend the BE count, check if we can add
1063  // one to it prior to zero extending without overflow. Provided this is safe,
1064  // it allows better simplification of the +1.
1065  if (DL->getTypeSizeInBits(BECount->getType()) <
1066  DL->getTypeSizeInBits(IntPtr) &&
1068  CurLoop, ICmpInst::ICMP_NE, BECount,
1069  SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
1070  TripCountS = SE->getZeroExtendExpr(
1071  SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
1072  IntPtr);
1073  } else {
1074  TripCountS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
1075  SE->getOne(IntPtr), SCEV::FlagNUW);
1076  }
1077 
1078  return TripCountS;
1079 }
1080 
1081 /// Compute the number of bytes as a SCEV from the backedge taken count.
1082 ///
1083 /// This also maps the SCEV into the provided type and tries to handle the
1084 /// computation in a way that will fold cleanly.
1085 static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
1086  const SCEV *StoreSizeSCEV, Loop *CurLoop,
1087  const DataLayout *DL, ScalarEvolution *SE) {
1088  const SCEV *TripCountSCEV = getTripCount(BECount, IntPtr, CurLoop, DL, SE);
1089 
1090  return SE->getMulExpr(TripCountSCEV,
1091  SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
1092  SCEV::FlagNUW);
1093 }
1094 
1095 /// processLoopStridedStore - We see a strided store of some value. If we can
1096 /// transform this into a memset or memset_pattern in the loop preheader, do so.
1097 bool LoopIdiomRecognize::processLoopStridedStore(
1098  Value *DestPtr, const SCEV *StoreSizeSCEV, MaybeAlign StoreAlignment,
1099  Value *StoredVal, Instruction *TheStore,
1101  const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) {
1102  Module *M = TheStore->getModule();
1103  Value *SplatValue = isBytewiseValue(StoredVal, *DL);
1104  Constant *PatternValue = nullptr;
1105 
1106  if (!SplatValue)
1107  PatternValue = getMemSetPatternValue(StoredVal, DL);
1108 
1109  assert((SplatValue || PatternValue) &&
1110  "Expected either splat value or pattern value.");
1111 
1112  // The trip count of the loop and the base pointer of the addrec SCEV is
1113  // guaranteed to be loop invariant, which means that it should dominate the
1114  // header. This allows us to insert code for it in the preheader.
1115  unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
1116  BasicBlock *Preheader = CurLoop->getLoopPreheader();
1117  IRBuilder<> Builder(Preheader->getTerminator());
1118  SCEVExpander Expander(*SE, *DL, "loop-idiom");
1119  SCEVExpanderCleaner ExpCleaner(Expander);
1120 
1121  Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
1122  Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
1123 
1124  bool Changed = false;
1125  const SCEV *Start = Ev->getStart();
1126  // Handle negative strided loops.
1127  if (IsNegStride)
1128  Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSizeSCEV, SE);
1129 
1130  // TODO: ideally we should still be able to generate memset if SCEV expander
1131  // is taught to generate the dependencies at the latest point.
1132  if (!isSafeToExpand(Start, *SE))
1133  return Changed;
1134 
1135  // Okay, we have a strided store "p[i]" of a splattable value. We can turn
1136  // this into a memset in the loop preheader now if we want. However, this
1137  // would be unsafe to do if there is anything else in the loop that may read
1138  // or write to the aliased location. Check for any overlap by generating the
1139  // base pointer and checking the region.
1140  Value *BasePtr =
1141  Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
1142 
1143  // From here on out, conservatively report to the pass manager that we've
1144  // changed the IR, even if we later clean up these added instructions. There
1145  // may be structural differences e.g. in the order of use lists not accounted
1146  // for in just a textual dump of the IR. This is written as a variable, even
1147  // though statically all the places this dominates could be replaced with
1148  // 'true', with the hope that anyone trying to be clever / "more precise" with
1149  // the return value will read this comment, and leave them alone.
1150  Changed = true;
1151 
1152  if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1153  StoreSizeSCEV, *AA, Stores))
1154  return Changed;
1155 
1156  if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
1157  return Changed;
1158 
1159  // Okay, everything looks good, insert the memset.
1160 
1161  const SCEV *NumBytesS =
1162  getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1163 
1164  // TODO: ideally we should still be able to generate memset if SCEV expander
1165  // is taught to generate the dependencies at the latest point.
1166  if (!isSafeToExpand(NumBytesS, *SE))
1167  return Changed;
1168 
1169  Value *NumBytes =
1170  Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1171 
1172  CallInst *NewCall;
1173  if (SplatValue) {
1174  AAMDNodes AATags = TheStore->getAAMetadata();
1175  for (Instruction *Store : Stores)
1176  AATags = AATags.merge(Store->getAAMetadata());
1177  if (auto CI = dyn_cast<ConstantInt>(NumBytes))
1178  AATags = AATags.extendTo(CI->getZExtValue());
1179  else
1180  AATags = AATags.extendTo(-1);
1181 
1182  NewCall = Builder.CreateMemSet(
1183  BasePtr, SplatValue, NumBytes, MaybeAlign(StoreAlignment),
1184  /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
1185  } else if (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)) {
1186  // Everything is emitted in default address space
1187  Type *Int8PtrTy = DestInt8PtrTy;
1188 
1189  StringRef FuncName = "memset_pattern16";
1190  FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16,
1191  Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy);
1192  inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI);
1193 
1194  // Otherwise we should form a memset_pattern16. PatternValue is known to be
1195  // an constant array of 16-bytes. Plop the value into a mergable global.
1196  GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
1198  PatternValue, ".memset_pattern");
1199  GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
1200  GV->setAlignment(Align(16));
1201  Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
1202  NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
1203  } else
1204  return Changed;
1205 
1206  NewCall->setDebugLoc(TheStore->getDebugLoc());
1207 
1208  if (MSSAU) {
1209  MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1210  NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1211  MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1212  }
1213 
1214  LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
1215  << " from store to: " << *Ev << " at: " << *TheStore
1216  << "\n");
1217 
1218  ORE.emit([&]() {
1219  OptimizationRemark R(DEBUG_TYPE, "ProcessLoopStridedStore",
1220  NewCall->getDebugLoc(), Preheader);
1221  R << "Transformed loop-strided store in "
1222  << ore::NV("Function", TheStore->getFunction())
1223  << " function into a call to "
1224  << ore::NV("NewFunction", NewCall->getCalledFunction())
1225  << "() intrinsic";
1226  if (!Stores.empty())
1227  R << ore::setExtraArgs();
1228  for (auto *I : Stores) {
1229  R << ore::NV("FromBlock", I->getParent()->getName())
1230  << ore::NV("ToBlock", Preheader->getName());
1231  }
1232  return R;
1233  });
1234 
1235  // Okay, the memset has been formed. Zap the original store and anything that
1236  // feeds into it.
1237  for (auto *I : Stores) {
1238  if (MSSAU)
1239  MSSAU->removeMemoryAccess(I, true);
1241  }
1242  if (MSSAU && VerifyMemorySSA)
1243  MSSAU->getMemorySSA()->verifyMemorySSA();
1244  ++NumMemSet;
1245  ExpCleaner.markResultUsed();
1246  return true;
1247 }
1248 
1249 /// If the stored value is a strided load in the same loop with the same stride
1250 /// this may be transformable into a memcpy. This kicks in for stuff like
1251 /// for (i) A[i] = B[i];
1252 bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1253  const SCEV *BECount) {
1254  assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
1255 
1256  Value *StorePtr = SI->getPointerOperand();
1257  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1258  unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1259 
1260  // The store must be feeding a non-volatile load.
1261  LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
1262  assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
1263 
1264  // See if the pointer expression is an AddRec like {base,+,1} on the current
1265  // loop, which indicates a strided load. If we have something else, it's a
1266  // random load we can't handle.
1267  Value *LoadPtr = LI->getPointerOperand();
1268  const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
1269 
1270  const SCEV *StoreSizeSCEV = SE->getConstant(StorePtr->getType(), StoreSize);
1271  return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSizeSCEV,
1272  SI->getAlign(), LI->getAlign(), SI, LI,
1273  StoreEv, LoadEv, BECount);
1274 }
1275 
1277 public:
1278  explicit MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr,
1279  const DataLayout &DL)
1281  LoadBasePtr.stripPointerCasts(), LoadOff, DL)),
1283  StoreBasePtr.stripPointerCasts(), StoreOff, DL)),
1284  IsSameObject(BP1 == BP2) {}
1285 
1286  bool loadAndStoreMayFormMemmove(unsigned StoreSize, bool IsNegStride,
1287  const Instruction &TheLoad,
1288  bool IsMemCpy) const {
1289  if (IsMemCpy) {
1290  // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1291  // for negative stride.
1292  if ((!IsNegStride && LoadOff <= StoreOff) ||
1293  (IsNegStride && LoadOff >= StoreOff))
1294  return false;
1295  } else {
1296  // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1297  // for negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr.
1298  int64_t LoadSize =
1299  DL.getTypeSizeInBits(TheLoad.getType()).getFixedSize() / 8;
1300  if (BP1 != BP2 || LoadSize != int64_t(StoreSize))
1301  return false;
1302  if ((!IsNegStride && LoadOff < StoreOff + int64_t(StoreSize)) ||
1303  (IsNegStride && LoadOff + LoadSize > StoreOff))
1304  return false;
1305  }
1306  return true;
1307  }
1308 
1309 private:
1310  const DataLayout &DL;
1311  int64_t LoadOff = 0;
1312  int64_t StoreOff = 0;
1313  const Value *BP1;
1314  const Value *BP2;
1315 
1316 public:
1317  const bool IsSameObject;
1318 };
1319 
1320 bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
1321  Value *DestPtr, Value *SourcePtr, const SCEV *StoreSizeSCEV,
1322  MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore,
1323  Instruction *TheLoad, const SCEVAddRecExpr *StoreEv,
1324  const SCEVAddRecExpr *LoadEv, const SCEV *BECount) {
1325 
1326  // FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to
1327  // conservatively bail here, since otherwise we may have to transform
1328  // llvm.memcpy.inline into llvm.memcpy which is illegal.
1329  if (isa<MemCpyInlineInst>(TheStore))
1330  return false;
1331 
1332  // The trip count of the loop and the base pointer of the addrec SCEV is
1333  // guaranteed to be loop invariant, which means that it should dominate the
1334  // header. This allows us to insert code for it in the preheader.
1335  BasicBlock *Preheader = CurLoop->getLoopPreheader();
1336  IRBuilder<> Builder(Preheader->getTerminator());
1337  SCEVExpander Expander(*SE, *DL, "loop-idiom");
1338 
1339  SCEVExpanderCleaner ExpCleaner(Expander);
1340 
1341  bool Changed = false;
1342  const SCEV *StrStart = StoreEv->getStart();
1343  unsigned StrAS = DestPtr->getType()->getPointerAddressSpace();
1344  Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
1345 
1346  APInt Stride = getStoreStride(StoreEv);
1347  const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
1348 
1349  // TODO: Deal with non-constant size; Currently expect constant store size
1350  assert(ConstStoreSize && "store size is expected to be a constant");
1351 
1352  int64_t StoreSize = ConstStoreSize->getValue()->getZExtValue();
1353  bool IsNegStride = StoreSize == -Stride;
1354 
1355  // Handle negative strided loops.
1356  if (IsNegStride)
1357  StrStart =
1358  getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1359 
1360  // Okay, we have a strided store "p[i]" of a loaded value. We can turn
1361  // this into a memcpy in the loop preheader now if we want. However, this
1362  // would be unsafe to do if there is anything else in the loop that may read
1363  // or write the memory region we're storing to. This includes the load that
1364  // feeds the stores. Check for an alias by generating the base address and
1365  // checking everything.
1366  Value *StoreBasePtr = Expander.expandCodeFor(
1367  StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
1368 
1369  // From here on out, conservatively report to the pass manager that we've
1370  // changed the IR, even if we later clean up these added instructions. There
1371  // may be structural differences e.g. in the order of use lists not accounted
1372  // for in just a textual dump of the IR. This is written as a variable, even
1373  // though statically all the places this dominates could be replaced with
1374  // 'true', with the hope that anyone trying to be clever / "more precise" with
1375  // the return value will read this comment, and leave them alone.
1376  Changed = true;
1377 
1378  SmallPtrSet<Instruction *, 2> IgnoredInsts;
1379  IgnoredInsts.insert(TheStore);
1380 
1381  bool IsMemCpy = isa<MemCpyInst>(TheStore);
1382  const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";
1383 
1384  bool LoopAccessStore =
1385  mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1386  StoreSizeSCEV, *AA, IgnoredInsts);
1387  if (LoopAccessStore) {
1388  // For memmove case it's not enough to guarantee that loop doesn't access
1389  // TheStore and TheLoad. Additionally we need to make sure that TheStore is
1390  // the only user of TheLoad.
1391  if (!TheLoad->hasOneUse())
1392  return Changed;
1393  IgnoredInsts.insert(TheLoad);
1394  if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop,
1395  BECount, StoreSizeSCEV, *AA, IgnoredInsts)) {
1396  ORE.emit([&]() {
1397  return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore",
1398  TheStore)
1399  << ore::NV("Inst", InstRemark) << " in "
1400  << ore::NV("Function", TheStore->getFunction())
1401  << " function will not be hoisted: "
1402  << ore::NV("Reason", "The loop may access store location");
1403  });
1404  return Changed;
1405  }
1406  IgnoredInsts.erase(TheLoad);
1407  }
1408 
1409  const SCEV *LdStart = LoadEv->getStart();
1410  unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
1411 
1412  // Handle negative strided loops.
1413  if (IsNegStride)
1414  LdStart =
1415  getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1416 
1417  // For a memcpy, we have to make sure that the input array is not being
1418  // mutated by the loop.
1419  Value *LoadBasePtr = Expander.expandCodeFor(
1420  LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
1421 
1422  // If the store is a memcpy instruction, we must check if it will write to
1423  // the load memory locations. So remove it from the ignored stores.
1424  MemmoveVerifier Verifier(*LoadBasePtr, *StoreBasePtr, *DL);
1425  if (IsMemCpy && !Verifier.IsSameObject)
1426  IgnoredInsts.erase(TheStore);
1427  if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1428  StoreSizeSCEV, *AA, IgnoredInsts)) {
1429  ORE.emit([&]() {
1430  return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
1431  << ore::NV("Inst", InstRemark) << " in "
1432  << ore::NV("Function", TheStore->getFunction())
1433  << " function will not be hoisted: "
1434  << ore::NV("Reason", "The loop may access load location");
1435  });
1436  return Changed;
1437  }
1438 
1439  bool UseMemMove = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore;
1440  if (UseMemMove)
1441  if (!Verifier.loadAndStoreMayFormMemmove(StoreSize, IsNegStride, *TheLoad,
1442  IsMemCpy))
1443  return Changed;
1444 
1445  if (avoidLIRForMultiBlockLoop())
1446  return Changed;
1447 
1448  // Okay, everything is safe, we can transform this!
1449 
1450  const SCEV *NumBytesS =
1451  getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1452 
1453  Value *NumBytes =
1454  Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1455 
1456  AAMDNodes AATags = TheLoad->getAAMetadata();
1457  AAMDNodes StoreAATags = TheStore->getAAMetadata();
1458  AATags = AATags.merge(StoreAATags);
1459  if (auto CI = dyn_cast<ConstantInt>(NumBytes))
1460  AATags = AATags.extendTo(CI->getZExtValue());
1461  else
1462  AATags = AATags.extendTo(-1);
1463 
1464  CallInst *NewCall = nullptr;
1465  // Check whether to generate an unordered atomic memcpy:
1466  // If the load or store are atomic, then they must necessarily be unordered
1467  // by previous checks.
1468  if (!TheStore->isAtomic() && !TheLoad->isAtomic()) {
1469  if (UseMemMove)
1470  NewCall = Builder.CreateMemMove(
1471  StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, NumBytes,
1472  /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
1473  else
1474  NewCall =
1475  Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign,
1476  NumBytes, /*isVolatile=*/false, AATags.TBAA,
1477  AATags.TBAAStruct, AATags.Scope, AATags.NoAlias);
1478  } else {
1479  // For now don't support unordered atomic memmove.
1480  if (UseMemMove)
1481  return Changed;
1482  // We cannot allow unaligned ops for unordered load/store, so reject
1483  // anything where the alignment isn't at least the element size.
1484  assert((StoreAlign.hasValue() && LoadAlign.hasValue()) &&
1485  "Expect unordered load/store to have align.");
1486  if (StoreAlign.getValue() < StoreSize || LoadAlign.getValue() < StoreSize)
1487  return Changed;
1488 
1489  // If the element.atomic memcpy is not lowered into explicit
1490  // loads/stores later, then it will be lowered into an element-size
1491  // specific lib call. If the lib call doesn't exist for our store size, then
1492  // we shouldn't generate the memcpy.
1493  if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1494  return Changed;
1495 
1496  // Create the call.
1497  // Note that unordered atomic loads/stores are *required* by the spec to
1498  // have an alignment but non-atomic loads/stores may not.
1499  NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1500  StoreBasePtr, StoreAlign.getValue(), LoadBasePtr, LoadAlign.getValue(),
1501  NumBytes, StoreSize, AATags.TBAA, AATags.TBAAStruct, AATags.Scope,
1502  AATags.NoAlias);
1503  }
1504  NewCall->setDebugLoc(TheStore->getDebugLoc());
1505 
1506  if (MSSAU) {
1507  MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1508  NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1509  MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1510  }
1511 
1512  LLVM_DEBUG(dbgs() << " Formed new call: " << *NewCall << "\n"
1513  << " from load ptr=" << *LoadEv << " at: " << *TheLoad
1514  << "\n"
1515  << " from store ptr=" << *StoreEv << " at: " << *TheStore
1516  << "\n");
1517 
1518  ORE.emit([&]() {
1519  return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",
1520  NewCall->getDebugLoc(), Preheader)
1521  << "Formed a call to "
1522  << ore::NV("NewFunction", NewCall->getCalledFunction())
1523  << "() intrinsic from " << ore::NV("Inst", InstRemark)
1524  << " instruction in " << ore::NV("Function", TheStore->getFunction())
1525  << " function"
1526  << ore::setExtraArgs()
1527  << ore::NV("FromBlock", TheStore->getParent()->getName())
1528  << ore::NV("ToBlock", Preheader->getName());
1529  });
1530 
1531  // Okay, a new call to memcpy/memmove has been formed. Zap the original store
1532  // and anything that feeds into it.
1533  if (MSSAU)
1534  MSSAU->removeMemoryAccess(TheStore, true);
1535  deleteDeadInstruction(TheStore);
1536  if (MSSAU && VerifyMemorySSA)
1537  MSSAU->getMemorySSA()->verifyMemorySSA();
1538  if (UseMemMove)
1539  ++NumMemMove;
1540  else
1541  ++NumMemCpy;
1542  ExpCleaner.markResultUsed();
1543  return true;
1544 }
1545 
1546 // When compiling for codesize we avoid idiom recognition for a multi-block loop
1547 // unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1548 //
1549 bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1550  bool IsLoopMemset) {
1551  if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
1552  if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
1553  LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
1554  << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
1555  << " avoided: multi-block top-level loop\n");
1556  return true;
1557  }
1558  }
1559 
1560  return false;
1561 }
1562 
1563 bool LoopIdiomRecognize::runOnNoncountableLoop() {
1564  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
1565  << CurLoop->getHeader()->getParent()->getName()
1566  << "] Noncountable Loop %"
1567  << CurLoop->getHeader()->getName() << "\n");
1568 
1569  return recognizePopcount() || recognizeAndInsertFFS() ||
1570  recognizeShiftUntilBitTest() || recognizeShiftUntilZero();
1571 }
1572 
1573 /// Check if the given conditional branch is based on the comparison between
1574 /// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1575 /// true), the control yields to the loop entry. If the branch matches the
1576 /// behavior, the variable involved in the comparison is returned. This function
1577 /// will be called to see if the precondition and postcondition of the loop are
1578 /// in desirable form.
1579 static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
1580  bool JmpOnZero = false) {
1581  if (!BI || !BI->isConditional())
1582  return nullptr;
1583 
1584  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
1585  if (!Cond)
1586  return nullptr;
1587 
1588  ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
1589  if (!CmpZero || !CmpZero->isZero())
1590  return nullptr;
1591 
1592  BasicBlock *TrueSucc = BI->getSuccessor(0);
1593  BasicBlock *FalseSucc = BI->getSuccessor(1);
1594  if (JmpOnZero)
1595  std::swap(TrueSucc, FalseSucc);
1596 
1597  ICmpInst::Predicate Pred = Cond->getPredicate();
1598  if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
1599  (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
1600  return Cond->getOperand(0);
1601 
1602  return nullptr;
1603 }
1604 
1605 // Check if the recurrence variable `VarX` is in the right form to create
1606 // the idiom. Returns the value coerced to a PHINode if so.
1608  BasicBlock *LoopEntry) {
1609  auto *PhiX = dyn_cast<PHINode>(VarX);
1610  if (PhiX && PhiX->getParent() == LoopEntry &&
1611  (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
1612  return PhiX;
1613  return nullptr;
1614 }
1615 
1616 /// Return true iff the idiom is detected in the loop.
1617 ///
1618 /// Additionally:
1619 /// 1) \p CntInst is set to the instruction counting the population bit.
1620 /// 2) \p CntPhi is set to the corresponding phi node.
1621 /// 3) \p Var is set to the value whose population bits are being counted.
1622 ///
1623 /// The core idiom we are trying to detect is:
1624 /// \code
1625 /// if (x0 != 0)
1626 /// goto loop-exit // the precondition of the loop
1627 /// cnt0 = init-val;
1628 /// do {
1629 /// x1 = phi (x0, x2);
1630 /// cnt1 = phi(cnt0, cnt2);
1631 ///
1632 /// cnt2 = cnt1 + 1;
1633 /// ...
1634 /// x2 = x1 & (x1 - 1);
1635 /// ...
1636 /// } while(x != 0);
1637 ///
1638 /// loop-exit:
1639 /// \endcode
1640 static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
1641  Instruction *&CntInst, PHINode *&CntPhi,
1642  Value *&Var) {
1643  // step 1: Check to see if the look-back branch match this pattern:
1644  // "if (a!=0) goto loop-entry".
1645  BasicBlock *LoopEntry;
1646  Instruction *DefX2, *CountInst;
1647  Value *VarX1, *VarX0;
1648  PHINode *PhiX, *CountPhi;
1649 
1650  DefX2 = CountInst = nullptr;
1651  VarX1 = VarX0 = nullptr;
1652  PhiX = CountPhi = nullptr;
1653  LoopEntry = *(CurLoop->block_begin());
1654 
1655  // step 1: Check if the loop-back branch is in desirable form.
1656  {
1657  if (Value *T = matchCondition(
1658  dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1659  DefX2 = dyn_cast<Instruction>(T);
1660  else
1661  return false;
1662  }
1663 
1664  // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1665  {
1666  if (!DefX2 || DefX2->getOpcode() != Instruction::And)
1667  return false;
1668 
1669  BinaryOperator *SubOneOp;
1670 
1671  if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
1672  VarX1 = DefX2->getOperand(1);
1673  else {
1674  VarX1 = DefX2->getOperand(0);
1675  SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
1676  }
1677  if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
1678  return false;
1679 
1680  ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
1681  if (!Dec ||
1682  !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
1683  (SubOneOp->getOpcode() == Instruction::Add &&
1684  Dec->isMinusOne()))) {
1685  return false;
1686  }
1687  }
1688 
1689  // step 3: Check the recurrence of variable X
1690  PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
1691  if (!PhiX)
1692  return false;
1693 
1694  // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
1695  {
1696  CountInst = nullptr;
1697  for (Instruction &Inst : llvm::make_range(
1698  LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
1699  if (Inst.getOpcode() != Instruction::Add)
1700  continue;
1701 
1702  ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
1703  if (!Inc || !Inc->isOne())
1704  continue;
1705 
1706  PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
1707  if (!Phi)
1708  continue;
1709 
1710  // Check if the result of the instruction is live of the loop.
1711  bool LiveOutLoop = false;
1712  for (User *U : Inst.users()) {
1713  if ((cast<Instruction>(U))->getParent() != LoopEntry) {
1714  LiveOutLoop = true;
1715  break;
1716  }
1717  }
1718 
1719  if (LiveOutLoop) {
1720  CountInst = &Inst;
1721  CountPhi = Phi;
1722  break;
1723  }
1724  }
1725 
1726  if (!CountInst)
1727  return false;
1728  }
1729 
1730  // step 5: check if the precondition is in this form:
1731  // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
1732  {
1733  auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1734  Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
1735  if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
1736  return false;
1737 
1738  CntInst = CountInst;
1739  CntPhi = CountPhi;
1740  Var = T;
1741  }
1742 
1743  return true;
1744 }
1745 
1746 /// Return true if the idiom is detected in the loop.
1747 ///
1748 /// Additionally:
1749 /// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1750 /// or nullptr if there is no such.
1751 /// 2) \p CntPhi is set to the corresponding phi node
1752 /// or nullptr if there is no such.
1753 /// 3) \p Var is set to the value whose CTLZ could be used.
1754 /// 4) \p DefX is set to the instruction calculating Loop exit condition.
1755 ///
1756 /// The core idiom we are trying to detect is:
1757 /// \code
1758 /// if (x0 == 0)
1759 /// goto loop-exit // the precondition of the loop
1760 /// cnt0 = init-val;
1761 /// do {
1762 /// x = phi (x0, x.next); //PhiX
1763 /// cnt = phi(cnt0, cnt.next);
1764 ///
1765 /// cnt.next = cnt + 1;
1766 /// ...
1767 /// x.next = x >> 1; // DefX
1768 /// ...
1769 /// } while(x.next != 0);
1770 ///
1771 /// loop-exit:
1772 /// \endcode
1773 static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
1774  Intrinsic::ID &IntrinID, Value *&InitX,
1775  Instruction *&CntInst, PHINode *&CntPhi,
1776  Instruction *&DefX) {
1777  BasicBlock *LoopEntry;
1778  Value *VarX = nullptr;
1779 
1780  DefX = nullptr;
1781  CntInst = nullptr;
1782  CntPhi = nullptr;
1783  LoopEntry = *(CurLoop->block_begin());
1784 
1785  // step 1: Check if the loop-back branch is in desirable form.
1786  if (Value *T = matchCondition(
1787  dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1788  DefX = dyn_cast<Instruction>(T);
1789  else
1790  return false;
1791 
1792  // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
1793  if (!DefX || !DefX->isShift())
1794  return false;
1795  IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
1796  Intrinsic::ctlz;
1797  ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
1798  if (!Shft || !Shft->isOne())
1799  return false;
1800  VarX = DefX->getOperand(0);
1801 
1802  // step 3: Check the recurrence of variable X
1803  PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
1804  if (!PhiX)
1805  return false;
1806 
1807  InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
1808 
1809  // Make sure the initial value can't be negative otherwise the ashr in the
1810  // loop might never reach zero which would make the loop infinite.
1811  if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
1812  return false;
1813 
1814  // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1815  // or cnt.next = cnt + -1.
1816  // TODO: We can skip the step. If loop trip count is known (CTLZ),
1817  // then all uses of "cnt.next" could be optimized to the trip count
1818  // plus "cnt0". Currently it is not optimized.
1819  // This step could be used to detect POPCNT instruction:
1820  // cnt.next = cnt + (x.next & 1)
1821  for (Instruction &Inst : llvm::make_range(
1822  LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
1823  if (Inst.getOpcode() != Instruction::Add)
1824  continue;
1825 
1826  ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
1827  if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
1828  continue;
1829 
1830  PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
1831  if (!Phi)
1832  continue;
1833 
1834  CntInst = &Inst;
1835  CntPhi = Phi;
1836  break;
1837  }
1838  if (!CntInst)
1839  return false;
1840 
1841  return true;
1842 }
1843 
1844 /// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
1845 /// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
1846 /// trip count returns true; otherwise, returns false.
1847 bool LoopIdiomRecognize::recognizeAndInsertFFS() {
1848  // Give up if the loop has multiple blocks or multiple backedges.
1849  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1850  return false;
1851 
1852  Intrinsic::ID IntrinID;
1853  Value *InitX;
1854  Instruction *DefX = nullptr;
1855  PHINode *CntPhi = nullptr;
1856  Instruction *CntInst = nullptr;
1857  // Help decide if transformation is profitable. For ShiftUntilZero idiom,
1858  // this is always 6.
1859  size_t IdiomCanonicalSize = 6;
1860 
1861  if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
1862  CntInst, CntPhi, DefX))
1863  return false;
1864 
1865  bool IsCntPhiUsedOutsideLoop = false;
1866  for (User *U : CntPhi->users())
1867  if (!CurLoop->contains(cast<Instruction>(U))) {
1868  IsCntPhiUsedOutsideLoop = true;
1869  break;
1870  }
1871  bool IsCntInstUsedOutsideLoop = false;
1872  for (User *U : CntInst->users())
1873  if (!CurLoop->contains(cast<Instruction>(U))) {
1874  IsCntInstUsedOutsideLoop = true;
1875  break;
1876  }
1877  // If both CntInst and CntPhi are used outside the loop the profitability
1878  // is questionable.
1879  if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
1880  return false;
1881 
1882  // For some CPUs result of CTLZ(X) intrinsic is undefined
1883  // when X is 0. If we can not guarantee X != 0, we need to check this
1884  // when expand.
1885  bool ZeroCheck = false;
1886  // It is safe to assume Preheader exist as it was checked in
1887  // parent function RunOnLoop.
1888  BasicBlock *PH = CurLoop->getLoopPreheader();
1889 
1890  // If we are using the count instruction outside the loop, make sure we
1891  // have a zero check as a precondition. Without the check the loop would run
1892  // one iteration for before any check of the input value. This means 0 and 1
1893  // would have identical behavior in the original loop and thus
1894  if (!IsCntPhiUsedOutsideLoop) {
1895  auto *PreCondBB = PH->getSinglePredecessor();
1896  if (!PreCondBB)
1897  return false;
1898  auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1899  if (!PreCondBI)
1900  return false;
1901  if (matchCondition(PreCondBI, PH) != InitX)
1902  return false;
1903  ZeroCheck = true;
1904  }
1905 
1906  // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
1907  // profitable if we delete the loop.
1908 
1909  // the loop has only 6 instructions:
1910  // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
1911  // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
1912  // %shr = ashr %n.addr.0, 1
1913  // %tobool = icmp eq %shr, 0
1914  // %inc = add nsw %i.0, 1
1915  // br i1 %tobool
1916 
1917  const Value *Args[] = {InitX,
1918  ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
1919 
1920  // @llvm.dbg doesn't count as they have no semantic effect.
1921  auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
1923  std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
1924 
1925  IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
1926  InstructionCost Cost =
1928  if (HeaderSize != IdiomCanonicalSize &&
1930  return false;
1931 
1932  transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
1933  DefX->getDebugLoc(), ZeroCheck,
1934  IsCntPhiUsedOutsideLoop);
1935  return true;
1936 }
1937 
1938 /// Recognizes a population count idiom in a non-countable loop.
1939 ///
1940 /// If detected, transforms the relevant code to issue the popcount intrinsic
1941 /// function call, and returns true; otherwise, returns false.
1942 bool LoopIdiomRecognize::recognizePopcount() {
1944  return false;
1945 
1946  // Counting population are usually conducted by few arithmetic instructions.
1947  // Such instructions can be easily "absorbed" by vacant slots in a
1948  // non-compact loop. Therefore, recognizing popcount idiom only makes sense
1949  // in a compact loop.
1950 
1951  // Give up if the loop has multiple blocks or multiple backedges.
1952  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1953  return false;
1954 
1955  BasicBlock *LoopBody = *(CurLoop->block_begin());
1956  if (LoopBody->size() >= 20) {
1957  // The loop is too big, bail out.
1958  return false;
1959  }
1960 
1961  // It should have a preheader containing nothing but an unconditional branch.
1962  BasicBlock *PH = CurLoop->getLoopPreheader();
1963  if (!PH || &PH->front() != PH->getTerminator())
1964  return false;
1965  auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
1966  if (!EntryBI || EntryBI->isConditional())
1967  return false;
1968 
1969  // It should have a precondition block where the generated popcount intrinsic
1970  // function can be inserted.
1971  auto *PreCondBB = PH->getSinglePredecessor();
1972  if (!PreCondBB)
1973  return false;
1974  auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1975  if (!PreCondBI || PreCondBI->isUnconditional())
1976  return false;
1977 
1978  Instruction *CntInst;
1979  PHINode *CntPhi;
1980  Value *Val;
1981  if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
1982  return false;
1983 
1984  transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
1985  return true;
1986 }
1987 
1989  const DebugLoc &DL) {
1990  Value *Ops[] = {Val};
1991  Type *Tys[] = {Val->getType()};
1992 
1994  Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
1995  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1996  CI->setDebugLoc(DL);
1997 
1998  return CI;
1999 }
2000 
2002  const DebugLoc &DL, bool ZeroCheck,
2003  Intrinsic::ID IID) {
2004  Value *Ops[] = {Val, IRBuilder.getInt1(ZeroCheck)};
2005  Type *Tys[] = {Val->getType()};
2006 
2008  Function *Func = Intrinsic::getDeclaration(M, IID, Tys);
2009  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
2010  CI->setDebugLoc(DL);
2011 
2012  return CI;
2013 }
2014 
2015 /// Transform the following loop (Using CTLZ, CTTZ is similar):
2016 /// loop:
2017 /// CntPhi = PHI [Cnt0, CntInst]
2018 /// PhiX = PHI [InitX, DefX]
2019 /// CntInst = CntPhi + 1
2020 /// DefX = PhiX >> 1
2021 /// LOOP_BODY
2022 /// Br: loop if (DefX != 0)
2023 /// Use(CntPhi) or Use(CntInst)
2024 ///
2025 /// Into:
2026 /// If CntPhi used outside the loop:
2027 /// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
2028 /// Count = CountPrev + 1
2029 /// else
2030 /// Count = BitWidth(InitX) - CTLZ(InitX)
2031 /// loop:
2032 /// CntPhi = PHI [Cnt0, CntInst]
2033 /// PhiX = PHI [InitX, DefX]
2034 /// PhiCount = PHI [Count, Dec]
2035 /// CntInst = CntPhi + 1
2036 /// DefX = PhiX >> 1
2037 /// Dec = PhiCount - 1
2038 /// LOOP_BODY
2039 /// Br: loop if (Dec != 0)
2040 /// Use(CountPrev + Cnt0) // Use(CntPhi)
2041 /// or
2042 /// Use(Count + Cnt0) // Use(CntInst)
2043 ///
2044 /// If LOOP_BODY is empty the loop will be deleted.
2045 /// If CntInst and DefX are not used in LOOP_BODY they will be removed.
2046 void LoopIdiomRecognize::transformLoopToCountable(
2047  Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
2048  PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
2049  bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
2050  BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
2051 
2052  // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
2053  IRBuilder<> Builder(PreheaderBr);
2054  Builder.SetCurrentDebugLocation(DL);
2055 
2056  // If there are no uses of CntPhi crate:
2057  // Count = BitWidth - CTLZ(InitX);
2058  // NewCount = Count;
2059  // If there are uses of CntPhi create:
2060  // NewCount = BitWidth - CTLZ(InitX >> 1);
2061  // Count = NewCount + 1;
2062  Value *InitXNext;
2063  if (IsCntPhiUsedOutsideLoop) {
2064  if (DefX->getOpcode() == Instruction::AShr)
2065  InitXNext = Builder.CreateAShr(InitX, 1);
2066  else if (DefX->getOpcode() == Instruction::LShr)
2067  InitXNext = Builder.CreateLShr(InitX, 1);
2068  else if (DefX->getOpcode() == Instruction::Shl) // cttz
2069  InitXNext = Builder.CreateShl(InitX, 1);
2070  else
2071  llvm_unreachable("Unexpected opcode!");
2072  } else
2073  InitXNext = InitX;
2074  Value *Count =
2075  createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
2076  Type *CountTy = Count->getType();
2077  Count = Builder.CreateSub(
2078  ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count);
2079  Value *NewCount = Count;
2080  if (IsCntPhiUsedOutsideLoop)
2081  Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1));
2082 
2083  NewCount = Builder.CreateZExtOrTrunc(NewCount, CntInst->getType());
2084 
2085  Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
2086  if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
2087  // If the counter was being incremented in the loop, add NewCount to the
2088  // counter's initial value, but only if the initial value is not zero.
2089  ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2090  if (!InitConst || !InitConst->isZero())
2091  NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2092  } else {
2093  // If the count was being decremented in the loop, subtract NewCount from
2094  // the counter's initial value.
2095  NewCount = Builder.CreateSub(CntInitVal, NewCount);
2096  }
2097 
2098  // Step 2: Insert new IV and loop condition:
2099  // loop:
2100  // ...
2101  // PhiCount = PHI [Count, Dec]
2102  // ...
2103  // Dec = PhiCount - 1
2104  // ...
2105  // Br: loop if (Dec != 0)
2106  BasicBlock *Body = *(CurLoop->block_begin());
2107  auto *LbBr = cast<BranchInst>(Body->getTerminator());
2108  ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2109 
2110  PHINode *TcPhi = PHINode::Create(CountTy, 2, "tcphi", &Body->front());
2111 
2112  Builder.SetInsertPoint(LbCond);
2113  Instruction *TcDec = cast<Instruction>(Builder.CreateSub(
2114  TcPhi, ConstantInt::get(CountTy, 1), "tcdec", false, true));
2115 
2116  TcPhi->addIncoming(Count, Preheader);
2117  TcPhi->addIncoming(TcDec, Body);
2118 
2119  CmpInst::Predicate Pred =
2120  (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
2121  LbCond->setPredicate(Pred);
2122  LbCond->setOperand(0, TcDec);
2123  LbCond->setOperand(1, ConstantInt::get(CountTy, 0));
2124 
2125  // Step 3: All the references to the original counter outside
2126  // the loop are replaced with the NewCount
2127  if (IsCntPhiUsedOutsideLoop)
2128  CntPhi->replaceUsesOutsideBlock(NewCount, Body);
2129  else
2130  CntInst->replaceUsesOutsideBlock(NewCount, Body);
2131 
2132  // step 4: Forget the "non-computable" trip-count SCEV associated with the
2133  // loop. The loop would otherwise not be deleted even if it becomes empty.
2134  SE->forgetLoop(CurLoop);
2135 }
2136 
2137 void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
2138  Instruction *CntInst,
2139  PHINode *CntPhi, Value *Var) {
2140  BasicBlock *PreHead = CurLoop->getLoopPreheader();
2141  auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
2142  const DebugLoc &DL = CntInst->getDebugLoc();
2143 
2144  // Assuming before transformation, the loop is following:
2145  // if (x) // the precondition
2146  // do { cnt++; x &= x - 1; } while(x);
2147 
2148  // Step 1: Insert the ctpop instruction at the end of the precondition block
2149  IRBuilder<> Builder(PreCondBr);
2150  Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
2151  {
2152  PopCnt = createPopcntIntrinsic(Builder, Var, DL);
2153  NewCount = PopCntZext =
2154  Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
2155 
2156  if (NewCount != PopCnt)
2157  (cast<Instruction>(NewCount))->setDebugLoc(DL);
2158 
2159  // TripCnt is exactly the number of iterations the loop has
2160  TripCnt = NewCount;
2161 
2162  // If the population counter's initial value is not zero, insert Add Inst.
2163  Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
2164  ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2165  if (!InitConst || !InitConst->isZero()) {
2166  NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2167  (cast<Instruction>(NewCount))->setDebugLoc(DL);
2168  }
2169  }
2170 
2171  // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
2172  // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
2173  // function would be partial dead code, and downstream passes will drag
2174  // it back from the precondition block to the preheader.
2175  {
2176  ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
2177 
2178  Value *Opnd0 = PopCntZext;
2179  Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
2180  if (PreCond->getOperand(0) != Var)
2181  std::swap(Opnd0, Opnd1);
2182 
2183  ICmpInst *NewPreCond = cast<ICmpInst>(
2184  Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
2185  PreCondBr->setCondition(NewPreCond);
2186 
2188  }
2189 
2190  // Step 3: Note that the population count is exactly the trip count of the
2191  // loop in question, which enable us to convert the loop from noncountable
2192  // loop into a countable one. The benefit is twofold:
2193  //
2194  // - If the loop only counts population, the entire loop becomes dead after
2195  // the transformation. It is a lot easier to prove a countable loop dead
2196  // than to prove a noncountable one. (In some C dialects, an infinite loop
2197  // isn't dead even if it computes nothing useful. In general, DCE needs
2198  // to prove a noncountable loop finite before safely delete it.)
2199  //
2200  // - If the loop also performs something else, it remains alive.
2201  // Since it is transformed to countable form, it can be aggressively
2202  // optimized by some optimizations which are in general not applicable
2203  // to a noncountable loop.
2204  //
2205  // After this step, this loop (conceptually) would look like following:
2206  // newcnt = __builtin_ctpop(x);
2207  // t = newcnt;
2208  // if (x)
2209  // do { cnt++; x &= x-1; t--) } while (t > 0);
2210  BasicBlock *Body = *(CurLoop->block_begin());
2211  {
2212  auto *LbBr = cast<BranchInst>(Body->getTerminator());
2213  ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2214  Type *Ty = TripCnt->getType();
2215 
2216  PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
2217 
2218  Builder.SetInsertPoint(LbCond);
2219  Instruction *TcDec = cast<Instruction>(
2220  Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
2221  "tcdec", false, true));
2222 
2223  TcPhi->addIncoming(TripCnt, PreHead);
2224  TcPhi->addIncoming(TcDec, Body);
2225 
2226  CmpInst::Predicate Pred =
2227  (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
2228  LbCond->setPredicate(Pred);
2229  LbCond->setOperand(0, TcDec);
2230  LbCond->setOperand(1, ConstantInt::get(Ty, 0));
2231  }
2232 
2233  // Step 4: All the references to the original population counter outside
2234  // the loop are replaced with the NewCount -- the value returned from
2235  // __builtin_ctpop().
2236  CntInst->replaceUsesOutsideBlock(NewCount, Body);
2237 
2238  // step 5: Forget the "non-computable" trip-count SCEV associated with the
2239  // loop. The loop would otherwise not be deleted even if it becomes empty.
2240  SE->forgetLoop(CurLoop);
2241 }
2242 
2243 /// Match loop-invariant value.
2244 template <typename SubPattern_t> struct match_LoopInvariant {
2245  SubPattern_t SubPattern;
2246  const Loop *L;
2247 
2248  match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
2249  : SubPattern(SP), L(L) {}
2250 
2251  template <typename ITy> bool match(ITy *V) {
2252  return L->isLoopInvariant(V) && SubPattern.match(V);
2253  }
2254 };
2255 
2256 /// Matches if the value is loop-invariant.
2257 template <typename Ty>
2258 inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
2259  return match_LoopInvariant<Ty>(M, L);
2260 }
2261 
2262 /// Return true if the idiom is detected in the loop.
2263 ///
2264 /// The core idiom we are trying to detect is:
2265 /// \code
2266 /// entry:
2267 /// <...>
2268 /// %bitmask = shl i32 1, %bitpos
2269 /// br label %loop
2270 ///
2271 /// loop:
2272 /// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2273 /// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2274 /// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2275 /// %x.next = shl i32 %x.curr, 1
2276 /// <...>
2277 /// br i1 %x.curr.isbitunset, label %loop, label %end
2278 ///
2279 /// end:
2280 /// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2281 /// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2282 /// <...>
2283 /// \endcode
2284 static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
2285  Value *&BitMask, Value *&BitPos,
2286  Value *&CurrX, Instruction *&NextX) {
2288  " Performing shift-until-bittest idiom detection.\n");
2289 
2290  // Give up if the loop has multiple blocks or multiple backedges.
2291  if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
2292  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
2293  return false;
2294  }
2295 
2296  BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2297  BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2298  assert(LoopPreheaderBB && "There is always a loop preheader.");
2299 
2300  using namespace PatternMatch;
2301 
2302  // Step 1: Check if the loop backedge is in desirable form.
2303 
2304  ICmpInst::Predicate Pred;
2305  Value *CmpLHS, *CmpRHS;
2306  BasicBlock *TrueBB, *FalseBB;
2307  if (!match(LoopHeaderBB->getTerminator(),
2308  m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
2309  m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {
2310  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
2311  return false;
2312  }
2313 
2314  // Step 2: Check if the backedge's condition is in desirable form.
2315 
2316  auto MatchVariableBitMask = [&]() {
2317  return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2318  match(CmpLHS,
2319  m_c_And(m_Value(CurrX),
2320  m_CombineAnd(
2321  m_Value(BitMask),
2322  m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),
2323  CurLoop))));
2324  };
2325  auto MatchConstantBitMask = [&]() {
2326  return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2327  match(CmpLHS, m_And(m_Value(CurrX),
2328  m_CombineAnd(m_Value(BitMask), m_Power2()))) &&
2329  (BitPos = ConstantExpr::getExactLogBase2(cast<Constant>(BitMask)));
2330  };
2331  auto MatchDecomposableConstantBitMask = [&]() {
2332  APInt Mask;
2333  return llvm::decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, CurrX, Mask) &&
2334  ICmpInst::isEquality(Pred) && Mask.isPowerOf2() &&
2335  (BitMask = ConstantInt::get(CurrX->getType(), Mask)) &&
2336  (BitPos = ConstantInt::get(CurrX->getType(), Mask.logBase2()));
2337  };
2338 
2339  if (!MatchVariableBitMask() && !MatchConstantBitMask() &&
2340  !MatchDecomposableConstantBitMask()) {
2341  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n");
2342  return false;
2343  }
2344 
2345  // Step 3: Check if the recurrence is in desirable form.
2346  auto *CurrXPN = dyn_cast<PHINode>(CurrX);
2347  if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) {
2348  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
2349  return false;
2350  }
2351 
2352  BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);
2353  NextX =
2354  dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));
2355 
2356  assert(CurLoop->isLoopInvariant(BaseX) &&
2357  "Expected BaseX to be avaliable in the preheader!");
2358 
2359  if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {
2360  // FIXME: support right-shift?
2361  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
2362  return false;
2363  }
2364 
2365  // Step 4: Check if the backedge's destinations are in desirable form.
2366 
2367  assert(ICmpInst::isEquality(Pred) &&
2368  "Should only get equality predicates here.");
2369 
2370  // cmp-br is commutative, so canonicalize to a single variant.
2371  if (Pred != ICmpInst::Predicate::ICMP_EQ) {
2372  Pred = ICmpInst::getInversePredicate(Pred);
2373  std::swap(TrueBB, FalseBB);
2374  }
2375 
2376  // We expect to exit loop when comparison yields false,
2377  // so when it yields true we should branch back to loop header.
2378  if (TrueBB != LoopHeaderBB) {
2379  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
2380  return false;
2381  }
2382 
2383  // Okay, idiom checks out.
2384  return true;
2385 }
2386 
2387 /// Look for the following loop:
2388 /// \code
2389 /// entry:
2390 /// <...>
2391 /// %bitmask = shl i32 1, %bitpos
2392 /// br label %loop
2393 ///
2394 /// loop:
2395 /// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2396 /// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2397 /// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2398 /// %x.next = shl i32 %x.curr, 1
2399 /// <...>
2400 /// br i1 %x.curr.isbitunset, label %loop, label %end
2401 ///
2402 /// end:
2403 /// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2404 /// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2405 /// <...>
2406 /// \endcode
2407 ///
2408 /// And transform it into:
2409 /// \code
2410 /// entry:
2411 /// %bitmask = shl i32 1, %bitpos
2412 /// %lowbitmask = add i32 %bitmask, -1
2413 /// %mask = or i32 %lowbitmask, %bitmask
2414 /// %x.masked = and i32 %x, %mask
2415 /// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
2416 /// i1 true)
2417 /// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
2418 /// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
2419 /// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
2420 /// %tripcount = add i32 %backedgetakencount, 1
2421 /// %x.curr = shl i32 %x, %backedgetakencount
2422 /// %x.next = shl i32 %x, %tripcount
2423 /// br label %loop
2424 ///
2425 /// loop:
2426 /// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
2427 /// %loop.iv.next = add nuw i32 %loop.iv, 1
2428 /// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
2429 /// <...>
2430 /// br i1 %loop.ivcheck, label %end, label %loop
2431 ///
2432 /// end:
2433 /// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2434 /// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2435 /// <...>
2436 /// \endcode
2437 bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
2438  bool MadeChange = false;
2439 
2440  Value *X, *BitMask, *BitPos, *XCurr;
2441  Instruction *XNext;
2442  if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,
2443  XNext)) {
2445  " shift-until-bittest idiom detection failed.\n");
2446  return MadeChange;
2447  }
2448  LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n");
2449 
2450  // Ok, it is the idiom we were looking for, we *could* transform this loop,
2451  // but is it profitable to transform?
2452 
2453  BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2454  BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2455  assert(LoopPreheaderBB && "There is always a loop preheader.");
2456 
2457  BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2458  assert(SuccessorBB && "There is only a single successor.");
2459 
2460  IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2461  Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());
2462 
2463  Intrinsic::ID IntrID = Intrinsic::ctlz;
2464  Type *Ty = X->getType();
2465  unsigned Bitwidth = Ty->getScalarSizeInBits();
2466 
2469 
2470  // The rewrite is considered to be unprofitable iff and only iff the
2471  // intrinsic/shift we'll use are not cheap. Note that we are okay with *just*
2472  // making the loop countable, even if nothing else changes.
2474  IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getTrue()});
2476  if (Cost > TargetTransformInfo::TCC_Basic) {
2478  " Intrinsic is too costly, not beneficial\n");
2479  return MadeChange;
2480  }
2481  if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >
2483  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n");
2484  return MadeChange;
2485  }
2486 
2487  // Ok, transform appears worthwhile.
2488  MadeChange = true;
2489 
2490  // Step 1: Compute the loop trip count.
2491 
2492  Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
2493  BitPos->getName() + ".lowbitmask");
2494  Value *Mask =
2495  Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
2496  Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
2497  CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
2498  IntrID, Ty, {XMasked, /*is_zero_undef=*/Builder.getTrue()},
2499  /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros");
2500  Value *XMaskedNumActiveBits = Builder.CreateSub(
2501  ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
2502  XMasked->getName() + ".numactivebits", /*HasNUW=*/true,
2503  /*HasNSW=*/Bitwidth != 2);
2504  Value *XMaskedLeadingOnePos =
2505  Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),
2506  XMasked->getName() + ".leadingonepos", /*HasNUW=*/false,
2507  /*HasNSW=*/Bitwidth > 2);
2508 
2509  Value *LoopBackedgeTakenCount = Builder.CreateSub(
2510  BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount",
2511  /*HasNUW=*/true, /*HasNSW=*/true);
2512  // We know loop's backedge-taken count, but what's loop's trip count?
2513  // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2514  Value *LoopTripCount =
2515  Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
2516  CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
2517  /*HasNSW=*/Bitwidth != 2);
2518 
2519  // Step 2: Compute the recurrence's final value without a loop.
2520 
2521  // NewX is always safe to compute, because `LoopBackedgeTakenCount`
2522  // will always be smaller than `bitwidth(X)`, i.e. we never get poison.
2523  Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);
2524  NewX->takeName(XCurr);
2525  if (auto *I = dyn_cast<Instruction>(NewX))
2526  I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
2527 
2528  Value *NewXNext;
2529  // Rewriting XNext is more complicated, however, because `X << LoopTripCount`
2530  // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
2531  // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
2532  // that isn't the case, we'll need to emit an alternative, safe IR.
2533  if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() ||
2537  Ty->getScalarSizeInBits() - 1))))
2538  NewXNext = Builder.CreateShl(X, LoopTripCount);
2539  else {
2540  // Otherwise, just additionally shift by one. It's the smallest solution,
2541  // alternatively, we could check that NewX is INT_MIN (or BitPos is )
2542  // and select 0 instead.
2543  NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));
2544  }
2545 
2546  NewXNext->takeName(XNext);
2547  if (auto *I = dyn_cast<Instruction>(NewXNext))
2548  I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
2549 
2550  // Step 3: Adjust the successor basic block to recieve the computed
2551  // recurrence's final value instead of the recurrence itself.
2552 
2553  XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);
2554  XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);
2555 
2556  // Step 4: Rewrite the loop into a countable form, with canonical IV.
2557 
2558  // The new canonical induction variable.
2559  Builder.SetInsertPoint(&LoopHeaderBB->front());
2560  auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
2561 
2562  // The induction itself.
2563  // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2564  Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
2565  auto *IVNext =
2566  Builder.CreateAdd(IV, ConstantInt::get(Ty, 1), IV->getName() + ".next",
2567  /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
2568 
2569  // The loop trip count check.
2570  auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
2571  CurLoop->getName() + ".ivcheck");
2572  Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
2573  LoopHeaderBB->getTerminator()->eraseFromParent();
2574 
2575  // Populate the IV PHI.
2576  IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
2577  IV->addIncoming(IVNext, LoopHeaderBB);
2578 
2579  // Step 5: Forget the "non-computable" trip-count SCEV associated with the
2580  // loop. The loop would otherwise not be deleted even if it becomes empty.
2581 
2582  SE->forgetLoop(CurLoop);
2583 
2584  // Other passes will take care of actually deleting the loop if possible.
2585 
2586  LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n");
2587 
2588  ++NumShiftUntilBitTest;
2589  return MadeChange;
2590 }
2591 
2592 /// Return true if the idiom is detected in the loop.
2593 ///
2594 /// The core idiom we are trying to detect is:
2595 /// \code
2596 /// entry:
2597 /// <...>
2598 /// %start = <...>
2599 /// %extraoffset = <...>
2600 /// <...>
2601 /// br label %for.cond
2602 ///
2603 /// loop:
2604 /// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
2605 /// %nbits = add nsw i8 %iv, %extraoffset
2606 /// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
2607 /// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
2608 /// %iv.next = add i8 %iv, 1
2609 /// <...>
2610 /// br i1 %val.shifted.iszero, label %end, label %loop
2611 ///
2612 /// end:
2613 /// %iv.res = phi i8 [ %iv, %loop ] <...>
2614 /// %nbits.res = phi i8 [ %nbits, %loop ] <...>
2615 /// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
2616 /// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
2617 /// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
2618 /// <...>
2619 /// \endcode
2621  Instruction *&ValShiftedIsZero,
2622  Intrinsic::ID &IntrinID, Instruction *&IV,
2623  Value *&Start, Value *&Val,
2624  const SCEV *&ExtraOffsetExpr,
2625  bool &InvertedCond) {
2627  " Performing shift-until-zero idiom detection.\n");
2628 
2629  // Give up if the loop has multiple blocks or multiple backedges.
2630  if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
2631  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
2632  return false;
2633  }
2634 
2635  Instruction *ValShifted, *NBits, *IVNext;
2636  Value *ExtraOffset;
2637 
2638  BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2639  BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2640  assert(LoopPreheaderBB && "There is always a loop preheader.");
2641 
2642  using namespace PatternMatch;
2643 
2644  // Step 1: Check if the loop backedge, condition is in desirable form.
2645 
2646  ICmpInst::Predicate Pred;
2647  BasicBlock *TrueBB, *FalseBB;
2648  if (!match(LoopHeaderBB->getTerminator(),
2649  m_Br(m_Instruction(ValShiftedIsZero), m_BasicBlock(TrueBB),
2650  m_BasicBlock(FalseBB))) ||
2651  !match(ValShiftedIsZero,
2652  m_ICmp(Pred, m_Instruction(ValShifted), m_Zero())) ||
2653  !ICmpInst::isEquality(Pred)) {
2654  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
2655  return false;
2656  }
2657 
2658  // Step 2: Check if the comparison's operand is in desirable form.
2659  // FIXME: Val could be a one-input PHI node, which we should look past.
2660  if (!match(ValShifted, m_Shift(m_LoopInvariant(m_Value(Val), CurLoop),
2661  m_Instruction(NBits)))) {
2662  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad comparisons value computation.\n");
2663  return false;
2664  }
2665  IntrinID = ValShifted->getOpcode() == Instruction::Shl ? Intrinsic::cttz
2666  : Intrinsic::ctlz;
2667 
2668  // Step 3: Check if the shift amount is in desirable form.
2669 
2670  if (match(NBits, m_c_Add(m_Instruction(IV),
2671  m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
2672  (NBits->hasNoSignedWrap() || NBits->hasNoUnsignedWrap()))
2673  ExtraOffsetExpr = SE->getNegativeSCEV(SE->getSCEV(ExtraOffset));
2674  else if (match(NBits,
2676  m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
2677  NBits->hasNoSignedWrap())
2678  ExtraOffsetExpr = SE->getSCEV(ExtraOffset);
2679  else {
2680  IV = NBits;
2681  ExtraOffsetExpr = SE->getZero(NBits->getType());
2682  }
2683 
2684  // Step 4: Check if the recurrence is in desirable form.
2685  auto *IVPN = dyn_cast<PHINode>(IV);
2686  if (!IVPN || IVPN->getParent() != LoopHeaderBB) {
2687  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
2688  return false;
2689  }
2690 
2691  Start = IVPN->getIncomingValueForBlock(LoopPreheaderBB);
2692  IVNext = dyn_cast<Instruction>(IVPN->getIncomingValueForBlock(LoopHeaderBB));
2693 
2694  if (!IVNext || !match(IVNext, m_Add(m_Specific(IVPN), m_One()))) {
2695  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
2696  return false;
2697  }
2698 
2699  // Step 4: Check if the backedge's destinations are in desirable form.
2700 
2701  assert(ICmpInst::isEquality(Pred) &&
2702  "Should only get equality predicates here.");
2703 
2704  // cmp-br is commutative, so canonicalize to a single variant.
2705  InvertedCond = Pred != ICmpInst::Predicate::ICMP_EQ;
2706  if (InvertedCond) {
2707  Pred = ICmpInst::getInversePredicate(Pred);
2708  std::swap(TrueBB, FalseBB);
2709  }
2710 
2711  // We expect to exit loop when comparison yields true,
2712  // so when it yields false we should branch back to loop header.
2713  if (FalseBB != LoopHeaderBB) {
2714  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
2715  return false;
2716  }
2717 
2718  // The new, countable, loop will certainly only run a known number of
2719  // iterations, It won't be infinite. But the old loop might be infinite
2720  // under certain conditions. For logical shifts, the value will become zero
2721  // after at most bitwidth(%Val) loop iterations. However, for arithmetic
2722  // right-shift, iff the sign bit was set, the value will never become zero,
2723  // and the loop may never finish.
2724  if (ValShifted->getOpcode() == Instruction::AShr &&
2725  !isMustProgress(CurLoop) && !SE->isKnownNonNegative(SE->getSCEV(Val))) {
2726  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Can not prove the loop is finite.\n");
2727  return false;
2728  }
2729 
2730  // Okay, idiom checks out.
2731  return true;
2732 }
2733 
2734 /// Look for the following loop:
2735 /// \code
2736 /// entry:
2737 /// <...>
2738 /// %start = <...>
2739 /// %extraoffset = <...>
2740 /// <...>
2741 /// br label %for.cond
2742 ///
2743 /// loop:
2744 /// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
2745 /// %nbits = add nsw i8 %iv, %extraoffset
2746 /// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
2747 /// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
2748 /// %iv.next = add i8 %iv, 1
2749 /// <...>
2750 /// br i1 %val.shifted.iszero, label %end, label %loop
2751 ///
2752 /// end:
2753 /// %iv.res = phi i8 [ %iv, %loop ] <...>
2754 /// %nbits.res = phi i8 [ %nbits, %loop ] <...>
2755 /// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
2756 /// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
2757 /// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
2758 /// <...>
2759 /// \endcode
2760 ///
2761 /// And transform it into:
2762 /// \code
2763 /// entry:
2764 /// <...>
2765 /// %start = <...>
2766 /// %extraoffset = <...>
2767 /// <...>
2768 /// %val.numleadingzeros = call i8 @llvm.ct{l,t}z.i8(i8 %val, i1 0)
2769 /// %val.numactivebits = sub i8 8, %val.numleadingzeros
2770 /// %extraoffset.neg = sub i8 0, %extraoffset
2771 /// %tmp = add i8 %val.numactivebits, %extraoffset.neg
2772 /// %iv.final = call i8 @llvm.smax.i8(i8 %tmp, i8 %start)
2773 /// %loop.tripcount = sub i8 %iv.final, %start
2774 /// br label %loop
2775 ///
2776 /// loop:
2777 /// %loop.iv = phi i8 [ 0, %entry ], [ %loop.iv.next, %loop ]
2778 /// %loop.iv.next = add i8 %loop.iv, 1
2779 /// %loop.ivcheck = icmp eq i8 %loop.iv.next, %loop.tripcount
2780 /// %iv = add i8 %loop.iv, %start
2781 /// <...>
2782 /// br i1 %loop.ivcheck, label %end, label %loop
2783 ///
2784 /// end:
2785 /// %iv.res = phi i8 [ %iv.final, %loop ] <...>
2786 /// <...>
2787 /// \endcode
2788 bool LoopIdiomRecognize::recognizeShiftUntilZero() {
2789  bool MadeChange = false;
2790 
2791  Instruction *ValShiftedIsZero;
2792  Intrinsic::ID IntrID;
2793  Instruction *IV;
2794  Value *Start, *Val;
2795  const SCEV *ExtraOffsetExpr;
2796  bool InvertedCond;
2797  if (!detectShiftUntilZeroIdiom(CurLoop, SE, ValShiftedIsZero, IntrID, IV,
2798  Start, Val, ExtraOffsetExpr, InvertedCond)) {
2800  " shift-until-zero idiom detection failed.\n");
2801  return MadeChange;
2802  }
2803  LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom detected!\n");
2804 
2805  // Ok, it is the idiom we were looking for, we *could* transform this loop,
2806  // but is it profitable to transform?
2807 
2808  BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2809  BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2810  assert(LoopPreheaderBB && "There is always a loop preheader.");
2811 
2812  BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2813  assert(SuccessorBB && "There is only a single successor.");
2814 
2815  IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2816  Builder.SetCurrentDebugLocation(IV->getDebugLoc());
2817 
2818  Type *Ty = Val->getType();
2819  unsigned Bitwidth = Ty->getScalarSizeInBits();
2820 
2823 
2824  // The rewrite is considered to be unprofitable iff and only iff the
2825  // intrinsic we'll use are not cheap. Note that we are okay with *just*
2826  // making the loop countable, even if nothing else changes.
2828  IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getFalse()});
2830  if (Cost > TargetTransformInfo::TCC_Basic) {
2832  " Intrinsic is too costly, not beneficial\n");
2833  return MadeChange;
2834  }
2835 
2836  // Ok, transform appears worthwhile.
2837  MadeChange = true;
2838 
2839  bool OffsetIsZero = false;
2840  if (auto *ExtraOffsetExprC = dyn_cast<SCEVConstant>(ExtraOffsetExpr))
2841  OffsetIsZero = ExtraOffsetExprC->isZero();
2842 
2843  // Step 1: Compute the loop's final IV value / trip count.
2844 
2845  CallInst *ValNumLeadingZeros = Builder.CreateIntrinsic(
2846  IntrID, Ty, {Val, /*is_zero_undef=*/Builder.getFalse()},
2847  /*FMFSource=*/nullptr, Val->getName() + ".numleadingzeros");
2848  Value *ValNumActiveBits = Builder.CreateSub(
2849  ConstantInt::get(Ty, Ty->getScalarSizeInBits()), ValNumLeadingZeros,
2850  Val->getName() + ".numactivebits", /*HasNUW=*/true,
2851  /*HasNSW=*/Bitwidth != 2);
2852 
2853  SCEVExpander Expander(*SE, *DL, "loop-idiom");
2854  Expander.setInsertPoint(&*Builder.GetInsertPoint());
2855  Value *ExtraOffset = Expander.expandCodeFor(ExtraOffsetExpr);
2856 
2857  Value *ValNumActiveBitsOffset = Builder.CreateAdd(
2858  ValNumActiveBits, ExtraOffset, ValNumActiveBits->getName() + ".offset",
2859  /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true);
2860  Value *IVFinal = Builder.CreateIntrinsic(Intrinsic::smax, {Ty},
2861  {ValNumActiveBitsOffset, Start},
2862  /*FMFSource=*/nullptr, "iv.final");
2863 
2864  auto *LoopBackedgeTakenCount = cast<Instruction>(Builder.CreateSub(
2865  IVFinal, Start, CurLoop->getName() + ".backedgetakencount",
2866  /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true));
2867  // FIXME: or when the offset was `add nuw`
2868 
2869  // We know loop's backedge-taken count, but what's loop's trip count?
2870  Value *LoopTripCount =
2871  Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
2872  CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
2873  /*HasNSW=*/Bitwidth != 2);
2874 
2875  // Step 2: Adjust the successor basic block to recieve the original
2876  // induction variable's final value instead of the orig. IV itself.
2877 
2878  IV->replaceUsesOutsideBlock(IVFinal, LoopHeaderBB);
2879 
2880  // Step 3: Rewrite the loop into a countable form, with canonical IV.
2881 
2882  // The new canonical induction variable.
2883  Builder.SetInsertPoint(&LoopHeaderBB->front());
2884  auto *CIV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
2885 
2886  // The induction itself.
2887  Builder.SetInsertPoint(LoopHeaderBB->getFirstNonPHI());
2888  auto *CIVNext =
2889  Builder.CreateAdd(CIV, ConstantInt::get(Ty, 1), CIV->getName() + ".next",
2890  /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
2891 
2892  // The loop trip count check.
2893  auto *CIVCheck = Builder.CreateICmpEQ(CIVNext, LoopTripCount,
2894  CurLoop->getName() + ".ivcheck");
2895  auto *NewIVCheck = CIVCheck;
2896  if (InvertedCond) {
2897  NewIVCheck = Builder.CreateNot(CIVCheck);
2898  NewIVCheck->takeName(ValShiftedIsZero);
2899  }
2900 
2901  // The original IV, but rebased to be an offset to the CIV.
2902  auto *IVDePHId = Builder.CreateAdd(CIV, Start, "", /*HasNUW=*/false,
2903  /*HasNSW=*/true); // FIXME: what about NUW?
2904  IVDePHId->takeName(IV);
2905 
2906  // The loop terminator.
2907  Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
2908  Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
2909  LoopHeaderBB->getTerminator()->eraseFromParent();
2910 
2911  // Populate the IV PHI.
2912  CIV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
2913  CIV->addIncoming(CIVNext, LoopHeaderBB);
2914 
2915  // Step 4: Forget the "non-computable" trip-count SCEV associated with the
2916  // loop. The loop would otherwise not be deleted even if it becomes empty.
2917 
2918  SE->forgetLoop(CurLoop);
2919 
2920  // Step 5: Try to cleanup the loop's body somewhat.
2921  IV->replaceAllUsesWith(IVDePHId);
2922  IV->eraseFromParent();
2923 
2924  ValShiftedIsZero->replaceAllUsesWith(NewIVCheck);
2925  ValShiftedIsZero->eraseFromParent();
2926 
2927  // Other passes will take care of actually deleting the loop if possible.
2928 
2929  LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom optimized!\n");
2930 
2931  ++NumShiftUntilZero;
2932  return MadeChange;
2933 }
i
i
Definition: README.txt:29
llvm::InstructionCost
Definition: InstructionCost.h:29
DisableLIRPAll
static cl::opt< bool, true > DisableLIRPAll("disable-" DEBUG_TYPE "-all", cl::desc("Options to disable Loop Idiom Recognize Pass."), cl::location(DisableLIRP::All), cl::init(false), cl::ReallyHidden)
llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:152
llvm::TargetTransformInfo::PSK_FastHardware
@ PSK_FastHardware
Definition: TargetTransformInfo.h:595
llvm::RecursivelyDeleteTriviallyDeadInstructions
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:517
llvm::ScalarEvolution::hasLoopInvariantBackedgeTakenCount
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
Definition: ScalarEvolution.cpp:12970
llvm::BasicBlock::end
iterator end()
Definition: BasicBlock.h:299
Attrs
Function Attrs
Definition: README_ALTIVEC.txt:215
llvm::OptimizationRemarkMissed
Diagnostic information for missed-optimization remarks.
Definition: DiagnosticInfo.h:735
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:211
llvm::Loop::isLoopInvariant
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:60
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:104
llvm::ScalarEvolution::isLoopEntryGuardedByCond
bool isLoopEntryGuardedByCond(const Loop *L, ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS)
Test whether entry to the loop is protected by a conditional between LHS and RHS.
Definition: ScalarEvolution.cpp:11031
llvm::ScalarEvolution::getNegativeSCEV
const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
Definition: ScalarEvolution.cpp:4437
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:17
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::Instruction::getModule
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:65
llvm::CmpInst::ICMP_EQ
@ ICMP_EQ
equal
Definition: InstrTypes.h:740
llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition: iterator_range.h:53
llvm::isKnownNonNegative
bool isKnownNonNegative(const Value *V, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Returns true if the give value is known to be non-negative.
Definition: ValueTracking.cpp:347
match_LoopInvariant
Match loop-invariant value.
Definition: LoopIdiomRecognize.cpp:2244
llvm::SmallPtrSetImpl::erase
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:379
llvm::LoopBase::getExitBlock
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
Definition: LoopInfoImpl.h:81
llvm::WeakTrackingVH
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
match_LoopInvariant::L
const Loop * L
Definition: LoopIdiomRecognize.cpp:2246
CmpInstAnalysis.h
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::isSafeToExpand
bool isSafeToExpand(const SCEV *S, ScalarEvolution &SE, bool CanonicalMode=true)
Return true if the given expression is safe to expand in the sense that all materialized values are s...
Definition: ScalarEvolutionExpander.cpp:2616
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
llvm::LocationSize::afterPointer
constexpr static LocationSize afterPointer()
Any location after the base pointer (but still within the underlying object).
Definition: MemoryLocation.h:124
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1418
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:719
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:87
llvm::SCEVAddRecExpr::isAffine
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
Definition: ScalarEvolutionExpressions.h:370
llvm::isModOrRefSet
LLVM_NODISCARD bool isModOrRefSet(const ModRefInfo MRI)
Definition: AliasAnalysis.h:189
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:104
IntrinsicInst.h
ScalarEvolutionExpander.h
llvm::MemIntrinsicBase::getDestAlign
MaybeAlign getDestAlign() const
Definition: IntrinsicInst.h:753
llvm::BTF::HeaderSize
@ HeaderSize
Definition: BTF.h:58
Scalar.h
llvm::SCEVAddRecExpr::getStart
const SCEV * getStart() const
Definition: ScalarEvolutionExpressions.h:353
llvm::TypeSize::getFixedSize
ScalarTy getFixedSize() const
Definition: TypeSize.h:444
MemorySSAUpdater.h
llvm::Function
Definition: Function.h:60
llvm::cl::location
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:447
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:546
StringRef.h
Pass.h
llvm::LoopBase::contains
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:138
llvm::SCEVExpander
This class uses information about analyze scalars to rewrite expressions in canonical form.
Definition: ScalarEvolutionExpander.h:63
llvm::BasicBlock::instructionsWithoutDebug
iterator_range< filter_iterator< BasicBlock::const_iterator, std::function< bool(const Instruction &)> > > instructionsWithoutDebug(bool SkipPseudoOp=true) const
Return a const iterator range over the instructions in the block, skipping any debug instructions.
Definition: BasicBlock.cpp:103
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:309
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1185
Statistic.h
llvm::PatternMatch::m_Add
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
Definition: PatternMatch.h:973
llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1478
detectPopcountIdiom
static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, Instruction *&CntInst, PHINode *&CntPhi, Value *&Var)
Return true iff the idiom is detected in the loop.
Definition: LoopIdiomRecognize.cpp:1640
llvm::TargetTransformInfo
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Definition: TargetTransformInfo.h:168
LoopAccessAnalysis.h
llvm::Type::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:729
llvm::IRBuilder<>
MapVector.h
llvm::GlobalVariable
Definition: GlobalVariable.h:39
llvm::SmallDenseMap
Definition: DenseMap.h:882
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:449
llvm::TargetTransformInfo::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
Definition: TargetTransformInfo.cpp:917
llvm::CmpInst::ICMP_NE
@ ICMP_NE
not equal
Definition: InstrTypes.h:741
llvm::Instruction::hasNoUnsignedWrap
bool hasNoUnsignedWrap() const
Determine whether the no unsigned wrap flag is set.
Definition: Instruction.cpp:135
llvm::CmpInst::getInversePredicate
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:833
llvm::ConstantExpr::getBitCast
static Constant * getBitCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2266
llvm::initializeLoopIdiomRecognizeLegacyPassPass
void initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry &)
ValueTracking.h
Local.h
OptimizationRemarkEmitter.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
llvm::getLoopAnalysisUsage
void getLoopAnalysisUsage(AnalysisUsage &AU)
Helper to consistently add the set of standard passes to a loop pass's AnalysisUsage.
Definition: LoopUtils.cpp:144
llvm::SCEVExpander::expandCodeFor
Value * expandCodeFor(const SCEV *SH, Type *Ty, Instruction *I)
Insert code to directly compute the specified SCEV expression into the program.
Definition: ScalarEvolutionExpander.h:275
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:139
llvm::PatternMatch::m_Br
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
Definition: PatternMatch.h:1725
APInt.h
llvm::AAMDNodes
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition: Metadata.h:652
ScalarEvolution.h
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
DenseMap.h
llvm::APInt::getBitWidth
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1411
llvm::ICmpInst::isEquality
bool isEquality() const
Return true if this predicate is either EQ or NE.
Definition: Instructions.h:1268
Module.h
llvm::AAMDNodes::TBAAStruct
MDNode * TBAAStruct
The tag for type-based alias analysis (tbaa struct).
Definition: Metadata.h:672
llvm::createLoopIdiomPass
Pass * createLoopIdiomPass()
Definition: LoopIdiomRecognize.cpp:345
getTripCount
static const SCEV * getTripCount(const SCEV *BECount, Type *IntPtr, Loop *CurLoop, const DataLayout *DL, ScalarEvolution *SE)
Compute trip count from the backedge taken count.
Definition: LoopIdiomRecognize.cpp:1055
llvm::Instruction::isShift
bool isShift() const
Definition: Instruction.h:164
llvm::isBytewiseValue
Value * isBytewiseValue(Value *V, const DataLayout &DL)
If the specified value can be set by repeating the same byte in memory, return the i8 value that it i...
Definition: ValueTracking.cpp:3892
llvm::LoopStandardAnalysisResults
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Definition: LoopAnalysisManager.h:51
match_LoopInvariant::match
bool match(ITy *V)
Definition: LoopIdiomRecognize.cpp:2251
llvm::SCEVExpanderCleaner::markResultUsed
void markResultUsed()
Indicate that the result of the expansion is used.
Definition: ScalarEvolutionExpander.h:521
llvm::GlobalValue::UnnamedAddr::Global
@ Global
llvm::isConsecutiveAccess
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
Definition: LoopAccessAnalysis.cpp:1409
T
#define T
Definition: Mips16ISelLowering.cpp:341
llvm::ScalarEvolution::applyLoopGuards
const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
Definition: ScalarEvolution.cpp:14376
llvm::MapVector< Value *, StoreList >
llvm::SmallPtrSet< Value *, 16 >
llvm::ore::NV
DiagnosticInfoOptimizationBase::Argument NV
Definition: OptimizationRemarkEmitter.h:136
llvm::DisableLIRP::All
static bool All
When true, the entire pass is disabled.
Definition: LoopIdiomRecognize.h:30
llvm::CmpInst::ICMP_SLE
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:749
llvm::ScalarEvolution::isKnownNonNegative
bool isKnownNonNegative(const SCEV *S)
Test if the given expression is known to be non-negative.
Definition: ScalarEvolution.cpp:10329
llvm::ConstantExpr::getExactLogBase2
static Constant * getExactLogBase2(Constant *C)
If C is a scalar/fixed width vector of known powers of 2, then this function returns a new scalar/fix...
Definition: Constants.cpp:2816
llvm::BasicBlock::getSinglePredecessor
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:261
llvm::GlobalValue::setUnnamedAddr
void setUnnamedAddr(UnnamedAddr Val)
Definition: GlobalValue.h:217
llvm::ArrayType
Class to represent array types.
Definition: DerivedTypes.h:357
llvm::LoadInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:260
llvm::ScalarEvolution::getTruncateOrZeroExtend
const SCEV * getTruncateOrZeroExtend(const SCEV *V, Type *Ty, unsigned Depth=0)
Return a SCEV corresponding to a conversion of the input value to the specified type.
Definition: ScalarEvolution.cpp:4573
llvm::PatternMatch::m_c_And
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
Definition: PatternMatch.h:2245
llvm::AAMDNodes::Scope
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition: Metadata.h:675
llvm::SCEVConstant::getValue
ConstantInt * getValue() const
Definition: ScalarEvolutionExpressions.h:69
llvm::LoopStandardAnalysisResults::DT
DominatorTree & DT
Definition: LoopAnalysisManager.h:54
llvm::LoadInst::getAlign
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:216
llvm::SimpleLoopSafetyInfo
Simple and conservative implementation of LoopSafetyInfo that can give false-positive answers to its ...
Definition: MustExecute.h:110
llvm::codeview::PointerMode::Pointer
@ Pointer
llvm::cl::ReallyHidden
@ ReallyHidden
Definition: CommandLine.h:140
llvm::LinearPolySize::isScalable
bool isScalable() const
Returns whether the size is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:298
llvm::IRBuilderBase::getInt1
ConstantInt * getInt1(bool V)
Get a constant value representing either true or false.
Definition: IRBuilder.h:436
MustExecute.h
llvm::LoopBase::getNumBlocks
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:201
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::RISCVFenceField::R
@ R
Definition: RISCVBaseInfo.h:240
llvm::MemSetBase::getValue
Value * getValue() const
Definition: IntrinsicInst.h:849
loop
Analysis the ScalarEvolution expression for r is< loop > Outside the loop
Definition: README.txt:8
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:55
AliasAnalysis.h
llvm::Optional::hasValue
constexpr bool hasValue() const
Definition: Optional.h:312
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::ScalarEvolution::getMulExpr
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
Definition: ScalarEvolution.cpp:3050
llvm::DominatorTree::dominates
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
Instruction.h
CommandLine.h
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::Intrinsic::getType
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Return the function type for an intrinsic.
Definition: Function.cpp:1374
llvm::Instruction::getOpcode
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:157
llvm::StoreInst::getValueOperand
Value * getValueOperand()
Definition: Instructions.h:386
GlobalValue.h
llvm::InlinerFunctionImportStatsOpts::No
@ No
llvm::MemorySSAWrapperPass
Legacy analysis pass which computes MemorySSA.
Definition: MemorySSA.h:998
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:31
llvm::ScalarEvolution::getOne
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
Definition: ScalarEvolution.h:645
Constants.h
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
llvm::AAResults
Definition: AliasAnalysis.h:511
LoopIdiomRecognize.h
llvm::TargetTransformInfo::getPopcntSupport
PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const
Return hardware support for population count.
Definition: TargetTransformInfo.cpp:557
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::LocationSize
Definition: MemoryLocation.h:66
llvm::User
Definition: User.h:44
Intrinsics.h
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
InstrTypes.h
llvm::CallBase::getCalledFunction
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1396
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::LoopBase::blocks
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:194
TargetLibraryInfo.h
llvm::PatternMatch::m_Instruction
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:710
false
Definition: StackSlotColoring.cpp:141
llvm::dwarf::Index
Index
Definition: Dwarf.h:472
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
llvm::PatternMatch::m_c_Add
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
Definition: PatternMatch.h:2231
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::PHINode::getIncomingValueForBlock
Value * getIncomingValueForBlock(const BasicBlock *BB) const
Definition: Instructions.h:2836
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:187
llvm::Constant::getAllOnesValue
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:395
llvm::BinaryOperator::getOpcode
BinaryOps getOpcode() const
Definition: InstrTypes.h:392
llvm::Instruction
Definition: Instruction.h:42
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:189
llvm::STATISTIC
STATISTIC(NumFunctions, "Total number of functions")
llvm::codeview::EncodedFramePtrReg::BasePtr
@ BasePtr
llvm::UndefValue::get
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1777
llvm::LocationSize::precise
static LocationSize precise(uint64_t Value)
Definition: MemoryLocation.h:101
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:928
llvm::isMustProgress
bool isMustProgress(const Loop *L)
Return true if this loop can be assumed to make progress.
Definition: LoopInfo.cpp:1117
LoopUtils.h
DEBUG_TYPE
#define DEBUG_TYPE
Definition: LoopIdiomRecognize.cpp:107
llvm::getUnderlyingObject
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
Definition: ValueTracking.cpp:4455
DebugLoc.h
llvm::LPPassManager
Definition: LoopPass.h:76
SmallPtrSet.h
llvm::Instruction::getAAMetadata
AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
Definition: Metadata.cpp:1400
m_LoopInvariant
match_LoopInvariant< Ty > m_LoopInvariant(const Ty &M, const Loop *L)
Matches if the value is loop-invariant.
Definition: LoopIdiomRecognize.cpp:2258
llvm::BasicBlock::getModule
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:147
MemmoveVerifier::MemmoveVerifier
MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr, const DataLayout &DL)
Definition: LoopIdiomRecognize.cpp:1278
Align
uint64_t Align
Definition: ELFObjHandler.cpp:81
PatternMatch.h
llvm::BasicBlock::getFirstNonPHI
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:209
llvm::StoreInst::getAlign
Align getAlign() const
Definition: Instructions.h:341
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::MemTransferBase::getSourceAlign
MaybeAlign getSourceAlign() const
Definition: IntrinsicInst.h:814
llvm::PatternMatch::m_Shift
BinOpPred_match< LHS, RHS, is_shift_op > m_Shift(const LHS &L, const RHS &R)
Matches shift operations.
Definition: PatternMatch.h:1290
llvm::Loop::getName
StringRef getName() const
Definition: LoopInfo.h:885
llvm::None
const NoneType None
Definition: None.h:24
llvm::Type::getIntegerBitWidth
unsigned getIntegerBitWidth() const
Definition: DerivedTypes.h:97
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
Type.h
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:118
X
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
llvm::MemSetInst
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
Definition: IntrinsicInst.h:989
llvm::PatternMatch::m_One
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:511
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
LoopInfo.h
llvm::BranchInst::getCondition
Value * getCondition() const
Definition: Instructions.h:3167
llvm::SCEV::isNonConstantNegative
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Definition: ScalarEvolution.cpp:442
llvm::ScalarEvolution::getSCEV
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
Definition: ScalarEvolution.cpp:4406
llvm::LoopBase::block_begin
block_iterator block_begin() const
Definition: LoopInfo.h:192
llvm::PatternMatch::m_Power2
cst_pred_ty< is_power2 > m_Power2()
Match an integer or vector power-of-2.
Definition: PatternMatch.h:538
getStoreStride
static APInt getStoreStride(const SCEVAddRecExpr *StoreEv)
Definition: LoopIdiomRecognize.cpp:425
UseLIRCodeSizeHeurs
static cl::opt< bool > UseLIRCodeSizeHeurs("use-lir-code-size-heurs", cl::desc("Use loop idiom recognition code size heuristics when compiling" "with -Os/-Oz"), cl::init(true), cl::Hidden)
llvm::getLoopPassPreservedAnalyses
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
Definition: LoopAnalysisManager.cpp:137
Verifier
verify safepoint Safepoint IR Verifier
Definition: SafepointIRVerifier.cpp:251
BasicBlock.h
llvm::cl::opt
Definition: CommandLine.h:1392
llvm::SCEV
This class represents an analyzed expression in the program.
Definition: ScalarEvolution.h:75
llvm::PatternMatch::m_Zero
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:531
llvm::StoreInst
An instruction for storing to memory.
Definition: Instructions.h:297
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
llvm::Instruction::eraseFromParent
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:77
llvm::ICmpInst
This instruction compares its operands according to the predicate given to the constructor.
Definition: Instructions.h:1173
llvm::TargetLibraryInfoWrapperPass
Definition: TargetLibraryInfo.h:468
uint64_t
llvm::LoopPass
Definition: LoopPass.h:28
llvm::TargetTransformInfoWrapperPass
Wrapper pass for TargetTransformInfo.
Definition: TargetTransformInfo.h:2535
llvm::GlobalValue::getParent
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:620
llvm::LPMUpdater
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
Definition: LoopPassManager.h:262
llvm::SmallPtrSetImpl::end
iterator end() const
Definition: SmallPtrSet.h:408
llvm::LoopBase::getUniqueExitBlocks
void getUniqueExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all unique successor blocks of this loop.
Definition: LoopInfoImpl.h:121
createPopcntIntrinsic
static CallInst * createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL)
Definition: LoopIdiomRecognize.cpp:1988
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
llvm::PHINode::addIncoming
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Definition: Instructions.h:2801
llvm::numbers::e
constexpr double e
Definition: MathExtras.h:57
MemoryLocation.h
llvm::MemorySSA
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition: MemorySSA.h:714
llvm::LoopInfoBase::getLoopFor
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Definition: LoopInfo.h:986
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
llvm::LoopBase::getLoopPreheader
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:166
llvm::PatternMatch::m_And
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1087
llvm::SmallPtrSetImpl::begin
iterator begin() const
Definition: SmallPtrSet.h:403
ArrayRef.h
llvm::Instruction::setDebugLoc
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:364
llvm::SCEVConstant
This class represents a constant integer value.
Definition: ScalarEvolutionExpressions.h:60
llvm::getOrInsertLibFunc
FunctionCallee getOrInsertLibFunc(Module *M, const TargetLibraryInfo &TLI, LibFunc TheLibFunc, FunctionType *T, AttributeList AttributeList)
Calls getOrInsertFunction() and then makes sure to add mandatory argument attributes.
Definition: BuildLibCalls.cpp:1280
llvm::ModRefInfo
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition: AliasAnalysis.h:148
IRBuilder.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::PatternMatch::m_Sub
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
Definition: PatternMatch.h:985
llvm::Optional::getValue
constexpr const T & getValue() const &
Definition: Optional.h:306
llvm::Value::replaceUsesOutsideBlock
void replaceUsesOutsideBlock(Value *V, BasicBlock *BB)
replaceUsesOutsideBlock - Go through the uses list for this definition and make each use point to "V"...
Definition: Value.cpp:581
std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:853
SI
StandardInstrumentations SI(Debug, VerifyEach)
llvm::OptimizationRemarkEmitter::emit
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Definition: OptimizationRemarkEmitter.cpp:77
match_LoopInvariant::SubPattern
SubPattern_t SubPattern
Definition: LoopIdiomRecognize.cpp:2245
detectShiftUntilZeroIdiom
static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL, Intrinsic::ID &IntrinID, Value *&InitX, Instruction *&CntInst, PHINode *&CntPhi, Instruction *&DefX)
Return true if the idiom is detected in the loop.
Definition: LoopIdiomRecognize.cpp:1773
llvm::TargetLibraryInfo::has
bool has(LibFunc F) const
Tests whether a library function is available.
Definition: TargetLibraryInfo.h:328
llvm::TTI
TargetTransformInfo TTI
Definition: TargetTransformInfo.h:163
llvm::SCEVExpanderCleaner
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
Definition: ScalarEvolutionExpander.h:507
llvm::ArrayType::get
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:638
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
llvm::MemorySSAAnalysis
An analysis that produces MemorySSA for a function.
Definition: MemorySSA.h:948
llvm::SimpleLoopSafetyInfo::computeLoopSafetyInfo
void computeLoopSafetyInfo(const Loop *CurLoop) override
Computes safety information for a loop checks loop body & header for the possibility of may throw exc...
Definition: MustExecute.cpp:51
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:383
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:651
InstructionCost.h
llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:75
MemmoveVerifier
Definition: LoopIdiomRecognize.cpp:1276
llvm::User::setOperand
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
llvm::SetVector::insert
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:141
DisableLIRPMemcpy
static cl::opt< bool, true > DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memcpy."), cl::location(DisableLIRP::Memcpy), cl::init(false), cl::ReallyHidden)
getDebugLoc
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
Definition: MachineInstrBundle.cpp:110
llvm::PatternMatch::m_CombineAnd
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
Definition: PatternMatch.h:218
llvm::Sched::Source
@ Source
Definition: TargetLowering.h:99
llvm::MemIntrinsicBase::getDest
Value * getDest() const
This is just like getRawDest, but it strips off any cast instructions (including addrspacecast) that ...
Definition: IntrinsicInst.h:740
llvm::LoopInfo
Definition: LoopInfo.h:1102
llvm::AAMDNodes::merge
AAMDNodes merge(const AAMDNodes &Other) const
Given two sets of AAMDNodes applying to potentially different locations, determine the best AAMDNodes...
Definition: TypeBasedAliasAnalysis.cpp:523
llvm::BinaryOperator
Definition: InstrTypes.h:188
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
DataLayout.h
llvm::PatternMatch::m_SpecificInt_ICMP
cst_pred_ty< icmp_pred_with_threshold > m_SpecificInt_ICMP(ICmpInst::Predicate Predicate, const APInt &Threshold)
Match an integer or vector with every element comparing 'pred' (eg/ne/...) to Threshold.
Definition: PatternMatch.h:592
Cond
SmallVector< MachineOperand, 4 > Cond
Definition: BasicBlockSections.cpp:137
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:58
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:215
llvm::ScalarEvolution::getConstant
const SCEV * getConstant(ConstantInt *V)
Definition: ScalarEvolution.cpp:461
LoopPass.h
llvm::SCEVExpander::setInsertPoint
void setInsertPoint(Instruction *IP)
Set the current insertion point.
Definition: ScalarEvolutionExpander.h:345
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
llvm::ConstantInt::isZero
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:194
llvm::LoopBase::isOutermost
bool isOutermost() const
Return true if the loop does not have a parent (natural) loop.
Definition: LoopInfo.h:184
llvm::Instruction::getFunction
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:69
llvm::AnalysisUsage::addPreserved
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Definition: PassAnalysisSupport.h:98
llvm::Value::replaceAllUsesWith
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:529
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
getParent
static const Function * getParent(const Value *V)
Definition: BasicAliasAnalysis.cpp:845
uint32_t
llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:991
llvm::ModRefInfo::Mod
@ Mod
The access may modify the value stored in memory.
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:82
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::SCEV::FlagNUW
@ FlagNUW
Definition: ScalarEvolution.h:133
llvm::ConstantInt::isMinusOne
bool isMinusOne() const
This function will return true iff every bit in this constant is set to true.
Definition: Constants.h:206
llvm::LoopStandardAnalysisResults::LI
LoopInfo & LI
Definition: LoopAnalysisManager.h:55
llvm::MemoryAccess
Definition: MemorySSA.h:142
idiom
loop idiom
Definition: LoopIdiomRecognize.cpp:342
llvm::Value::getName
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:305
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:173
ValueHandle.h
matchCondition
static Value * matchCondition(BranchInst *BI, BasicBlock *LoopEntry, bool JmpOnZero=false)
Check if the given conditional branch is based on the comparison between a variable and zero,...
Definition: LoopIdiomRecognize.cpp:1579
llvm::SimpleLoopSafetyInfo::anyBlockMayThrow
bool anyBlockMayThrow() const override
Returns true iff any block of the loop for which this info is contains an instruction that may throw ...
Definition: MustExecute.cpp:47
mayLoopAccessLocation
static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, const SCEV *BECount, const SCEV *StoreSizeSCEV, AliasAnalysis &AA, SmallPtrSetImpl< Instruction * > &IgnoredInsts)
mayLoopAccessLocation - Return true if the specified loop might access the specified pointer location...
Definition: LoopIdiomRecognize.cpp:1005
llvm::Function::hasOptSize
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:664
llvm::ScalarEvolution::isLoopInvariant
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
Definition: ScalarEvolution.cpp:13240
detectShiftUntilBitTestIdiom
static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX, Value *&BitMask, Value *&BitPos, Value *&CurrX, Instruction *&NextX)
Return true if the idiom is detected in the loop.
Definition: LoopIdiomRecognize.cpp:2284
llvm::ConstantInt::getZExtValue
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:142
llvm::BasicBlock::front
const Instruction & front() const
Definition: BasicBlock.h:309
llvm::LoopIdiomRecognizePass::run
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
Definition: LoopIdiomRecognize.cpp:313
llvm::DisableLIRP::Memcpy
static bool Memcpy
When true, Memcpy is disabled.
Definition: LoopIdiomRecognize.h:36
j
return j(j<< 16)
llvm::isLibFuncEmittable
bool isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI, LibFunc TheLibFunc)
Check whether the library function is available on target and also that it in the current Module is a...
Definition: BuildLibCalls.cpp:1355
getNumBytes
static const SCEV * getNumBytes(const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, Loop *CurLoop, const DataLayout *DL, ScalarEvolution *SE)
Compute the number of bytes as a SCEV from the backedge taken count.
Definition: LoopIdiomRecognize.cpp:1085
llvm::Instruction::isAtomic
bool isAtomic() const
Return true if this instruction has an AtomicOrdering of unordered or higher.
Definition: Instruction.cpp:616
Constant.h
llvm::ScalarEvolution::forgetLoop
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
Definition: ScalarEvolution.cpp:8031
llvm::SCEVAddRecExpr::getLoop
const Loop * getLoop() const
Definition: ScalarEvolutionExpressions.h:354
llvm::GraphProgram::Name
Name
Definition: GraphWriter.h:50
llvm::decomposeBitTestICmp
bool decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate &Pred, Value *&X, APInt &Mask, bool LookThroughTrunc=true)
Decompose an icmp into the form ((X & Mask) pred 0) if possible.
Definition: CmpInstAnalysis.cpp:76
llvm::ScalarEvolution::getMinusSCEV
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
Definition: ScalarEvolution.cpp:4523
MemmoveVerifier::loadAndStoreMayFormMemmove
bool loadAndStoreMayFormMemmove(unsigned StoreSize, bool IsNegStride, const Instruction &TheLoad, bool IsMemCpy) const
Definition: LoopIdiomRecognize.cpp:1286
llvm::AAMDNodes::NoAlias
MDNode * NoAlias
The tag specifying the noalias scope.
Definition: Metadata.h:678
llvm::MemIntrinsicBase::getLength
Value * getLength() const
Definition: IntrinsicInst.h:731
llvm::LoopStandardAnalysisResults::TTI
TargetTransformInfo & TTI
Definition: LoopAnalysisManager.h:58
getMemSetPatternValue
static Constant * getMemSetPatternValue(Value *V, const DataLayout *DL)
getMemSetPatternValue - If a strided store of the specified value is safe to turn into a memset_patte...
Definition: LoopIdiomRecognize.cpp:436
llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:158
llvm::PHINode::Create
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
Definition: Instructions.h:2693
llvm::ore::setExtraArgs
DiagnosticInfoOptimizationBase::setExtraArgs setExtraArgs
Definition: OptimizationRemarkEmitter.h:138
llvm::ConstantInt::getBool
static ConstantInt * getBool(LLVMContext &Context, bool V)
Definition: Constants.cpp:890
GlobalVariable.h
llvm::IRBuilderBase::GetInsertBlock
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:173
llvm::MemorySSA::BeforeTerminator
@ BeforeTerminator
Definition: MemorySSA.h:802
llvm::TypeSize
Definition: TypeSize.h:435
llvm::SCEVAddRecExpr
This node represents a polynomial recurrence on the trip count of the specified loop.
Definition: ScalarEvolutionExpressions.h:342
Casting.h
idioms
loop Recognize loop idioms
Definition: LoopIdiomRecognize.cpp:343
llvm::LoopBase::getHeader
BlockT * getHeader() const
Definition: LoopInfo.h:104
PassManager.h
llvm::SetVector::count
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:215
llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition: TargetLibraryInfo.h:222
llvm::SCEV::isOne
bool isOne() const
Return true if the expression is a constant one.
Definition: ScalarEvolution.cpp:430
llvm::ConstantArray::get
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1290
llvm::GetPointerBaseWithConstantOffset
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
Definition: ValueTracking.h:278
llvm::SmallVectorImpl::clear
void clear()
Definition: SmallVector.h:591
llvm::PrevailingType::Yes
@ Yes
llvm::LoopBase::getNumBackEdges
unsigned getNumBackEdges() const
Calculate the number of back edges to the loop header.
Definition: LoopInfo.h:266
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:185
llvm::CmpInst::setPredicate
void setPredicate(Predicate P)
Set the predicate for this instruction to the specified value.
Definition: InstrTypes.h:812
isSimple
static bool isSimple(Instruction *I)
Definition: SLPVectorizer.cpp:664
llvm::LoopStandardAnalysisResults::SE
ScalarEvolution & SE
Definition: LoopAnalysisManager.h:56
llvm::LoopStandardAnalysisResults::AA
AAResults & AA
Definition: LoopAnalysisManager.h:52
llvm::AAMDNodes::TBAA
MDNode * TBAA
The tag for type-based alias analysis.
Definition: Metadata.h:669
llvm::SCEVConstant::getAPInt
const APInt & getAPInt() const
Definition: ScalarEvolutionExpressions.h:70
MemmoveVerifier::IsSameObject
const bool IsSameObject
Definition: LoopIdiomRecognize.cpp:1317
AA
ScalarEvolutionExpressions.h
llvm::OptimizationRemark
Diagnostic information for applied optimization remarks.
Definition: DiagnosticInfo.h:690
llvm::Pass
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:91
MemorySSA.h
llvm::ScalarEvolution::getZeroExtendExpr
const SCEV * getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
Definition: ScalarEvolution.cpp:1594
llvm::GlobalValue::PrivateLinkage
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:56
Instructions.h
llvm::MemCpyInst
This class wraps the llvm.memcpy intrinsic.
Definition: IntrinsicInst.h:1041
llvm::LoadInst::isVolatile
bool isVolatile() const
Return true if this is a load from a volatile memory location.
Definition: Instructions.h:210
llvm::MemIntrinsic::isVolatile
bool isVolatile() const
Definition: IntrinsicInst.h:966
SmallVector.h
match_LoopInvariant::match_LoopInvariant
match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
Definition: LoopIdiomRecognize.cpp:2248
llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:766
llvm::Instruction::getDebugLoc
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:367
User.h
llvm::StoreInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:389
llvm::PatternMatch::m_ICmp
CmpClass_match< LHS, RHS, ICmpInst, ICmpInst::Predicate > m_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R)
Definition: PatternMatch.h:1388
llvm::Instruction::hasNoSignedWrap
bool hasNoSignedWrap() const
Determine whether the no signed wrap flag is set.
Definition: Instruction.cpp:139
llvm::CmpInst::ICMP_UGT
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:742
Dominators.h
llvm::BasicBlock::size
size_t size() const
Definition: BasicBlock.h:307
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:91
llvm::CmpInst::getPredicate
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:809
llvm::ScalarEvolution::getZero
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
Definition: ScalarEvolution.h:642
getStartForNegStride
static const SCEV * getStartForNegStride(const SCEV *Start, const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, ScalarEvolution *SE)
Definition: LoopIdiomRecognize.cpp:1040
TargetTransformInfo.h
llvm::LoopStandardAnalysisResults::MSSA
MemorySSA * MSSA
Definition: LoopAnalysisManager.h:61
llvm::FunctionCallee
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:165
llvm::PHINode
Definition: Instructions.h:2651
llvm::BasicBlock::getTerminator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:119
IV
static const uint32_t IV[8]
Definition: blake3_impl.h:85
llvm::SmallVectorImpl< BasicBlock * >
llvm::TargetTransformInfo::getAtomicMemIntrinsicMaxElementSize
unsigned getAtomicMemIntrinsicMaxElementSize() const
Definition: TargetTransformInfo.cpp:986
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom", "Recognize loop idioms", false, false) INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass
llvm::Module::getDataLayout
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:398
DerivedTypes.h
llvm::SmallPtrSetImpl< Instruction * >
llvm::SCEV::getType
Type * getType() const
Return the LLVM type of this SCEV expression.
Definition: ScalarEvolution.cpp:392
llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition: InstructionSimplify.h:42
llvm::ScalarEvolution::getAddExpr
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
Definition: ScalarEvolution.cpp:2453
createFFSIntrinsic
static CallInst * createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL, bool ZeroCheck, Intrinsic::ID IID)
Definition: LoopIdiomRecognize.cpp:2001
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1461
llvm::ConstantInt::isOne
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
Definition: Constants.h:200
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::LoadInst::isUnordered
bool isUnordered() const
Definition: Instructions.h:254
DisableLIRPMemset
static cl::opt< bool, true > DisableLIRPMemset("disable-" DEBUG_TYPE "-memset", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memset."), cl::location(DisableLIRP::Memset), cl::init(false), cl::ReallyHidden)
deleteDeadInstruction
static void deleteDeadInstruction(Instruction *I)
Definition: LoopIdiomRecognize.cpp:347
llvm::inferNonMandatoryLibFuncAttrs
bool inferNonMandatoryLibFuncAttrs(Module *M, StringRef Name, const TargetLibraryInfo &TLI)
Analyze the name and prototype of the given function and set any applicable attributes.
Definition: BuildLibCalls.cpp:257
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::VerifyMemorySSA
bool VerifyMemorySSA
Enables verification of MemorySSA.
Definition: MemorySSA.cpp:91
LLVMContext.h
llvm::TargetTransformInfo::TCC_Basic
@ TCC_Basic
The cost of a typical 'add' instruction.
Definition: TargetTransformInfo.h:263
llvm::Value::takeName
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:378
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
llvm::PatternMatch::m_BasicBlock
class_match< BasicBlock > m_BasicBlock()
Match an arbitrary basic block value and ignore it.
Definition: PatternMatch.h:162
llvm::TargetTransformInfo::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueKind Opd1Info=OK_AnyValue, OperandValueKind Opd2Info=OK_AnyValue, OperandValueProperties Opd1PropInfo=OP_None, OperandValueProperties Opd2PropInfo=OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
Definition: TargetTransformInfo.cpp:755
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:394
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:405
llvm::GlobalObject::setAlignment
void setAlignment(MaybeAlign Align)
Definition: Globals.cpp:126
llvm::SCEVNAryExpr::getOperand
const SCEV * getOperand(unsigned i) const
Definition: ScalarEvolutionExpressions.h:201
llvm::BranchInst
Conditional or Unconditional Branch instruction.
Definition: Instructions.h:3086
raw_ostream.h
llvm::SetVector
A vector that has set insertion semantics.
Definition: SetVector.h:40
llvm::ScalarEvolution::getBackedgeTakenCount
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
Definition: ScalarEvolution.cpp:7901
llvm::SmallPtrSetImpl::contains
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:389
llvm::intersectModRef
LLVM_NODISCARD ModRefInfo intersectModRef(const ModRefInfo MRI1, const ModRefInfo MRI2)
Definition: AliasAnalysis.h:236
Value.h
getRecurrenceVar
static PHINode * getRecurrenceVar(Value *VarX, Instruction *DefX, BasicBlock *LoopEntry)
Definition: LoopIdiomRecognize.cpp:1607
llvm::LoopStandardAnalysisResults::TLI
TargetLibraryInfo & TLI
Definition: LoopAnalysisManager.h:57
InitializePasses.h
BuildLibCalls.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::PatternMatch::m_Shl
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1105
Debug.h
llvm::Value::users
iterator_range< user_iterator > users()
Definition: Value.h:421
llvm::APIntOps::smax
const APInt & smax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be signed.
Definition: APInt.h:2132
llvm::BranchInst::isConditional
bool isConditional() const
Definition: Instructions.h:3165
llvm::ModRefInfo::ModRef
@ ModRef
The access may reference and may modify the value stored in memory.
llvm::IRBuilderBase::CreateCall
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=None, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2235
llvm::BranchInst::getSuccessor
BasicBlock * getSuccessor(unsigned i) const
Definition: Instructions.h:3179
llvm::MemoryLocation
Representation for a specific memory location.
Definition: MemoryLocation.h:210
llvm::DisableLIRP::Memset
static bool Memset
When true, Memset is disabled.
Definition: LoopIdiomRecognize.h:33
SetVector.h
llvm::AAMDNodes::extendTo
AAMDNodes extendTo(ssize_t Len) const
Create a new AAMDNode that describes this AAMDNode after extending it to apply to a series of bytes o...
Definition: Metadata.h:719
llvm::MemTransferBase::getSource
Value * getSource() const
This is just like getRawSource, but it strips off any cast instructions that feed it,...
Definition: IntrinsicInst.h:800
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38