LLVM  14.0.0git
LoopIdiomRecognize.cpp
Go to the documentation of this file.
1 //===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass implements an idiom recognizer that transforms simple loops into a
10 // non-loop form. In cases that this kicks in, it can be a significant
11 // performance win.
12 //
13 // If compiling for code size we avoid idiom recognition if the resulting
14 // code could be larger than the code for the original loop. One way this could
15 // happen is if the loop is not removable after idiom recognition due to the
16 // presence of non-idiom instructions. The initial implementation of the
17 // heuristics applies to idioms in multi-block loops.
18 //
19 //===----------------------------------------------------------------------===//
20 //
21 // TODO List:
22 //
23 // Future loop memory idioms to recognize:
24 // memcmp, strlen, etc.
25 // Future floating point idioms to recognize in -ffast-math mode:
26 // fpowi
27 // Future integer operation idioms to recognize:
28 // ctpop
29 //
30 // Beware that isel's default lowering for ctpop is highly inefficient for
31 // i64 and larger types when i64 is legal and the value has few bits set. It
32 // would be good to enhance isel to emit a loop for ctpop in this case.
33 //
34 // This could recognize common matrix multiplies and dot product idioms and
35 // replace them with calls to BLAS (if linked in??).
36 //
37 //===----------------------------------------------------------------------===//
38 
40 #include "llvm/ADT/APInt.h"
41 #include "llvm/ADT/ArrayRef.h"
42 #include "llvm/ADT/DenseMap.h"
43 #include "llvm/ADT/MapVector.h"
44 #include "llvm/ADT/SetVector.h"
45 #include "llvm/ADT/SmallPtrSet.h"
46 #include "llvm/ADT/SmallVector.h"
47 #include "llvm/ADT/Statistic.h"
48 #include "llvm/ADT/StringRef.h"
52 #include "llvm/Analysis/LoopInfo.h"
53 #include "llvm/Analysis/LoopPass.h"
64 #include "llvm/IR/Attributes.h"
65 #include "llvm/IR/BasicBlock.h"
66 #include "llvm/IR/Constant.h"
67 #include "llvm/IR/Constants.h"
68 #include "llvm/IR/DataLayout.h"
69 #include "llvm/IR/DebugLoc.h"
70 #include "llvm/IR/DerivedTypes.h"
71 #include "llvm/IR/Dominators.h"
72 #include "llvm/IR/GlobalValue.h"
73 #include "llvm/IR/GlobalVariable.h"
74 #include "llvm/IR/IRBuilder.h"
75 #include "llvm/IR/InstrTypes.h"
76 #include "llvm/IR/Instruction.h"
77 #include "llvm/IR/Instructions.h"
78 #include "llvm/IR/IntrinsicInst.h"
79 #include "llvm/IR/Intrinsics.h"
80 #include "llvm/IR/LLVMContext.h"
81 #include "llvm/IR/Module.h"
82 #include "llvm/IR/PassManager.h"
83 #include "llvm/IR/PatternMatch.h"
84 #include "llvm/IR/Type.h"
85 #include "llvm/IR/User.h"
86 #include "llvm/IR/Value.h"
87 #include "llvm/IR/ValueHandle.h"
88 #include "llvm/InitializePasses.h"
89 #include "llvm/Pass.h"
90 #include "llvm/Support/Casting.h"
92 #include "llvm/Support/Debug.h"
95 #include "llvm/Transforms/Scalar.h"
100 #include <algorithm>
101 #include <cassert>
102 #include <cstdint>
103 #include <utility>
104 #include <vector>
105 
106 using namespace llvm;
107 
108 #define DEBUG_TYPE "loop-idiom"
109 
110 STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
111 STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
112 STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores");
113 STATISTIC(
114  NumShiftUntilBitTest,
115  "Number of uncountable loops recognized as 'shift until bitttest' idiom");
116 STATISTIC(NumShiftUntilZero,
117  "Number of uncountable loops recognized as 'shift until zero' idiom");
118 
119 bool DisableLIRP::All;
120 static cl::opt<bool, true>
121  DisableLIRPAll("disable-" DEBUG_TYPE "-all",
122  cl::desc("Options to disable Loop Idiom Recognize Pass."),
125 
127 static cl::opt<bool, true>
128  DisableLIRPMemset("disable-" DEBUG_TYPE "-memset",
129  cl::desc("Proceed with loop idiom recognize pass, but do "
130  "not convert loop(s) to memset."),
133 
135 static cl::opt<bool, true>
136  DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy",
137  cl::desc("Proceed with loop idiom recognize pass, but do "
138  "not convert loop(s) to memcpy."),
141 
143  "use-lir-code-size-heurs",
144  cl::desc("Use loop idiom recognition code size heuristics when compiling"
145  "with -Os/-Oz"),
146  cl::init(true), cl::Hidden);
147 
148 namespace {
149 
150 class LoopIdiomRecognize {
151  Loop *CurLoop = nullptr;
152  AliasAnalysis *AA;
153  DominatorTree *DT;
154  LoopInfo *LI;
155  ScalarEvolution *SE;
156  TargetLibraryInfo *TLI;
157  const TargetTransformInfo *TTI;
158  const DataLayout *DL;
160  bool ApplyCodeSizeHeuristics;
161  std::unique_ptr<MemorySSAUpdater> MSSAU;
162 
163 public:
164  explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
165  LoopInfo *LI, ScalarEvolution *SE,
166  TargetLibraryInfo *TLI,
167  const TargetTransformInfo *TTI, MemorySSA *MSSA,
168  const DataLayout *DL,
170  : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
171  if (MSSA)
172  MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
173  }
174 
175  bool runOnLoop(Loop *L);
176 
177 private:
178  using StoreList = SmallVector<StoreInst *, 8>;
179  using StoreListMap = MapVector<Value *, StoreList>;
180 
181  StoreListMap StoreRefsForMemset;
182  StoreListMap StoreRefsForMemsetPattern;
183  StoreList StoreRefsForMemcpy;
184  bool HasMemset;
185  bool HasMemsetPattern;
186  bool HasMemcpy;
187 
188  /// Return code for isLegalStore()
189  enum LegalStoreKind {
190  None = 0,
191  Memset,
192  MemsetPattern,
193  Memcpy,
194  UnorderedAtomicMemcpy,
195  DontUse // Dummy retval never to be used. Allows catching errors in retval
196  // handling.
197  };
198 
199  /// \name Countable Loop Idiom Handling
200  /// @{
201 
202  bool runOnCountableLoop();
203  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
204  SmallVectorImpl<BasicBlock *> &ExitBlocks);
205 
206  void collectStores(BasicBlock *BB);
207  LegalStoreKind isLegalStore(StoreInst *SI);
208  enum class ForMemset { No, Yes };
209  bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
210  ForMemset For);
211 
212  template <typename MemInst>
213  bool processLoopMemIntrinsic(
214  BasicBlock *BB,
215  bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
216  const SCEV *BECount);
217  bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount);
218  bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
219 
220  bool processLoopStridedStore(Value *DestPtr, const SCEV *StoreSizeSCEV,
221  MaybeAlign StoreAlignment, Value *StoredVal,
222  Instruction *TheStore,
224  const SCEVAddRecExpr *Ev, const SCEV *BECount,
225  bool IsNegStride, bool IsLoopMemset = false);
226  bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
227  bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr,
228  const SCEV *StoreSize, MaybeAlign StoreAlign,
229  MaybeAlign LoadAlign, Instruction *TheStore,
230  Instruction *TheLoad,
231  const SCEVAddRecExpr *StoreEv,
232  const SCEVAddRecExpr *LoadEv,
233  const SCEV *BECount);
234  bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
235  bool IsLoopMemset = false);
236 
237  /// @}
238  /// \name Noncountable Loop Idiom Handling
239  /// @{
240 
241  bool runOnNoncountableLoop();
242 
243  bool recognizePopcount();
244  void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
245  PHINode *CntPhi, Value *Var);
246  bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
247  void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
248  Instruction *CntInst, PHINode *CntPhi,
249  Value *Var, Instruction *DefX,
250  const DebugLoc &DL, bool ZeroCheck,
251  bool IsCntPhiUsedOutsideLoop);
252 
253  bool recognizeShiftUntilBitTest();
254  bool recognizeShiftUntilZero();
255 
256  /// @}
257 };
258 
259 class LoopIdiomRecognizeLegacyPass : public LoopPass {
260 public:
261  static char ID;
262 
263  explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
266  }
267 
268  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
269  if (DisableLIRP::All)
270  return false;
271 
272  if (skipLoop(L))
273  return false;
274 
275  AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
276  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
277  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
278  ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
279  TargetLibraryInfo *TLI =
280  &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
281  *L->getHeader()->getParent());
282  const TargetTransformInfo *TTI =
283  &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
284  *L->getHeader()->getParent());
285  const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
286  auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
287  MemorySSA *MSSA = nullptr;
288  if (MSSAAnalysis)
289  MSSA = &MSSAAnalysis->getMSSA();
290 
291  // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
292  // pass. Function analyses need to be preserved across loop transformations
293  // but ORE cannot be preserved (see comment before the pass definition).
295 
296  LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, MSSA, DL, ORE);
297  return LIR.runOnLoop(L);
298  }
299 
300  /// This transformation requires natural loop information & requires that
301  /// loop preheaders be inserted into the CFG.
302  void getAnalysisUsage(AnalysisUsage &AU) const override {
307  }
308 };
309 
310 } // end anonymous namespace
311 
313 
316  LPMUpdater &) {
317  if (DisableLIRP::All)
318  return PreservedAnalyses::all();
319 
320  const auto *DL = &L.getHeader()->getModule()->getDataLayout();
321 
322  // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
323  // pass. Function analyses need to be preserved across loop transformations
324  // but ORE cannot be preserved (see comment before the pass definition).
326 
327  LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
328  AR.MSSA, DL, ORE);
329  if (!LIR.runOnLoop(&L))
330  return PreservedAnalyses::all();
331 
332  auto PA = getLoopPassPreservedAnalyses();
333  if (AR.MSSA)
334  PA.preserve<MemorySSAAnalysis>();
335  return PA;
336 }
337 
338 INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",
339  "Recognize loop idioms", false, false)
343 INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",
344  "Recognize loop idioms", false, false)
345 
346 Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
347 
349  I->replaceAllUsesWith(UndefValue::get(I->getType()));
350  I->eraseFromParent();
351 }
352 
353 //===----------------------------------------------------------------------===//
354 //
355 // Implementation of LoopIdiomRecognize
356 //
357 //===----------------------------------------------------------------------===//
358 
359 bool LoopIdiomRecognize::runOnLoop(Loop *L) {
360  CurLoop = L;
361  // If the loop could not be converted to canonical form, it must have an
362  // indirectbr in it, just give up.
363  if (!L->getLoopPreheader())
364  return false;
365 
366  // Disable loop idiom recognition if the function's name is a common idiom.
368  if (Name == "memset" || Name == "memcpy")
369  return false;
370 
371  // Determine if code size heuristics need to be applied.
372  ApplyCodeSizeHeuristics =
374 
375  HasMemset = TLI->has(LibFunc_memset);
376  HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
377  HasMemcpy = TLI->has(LibFunc_memcpy);
378 
379  if (HasMemset || HasMemsetPattern || HasMemcpy)
381  return runOnCountableLoop();
382 
383  return runOnNoncountableLoop();
384 }
385 
386 bool LoopIdiomRecognize::runOnCountableLoop() {
387  const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
388  assert(!isa<SCEVCouldNotCompute>(BECount) &&
389  "runOnCountableLoop() called on a loop without a predictable"
390  "backedge-taken count");
391 
392  // If this loop executes exactly one time, then it should be peeled, not
393  // optimized by this pass.
394  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
395  if (BECst->getAPInt() == 0)
396  return false;
397 
398  SmallVector<BasicBlock *, 8> ExitBlocks;
399  CurLoop->getUniqueExitBlocks(ExitBlocks);
400 
401  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
402  << CurLoop->getHeader()->getParent()->getName()
403  << "] Countable Loop %" << CurLoop->getHeader()->getName()
404  << "\n");
405 
406  // The following transforms hoist stores/memsets into the loop pre-header.
407  // Give up if the loop has instructions that may throw.
408  SimpleLoopSafetyInfo SafetyInfo;
409  SafetyInfo.computeLoopSafetyInfo(CurLoop);
410  if (SafetyInfo.anyBlockMayThrow())
411  return false;
412 
413  bool MadeChange = false;
414 
415  // Scan all the blocks in the loop that are not in subloops.
416  for (auto *BB : CurLoop->getBlocks()) {
417  // Ignore blocks in subloops.
418  if (LI->getLoopFor(BB) != CurLoop)
419  continue;
420 
421  MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
422  }
423  return MadeChange;
424 }
425 
426 static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
427  const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
428  return ConstStride->getAPInt();
429 }
430 
431 /// getMemSetPatternValue - If a strided store of the specified value is safe to
432 /// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
433 /// be passed in. Otherwise, return null.
434 ///
435 /// Note that we don't ever attempt to use memset_pattern8 or 4, because these
436 /// just replicate their input array and then pass on to memset_pattern16.
438  // FIXME: This could check for UndefValue because it can be merged into any
439  // other valid pattern.
440 
441  // If the value isn't a constant, we can't promote it to being in a constant
442  // array. We could theoretically do a store to an alloca or something, but
443  // that doesn't seem worthwhile.
444  Constant *C = dyn_cast<Constant>(V);
445  if (!C)
446  return nullptr;
447 
448  // Only handle simple values that are a power of two bytes in size.
449  uint64_t Size = DL->getTypeSizeInBits(V->getType());
450  if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
451  return nullptr;
452 
453  // Don't care enough about darwin/ppc to implement this.
454  if (DL->isBigEndian())
455  return nullptr;
456 
457  // Convert to size in bytes.
458  Size /= 8;
459 
460  // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
461  // if the top and bottom are the same (e.g. for vectors and large integers).
462  if (Size > 16)
463  return nullptr;
464 
465  // If the constant is exactly 16 bytes, just use it.
466  if (Size == 16)
467  return C;
468 
469  // Otherwise, we'll use an array of the constants.
470  unsigned ArraySize = 16 / Size;
471  ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
472  return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
473 }
474 
475 LoopIdiomRecognize::LegalStoreKind
476 LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
477  // Don't touch volatile stores.
478  if (SI->isVolatile())
479  return LegalStoreKind::None;
480  // We only want simple or unordered-atomic stores.
481  if (!SI->isUnordered())
482  return LegalStoreKind::None;
483 
484  // Avoid merging nontemporal stores.
485  if (SI->getMetadata(LLVMContext::MD_nontemporal))
486  return LegalStoreKind::None;
487 
488  Value *StoredVal = SI->getValueOperand();
489  Value *StorePtr = SI->getPointerOperand();
490 
491  // Don't convert stores of non-integral pointer types to memsets (which stores
492  // integers).
493  if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
494  return LegalStoreKind::None;
495 
496  // Reject stores that are so large that they overflow an unsigned.
497  // When storing out scalable vectors we bail out for now, since the code
498  // below currently only works for constant strides.
499  TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
500  if (SizeInBits.isScalable() || (SizeInBits.getFixedSize() & 7) ||
501  (SizeInBits.getFixedSize() >> 32) != 0)
502  return LegalStoreKind::None;
503 
504  // See if the pointer expression is an AddRec like {base,+,1} on the current
505  // loop, which indicates a strided store. If we have something else, it's a
506  // random store we can't handle.
507  const SCEVAddRecExpr *StoreEv =
508  dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
509  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
510  return LegalStoreKind::None;
511 
512  // Check to see if we have a constant stride.
513  if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
514  return LegalStoreKind::None;
515 
516  // See if the store can be turned into a memset.
517 
518  // If the stored value is a byte-wise value (like i32 -1), then it may be
519  // turned into a memset of i8 -1, assuming that all the consecutive bytes
520  // are stored. A store of i32 0x01020304 can never be turned into a memset,
521  // but it can be turned into memset_pattern if the target supports it.
522  Value *SplatValue = isBytewiseValue(StoredVal, *DL);
523 
524  // Note: memset and memset_pattern on unordered-atomic is yet not supported
525  bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
526 
527  // If we're allowed to form a memset, and the stored value would be
528  // acceptable for memset, use it.
529  if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
530  // Verify that the stored value is loop invariant. If not, we can't
531  // promote the memset.
532  CurLoop->isLoopInvariant(SplatValue)) {
533  // It looks like we can use SplatValue.
534  return LegalStoreKind::Memset;
535  }
536  if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
537  // Don't create memset_pattern16s with address spaces.
538  StorePtr->getType()->getPointerAddressSpace() == 0 &&
539  getMemSetPatternValue(StoredVal, DL)) {
540  // It looks like we can use PatternValue!
541  return LegalStoreKind::MemsetPattern;
542  }
543 
544  // Otherwise, see if the store can be turned into a memcpy.
545  if (HasMemcpy && !DisableLIRP::Memcpy) {
546  // Check to see if the stride matches the size of the store. If so, then we
547  // know that every byte is touched in the loop.
548  APInt Stride = getStoreStride(StoreEv);
549  unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
550  if (StoreSize != Stride && StoreSize != -Stride)
551  return LegalStoreKind::None;
552 
553  // The store must be feeding a non-volatile load.
554  LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
555 
556  // Only allow non-volatile loads
557  if (!LI || LI->isVolatile())
558  return LegalStoreKind::None;
559  // Only allow simple or unordered-atomic loads
560  if (!LI->isUnordered())
561  return LegalStoreKind::None;
562 
563  // See if the pointer expression is an AddRec like {base,+,1} on the current
564  // loop, which indicates a strided load. If we have something else, it's a
565  // random load we can't handle.
566  const SCEVAddRecExpr *LoadEv =
567  dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
568  if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
569  return LegalStoreKind::None;
570 
571  // The store and load must share the same stride.
572  if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
573  return LegalStoreKind::None;
574 
575  // Success. This store can be converted into a memcpy.
576  UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
577  return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
578  : LegalStoreKind::Memcpy;
579  }
580  // This store can't be transformed into a memset/memcpy.
581  return LegalStoreKind::None;
582 }
583 
584 void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
585  StoreRefsForMemset.clear();
586  StoreRefsForMemsetPattern.clear();
587  StoreRefsForMemcpy.clear();
588  for (Instruction &I : *BB) {
589  StoreInst *SI = dyn_cast<StoreInst>(&I);
590  if (!SI)
591  continue;
592 
593  // Make sure this is a strided store with a constant stride.
594  switch (isLegalStore(SI)) {
596  // Nothing to do
597  break;
598  case LegalStoreKind::Memset: {
599  // Find the base pointer.
600  Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
601  StoreRefsForMemset[Ptr].push_back(SI);
602  } break;
603  case LegalStoreKind::MemsetPattern: {
604  // Find the base pointer.
605  Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
606  StoreRefsForMemsetPattern[Ptr].push_back(SI);
607  } break;
608  case LegalStoreKind::Memcpy:
609  case LegalStoreKind::UnorderedAtomicMemcpy:
610  StoreRefsForMemcpy.push_back(SI);
611  break;
612  default:
613  assert(false && "unhandled return value");
614  break;
615  }
616  }
617 }
618 
619 /// runOnLoopBlock - Process the specified block, which lives in a counted loop
620 /// with the specified backedge count. This block is known to be in the current
621 /// loop and not in any subloops.
622 bool LoopIdiomRecognize::runOnLoopBlock(
623  BasicBlock *BB, const SCEV *BECount,
624  SmallVectorImpl<BasicBlock *> &ExitBlocks) {
625  // We can only promote stores in this block if they are unconditionally
626  // executed in the loop. For a block to be unconditionally executed, it has
627  // to dominate all the exit blocks of the loop. Verify this now.
628  for (BasicBlock *ExitBlock : ExitBlocks)
629  if (!DT->dominates(BB, ExitBlock))
630  return false;
631 
632  bool MadeChange = false;
633  // Look for store instructions, which may be optimized to memset/memcpy.
634  collectStores(BB);
635 
636  // Look for a single store or sets of stores with a common base, which can be
637  // optimized into a memset (memset_pattern). The latter most commonly happens
638  // with structs and handunrolled loops.
639  for (auto &SL : StoreRefsForMemset)
640  MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
641 
642  for (auto &SL : StoreRefsForMemsetPattern)
643  MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
644 
645  // Optimize the store into a memcpy, if it feeds an similarly strided load.
646  for (auto &SI : StoreRefsForMemcpy)
647  MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
648 
649  MadeChange |= processLoopMemIntrinsic<MemCpyInst>(
650  BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
651  MadeChange |= processLoopMemIntrinsic<MemSetInst>(
652  BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
653 
654  return MadeChange;
655 }
656 
657 /// See if this store(s) can be promoted to a memset.
658 bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
659  const SCEV *BECount, ForMemset For) {
660  // Try to find consecutive stores that can be transformed into memsets.
661  SetVector<StoreInst *> Heads, Tails;
663 
664  // Do a quadratic search on all of the given stores and find
665  // all of the pairs of stores that follow each other.
666  SmallVector<unsigned, 16> IndexQueue;
667  for (unsigned i = 0, e = SL.size(); i < e; ++i) {
668  assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
669 
670  Value *FirstStoredVal = SL[i]->getValueOperand();
671  Value *FirstStorePtr = SL[i]->getPointerOperand();
672  const SCEVAddRecExpr *FirstStoreEv =
673  cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
674  APInt FirstStride = getStoreStride(FirstStoreEv);
675  unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
676 
677  // See if we can optimize just this store in isolation.
678  if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
679  Heads.insert(SL[i]);
680  continue;
681  }
682 
683  Value *FirstSplatValue = nullptr;
684  Constant *FirstPatternValue = nullptr;
685 
686  if (For == ForMemset::Yes)
687  FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
688  else
689  FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
690 
691  assert((FirstSplatValue || FirstPatternValue) &&
692  "Expected either splat value or pattern value.");
693 
694  IndexQueue.clear();
695  // If a store has multiple consecutive store candidates, search Stores
696  // array according to the sequence: from i+1 to e, then from i-1 to 0.
697  // This is because usually pairing with immediate succeeding or preceding
698  // candidate create the best chance to find memset opportunity.
699  unsigned j = 0;
700  for (j = i + 1; j < e; ++j)
701  IndexQueue.push_back(j);
702  for (j = i; j > 0; --j)
703  IndexQueue.push_back(j - 1);
704 
705  for (auto &k : IndexQueue) {
706  assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
707  Value *SecondStorePtr = SL[k]->getPointerOperand();
708  const SCEVAddRecExpr *SecondStoreEv =
709  cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
710  APInt SecondStride = getStoreStride(SecondStoreEv);
711 
712  if (FirstStride != SecondStride)
713  continue;
714 
715  Value *SecondStoredVal = SL[k]->getValueOperand();
716  Value *SecondSplatValue = nullptr;
717  Constant *SecondPatternValue = nullptr;
718 
719  if (For == ForMemset::Yes)
720  SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
721  else
722  SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
723 
724  assert((SecondSplatValue || SecondPatternValue) &&
725  "Expected either splat value or pattern value.");
726 
727  if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
728  if (For == ForMemset::Yes) {
729  if (isa<UndefValue>(FirstSplatValue))
730  FirstSplatValue = SecondSplatValue;
731  if (FirstSplatValue != SecondSplatValue)
732  continue;
733  } else {
734  if (isa<UndefValue>(FirstPatternValue))
735  FirstPatternValue = SecondPatternValue;
736  if (FirstPatternValue != SecondPatternValue)
737  continue;
738  }
739  Tails.insert(SL[k]);
740  Heads.insert(SL[i]);
741  ConsecutiveChain[SL[i]] = SL[k];
742  break;
743  }
744  }
745  }
746 
747  // We may run into multiple chains that merge into a single chain. We mark the
748  // stores that we transformed so that we don't visit the same store twice.
749  SmallPtrSet<Value *, 16> TransformedStores;
750  bool Changed = false;
751 
752  // For stores that start but don't end a link in the chain:
753  for (StoreInst *I : Heads) {
754  if (Tails.count(I))
755  continue;
756 
757  // We found a store instr that starts a chain. Now follow the chain and try
758  // to transform it.
759  SmallPtrSet<Instruction *, 8> AdjacentStores;
760  StoreInst *HeadStore = I;
761  unsigned StoreSize = 0;
762 
763  // Collect the chain into a list.
764  while (Tails.count(I) || Heads.count(I)) {
765  if (TransformedStores.count(I))
766  break;
767  AdjacentStores.insert(I);
768 
769  StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
770  // Move to the next value in the chain.
771  I = ConsecutiveChain[I];
772  }
773 
774  Value *StoredVal = HeadStore->getValueOperand();
775  Value *StorePtr = HeadStore->getPointerOperand();
776  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
777  APInt Stride = getStoreStride(StoreEv);
778 
779  // Check to see if the stride matches the size of the stores. If so, then
780  // we know that every byte is touched in the loop.
781  if (StoreSize != Stride && StoreSize != -Stride)
782  continue;
783 
784  bool IsNegStride = StoreSize == -Stride;
785 
786  Type *IntIdxTy = DL->getIndexType(StorePtr->getType());
787  const SCEV *StoreSizeSCEV = SE->getConstant(IntIdxTy, StoreSize);
788  if (processLoopStridedStore(StorePtr, StoreSizeSCEV,
789  MaybeAlign(HeadStore->getAlign()), StoredVal,
790  HeadStore, AdjacentStores, StoreEv, BECount,
791  IsNegStride)) {
792  TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
793  Changed = true;
794  }
795  }
796 
797  return Changed;
798 }
799 
800 /// processLoopMemIntrinsic - Template function for calling different processor
801 /// functions based on mem instrinsic type.
802 template <typename MemInst>
803 bool LoopIdiomRecognize::processLoopMemIntrinsic(
804  BasicBlock *BB,
805  bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
806  const SCEV *BECount) {
807  bool MadeChange = false;
808  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
809  Instruction *Inst = &*I++;
810  // Look for memory instructions, which may be optimized to a larger one.
811  if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
812  WeakTrackingVH InstPtr(&*I);
813  if (!(this->*Processor)(MI, BECount))
814  continue;
815  MadeChange = true;
816 
817  // If processing the instruction invalidated our iterator, start over from
818  // the top of the block.
819  if (!InstPtr)
820  I = BB->begin();
821  }
822  }
823  return MadeChange;
824 }
825 
826 /// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy
827 bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
828  const SCEV *BECount) {
829  // We can only handle non-volatile memcpys with a constant size.
830  if (MCI->isVolatile() || !isa<ConstantInt>(MCI->getLength()))
831  return false;
832 
833  // If we're not allowed to hack on memcpy, we fail.
834  if ((!HasMemcpy && !isa<MemCpyInlineInst>(MCI)) || DisableLIRP::Memcpy)
835  return false;
836 
837  Value *Dest = MCI->getDest();
838  Value *Source = MCI->getSource();
839  if (!Dest || !Source)
840  return false;
841 
842  // See if the load and store pointer expressions are AddRec like {base,+,1} on
843  // the current loop, which indicates a strided load and store. If we have
844  // something else, it's a random load or store we can't handle.
845  const SCEVAddRecExpr *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Dest));
846  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
847  return false;
848  const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Source));
849  if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
850  return false;
851 
852  // Reject memcpys that are so large that they overflow an unsigned.
853  uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength())->getZExtValue();
854  if ((SizeInBytes >> 32) != 0)
855  return false;
856 
857  // Check if the stride matches the size of the memcpy. If so, then we know
858  // that every byte is touched in the loop.
859  const SCEVConstant *ConstStoreStride =
860  dyn_cast<SCEVConstant>(StoreEv->getOperand(1));
861  const SCEVConstant *ConstLoadStride =
862  dyn_cast<SCEVConstant>(LoadEv->getOperand(1));
863  if (!ConstStoreStride || !ConstLoadStride)
864  return false;
865 
866  APInt StoreStrideValue = ConstStoreStride->getAPInt();
867  APInt LoadStrideValue = ConstLoadStride->getAPInt();
868  // Huge stride value - give up
869  if (StoreStrideValue.getBitWidth() > 64 || LoadStrideValue.getBitWidth() > 64)
870  return false;
871 
872  if (SizeInBytes != StoreStrideValue && SizeInBytes != -StoreStrideValue) {
873  ORE.emit([&]() {
874  return OptimizationRemarkMissed(DEBUG_TYPE, "SizeStrideUnequal", MCI)
875  << ore::NV("Inst", "memcpy") << " in "
876  << ore::NV("Function", MCI->getFunction())
877  << " function will not be hoisted: "
878  << ore::NV("Reason", "memcpy size is not equal to stride");
879  });
880  return false;
881  }
882 
883  int64_t StoreStrideInt = StoreStrideValue.getSExtValue();
884  int64_t LoadStrideInt = LoadStrideValue.getSExtValue();
885  // Check if the load stride matches the store stride.
886  if (StoreStrideInt != LoadStrideInt)
887  return false;
888 
889  return processLoopStoreOfLoopLoad(
890  Dest, Source, SE->getConstant(Dest->getType(), SizeInBytes),
891  MCI->getDestAlign(), MCI->getSourceAlign(), MCI, MCI, StoreEv, LoadEv,
892  BECount);
893 }
894 
895 /// processLoopMemSet - See if this memset can be promoted to a large memset.
896 bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
897  const SCEV *BECount) {
898  // We can only handle non-volatile memsets.
899  if (MSI->isVolatile())
900  return false;
901 
902  // If we're not allowed to hack on memset, we fail.
903  if (!HasMemset || DisableLIRP::Memset)
904  return false;
905 
906  Value *Pointer = MSI->getDest();
907 
908  // See if the pointer expression is an AddRec like {base,+,1} on the current
909  // loop, which indicates a strided store. If we have something else, it's a
910  // random store we can't handle.
911  const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
912  if (!Ev || Ev->getLoop() != CurLoop)
913  return false;
914  if (!Ev->isAffine()) {
915  LLVM_DEBUG(dbgs() << " Pointer is not affine, abort\n");
916  return false;
917  }
918 
919  const SCEV *PointerStrideSCEV = Ev->getOperand(1);
920  const SCEV *MemsetSizeSCEV = SE->getSCEV(MSI->getLength());
921  if (!PointerStrideSCEV || !MemsetSizeSCEV)
922  return false;
923 
924  bool IsNegStride = false;
925  const bool IsConstantSize = isa<ConstantInt>(MSI->getLength());
926 
927  if (IsConstantSize) {
928  // Memset size is constant.
929  // Check if the pointer stride matches the memset size. If so, then
930  // we know that every byte is touched in the loop.
931  LLVM_DEBUG(dbgs() << " memset size is constant\n");
932  uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
933  const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
934  if (!ConstStride)
935  return false;
936 
937  APInt Stride = ConstStride->getAPInt();
938  if (SizeInBytes != Stride && SizeInBytes != -Stride)
939  return false;
940 
941  IsNegStride = SizeInBytes == -Stride;
942  } else {
943  // Memset size is non-constant.
944  // Check if the pointer stride matches the memset size.
945  // To be conservative, the pass would not promote pointers that aren't in
946  // address space zero. Also, the pass only handles memset length and stride
947  // that are invariant for the top level loop.
948  LLVM_DEBUG(dbgs() << " memset size is non-constant\n");
949  if (Pointer->getType()->getPointerAddressSpace() != 0) {
950  LLVM_DEBUG(dbgs() << " pointer is not in address space zero, "
951  << "abort\n");
952  return false;
953  }
954  if (!SE->isLoopInvariant(MemsetSizeSCEV, CurLoop)) {
955  LLVM_DEBUG(dbgs() << " memset size is not a loop-invariant, "
956  << "abort\n");
957  return false;
958  }
959 
960  // Compare positive direction PointerStrideSCEV with MemsetSizeSCEV
961  IsNegStride = PointerStrideSCEV->isNonConstantNegative();
962  const SCEV *PositiveStrideSCEV =
963  IsNegStride ? SE->getNegativeSCEV(PointerStrideSCEV)
964  : PointerStrideSCEV;
965  LLVM_DEBUG(dbgs() << " MemsetSizeSCEV: " << *MemsetSizeSCEV << "\n"
966  << " PositiveStrideSCEV: " << *PositiveStrideSCEV
967  << "\n");
968 
969  if (PositiveStrideSCEV != MemsetSizeSCEV) {
970  // If an expression is covered by the loop guard, compare again and
971  // proceed with optimization if equal.
972  const SCEV *FoldedPositiveStride =
973  SE->applyLoopGuards(PositiveStrideSCEV, CurLoop);
974  const SCEV *FoldedMemsetSize =
975  SE->applyLoopGuards(MemsetSizeSCEV, CurLoop);
976 
977  LLVM_DEBUG(dbgs() << " Try to fold SCEV based on loop guard\n"
978  << " FoldedMemsetSize: " << *FoldedMemsetSize << "\n"
979  << " FoldedPositiveStride: " << *FoldedPositiveStride
980  << "\n");
981 
982  if (FoldedPositiveStride != FoldedMemsetSize) {
983  LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n");
984  return false;
985  }
986  }
987  }
988 
989  // Verify that the memset value is loop invariant. If not, we can't promote
990  // the memset.
991  Value *SplatValue = MSI->getValue();
992  if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
993  return false;
994 
996  MSIs.insert(MSI);
997  return processLoopStridedStore(Pointer, SE->getSCEV(MSI->getLength()),
999  SplatValue, MSI, MSIs, Ev, BECount,
1000  IsNegStride, /*IsLoopMemset=*/true);
1001 }
1002 
1003 /// mayLoopAccessLocation - Return true if the specified loop might access the
1004 /// specified pointer location, which is a loop-strided access. The 'Access'
1005 /// argument specifies what the verboten forms of access are (read or write).
1006 static bool
1008  const SCEV *BECount, const SCEV *StoreSizeSCEV,
1009  AliasAnalysis &AA,
1010  SmallPtrSetImpl<Instruction *> &IgnoredInsts) {
1011  // Get the location that may be stored across the loop. Since the access is
1012  // strided positively through memory, we say that the modified location starts
1013  // at the pointer and has infinite size.
1015 
1016  // If the loop iterates a fixed number of times, we can refine the access size
1017  // to be exactly the size of the memset, which is (BECount+1)*StoreSize
1018  const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount);
1019  const SCEVConstant *ConstSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
1020  if (BECst && ConstSize)
1021  AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
1022  ConstSize->getValue()->getZExtValue());
1023 
1024  // TODO: For this to be really effective, we have to dive into the pointer
1025  // operand in the store. Store to &A[i] of 100 will always return may alias
1026  // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
1027  // which will then no-alias a store to &A[100].
1028  MemoryLocation StoreLoc(Ptr, AccessSize);
1029 
1030  for (BasicBlock *B : L->blocks())
1031  for (Instruction &I : *B)
1032  if (!IgnoredInsts.contains(&I) &&
1033  isModOrRefSet(
1034  intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
1035  return true;
1036  return false;
1037 }
1038 
1039 // If we have a negative stride, Start refers to the end of the memory location
1040 // we're trying to memset. Therefore, we need to recompute the base pointer,
1041 // which is just Start - BECount*Size.
1042 static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
1043  Type *IntPtr, const SCEV *StoreSizeSCEV,
1044  ScalarEvolution *SE) {
1045  const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
1046  if (!StoreSizeSCEV->isOne()) {
1047  // index = back edge count * store size
1048  Index = SE->getMulExpr(Index,
1049  SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
1050  SCEV::FlagNUW);
1051  }
1052  // base pointer = start - index * store size
1053  return SE->getMinusSCEV(Start, Index);
1054 }
1055 
1056 /// Compute trip count from the backedge taken count.
1057 static const SCEV *getTripCount(const SCEV *BECount, Type *IntPtr,
1058  Loop *CurLoop, const DataLayout *DL,
1059  ScalarEvolution *SE) {
1060  const SCEV *TripCountS = nullptr;
1061  // The # stored bytes is (BECount+1). Expand the trip count out to
1062  // pointer size if it isn't already.
1063  //
1064  // If we're going to need to zero extend the BE count, check if we can add
1065  // one to it prior to zero extending without overflow. Provided this is safe,
1066  // it allows better simplification of the +1.
1067  if (DL->getTypeSizeInBits(BECount->getType()) <
1068  DL->getTypeSizeInBits(IntPtr) &&
1070  CurLoop, ICmpInst::ICMP_NE, BECount,
1071  SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
1072  TripCountS = SE->getZeroExtendExpr(
1073  SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
1074  IntPtr);
1075  } else {
1076  TripCountS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
1077  SE->getOne(IntPtr), SCEV::FlagNUW);
1078  }
1079 
1080  return TripCountS;
1081 }
1082 
1083 /// Compute the number of bytes as a SCEV from the backedge taken count.
1084 ///
1085 /// This also maps the SCEV into the provided type and tries to handle the
1086 /// computation in a way that will fold cleanly.
1087 static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
1088  const SCEV *StoreSizeSCEV, Loop *CurLoop,
1089  const DataLayout *DL, ScalarEvolution *SE) {
1090  const SCEV *TripCountSCEV = getTripCount(BECount, IntPtr, CurLoop, DL, SE);
1091 
1092  return SE->getMulExpr(TripCountSCEV,
1093  SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
1094  SCEV::FlagNUW);
1095 }
1096 
1097 /// processLoopStridedStore - We see a strided store of some value. If we can
1098 /// transform this into a memset or memset_pattern in the loop preheader, do so.
1099 bool LoopIdiomRecognize::processLoopStridedStore(
1100  Value *DestPtr, const SCEV *StoreSizeSCEV, MaybeAlign StoreAlignment,
1101  Value *StoredVal, Instruction *TheStore,
1103  const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) {
1104  Value *SplatValue = isBytewiseValue(StoredVal, *DL);
1105  Constant *PatternValue = nullptr;
1106 
1107  if (!SplatValue)
1108  PatternValue = getMemSetPatternValue(StoredVal, DL);
1109 
1110  assert((SplatValue || PatternValue) &&
1111  "Expected either splat value or pattern value.");
1112 
1113  // The trip count of the loop and the base pointer of the addrec SCEV is
1114  // guaranteed to be loop invariant, which means that it should dominate the
1115  // header. This allows us to insert code for it in the preheader.
1116  unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
1117  BasicBlock *Preheader = CurLoop->getLoopPreheader();
1118  IRBuilder<> Builder(Preheader->getTerminator());
1119  SCEVExpander Expander(*SE, *DL, "loop-idiom");
1120  SCEVExpanderCleaner ExpCleaner(Expander);
1121 
1122  Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
1123  Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
1124 
1125  bool Changed = false;
1126  const SCEV *Start = Ev->getStart();
1127  // Handle negative strided loops.
1128  if (IsNegStride)
1129  Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSizeSCEV, SE);
1130 
1131  // TODO: ideally we should still be able to generate memset if SCEV expander
1132  // is taught to generate the dependencies at the latest point.
1133  if (!isSafeToExpand(Start, *SE))
1134  return Changed;
1135 
1136  // Okay, we have a strided store "p[i]" of a splattable value. We can turn
1137  // this into a memset in the loop preheader now if we want. However, this
1138  // would be unsafe to do if there is anything else in the loop that may read
1139  // or write to the aliased location. Check for any overlap by generating the
1140  // base pointer and checking the region.
1141  Value *BasePtr =
1142  Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
1143 
1144  // From here on out, conservatively report to the pass manager that we've
1145  // changed the IR, even if we later clean up these added instructions. There
1146  // may be structural differences e.g. in the order of use lists not accounted
1147  // for in just a textual dump of the IR. This is written as a variable, even
1148  // though statically all the places this dominates could be replaced with
1149  // 'true', with the hope that anyone trying to be clever / "more precise" with
1150  // the return value will read this comment, and leave them alone.
1151  Changed = true;
1152 
1153  if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1154  StoreSizeSCEV, *AA, Stores))
1155  return Changed;
1156 
1157  if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
1158  return Changed;
1159 
1160  // Okay, everything looks good, insert the memset.
1161 
1162  const SCEV *NumBytesS =
1163  getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1164 
1165  // TODO: ideally we should still be able to generate memset if SCEV expander
1166  // is taught to generate the dependencies at the latest point.
1167  if (!isSafeToExpand(NumBytesS, *SE))
1168  return Changed;
1169 
1170  Value *NumBytes =
1171  Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1172 
1173  CallInst *NewCall;
1174  if (SplatValue) {
1175  NewCall = Builder.CreateMemSet(BasePtr, SplatValue, NumBytes,
1176  MaybeAlign(StoreAlignment));
1177  } else {
1178  // Everything is emitted in default address space
1179  Type *Int8PtrTy = DestInt8PtrTy;
1180 
1181  Module *M = TheStore->getModule();
1182  StringRef FuncName = "memset_pattern16";
1183  FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(),
1184  Int8PtrTy, Int8PtrTy, IntIdxTy);
1185  inferLibFuncAttributes(M, FuncName, *TLI);
1186 
1187  // Otherwise we should form a memset_pattern16. PatternValue is known to be
1188  // an constant array of 16-bytes. Plop the value into a mergable global.
1189  GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
1191  PatternValue, ".memset_pattern");
1192  GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
1193  GV->setAlignment(Align(16));
1194  Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
1195  NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
1196  }
1197  NewCall->setDebugLoc(TheStore->getDebugLoc());
1198 
1199  if (MSSAU) {
1200  MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1201  NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1202  MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1203  }
1204 
1205  LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
1206  << " from store to: " << *Ev << " at: " << *TheStore
1207  << "\n");
1208 
1209  ORE.emit([&]() {
1210  OptimizationRemark R(DEBUG_TYPE, "ProcessLoopStridedStore",
1211  NewCall->getDebugLoc(), Preheader);
1212  R << "Transformed loop-strided store in "
1213  << ore::NV("Function", TheStore->getFunction())
1214  << " function into a call to "
1215  << ore::NV("NewFunction", NewCall->getCalledFunction())
1216  << "() intrinsic";
1217  if (!Stores.empty())
1218  R << ore::setExtraArgs();
1219  for (auto *I : Stores) {
1220  R << ore::NV("FromBlock", I->getParent()->getName())
1221  << ore::NV("ToBlock", Preheader->getName());
1222  }
1223  return R;
1224  });
1225 
1226  // Okay, the memset has been formed. Zap the original store and anything that
1227  // feeds into it.
1228  for (auto *I : Stores) {
1229  if (MSSAU)
1230  MSSAU->removeMemoryAccess(I, true);
1232  }
1233  if (MSSAU && VerifyMemorySSA)
1234  MSSAU->getMemorySSA()->verifyMemorySSA();
1235  ++NumMemSet;
1236  ExpCleaner.markResultUsed();
1237  return true;
1238 }
1239 
1240 /// If the stored value is a strided load in the same loop with the same stride
1241 /// this may be transformable into a memcpy. This kicks in for stuff like
1242 /// for (i) A[i] = B[i];
1243 bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1244  const SCEV *BECount) {
1245  assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
1246 
1247  Value *StorePtr = SI->getPointerOperand();
1248  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1249  unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1250 
1251  // The store must be feeding a non-volatile load.
1252  LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
1253  assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
1254 
1255  // See if the pointer expression is an AddRec like {base,+,1} on the current
1256  // loop, which indicates a strided load. If we have something else, it's a
1257  // random load we can't handle.
1258  Value *LoadPtr = LI->getPointerOperand();
1259  const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
1260 
1261  const SCEV *StoreSizeSCEV = SE->getConstant(StorePtr->getType(), StoreSize);
1262  return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSizeSCEV,
1263  SI->getAlign(), LI->getAlign(), SI, LI,
1264  StoreEv, LoadEv, BECount);
1265 }
1266 
1268 public:
1269  explicit MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr,
1270  const DataLayout &DL)
1271  : DL(DL), LoadOff(0), StoreOff(0),
1273  LoadBasePtr.stripPointerCasts(), LoadOff, DL)),
1275  StoreBasePtr.stripPointerCasts(), StoreOff, DL)),
1276  IsSameObject(BP1 == BP2) {}
1277 
1278  bool loadAndStoreMayFormMemmove(unsigned StoreSize, bool IsNegStride,
1279  const Instruction &TheLoad,
1280  bool IsMemCpy) const {
1281  if (IsMemCpy) {
1282  // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1283  // for negative stride.
1284  if ((!IsNegStride && LoadOff <= StoreOff) ||
1285  (IsNegStride && LoadOff >= StoreOff))
1286  return false;
1287  } else {
1288  // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1289  // for negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr.
1290  int64_t LoadSize =
1291  DL.getTypeSizeInBits(TheLoad.getType()).getFixedSize() / 8;
1292  if (BP1 != BP2 || LoadSize != int64_t(StoreSize))
1293  return false;
1294  if ((!IsNegStride && LoadOff < StoreOff + int64_t(StoreSize)) ||
1295  (IsNegStride && LoadOff + LoadSize > StoreOff))
1296  return false;
1297  }
1298  return true;
1299  }
1300 
1301 private:
1302  const DataLayout &DL;
1303  int64_t LoadOff;
1304  int64_t StoreOff;
1305  const Value *BP1;
1306  const Value *BP2;
1307 
1308 public:
1309  const bool IsSameObject;
1310 };
1311 
1312 bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
1313  Value *DestPtr, Value *SourcePtr, const SCEV *StoreSizeSCEV,
1314  MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore,
1315  Instruction *TheLoad, const SCEVAddRecExpr *StoreEv,
1316  const SCEVAddRecExpr *LoadEv, const SCEV *BECount) {
1317 
1318  // FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to
1319  // conservatively bail here, since otherwise we may have to transform
1320  // llvm.memcpy.inline into llvm.memcpy which is illegal.
1321  if (isa<MemCpyInlineInst>(TheStore))
1322  return false;
1323 
1324  // The trip count of the loop and the base pointer of the addrec SCEV is
1325  // guaranteed to be loop invariant, which means that it should dominate the
1326  // header. This allows us to insert code for it in the preheader.
1327  BasicBlock *Preheader = CurLoop->getLoopPreheader();
1328  IRBuilder<> Builder(Preheader->getTerminator());
1329  SCEVExpander Expander(*SE, *DL, "loop-idiom");
1330 
1331  SCEVExpanderCleaner ExpCleaner(Expander);
1332 
1333  bool Changed = false;
1334  const SCEV *StrStart = StoreEv->getStart();
1335  unsigned StrAS = DestPtr->getType()->getPointerAddressSpace();
1336  Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
1337 
1338  APInt Stride = getStoreStride(StoreEv);
1339  const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
1340 
1341  // TODO: Deal with non-constant size; Currently expect constant store size
1342  assert(ConstStoreSize && "store size is expected to be a constant");
1343 
1344  int64_t StoreSize = ConstStoreSize->getValue()->getZExtValue();
1345  bool IsNegStride = StoreSize == -Stride;
1346 
1347  // Handle negative strided loops.
1348  if (IsNegStride)
1349  StrStart =
1350  getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1351 
1352  // Okay, we have a strided store "p[i]" of a loaded value. We can turn
1353  // this into a memcpy in the loop preheader now if we want. However, this
1354  // would be unsafe to do if there is anything else in the loop that may read
1355  // or write the memory region we're storing to. This includes the load that
1356  // feeds the stores. Check for an alias by generating the base address and
1357  // checking everything.
1358  Value *StoreBasePtr = Expander.expandCodeFor(
1359  StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
1360 
1361  // From here on out, conservatively report to the pass manager that we've
1362  // changed the IR, even if we later clean up these added instructions. There
1363  // may be structural differences e.g. in the order of use lists not accounted
1364  // for in just a textual dump of the IR. This is written as a variable, even
1365  // though statically all the places this dominates could be replaced with
1366  // 'true', with the hope that anyone trying to be clever / "more precise" with
1367  // the return value will read this comment, and leave them alone.
1368  Changed = true;
1369 
1370  SmallPtrSet<Instruction *, 2> IgnoredInsts;
1371  IgnoredInsts.insert(TheStore);
1372 
1373  bool IsMemCpy = isa<MemCpyInst>(TheStore);
1374  const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";
1375 
1376  bool LoopAccessStore =
1377  mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1378  StoreSizeSCEV, *AA, IgnoredInsts);
1379  if (LoopAccessStore) {
1380  // For memmove case it's not enough to guarantee that loop doesn't access
1381  // TheStore and TheLoad. Additionally we need to make sure that TheStore is
1382  // the only user of TheLoad.
1383  if (!TheLoad->hasOneUse())
1384  return Changed;
1385  IgnoredInsts.insert(TheLoad);
1386  if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop,
1387  BECount, StoreSizeSCEV, *AA, IgnoredInsts)) {
1388  ORE.emit([&]() {
1389  return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore",
1390  TheStore)
1391  << ore::NV("Inst", InstRemark) << " in "
1392  << ore::NV("Function", TheStore->getFunction())
1393  << " function will not be hoisted: "
1394  << ore::NV("Reason", "The loop may access store location");
1395  });
1396  return Changed;
1397  }
1398  IgnoredInsts.erase(TheLoad);
1399  }
1400 
1401  const SCEV *LdStart = LoadEv->getStart();
1402  unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
1403 
1404  // Handle negative strided loops.
1405  if (IsNegStride)
1406  LdStart =
1407  getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1408 
1409  // For a memcpy, we have to make sure that the input array is not being
1410  // mutated by the loop.
1411  Value *LoadBasePtr = Expander.expandCodeFor(
1412  LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
1413 
1414  // If the store is a memcpy instruction, we must check if it will write to
1415  // the load memory locations. So remove it from the ignored stores.
1416  if (IsMemCpy)
1417  IgnoredInsts.erase(TheStore);
1418  MemmoveVerifier Verifier(*LoadBasePtr, *StoreBasePtr, *DL);
1419  if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1420  StoreSizeSCEV, *AA, IgnoredInsts)) {
1421  if (!IsMemCpy) {
1422  ORE.emit([&]() {
1423  return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad",
1424  TheLoad)
1425  << ore::NV("Inst", InstRemark) << " in "
1426  << ore::NV("Function", TheStore->getFunction())
1427  << " function will not be hoisted: "
1428  << ore::NV("Reason", "The loop may access load location");
1429  });
1430  return Changed;
1431  }
1432  // At this point loop may access load only for memcpy in same underlying
1433  // object. If that's not the case bail out.
1434  if (!Verifier.IsSameObject)
1435  return Changed;
1436  }
1437 
1438  bool UseMemMove = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore;
1439  if (UseMemMove)
1440  if (!Verifier.loadAndStoreMayFormMemmove(StoreSize, IsNegStride, *TheLoad,
1441  IsMemCpy))
1442  return Changed;
1443 
1444  if (avoidLIRForMultiBlockLoop())
1445  return Changed;
1446 
1447  // Okay, everything is safe, we can transform this!
1448 
1449  const SCEV *NumBytesS =
1450  getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1451 
1452  Value *NumBytes =
1453  Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1454 
1455  CallInst *NewCall = nullptr;
1456  // Check whether to generate an unordered atomic memcpy:
1457  // If the load or store are atomic, then they must necessarily be unordered
1458  // by previous checks.
1459  if (!TheStore->isAtomic() && !TheLoad->isAtomic()) {
1460  if (UseMemMove)
1461  NewCall = Builder.CreateMemMove(StoreBasePtr, StoreAlign, LoadBasePtr,
1462  LoadAlign, NumBytes);
1463  else
1464  NewCall = Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr,
1465  LoadAlign, NumBytes);
1466  } else {
1467  // For now don't support unordered atomic memmove.
1468  if (UseMemMove)
1469  return Changed;
1470  // We cannot allow unaligned ops for unordered load/store, so reject
1471  // anything where the alignment isn't at least the element size.
1472  assert((StoreAlign.hasValue() && LoadAlign.hasValue()) &&
1473  "Expect unordered load/store to have align.");
1474  if (StoreAlign.getValue() < StoreSize || LoadAlign.getValue() < StoreSize)
1475  return Changed;
1476 
1477  // If the element.atomic memcpy is not lowered into explicit
1478  // loads/stores later, then it will be lowered into an element-size
1479  // specific lib call. If the lib call doesn't exist for our store size, then
1480  // we shouldn't generate the memcpy.
1481  if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1482  return Changed;
1483 
1484  // Create the call.
1485  // Note that unordered atomic loads/stores are *required* by the spec to
1486  // have an alignment but non-atomic loads/stores may not.
1487  NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1488  StoreBasePtr, StoreAlign.getValue(), LoadBasePtr, LoadAlign.getValue(),
1489  NumBytes, StoreSize);
1490  }
1491  NewCall->setDebugLoc(TheStore->getDebugLoc());
1492 
1493  if (MSSAU) {
1494  MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1495  NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1496  MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1497  }
1498 
1499  LLVM_DEBUG(dbgs() << " Formed new call: " << *NewCall << "\n"
1500  << " from load ptr=" << *LoadEv << " at: " << *TheLoad
1501  << "\n"
1502  << " from store ptr=" << *StoreEv << " at: " << *TheStore
1503  << "\n");
1504 
1505  ORE.emit([&]() {
1506  return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",
1507  NewCall->getDebugLoc(), Preheader)
1508  << "Formed a call to "
1509  << ore::NV("NewFunction", NewCall->getCalledFunction())
1510  << "() intrinsic from " << ore::NV("Inst", InstRemark)
1511  << " instruction in " << ore::NV("Function", TheStore->getFunction())
1512  << " function"
1513  << ore::setExtraArgs()
1514  << ore::NV("FromBlock", TheStore->getParent()->getName())
1515  << ore::NV("ToBlock", Preheader->getName());
1516  });
1517 
1518  // Okay, a new call to memcpy/memmove has been formed. Zap the original store
1519  // and anything that feeds into it.
1520  if (MSSAU)
1521  MSSAU->removeMemoryAccess(TheStore, true);
1522  deleteDeadInstruction(TheStore);
1523  if (MSSAU && VerifyMemorySSA)
1524  MSSAU->getMemorySSA()->verifyMemorySSA();
1525  if (UseMemMove)
1526  ++NumMemMove;
1527  else
1528  ++NumMemCpy;
1529  ExpCleaner.markResultUsed();
1530  return true;
1531 }
1532 
1533 // When compiling for codesize we avoid idiom recognition for a multi-block loop
1534 // unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1535 //
1536 bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1537  bool IsLoopMemset) {
1538  if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
1539  if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
1540  LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
1541  << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
1542  << " avoided: multi-block top-level loop\n");
1543  return true;
1544  }
1545  }
1546 
1547  return false;
1548 }
1549 
1550 bool LoopIdiomRecognize::runOnNoncountableLoop() {
1551  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
1552  << CurLoop->getHeader()->getParent()->getName()
1553  << "] Noncountable Loop %"
1554  << CurLoop->getHeader()->getName() << "\n");
1555 
1556  return recognizePopcount() || recognizeAndInsertFFS() ||
1557  recognizeShiftUntilBitTest() || recognizeShiftUntilZero();
1558 }
1559 
1560 /// Check if the given conditional branch is based on the comparison between
1561 /// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1562 /// true), the control yields to the loop entry. If the branch matches the
1563 /// behavior, the variable involved in the comparison is returned. This function
1564 /// will be called to see if the precondition and postcondition of the loop are
1565 /// in desirable form.
1566 static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
1567  bool JmpOnZero = false) {
1568  if (!BI || !BI->isConditional())
1569  return nullptr;
1570 
1571  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
1572  if (!Cond)
1573  return nullptr;
1574 
1575  ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
1576  if (!CmpZero || !CmpZero->isZero())
1577  return nullptr;
1578 
1579  BasicBlock *TrueSucc = BI->getSuccessor(0);
1580  BasicBlock *FalseSucc = BI->getSuccessor(1);
1581  if (JmpOnZero)
1582  std::swap(TrueSucc, FalseSucc);
1583 
1584  ICmpInst::Predicate Pred = Cond->getPredicate();
1585  if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
1586  (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
1587  return Cond->getOperand(0);
1588 
1589  return nullptr;
1590 }
1591 
1592 // Check if the recurrence variable `VarX` is in the right form to create
1593 // the idiom. Returns the value coerced to a PHINode if so.
1595  BasicBlock *LoopEntry) {
1596  auto *PhiX = dyn_cast<PHINode>(VarX);
1597  if (PhiX && PhiX->getParent() == LoopEntry &&
1598  (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
1599  return PhiX;
1600  return nullptr;
1601 }
1602 
1603 /// Return true iff the idiom is detected in the loop.
1604 ///
1605 /// Additionally:
1606 /// 1) \p CntInst is set to the instruction counting the population bit.
1607 /// 2) \p CntPhi is set to the corresponding phi node.
1608 /// 3) \p Var is set to the value whose population bits are being counted.
1609 ///
1610 /// The core idiom we are trying to detect is:
1611 /// \code
1612 /// if (x0 != 0)
1613 /// goto loop-exit // the precondition of the loop
1614 /// cnt0 = init-val;
1615 /// do {
1616 /// x1 = phi (x0, x2);
1617 /// cnt1 = phi(cnt0, cnt2);
1618 ///
1619 /// cnt2 = cnt1 + 1;
1620 /// ...
1621 /// x2 = x1 & (x1 - 1);
1622 /// ...
1623 /// } while(x != 0);
1624 ///
1625 /// loop-exit:
1626 /// \endcode
1627 static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
1628  Instruction *&CntInst, PHINode *&CntPhi,
1629  Value *&Var) {
1630  // step 1: Check to see if the look-back branch match this pattern:
1631  // "if (a!=0) goto loop-entry".
1632  BasicBlock *LoopEntry;
1633  Instruction *DefX2, *CountInst;
1634  Value *VarX1, *VarX0;
1635  PHINode *PhiX, *CountPhi;
1636 
1637  DefX2 = CountInst = nullptr;
1638  VarX1 = VarX0 = nullptr;
1639  PhiX = CountPhi = nullptr;
1640  LoopEntry = *(CurLoop->block_begin());
1641 
1642  // step 1: Check if the loop-back branch is in desirable form.
1643  {
1644  if (Value *T = matchCondition(
1645  dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1646  DefX2 = dyn_cast<Instruction>(T);
1647  else
1648  return false;
1649  }
1650 
1651  // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1652  {
1653  if (!DefX2 || DefX2->getOpcode() != Instruction::And)
1654  return false;
1655 
1656  BinaryOperator *SubOneOp;
1657 
1658  if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
1659  VarX1 = DefX2->getOperand(1);
1660  else {
1661  VarX1 = DefX2->getOperand(0);
1662  SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
1663  }
1664  if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
1665  return false;
1666 
1667  ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
1668  if (!Dec ||
1669  !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
1670  (SubOneOp->getOpcode() == Instruction::Add &&
1671  Dec->isMinusOne()))) {
1672  return false;
1673  }
1674  }
1675 
1676  // step 3: Check the recurrence of variable X
1677  PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
1678  if (!PhiX)
1679  return false;
1680 
1681  // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
1682  {
1683  CountInst = nullptr;
1684  for (Instruction &Inst : llvm::make_range(
1685  LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
1686  if (Inst.getOpcode() != Instruction::Add)
1687  continue;
1688 
1689  ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
1690  if (!Inc || !Inc->isOne())
1691  continue;
1692 
1693  PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
1694  if (!Phi)
1695  continue;
1696 
1697  // Check if the result of the instruction is live of the loop.
1698  bool LiveOutLoop = false;
1699  for (User *U : Inst.users()) {
1700  if ((cast<Instruction>(U))->getParent() != LoopEntry) {
1701  LiveOutLoop = true;
1702  break;
1703  }
1704  }
1705 
1706  if (LiveOutLoop) {
1707  CountInst = &Inst;
1708  CountPhi = Phi;
1709  break;
1710  }
1711  }
1712 
1713  if (!CountInst)
1714  return false;
1715  }
1716 
1717  // step 5: check if the precondition is in this form:
1718  // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
1719  {
1720  auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1721  Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
1722  if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
1723  return false;
1724 
1725  CntInst = CountInst;
1726  CntPhi = CountPhi;
1727  Var = T;
1728  }
1729 
1730  return true;
1731 }
1732 
1733 /// Return true if the idiom is detected in the loop.
1734 ///
1735 /// Additionally:
1736 /// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1737 /// or nullptr if there is no such.
1738 /// 2) \p CntPhi is set to the corresponding phi node
1739 /// or nullptr if there is no such.
1740 /// 3) \p Var is set to the value whose CTLZ could be used.
1741 /// 4) \p DefX is set to the instruction calculating Loop exit condition.
1742 ///
1743 /// The core idiom we are trying to detect is:
1744 /// \code
1745 /// if (x0 == 0)
1746 /// goto loop-exit // the precondition of the loop
1747 /// cnt0 = init-val;
1748 /// do {
1749 /// x = phi (x0, x.next); //PhiX
1750 /// cnt = phi(cnt0, cnt.next);
1751 ///
1752 /// cnt.next = cnt + 1;
1753 /// ...
1754 /// x.next = x >> 1; // DefX
1755 /// ...
1756 /// } while(x.next != 0);
1757 ///
1758 /// loop-exit:
1759 /// \endcode
1760 static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
1761  Intrinsic::ID &IntrinID, Value *&InitX,
1762  Instruction *&CntInst, PHINode *&CntPhi,
1763  Instruction *&DefX) {
1764  BasicBlock *LoopEntry;
1765  Value *VarX = nullptr;
1766 
1767  DefX = nullptr;
1768  CntInst = nullptr;
1769  CntPhi = nullptr;
1770  LoopEntry = *(CurLoop->block_begin());
1771 
1772  // step 1: Check if the loop-back branch is in desirable form.
1773  if (Value *T = matchCondition(
1774  dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1775  DefX = dyn_cast<Instruction>(T);
1776  else
1777  return false;
1778 
1779  // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
1780  if (!DefX || !DefX->isShift())
1781  return false;
1782  IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
1783  Intrinsic::ctlz;
1784  ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
1785  if (!Shft || !Shft->isOne())
1786  return false;
1787  VarX = DefX->getOperand(0);
1788 
1789  // step 3: Check the recurrence of variable X
1790  PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
1791  if (!PhiX)
1792  return false;
1793 
1794  InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
1795 
1796  // Make sure the initial value can't be negative otherwise the ashr in the
1797  // loop might never reach zero which would make the loop infinite.
1798  if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
1799  return false;
1800 
1801  // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1802  // or cnt.next = cnt + -1.
1803  // TODO: We can skip the step. If loop trip count is known (CTLZ),
1804  // then all uses of "cnt.next" could be optimized to the trip count
1805  // plus "cnt0". Currently it is not optimized.
1806  // This step could be used to detect POPCNT instruction:
1807  // cnt.next = cnt + (x.next & 1)
1808  for (Instruction &Inst : llvm::make_range(
1809  LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
1810  if (Inst.getOpcode() != Instruction::Add)
1811  continue;
1812 
1813  ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
1814  if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
1815  continue;
1816 
1817  PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
1818  if (!Phi)
1819  continue;
1820 
1821  CntInst = &Inst;
1822  CntPhi = Phi;
1823  break;
1824  }
1825  if (!CntInst)
1826  return false;
1827 
1828  return true;
1829 }
1830 
1831 /// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
1832 /// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
1833 /// trip count returns true; otherwise, returns false.
1834 bool LoopIdiomRecognize::recognizeAndInsertFFS() {
1835  // Give up if the loop has multiple blocks or multiple backedges.
1836  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1837  return false;
1838 
1839  Intrinsic::ID IntrinID;
1840  Value *InitX;
1841  Instruction *DefX = nullptr;
1842  PHINode *CntPhi = nullptr;
1843  Instruction *CntInst = nullptr;
1844  // Help decide if transformation is profitable. For ShiftUntilZero idiom,
1845  // this is always 6.
1846  size_t IdiomCanonicalSize = 6;
1847 
1848  if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
1849  CntInst, CntPhi, DefX))
1850  return false;
1851 
1852  bool IsCntPhiUsedOutsideLoop = false;
1853  for (User *U : CntPhi->users())
1854  if (!CurLoop->contains(cast<Instruction>(U))) {
1855  IsCntPhiUsedOutsideLoop = true;
1856  break;
1857  }
1858  bool IsCntInstUsedOutsideLoop = false;
1859  for (User *U : CntInst->users())
1860  if (!CurLoop->contains(cast<Instruction>(U))) {
1861  IsCntInstUsedOutsideLoop = true;
1862  break;
1863  }
1864  // If both CntInst and CntPhi are used outside the loop the profitability
1865  // is questionable.
1866  if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
1867  return false;
1868 
1869  // For some CPUs result of CTLZ(X) intrinsic is undefined
1870  // when X is 0. If we can not guarantee X != 0, we need to check this
1871  // when expand.
1872  bool ZeroCheck = false;
1873  // It is safe to assume Preheader exist as it was checked in
1874  // parent function RunOnLoop.
1875  BasicBlock *PH = CurLoop->getLoopPreheader();
1876 
1877  // If we are using the count instruction outside the loop, make sure we
1878  // have a zero check as a precondition. Without the check the loop would run
1879  // one iteration for before any check of the input value. This means 0 and 1
1880  // would have identical behavior in the original loop and thus
1881  if (!IsCntPhiUsedOutsideLoop) {
1882  auto *PreCondBB = PH->getSinglePredecessor();
1883  if (!PreCondBB)
1884  return false;
1885  auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1886  if (!PreCondBI)
1887  return false;
1888  if (matchCondition(PreCondBI, PH) != InitX)
1889  return false;
1890  ZeroCheck = true;
1891  }
1892 
1893  // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
1894  // profitable if we delete the loop.
1895 
1896  // the loop has only 6 instructions:
1897  // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
1898  // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
1899  // %shr = ashr %n.addr.0, 1
1900  // %tobool = icmp eq %shr, 0
1901  // %inc = add nsw %i.0, 1
1902  // br i1 %tobool
1903 
1904  const Value *Args[] = {InitX,
1905  ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
1906 
1907  // @llvm.dbg doesn't count as they have no semantic effect.
1908  auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
1910  std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
1911 
1912  IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
1913  InstructionCost Cost =
1915  if (HeaderSize != IdiomCanonicalSize &&
1917  return false;
1918 
1919  transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
1920  DefX->getDebugLoc(), ZeroCheck,
1921  IsCntPhiUsedOutsideLoop);
1922  return true;
1923 }
1924 
1925 /// Recognizes a population count idiom in a non-countable loop.
1926 ///
1927 /// If detected, transforms the relevant code to issue the popcount intrinsic
1928 /// function call, and returns true; otherwise, returns false.
1929 bool LoopIdiomRecognize::recognizePopcount() {
1931  return false;
1932 
1933  // Counting population are usually conducted by few arithmetic instructions.
1934  // Such instructions can be easily "absorbed" by vacant slots in a
1935  // non-compact loop. Therefore, recognizing popcount idiom only makes sense
1936  // in a compact loop.
1937 
1938  // Give up if the loop has multiple blocks or multiple backedges.
1939  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1940  return false;
1941 
1942  BasicBlock *LoopBody = *(CurLoop->block_begin());
1943  if (LoopBody->size() >= 20) {
1944  // The loop is too big, bail out.
1945  return false;
1946  }
1947 
1948  // It should have a preheader containing nothing but an unconditional branch.
1949  BasicBlock *PH = CurLoop->getLoopPreheader();
1950  if (!PH || &PH->front() != PH->getTerminator())
1951  return false;
1952  auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
1953  if (!EntryBI || EntryBI->isConditional())
1954  return false;
1955 
1956  // It should have a precondition block where the generated popcount intrinsic
1957  // function can be inserted.
1958  auto *PreCondBB = PH->getSinglePredecessor();
1959  if (!PreCondBB)
1960  return false;
1961  auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1962  if (!PreCondBI || PreCondBI->isUnconditional())
1963  return false;
1964 
1965  Instruction *CntInst;
1966  PHINode *CntPhi;
1967  Value *Val;
1968  if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
1969  return false;
1970 
1971  transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
1972  return true;
1973 }
1974 
1976  const DebugLoc &DL) {
1977  Value *Ops[] = {Val};
1978  Type *Tys[] = {Val->getType()};
1979 
1981  Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
1982  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1983  CI->setDebugLoc(DL);
1984 
1985  return CI;
1986 }
1987 
1989  const DebugLoc &DL, bool ZeroCheck,
1990  Intrinsic::ID IID) {
1991  Value *Ops[] = {Val, IRBuilder.getInt1(ZeroCheck)};
1992  Type *Tys[] = {Val->getType()};
1993 
1995  Function *Func = Intrinsic::getDeclaration(M, IID, Tys);
1996  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1997  CI->setDebugLoc(DL);
1998 
1999  return CI;
2000 }
2001 
2002 /// Transform the following loop (Using CTLZ, CTTZ is similar):
2003 /// loop:
2004 /// CntPhi = PHI [Cnt0, CntInst]
2005 /// PhiX = PHI [InitX, DefX]
2006 /// CntInst = CntPhi + 1
2007 /// DefX = PhiX >> 1
2008 /// LOOP_BODY
2009 /// Br: loop if (DefX != 0)
2010 /// Use(CntPhi) or Use(CntInst)
2011 ///
2012 /// Into:
2013 /// If CntPhi used outside the loop:
2014 /// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
2015 /// Count = CountPrev + 1
2016 /// else
2017 /// Count = BitWidth(InitX) - CTLZ(InitX)
2018 /// loop:
2019 /// CntPhi = PHI [Cnt0, CntInst]
2020 /// PhiX = PHI [InitX, DefX]
2021 /// PhiCount = PHI [Count, Dec]
2022 /// CntInst = CntPhi + 1
2023 /// DefX = PhiX >> 1
2024 /// Dec = PhiCount - 1
2025 /// LOOP_BODY
2026 /// Br: loop if (Dec != 0)
2027 /// Use(CountPrev + Cnt0) // Use(CntPhi)
2028 /// or
2029 /// Use(Count + Cnt0) // Use(CntInst)
2030 ///
2031 /// If LOOP_BODY is empty the loop will be deleted.
2032 /// If CntInst and DefX are not used in LOOP_BODY they will be removed.
2033 void LoopIdiomRecognize::transformLoopToCountable(
2034  Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
2035  PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
2036  bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
2037  BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
2038 
2039  // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
2040  IRBuilder<> Builder(PreheaderBr);
2041  Builder.SetCurrentDebugLocation(DL);
2042 
2043  // If there are no uses of CntPhi crate:
2044  // Count = BitWidth - CTLZ(InitX);
2045  // NewCount = Count;
2046  // If there are uses of CntPhi create:
2047  // NewCount = BitWidth - CTLZ(InitX >> 1);
2048  // Count = NewCount + 1;
2049  Value *InitXNext;
2050  if (IsCntPhiUsedOutsideLoop) {
2051  if (DefX->getOpcode() == Instruction::AShr)
2052  InitXNext = Builder.CreateAShr(InitX, 1);
2053  else if (DefX->getOpcode() == Instruction::LShr)
2054  InitXNext = Builder.CreateLShr(InitX, 1);
2055  else if (DefX->getOpcode() == Instruction::Shl) // cttz
2056  InitXNext = Builder.CreateShl(InitX, 1);
2057  else
2058  llvm_unreachable("Unexpected opcode!");
2059  } else
2060  InitXNext = InitX;
2061  Value *Count =
2062  createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
2063  Type *CountTy = Count->getType();
2064  Count = Builder.CreateSub(
2065  ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count);
2066  Value *NewCount = Count;
2067  if (IsCntPhiUsedOutsideLoop)
2068  Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1));
2069 
2070  NewCount = Builder.CreateZExtOrTrunc(NewCount, CntInst->getType());
2071 
2072  Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
2073  if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
2074  // If the counter was being incremented in the loop, add NewCount to the
2075  // counter's initial value, but only if the initial value is not zero.
2076  ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2077  if (!InitConst || !InitConst->isZero())
2078  NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2079  } else {
2080  // If the count was being decremented in the loop, subtract NewCount from
2081  // the counter's initial value.
2082  NewCount = Builder.CreateSub(CntInitVal, NewCount);
2083  }
2084 
2085  // Step 2: Insert new IV and loop condition:
2086  // loop:
2087  // ...
2088  // PhiCount = PHI [Count, Dec]
2089  // ...
2090  // Dec = PhiCount - 1
2091  // ...
2092  // Br: loop if (Dec != 0)
2093  BasicBlock *Body = *(CurLoop->block_begin());
2094  auto *LbBr = cast<BranchInst>(Body->getTerminator());
2095  ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2096 
2097  PHINode *TcPhi = PHINode::Create(CountTy, 2, "tcphi", &Body->front());
2098 
2099  Builder.SetInsertPoint(LbCond);
2100  Instruction *TcDec = cast<Instruction>(Builder.CreateSub(
2101  TcPhi, ConstantInt::get(CountTy, 1), "tcdec", false, true));
2102 
2103  TcPhi->addIncoming(Count, Preheader);
2104  TcPhi->addIncoming(TcDec, Body);
2105 
2106  CmpInst::Predicate Pred =
2107  (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
2108  LbCond->setPredicate(Pred);
2109  LbCond->setOperand(0, TcDec);
2110  LbCond->setOperand(1, ConstantInt::get(CountTy, 0));
2111 
2112  // Step 3: All the references to the original counter outside
2113  // the loop are replaced with the NewCount
2114  if (IsCntPhiUsedOutsideLoop)
2115  CntPhi->replaceUsesOutsideBlock(NewCount, Body);
2116  else
2117  CntInst->replaceUsesOutsideBlock(NewCount, Body);
2118 
2119  // step 4: Forget the "non-computable" trip-count SCEV associated with the
2120  // loop. The loop would otherwise not be deleted even if it becomes empty.
2121  SE->forgetLoop(CurLoop);
2122 }
2123 
2124 void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
2125  Instruction *CntInst,
2126  PHINode *CntPhi, Value *Var) {
2127  BasicBlock *PreHead = CurLoop->getLoopPreheader();
2128  auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
2129  const DebugLoc &DL = CntInst->getDebugLoc();
2130 
2131  // Assuming before transformation, the loop is following:
2132  // if (x) // the precondition
2133  // do { cnt++; x &= x - 1; } while(x);
2134 
2135  // Step 1: Insert the ctpop instruction at the end of the precondition block
2136  IRBuilder<> Builder(PreCondBr);
2137  Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
2138  {
2139  PopCnt = createPopcntIntrinsic(Builder, Var, DL);
2140  NewCount = PopCntZext =
2141  Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
2142 
2143  if (NewCount != PopCnt)
2144  (cast<Instruction>(NewCount))->setDebugLoc(DL);
2145 
2146  // TripCnt is exactly the number of iterations the loop has
2147  TripCnt = NewCount;
2148 
2149  // If the population counter's initial value is not zero, insert Add Inst.
2150  Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
2151  ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2152  if (!InitConst || !InitConst->isZero()) {
2153  NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2154  (cast<Instruction>(NewCount))->setDebugLoc(DL);
2155  }
2156  }
2157 
2158  // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
2159  // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
2160  // function would be partial dead code, and downstream passes will drag
2161  // it back from the precondition block to the preheader.
2162  {
2163  ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
2164 
2165  Value *Opnd0 = PopCntZext;
2166  Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
2167  if (PreCond->getOperand(0) != Var)
2168  std::swap(Opnd0, Opnd1);
2169 
2170  ICmpInst *NewPreCond = cast<ICmpInst>(
2171  Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
2172  PreCondBr->setCondition(NewPreCond);
2173 
2175  }
2176 
2177  // Step 3: Note that the population count is exactly the trip count of the
2178  // loop in question, which enable us to convert the loop from noncountable
2179  // loop into a countable one. The benefit is twofold:
2180  //
2181  // - If the loop only counts population, the entire loop becomes dead after
2182  // the transformation. It is a lot easier to prove a countable loop dead
2183  // than to prove a noncountable one. (In some C dialects, an infinite loop
2184  // isn't dead even if it computes nothing useful. In general, DCE needs
2185  // to prove a noncountable loop finite before safely delete it.)
2186  //
2187  // - If the loop also performs something else, it remains alive.
2188  // Since it is transformed to countable form, it can be aggressively
2189  // optimized by some optimizations which are in general not applicable
2190  // to a noncountable loop.
2191  //
2192  // After this step, this loop (conceptually) would look like following:
2193  // newcnt = __builtin_ctpop(x);
2194  // t = newcnt;
2195  // if (x)
2196  // do { cnt++; x &= x-1; t--) } while (t > 0);
2197  BasicBlock *Body = *(CurLoop->block_begin());
2198  {
2199  auto *LbBr = cast<BranchInst>(Body->getTerminator());
2200  ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2201  Type *Ty = TripCnt->getType();
2202 
2203  PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
2204 
2205  Builder.SetInsertPoint(LbCond);
2206  Instruction *TcDec = cast<Instruction>(
2207  Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
2208  "tcdec", false, true));
2209 
2210  TcPhi->addIncoming(TripCnt, PreHead);
2211  TcPhi->addIncoming(TcDec, Body);
2212 
2213  CmpInst::Predicate Pred =
2214  (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
2215  LbCond->setPredicate(Pred);
2216  LbCond->setOperand(0, TcDec);
2217  LbCond->setOperand(1, ConstantInt::get(Ty, 0));
2218  }
2219 
2220  // Step 4: All the references to the original population counter outside
2221  // the loop are replaced with the NewCount -- the value returned from
2222  // __builtin_ctpop().
2223  CntInst->replaceUsesOutsideBlock(NewCount, Body);
2224 
2225  // step 5: Forget the "non-computable" trip-count SCEV associated with the
2226  // loop. The loop would otherwise not be deleted even if it becomes empty.
2227  SE->forgetLoop(CurLoop);
2228 }
2229 
2230 /// Match loop-invariant value.
2231 template <typename SubPattern_t> struct match_LoopInvariant {
2232  SubPattern_t SubPattern;
2233  const Loop *L;
2234 
2235  match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
2236  : SubPattern(SP), L(L) {}
2237 
2238  template <typename ITy> bool match(ITy *V) {
2239  return L->isLoopInvariant(V) && SubPattern.match(V);
2240  }
2241 };
2242 
2243 /// Matches if the value is loop-invariant.
2244 template <typename Ty>
2245 inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
2246  return match_LoopInvariant<Ty>(M, L);
2247 }
2248 
2249 /// Return true if the idiom is detected in the loop.
2250 ///
2251 /// The core idiom we are trying to detect is:
2252 /// \code
2253 /// entry:
2254 /// <...>
2255 /// %bitmask = shl i32 1, %bitpos
2256 /// br label %loop
2257 ///
2258 /// loop:
2259 /// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2260 /// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2261 /// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2262 /// %x.next = shl i32 %x.curr, 1
2263 /// <...>
2264 /// br i1 %x.curr.isbitunset, label %loop, label %end
2265 ///
2266 /// end:
2267 /// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2268 /// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2269 /// <...>
2270 /// \endcode
2271 static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
2272  Value *&BitMask, Value *&BitPos,
2273  Value *&CurrX, Instruction *&NextX) {
2275  " Performing shift-until-bittest idiom detection.\n");
2276 
2277  // Give up if the loop has multiple blocks or multiple backedges.
2278  if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
2279  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
2280  return false;
2281  }
2282 
2283  BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2284  BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2285  assert(LoopPreheaderBB && "There is always a loop preheader.");
2286 
2287  using namespace PatternMatch;
2288 
2289  // Step 1: Check if the loop backedge is in desirable form.
2290 
2291  ICmpInst::Predicate Pred;
2292  Value *CmpLHS, *CmpRHS;
2293  BasicBlock *TrueBB, *FalseBB;
2294  if (!match(LoopHeaderBB->getTerminator(),
2295  m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
2296  m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {
2297  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
2298  return false;
2299  }
2300 
2301  // Step 2: Check if the backedge's condition is in desirable form.
2302 
2303  auto MatchVariableBitMask = [&]() {
2304  return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2305  match(CmpLHS,
2306  m_c_And(m_Value(CurrX),
2307  m_CombineAnd(
2308  m_Value(BitMask),
2309  m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),
2310  CurLoop))));
2311  };
2312  auto MatchConstantBitMask = [&]() {
2313  return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2314  match(CmpLHS, m_And(m_Value(CurrX),
2315  m_CombineAnd(m_Value(BitMask), m_Power2()))) &&
2316  (BitPos = ConstantExpr::getExactLogBase2(cast<Constant>(BitMask)));
2317  };
2318  auto MatchDecomposableConstantBitMask = [&]() {
2319  APInt Mask;
2320  return llvm::decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, CurrX, Mask) &&
2321  ICmpInst::isEquality(Pred) && Mask.isPowerOf2() &&
2322  (BitMask = ConstantInt::get(CurrX->getType(), Mask)) &&
2323  (BitPos = ConstantInt::get(CurrX->getType(), Mask.logBase2()));
2324  };
2325 
2326  if (!MatchVariableBitMask() && !MatchConstantBitMask() &&
2327  !MatchDecomposableConstantBitMask()) {
2328  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n");
2329  return false;
2330  }
2331 
2332  // Step 3: Check if the recurrence is in desirable form.
2333  auto *CurrXPN = dyn_cast<PHINode>(CurrX);
2334  if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) {
2335  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
2336  return false;
2337  }
2338 
2339  BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);
2340  NextX =
2341  dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));
2342 
2343  assert(CurLoop->isLoopInvariant(BaseX) &&
2344  "Expected BaseX to be avaliable in the preheader!");
2345 
2346  if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {
2347  // FIXME: support right-shift?
2348  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
2349  return false;
2350  }
2351 
2352  // Step 4: Check if the backedge's destinations are in desirable form.
2353 
2354  assert(ICmpInst::isEquality(Pred) &&
2355  "Should only get equality predicates here.");
2356 
2357  // cmp-br is commutative, so canonicalize to a single variant.
2358  if (Pred != ICmpInst::Predicate::ICMP_EQ) {
2359  Pred = ICmpInst::getInversePredicate(Pred);
2360  std::swap(TrueBB, FalseBB);
2361  }
2362 
2363  // We expect to exit loop when comparison yields false,
2364  // so when it yields true we should branch back to loop header.
2365  if (TrueBB != LoopHeaderBB) {
2366  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
2367  return false;
2368  }
2369 
2370  // Okay, idiom checks out.
2371  return true;
2372 }
2373 
2374 /// Look for the following loop:
2375 /// \code
2376 /// entry:
2377 /// <...>
2378 /// %bitmask = shl i32 1, %bitpos
2379 /// br label %loop
2380 ///
2381 /// loop:
2382 /// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2383 /// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2384 /// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2385 /// %x.next = shl i32 %x.curr, 1
2386 /// <...>
2387 /// br i1 %x.curr.isbitunset, label %loop, label %end
2388 ///
2389 /// end:
2390 /// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2391 /// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2392 /// <...>
2393 /// \endcode
2394 ///
2395 /// And transform it into:
2396 /// \code
2397 /// entry:
2398 /// %bitmask = shl i32 1, %bitpos
2399 /// %lowbitmask = add i32 %bitmask, -1
2400 /// %mask = or i32 %lowbitmask, %bitmask
2401 /// %x.masked = and i32 %x, %mask
2402 /// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
2403 /// i1 true)
2404 /// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
2405 /// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
2406 /// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
2407 /// %tripcount = add i32 %backedgetakencount, 1
2408 /// %x.curr = shl i32 %x, %backedgetakencount
2409 /// %x.next = shl i32 %x, %tripcount
2410 /// br label %loop
2411 ///
2412 /// loop:
2413 /// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
2414 /// %loop.iv.next = add nuw i32 %loop.iv, 1
2415 /// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
2416 /// <...>
2417 /// br i1 %loop.ivcheck, label %end, label %loop
2418 ///
2419 /// end:
2420 /// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2421 /// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2422 /// <...>
2423 /// \endcode
2424 bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
2425  bool MadeChange = false;
2426 
2427  Value *X, *BitMask, *BitPos, *XCurr;
2428  Instruction *XNext;
2429  if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,
2430  XNext)) {
2432  " shift-until-bittest idiom detection failed.\n");
2433  return MadeChange;
2434  }
2435  LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n");
2436 
2437  // Ok, it is the idiom we were looking for, we *could* transform this loop,
2438  // but is it profitable to transform?
2439 
2440  BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2441  BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2442  assert(LoopPreheaderBB && "There is always a loop preheader.");
2443 
2444  BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2445  assert(SuccessorBB && "There is only a single successor.");
2446 
2447  IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2448  Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());
2449 
2450  Intrinsic::ID IntrID = Intrinsic::ctlz;
2451  Type *Ty = X->getType();
2452  unsigned Bitwidth = Ty->getScalarSizeInBits();
2453 
2456 
2457  // The rewrite is considered to be unprofitable iff and only iff the
2458  // intrinsic/shift we'll use are not cheap. Note that we are okay with *just*
2459  // making the loop countable, even if nothing else changes.
2461  IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getTrue()});
2463  if (Cost > TargetTransformInfo::TCC_Basic) {
2465  " Intrinsic is too costly, not beneficial\n");
2466  return MadeChange;
2467  }
2468  if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >
2470  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n");
2471  return MadeChange;
2472  }
2473 
2474  // Ok, transform appears worthwhile.
2475  MadeChange = true;
2476 
2477  // Step 1: Compute the loop trip count.
2478 
2479  Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
2480  BitPos->getName() + ".lowbitmask");
2481  Value *Mask =
2482  Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
2483  Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
2484  CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
2485  IntrID, Ty, {XMasked, /*is_zero_undef=*/Builder.getTrue()},
2486  /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros");
2487  Value *XMaskedNumActiveBits = Builder.CreateSub(
2488  ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
2489  XMasked->getName() + ".numactivebits", /*HasNUW=*/true,
2490  /*HasNSW=*/Bitwidth != 2);
2491  Value *XMaskedLeadingOnePos =
2492  Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),
2493  XMasked->getName() + ".leadingonepos", /*HasNUW=*/false,
2494  /*HasNSW=*/Bitwidth > 2);
2495 
2496  Value *LoopBackedgeTakenCount = Builder.CreateSub(
2497  BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount",
2498  /*HasNUW=*/true, /*HasNSW=*/true);
2499  // We know loop's backedge-taken count, but what's loop's trip count?
2500  // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2501  Value *LoopTripCount =
2502  Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
2503  CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
2504  /*HasNSW=*/Bitwidth != 2);
2505 
2506  // Step 2: Compute the recurrence's final value without a loop.
2507 
2508  // NewX is always safe to compute, because `LoopBackedgeTakenCount`
2509  // will always be smaller than `bitwidth(X)`, i.e. we never get poison.
2510  Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);
2511  NewX->takeName(XCurr);
2512  if (auto *I = dyn_cast<Instruction>(NewX))
2513  I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
2514 
2515  Value *NewXNext;
2516  // Rewriting XNext is more complicated, however, because `X << LoopTripCount`
2517  // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
2518  // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
2519  // that isn't the case, we'll need to emit an alternative, safe IR.
2520  if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() ||
2524  Ty->getScalarSizeInBits() - 1))))
2525  NewXNext = Builder.CreateShl(X, LoopTripCount);
2526  else {
2527  // Otherwise, just additionally shift by one. It's the smallest solution,
2528  // alternatively, we could check that NewX is INT_MIN (or BitPos is )
2529  // and select 0 instead.
2530  NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));
2531  }
2532 
2533  NewXNext->takeName(XNext);
2534  if (auto *I = dyn_cast<Instruction>(NewXNext))
2535  I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
2536 
2537  // Step 3: Adjust the successor basic block to recieve the computed
2538  // recurrence's final value instead of the recurrence itself.
2539 
2540  XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);
2541  XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);
2542 
2543  // Step 4: Rewrite the loop into a countable form, with canonical IV.
2544 
2545  // The new canonical induction variable.
2546  Builder.SetInsertPoint(&LoopHeaderBB->front());
2547  auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
2548 
2549  // The induction itself.
2550  // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2551  Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
2552  auto *IVNext =
2553  Builder.CreateAdd(IV, ConstantInt::get(Ty, 1), IV->getName() + ".next",
2554  /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
2555 
2556  // The loop trip count check.
2557  auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
2558  CurLoop->getName() + ".ivcheck");
2559  Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
2560  LoopHeaderBB->getTerminator()->eraseFromParent();
2561 
2562  // Populate the IV PHI.
2563  IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
2564  IV->addIncoming(IVNext, LoopHeaderBB);
2565 
2566  // Step 5: Forget the "non-computable" trip-count SCEV associated with the
2567  // loop. The loop would otherwise not be deleted even if it becomes empty.
2568 
2569  SE->forgetLoop(CurLoop);
2570 
2571  // Other passes will take care of actually deleting the loop if possible.
2572 
2573  LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n");
2574 
2575  ++NumShiftUntilBitTest;
2576  return MadeChange;
2577 }
2578 
2579 /// Return true if the idiom is detected in the loop.
2580 ///
2581 /// The core idiom we are trying to detect is:
2582 /// \code
2583 /// entry:
2584 /// <...>
2585 /// %start = <...>
2586 /// %extraoffset = <...>
2587 /// <...>
2588 /// br label %for.cond
2589 ///
2590 /// loop:
2591 /// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
2592 /// %nbits = add nsw i8 %iv, %extraoffset
2593 /// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
2594 /// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
2595 /// %iv.next = add i8 %iv, 1
2596 /// <...>
2597 /// br i1 %val.shifted.iszero, label %end, label %loop
2598 ///
2599 /// end:
2600 /// %iv.res = phi i8 [ %iv, %loop ] <...>
2601 /// %nbits.res = phi i8 [ %nbits, %loop ] <...>
2602 /// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
2603 /// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
2604 /// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
2605 /// <...>
2606 /// \endcode
2608  Instruction *&ValShiftedIsZero,
2609  Intrinsic::ID &IntrinID, Instruction *&IV,
2610  Value *&Start, Value *&Val,
2611  const SCEV *&ExtraOffsetExpr,
2612  bool &InvertedCond) {
2614  " Performing shift-until-zero idiom detection.\n");
2615 
2616  // Give up if the loop has multiple blocks or multiple backedges.
2617  if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
2618  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
2619  return false;
2620  }
2621 
2622  Instruction *ValShifted, *NBits, *IVNext;
2623  Value *ExtraOffset;
2624 
2625  BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2626  BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2627  assert(LoopPreheaderBB && "There is always a loop preheader.");
2628 
2629  using namespace PatternMatch;
2630 
2631  // Step 1: Check if the loop backedge, condition is in desirable form.
2632 
2633  ICmpInst::Predicate Pred;
2634  BasicBlock *TrueBB, *FalseBB;
2635  if (!match(LoopHeaderBB->getTerminator(),
2636  m_Br(m_Instruction(ValShiftedIsZero), m_BasicBlock(TrueBB),
2637  m_BasicBlock(FalseBB))) ||
2638  !match(ValShiftedIsZero,
2639  m_ICmp(Pred, m_Instruction(ValShifted), m_Zero())) ||
2640  !ICmpInst::isEquality(Pred)) {
2641  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
2642  return false;
2643  }
2644 
2645  // Step 2: Check if the comparison's operand is in desirable form.
2646  // FIXME: Val could be a one-input PHI node, which we should look past.
2647  if (!match(ValShifted, m_Shift(m_LoopInvariant(m_Value(Val), CurLoop),
2648  m_Instruction(NBits)))) {
2649  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad comparisons value computation.\n");
2650  return false;
2651  }
2652  IntrinID = ValShifted->getOpcode() == Instruction::Shl ? Intrinsic::cttz
2653  : Intrinsic::ctlz;
2654 
2655  // Step 3: Check if the shift amount is in desirable form.
2656 
2657  if (match(NBits, m_c_Add(m_Instruction(IV),
2658  m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
2659  (NBits->hasNoSignedWrap() || NBits->hasNoUnsignedWrap()))
2660  ExtraOffsetExpr = SE->getNegativeSCEV(SE->getSCEV(ExtraOffset));
2661  else if (match(NBits,
2662  m_Sub(m_Instruction(IV),
2663  m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
2664  NBits->hasNoSignedWrap())
2665  ExtraOffsetExpr = SE->getSCEV(ExtraOffset);
2666  else {
2667  IV = NBits;
2668  ExtraOffsetExpr = SE->getZero(NBits->getType());
2669  }
2670 
2671  // Step 4: Check if the recurrence is in desirable form.
2672  auto *IVPN = dyn_cast<PHINode>(IV);
2673  if (!IVPN || IVPN->getParent() != LoopHeaderBB) {
2674  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
2675  return false;
2676  }
2677 
2678  Start = IVPN->getIncomingValueForBlock(LoopPreheaderBB);
2679  IVNext = dyn_cast<Instruction>(IVPN->getIncomingValueForBlock(LoopHeaderBB));
2680 
2681  if (!IVNext || !match(IVNext, m_Add(m_Specific(IVPN), m_One()))) {
2682  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
2683  return false;
2684  }
2685 
2686  // Step 4: Check if the backedge's destinations are in desirable form.
2687 
2688  assert(ICmpInst::isEquality(Pred) &&
2689  "Should only get equality predicates here.");
2690 
2691  // cmp-br is commutative, so canonicalize to a single variant.
2692  InvertedCond = Pred != ICmpInst::Predicate::ICMP_EQ;
2693  if (InvertedCond) {
2694  Pred = ICmpInst::getInversePredicate(Pred);
2695  std::swap(TrueBB, FalseBB);
2696  }
2697 
2698  // We expect to exit loop when comparison yields true,
2699  // so when it yields false we should branch back to loop header.
2700  if (FalseBB != LoopHeaderBB) {
2701  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
2702  return false;
2703  }
2704 
2705  // The new, countable, loop will certainly only run a known number of
2706  // iterations, It won't be infinite. But the old loop might be infinite
2707  // under certain conditions. For logical shifts, the value will become zero
2708  // after at most bitwidth(%Val) loop iterations. However, for arithmetic
2709  // right-shift, iff the sign bit was set, the value will never become zero,
2710  // and the loop may never finish.
2711  if (ValShifted->getOpcode() == Instruction::AShr &&
2712  !isMustProgress(CurLoop) && !SE->isKnownNonNegative(SE->getSCEV(Val))) {
2713  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Can not prove the loop is finite.\n");
2714  return false;
2715  }
2716 
2717  // Okay, idiom checks out.
2718  return true;
2719 }
2720 
2721 /// Look for the following loop:
2722 /// \code
2723 /// entry:
2724 /// <...>
2725 /// %start = <...>
2726 /// %extraoffset = <...>
2727 /// <...>
2728 /// br label %for.cond
2729 ///
2730 /// loop:
2731 /// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
2732 /// %nbits = add nsw i8 %iv, %extraoffset
2733 /// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
2734 /// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
2735 /// %iv.next = add i8 %iv, 1
2736 /// <...>
2737 /// br i1 %val.shifted.iszero, label %end, label %loop
2738 ///
2739 /// end:
2740 /// %iv.res = phi i8 [ %iv, %loop ] <...>
2741 /// %nbits.res = phi i8 [ %nbits, %loop ] <...>
2742 /// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
2743 /// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
2744 /// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
2745 /// <...>
2746 /// \endcode
2747 ///
2748 /// And transform it into:
2749 /// \code
2750 /// entry:
2751 /// <...>
2752 /// %start = <...>
2753 /// %extraoffset = <...>
2754 /// <...>
2755 /// %val.numleadingzeros = call i8 @llvm.ct{l,t}z.i8(i8 %val, i1 0)
2756 /// %val.numactivebits = sub i8 8, %val.numleadingzeros
2757 /// %extraoffset.neg = sub i8 0, %extraoffset
2758 /// %tmp = add i8 %val.numactivebits, %extraoffset.neg
2759 /// %iv.final = call i8 @llvm.smax.i8(i8 %tmp, i8 %start)
2760 /// %loop.tripcount = sub i8 %iv.final, %start
2761 /// br label %loop
2762 ///
2763 /// loop:
2764 /// %loop.iv = phi i8 [ 0, %entry ], [ %loop.iv.next, %loop ]
2765 /// %loop.iv.next = add i8 %loop.iv, 1
2766 /// %loop.ivcheck = icmp eq i8 %loop.iv.next, %loop.tripcount
2767 /// %iv = add i8 %loop.iv, %start
2768 /// <...>
2769 /// br i1 %loop.ivcheck, label %end, label %loop
2770 ///
2771 /// end:
2772 /// %iv.res = phi i8 [ %iv.final, %loop ] <...>
2773 /// <...>
2774 /// \endcode
2775 bool LoopIdiomRecognize::recognizeShiftUntilZero() {
2776  bool MadeChange = false;
2777 
2778  Instruction *ValShiftedIsZero;
2779  Intrinsic::ID IntrID;
2780  Instruction *IV;
2781  Value *Start, *Val;
2782  const SCEV *ExtraOffsetExpr;
2783  bool InvertedCond;
2784  if (!detectShiftUntilZeroIdiom(CurLoop, SE, ValShiftedIsZero, IntrID, IV,
2785  Start, Val, ExtraOffsetExpr, InvertedCond)) {
2787  " shift-until-zero idiom detection failed.\n");
2788  return MadeChange;
2789  }
2790  LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom detected!\n");
2791 
2792  // Ok, it is the idiom we were looking for, we *could* transform this loop,
2793  // but is it profitable to transform?
2794 
2795  BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2796  BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2797  assert(LoopPreheaderBB && "There is always a loop preheader.");
2798 
2799  BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2800  assert(SuccessorBB && "There is only a single successor.");
2801 
2802  IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2803  Builder.SetCurrentDebugLocation(IV->getDebugLoc());
2804 
2805  Type *Ty = Val->getType();
2806  unsigned Bitwidth = Ty->getScalarSizeInBits();
2807 
2810 
2811  // The rewrite is considered to be unprofitable iff and only iff the
2812  // intrinsic we'll use are not cheap. Note that we are okay with *just*
2813  // making the loop countable, even if nothing else changes.
2815  IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getFalse()});
2817  if (Cost > TargetTransformInfo::TCC_Basic) {
2819  " Intrinsic is too costly, not beneficial\n");
2820  return MadeChange;
2821  }
2822 
2823  // Ok, transform appears worthwhile.
2824  MadeChange = true;
2825 
2826  bool OffsetIsZero = false;
2827  if (auto *ExtraOffsetExprC = dyn_cast<SCEVConstant>(ExtraOffsetExpr))
2828  OffsetIsZero = ExtraOffsetExprC->isZero();
2829 
2830  // Step 1: Compute the loop's final IV value / trip count.
2831 
2832  CallInst *ValNumLeadingZeros = Builder.CreateIntrinsic(
2833  IntrID, Ty, {Val, /*is_zero_undef=*/Builder.getFalse()},
2834  /*FMFSource=*/nullptr, Val->getName() + ".numleadingzeros");
2835  Value *ValNumActiveBits = Builder.CreateSub(
2836  ConstantInt::get(Ty, Ty->getScalarSizeInBits()), ValNumLeadingZeros,
2837  Val->getName() + ".numactivebits", /*HasNUW=*/true,
2838  /*HasNSW=*/Bitwidth != 2);
2839 
2840  SCEVExpander Expander(*SE, *DL, "loop-idiom");
2841  Expander.setInsertPoint(&*Builder.GetInsertPoint());
2842  Value *ExtraOffset = Expander.expandCodeFor(ExtraOffsetExpr);
2843 
2844  Value *ValNumActiveBitsOffset = Builder.CreateAdd(
2845  ValNumActiveBits, ExtraOffset, ValNumActiveBits->getName() + ".offset",
2846  /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true);
2847  Value *IVFinal = Builder.CreateIntrinsic(Intrinsic::smax, {Ty},
2848  {ValNumActiveBitsOffset, Start},
2849  /*FMFSource=*/nullptr, "iv.final");
2850 
2851  auto *LoopBackedgeTakenCount = cast<Instruction>(Builder.CreateSub(
2852  IVFinal, Start, CurLoop->getName() + ".backedgetakencount",
2853  /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true));
2854  // FIXME: or when the offset was `add nuw`
2855 
2856  // We know loop's backedge-taken count, but what's loop's trip count?
2857  Value *LoopTripCount =
2858  Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
2859  CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
2860  /*HasNSW=*/Bitwidth != 2);
2861 
2862  // Step 2: Adjust the successor basic block to recieve the original
2863  // induction variable's final value instead of the orig. IV itself.
2864 
2865  IV->replaceUsesOutsideBlock(IVFinal, LoopHeaderBB);
2866 
2867  // Step 3: Rewrite the loop into a countable form, with canonical IV.
2868 
2869  // The new canonical induction variable.
2870  Builder.SetInsertPoint(&LoopHeaderBB->front());
2871  auto *CIV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
2872 
2873  // The induction itself.
2874  Builder.SetInsertPoint(LoopHeaderBB->getFirstNonPHI());
2875  auto *CIVNext =
2876  Builder.CreateAdd(CIV, ConstantInt::get(Ty, 1), CIV->getName() + ".next",
2877  /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
2878 
2879  // The loop trip count check.
2880  auto *CIVCheck = Builder.CreateICmpEQ(CIVNext, LoopTripCount,
2881  CurLoop->getName() + ".ivcheck");
2882  auto *NewIVCheck = CIVCheck;
2883  if (InvertedCond) {
2884  NewIVCheck = Builder.CreateNot(CIVCheck);
2885  NewIVCheck->takeName(ValShiftedIsZero);
2886  }
2887 
2888  // The original IV, but rebased to be an offset to the CIV.
2889  auto *IVDePHId = Builder.CreateAdd(CIV, Start, "", /*HasNUW=*/false,
2890  /*HasNSW=*/true); // FIXME: what about NUW?
2891  IVDePHId->takeName(IV);
2892 
2893  // The loop terminator.
2894  Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
2895  Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
2896  LoopHeaderBB->getTerminator()->eraseFromParent();
2897 
2898  // Populate the IV PHI.
2899  CIV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
2900  CIV->addIncoming(CIVNext, LoopHeaderBB);
2901 
2902  // Step 4: Forget the "non-computable" trip-count SCEV associated with the
2903  // loop. The loop would otherwise not be deleted even if it becomes empty.
2904 
2905  SE->forgetLoop(CurLoop);
2906 
2907  // Step 5: Try to cleanup the loop's body somewhat.
2908  IV->replaceAllUsesWith(IVDePHId);
2909  IV->eraseFromParent();
2910 
2911  ValShiftedIsZero->replaceAllUsesWith(NewIVCheck);
2912  ValShiftedIsZero->eraseFromParent();
2913 
2914  // Other passes will take care of actually deleting the loop if possible.
2915 
2916  LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom optimized!\n");
2917 
2918  ++NumShiftUntilZero;
2919  return MadeChange;
2920 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
i
i
Definition: README.txt:29
llvm::InstructionCost
Definition: InstructionCost.h:29
DisableLIRPAll
static cl::opt< bool, true > DisableLIRPAll("disable-" DEBUG_TYPE "-all", cl::desc("Options to disable Loop Idiom Recognize Pass."), cl::location(DisableLIRP::All), cl::init(false), cl::ReallyHidden)
llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:155
llvm::TargetTransformInfo::PSK_FastHardware
@ PSK_FastHardware
Definition: TargetTransformInfo.h:595
llvm::RecursivelyDeleteTriviallyDeadInstructions
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:523
llvm::ScalarEvolution::hasLoopInvariantBackedgeTakenCount
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
Definition: ScalarEvolution.cpp:12727
llvm::BasicBlock::end
iterator end()
Definition: BasicBlock.h:298
Attrs
Function Attrs
Definition: README_ALTIVEC.txt:215
llvm::OptimizationRemarkMissed
Diagnostic information for missed-optimization remarks.
Definition: DiagnosticInfo.h:730
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:211
llvm::Loop::isLoopInvariant
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:64
llvm::MemIntrinsicBase::getDestAlignment
unsigned getDestAlignment() const
FIXME: Remove this function once transition to Align is over.
Definition: IntrinsicInst.h:712
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:105
llvm::ScalarEvolution::isLoopEntryGuardedByCond
bool isLoopEntryGuardedByCond(const Loop *L, ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS)
Test whether entry to the loop is protected by a conditional between LHS and RHS.
Definition: ScalarEvolution.cpp:10843
llvm::ScalarEvolution::getNegativeSCEV
const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
Definition: ScalarEvolution.cpp:4385
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AllocatorList.h:23
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::Instruction::getModule
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:66
llvm::CmpInst::ICMP_EQ
@ ICMP_EQ
equal
Definition: InstrTypes.h:742
llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition: iterator_range.h:53
llvm::isKnownNonNegative
bool isKnownNonNegative(const Value *V, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Returns true if the give value is known to be non-negative.
Definition: ValueTracking.cpp:323
match_LoopInvariant
Match loop-invariant value.
Definition: LoopIdiomRecognize.cpp:2231
llvm::SmallPtrSetImpl::erase
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:378
llvm::LoopBase::getExitBlock
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
Definition: LoopInfoImpl.h:82
llvm::WeakTrackingVH
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
match_LoopInvariant::L
const Loop * L
Definition: LoopIdiomRecognize.cpp:2233
CmpInstAnalysis.h
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::isSafeToExpand
bool isSafeToExpand(const SCEV *S, ScalarEvolution &SE, bool CanonicalMode=true)
Return true if the given expression is safe to expand in the sense that all materialized values are s...
Definition: ScalarEvolutionExpander.cpp:2756
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
llvm::LocationSize::afterPointer
constexpr static LocationSize afterPointer()
Any location after the base pointer (but still within the underlying object).
Definition: MemoryLocation.h:123
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1399
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:721
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:90
llvm::SCEVAddRecExpr::isAffine
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
Definition: ScalarEvolutionExpressions.h:370
llvm::isModOrRefSet
LLVM_NODISCARD bool isModOrRefSet(const ModRefInfo MRI)
Definition: AliasAnalysis.h:189
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
IntrinsicInst.h
ScalarEvolutionExpander.h
llvm::MemIntrinsicBase::getDestAlign
MaybeAlign getDestAlign() const
Definition: IntrinsicInst.h:717
Scalar.h
llvm::SCEVAddRecExpr::getStart
const SCEV * getStart() const
Definition: ScalarEvolutionExpressions.h:353
llvm::TypeSize::getFixedSize
ScalarTy getFixedSize() const
Definition: TypeSize.h:425
T
MemorySSAUpdater.h
llvm::Function
Definition: Function.h:62
llvm::cl::location
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:457
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:530
StringRef.h
Pass.h
llvm::LoopBase::contains
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:122
llvm::SCEVExpander
This class uses information about analyze scalars to rewrite expressions in canonical form.
Definition: ScalarEvolutionExpander.h:66
llvm::BasicBlock::instructionsWithoutDebug
iterator_range< filter_iterator< BasicBlock::const_iterator, std::function< bool(const Instruction &)> > > instructionsWithoutDebug(bool SkipPseudoOp=true) const
Return a const iterator range over the instructions in the block, skipping any debug instructions.
Definition: BasicBlock.cpp:104
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:308
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1175
Statistic.h
llvm::PatternMatch::m_Add
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
Definition: PatternMatch.h:988
llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1479
detectPopcountIdiom
static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, Instruction *&CntInst, PHINode *&CntPhi, Value *&Var)
Return true iff the idiom is detected in the loop.
Definition: LoopIdiomRecognize.cpp:1627
llvm::TargetTransformInfo
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Definition: TargetTransformInfo.h:168
LoopAccessAnalysis.h
llvm::Type::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:734
llvm::IRBuilder<>
MapVector.h
llvm::GlobalVariable
Definition: GlobalVariable.h:39
llvm::SmallDenseMap
Definition: DenseMap.h:880
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:460
llvm::TargetTransformInfo::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
Definition: TargetTransformInfo.cpp:896
llvm::CmpInst::ICMP_NE
@ ICMP_NE
not equal
Definition: InstrTypes.h:743
llvm::Instruction::hasNoUnsignedWrap
bool hasNoUnsignedWrap() const
Determine whether the no unsigned wrap flag is set.
Definition: Instruction.cpp:136
llvm::CmpInst::getInversePredicate
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:835
llvm::ConstantExpr::getBitCast
static Constant * getBitCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2260
llvm::BTF::HeaderSize
@ HeaderSize
Definition: BTF.h:58
llvm::initializeLoopIdiomRecognizeLegacyPassPass
void initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry &)
ValueTracking.h
Local.h
OptimizationRemarkEmitter.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:151
llvm::getLoopAnalysisUsage
void getLoopAnalysisUsage(AnalysisUsage &AU)
Helper to consistently add the set of standard passes to a loop pass's AnalysisUsage.
Definition: LoopUtils.cpp:150
llvm::SCEVExpander::expandCodeFor
Value * expandCodeFor(const SCEV *SH, Type *Ty, Instruction *I)
Insert code to directly compute the specified SCEV expression into the program.
Definition: ScalarEvolutionExpander.h:278
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
llvm::PatternMatch::m_Br
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
Definition: PatternMatch.h:1741
APInt.h
ScalarEvolution.h
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
DenseMap.h
llvm::APInt::getBitWidth
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1412
llvm::ICmpInst::isEquality
bool isEquality() const
Return true if this predicate is either EQ or NE.
Definition: Instructions.h:1285
Module.h
llvm::createLoopIdiomPass
Pass * createLoopIdiomPass()
Definition: LoopIdiomRecognize.cpp:346
getTripCount
static const SCEV * getTripCount(const SCEV *BECount, Type *IntPtr, Loop *CurLoop, const DataLayout *DL, ScalarEvolution *SE)
Compute trip count from the backedge taken count.
Definition: LoopIdiomRecognize.cpp:1057
llvm::Instruction::isShift
bool isShift() const
Definition: Instruction.h:167
llvm::isBytewiseValue
Value * isBytewiseValue(Value *V, const DataLayout &DL)
If the specified value can be set by repeating the same byte in memory, return the i8 value that it i...
Definition: ValueTracking.cpp:3746
llvm::LoopStandardAnalysisResults
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Definition: LoopAnalysisManager.h:52
match_LoopInvariant::match
bool match(ITy *V)
Definition: LoopIdiomRecognize.cpp:2238
llvm::SCEVExpanderCleaner::markResultUsed
void markResultUsed()
Indicate that the result of the expansion is used.
Definition: ScalarEvolutionExpander.h:529
llvm::GlobalValue::UnnamedAddr::Global
@ Global
llvm::isConsecutiveAccess
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
Definition: LoopAccessAnalysis.cpp:1275
llvm::ScalarEvolution::applyLoopGuards
const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
Definition: ScalarEvolution.cpp:14060
llvm::MapVector< Value *, StoreList >
llvm::SmallPtrSet< Value *, 16 >
llvm::ore::NV
DiagnosticInfoOptimizationBase::Argument NV
Definition: OptimizationRemarkEmitter.h:136
llvm::DisableLIRP::All
static bool All
When true, the entire pass is disabled.
Definition: LoopIdiomRecognize.h:30
llvm::CmpInst::ICMP_SLE
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:751
llvm::ScalarEvolution::isKnownNonNegative
bool isKnownNonNegative(const SCEV *S)
Test if the given expression is known to be non-negative.
Definition: ScalarEvolution.cpp:10151
llvm::ConstantExpr::getExactLogBase2
static Constant * getExactLogBase2(Constant *C)
If C is a scalar/fixed width vector of known powers of 2, then this function returns a new scalar/fix...
Definition: Constants.cpp:2810
llvm::BasicBlock::getSinglePredecessor
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:268
llvm::GlobalValue::setUnnamedAddr
void setUnnamedAddr(UnnamedAddr Val)
Definition: GlobalValue.h:212
llvm::ArrayType
Class to represent array types.
Definition: DerivedTypes.h:357
llvm::LoadInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:272
llvm::ScalarEvolution::getTruncateOrZeroExtend
const SCEV * getTruncateOrZeroExtend(const SCEV *V, Type *Ty, unsigned Depth=0)
Return a SCEV corresponding to a conversion of the input value to the specified type.
Definition: ScalarEvolution.cpp:4521
llvm::PatternMatch::m_c_And
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
Definition: PatternMatch.h:2249
llvm::SCEVConstant::getValue
ConstantInt * getValue() const
Definition: ScalarEvolutionExpressions.h:69
llvm::LoopStandardAnalysisResults::DT
DominatorTree & DT
Definition: LoopAnalysisManager.h:55
llvm::LoadInst::getAlign
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:228
llvm::BitmaskEnumDetail::Mask
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::SimpleLoopSafetyInfo
Simple and conservative implementation of LoopSafetyInfo that can give false-positive answers to its ...
Definition: MustExecute.h:110
llvm::codeview::PointerMode::Pointer
@ Pointer
llvm::cl::ReallyHidden
@ ReallyHidden
Definition: CommandLine.h:144
llvm::LinearPolySize::isScalable
bool isScalable() const
Returns whether the size is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:298
llvm::IRBuilderBase::getInt1
ConstantInt * getInt1(bool V)
Get a constant value representing either true or false.
Definition: IRBuilder.h:453
MustExecute.h
llvm::LoopBase::getNumBlocks
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:185
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::RISCVFenceField::R
@ R
Definition: RISCVBaseInfo.h:207
llvm::MemSetBase::getValue
Value * getValue() const
Definition: IntrinsicInst.h:818
loop
Analysis the ScalarEvolution expression for r is< loop > Outside the loop
Definition: README.txt:8
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
AliasAnalysis.h
llvm::Optional::hasValue
constexpr bool hasValue() const
Definition: Optional.h:288
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::ScalarEvolution::getMulExpr
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
Definition: ScalarEvolution.cpp:3037
llvm::DominatorTree::dominates
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:115
Instruction.h
CommandLine.h
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::Intrinsic::getType
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Return the function type for an intrinsic.
Definition: Function.cpp:1355
llvm::Instruction::getOpcode
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:160
llvm::StoreInst::getValueOperand
Value * getValueOperand()
Definition: Instructions.h:403
GlobalValue.h
llvm::InlinerFunctionImportStatsOpts::No
@ No
llvm::MemorySSAWrapperPass
Legacy analysis pass which computes MemorySSA.
Definition: MemorySSA.h:980
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:31
llvm::ScalarEvolution::getOne
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
Definition: ScalarEvolution.h:652
Constants.h
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
llvm::AAResults
Definition: AliasAnalysis.h:507
LoopIdiomRecognize.h
llvm::TargetTransformInfo::getPopcntSupport
PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const
Return hardware support for population count.
Definition: TargetTransformInfo.cpp:542
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::LocationSize
Definition: MemoryLocation.h:65
llvm::User
Definition: User.h:44
Intrinsics.h
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
InstrTypes.h
llvm::CallBase::getCalledFunction
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation.
Definition: InstrTypes.h:1398
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::LoopBase::blocks
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:178
TargetLibraryInfo.h
llvm::PatternMatch::m_Instruction
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:726
false
Definition: StackSlotColoring.cpp:142
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:109
llvm::PatternMatch::m_c_Add
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
Definition: PatternMatch.h:2235
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::PHINode::getIncomingValueForBlock
Value * getIncomingValueForBlock(const BasicBlock *BB) const
Definition: Instructions.h:2842
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:171
llvm::Constant::getAllOnesValue
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:405
llvm::BinaryOperator::getOpcode
BinaryOps getOpcode() const
Definition: InstrTypes.h:394
llvm::Instruction
Definition: Instruction.h:45
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:191
llvm::STATISTIC
STATISTIC(NumFunctions, "Total number of functions")
llvm::codeview::EncodedFramePtrReg::BasePtr
@ BasePtr
llvm::UndefValue::get
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1782
llvm::LocationSize::precise
static LocationSize precise(uint64_t Value)
Definition: MemoryLocation.h:100
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:932
llvm::isMustProgress
bool isMustProgress(const Loop *L)
Return true if this loop can be assumed to make progress.
Definition: LoopInfo.cpp:1117
LoopUtils.h
DEBUG_TYPE
#define DEBUG_TYPE
Definition: LoopIdiomRecognize.cpp:108
llvm::getUnderlyingObject
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
Definition: ValueTracking.cpp:4280
DebugLoc.h
llvm::LPPassManager
Definition: LoopPass.h:75
SmallPtrSet.h
m_LoopInvariant
match_LoopInvariant< Ty > m_LoopInvariant(const Ty &M, const Loop *L)
Matches if the value is loop-invariant.
Definition: LoopIdiomRecognize.cpp:2245
llvm::BasicBlock::getModule
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:148
MemmoveVerifier::MemmoveVerifier
MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr, const DataLayout &DL)
Definition: LoopIdiomRecognize.cpp:1269
Align
uint64_t Align
Definition: ELFObjHandler.cpp:82
PatternMatch.h
llvm::BasicBlock::getFirstNonPHI
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:216
llvm::StoreInst::getAlign
Align getAlign() const
Definition: Instructions.h:358
llvm::MemTransferBase::getSourceAlign
MaybeAlign getSourceAlign() const
Definition: IntrinsicInst.h:783
llvm::PatternMatch::m_Shift
BinOpPred_match< LHS, RHS, is_shift_op > m_Shift(const LHS &L, const RHS &R)
Matches shift operations.
Definition: PatternMatch.h:1306
llvm::Loop::getName
StringRef getName() const
Definition: LoopInfo.h:869
llvm::None
const NoneType None
Definition: None.h:23
llvm::Type::getIntegerBitWidth
unsigned getIntegerBitWidth() const
Definition: DerivedTypes.h:97
Type.h
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:118
X
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
llvm::MemSetInst
This class wraps the llvm.memset intrinsic.
Definition: IntrinsicInst.h:957
llvm::PatternMatch::m_One
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:513
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
LoopInfo.h
llvm::BranchInst::getCondition
Value * getCondition() const
Definition: Instructions.h:3173
llvm::SCEV::isNonConstantNegative
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Definition: ScalarEvolution.cpp:431
llvm::ScalarEvolution::getSCEV
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
Definition: ScalarEvolution.cpp:4339
llvm::LoopBase::block_begin
block_iterator block_begin() const
Definition: LoopInfo.h:176
llvm::PatternMatch::m_Power2
cst_pred_ty< is_power2 > m_Power2()
Match an integer or vector power-of-2.
Definition: PatternMatch.h:544
getStoreStride
static APInt getStoreStride(const SCEVAddRecExpr *StoreEv)
Definition: LoopIdiomRecognize.cpp:426
UseLIRCodeSizeHeurs
static cl::opt< bool > UseLIRCodeSizeHeurs("use-lir-code-size-heurs", cl::desc("Use loop idiom recognition code size heuristics when compiling" "with -Os/-Oz"), cl::init(true), cl::Hidden)
llvm::getLoopPassPreservedAnalyses
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
Definition: LoopAnalysisManager.cpp:140
Verifier
verify safepoint Safepoint IR Verifier
Definition: SafepointIRVerifier.cpp:253
BasicBlock.h
llvm::cl::opt
Definition: CommandLine.h:1432
llvm::SCEV
This class represents an analyzed expression in the program.
Definition: ScalarEvolution.h:77
llvm::PatternMatch::m_Zero
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:535
llvm::StoreInst
An instruction for storing to memory.
Definition: Instructions.h:309
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
llvm::Instruction::eraseFromParent
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:78
llvm::ICmpInst
This instruction compares its operands according to the predicate given to the constructor.
Definition: Instructions.h:1190
Index
uint32_t Index
Definition: ELFObjHandler.cpp:83
llvm::TargetLibraryInfoWrapperPass
Definition: TargetLibraryInfo.h:465
uint64_t
llvm::LoopPass
Definition: LoopPass.h:27
llvm::TargetTransformInfoWrapperPass
Wrapper pass for TargetTransformInfo.
Definition: TargetTransformInfo.h:2474
llvm::GlobalValue::getParent
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:578
llvm::LPMUpdater
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
Definition: LoopPassManager.h:263
llvm::SmallPtrSetImpl::end
iterator end() const
Definition: SmallPtrSet.h:407
llvm::LoopBase::getUniqueExitBlocks
void getUniqueExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all unique successor blocks of this loop.
Definition: LoopInfoImpl.h:122
createPopcntIntrinsic
static CallInst * createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL)
Definition: LoopIdiomRecognize.cpp:1975
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
llvm::PHINode::addIncoming
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Definition: Instructions.h:2807
llvm::numbers::e
constexpr double e
Definition: MathExtras.h:57
MemoryLocation.h
llvm::MemorySSA
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition: MemorySSA.h:704
llvm::LoopInfoBase::getLoopFor
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Definition: LoopInfo.h:970
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:441
llvm::LoopBase::getLoopPreheader
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:167
llvm::PatternMatch::m_And
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1103
llvm::SmallPtrSetImpl::begin
iterator begin() const
Definition: SmallPtrSet.h:402
ArrayRef.h
llvm::Instruction::setDebugLoc
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:367
llvm::SCEVConstant
This class represents a constant integer value.
Definition: ScalarEvolutionExpressions.h:60
llvm::ModRefInfo
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition: AliasAnalysis.h:148
IRBuilder.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::PatternMatch::m_Sub
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1000
llvm::Value::replaceUsesOutsideBlock
void replaceUsesOutsideBlock(Value *V, BasicBlock *BB)
replaceUsesOutsideBlock - Go through the uses list for this definition and make each use point to "V"...
Definition: Value.cpp:584
std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:852
SI
StandardInstrumentations SI(Debug, VerifyEach)
llvm::OptimizationRemarkEmitter::emit
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Definition: OptimizationRemarkEmitter.cpp:77
match_LoopInvariant::SubPattern
SubPattern_t SubPattern
Definition: LoopIdiomRecognize.cpp:2232
detectShiftUntilZeroIdiom
static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL, Intrinsic::ID &IntrinID, Value *&InitX, Instruction *&CntInst, PHINode *&CntPhi, Instruction *&DefX)
Return true if the idiom is detected in the loop.
Definition: LoopIdiomRecognize.cpp:1760
llvm::TargetLibraryInfo::has
bool has(LibFunc F) const
Tests whether a library function is available.
Definition: TargetLibraryInfo.h:325
llvm::TTI
TargetTransformInfo TTI
Definition: TargetTransformInfo.h:163
llvm::SCEVExpanderCleaner
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
Definition: ScalarEvolutionExpander.h:515
llvm::ArrayType::get
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:640
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
llvm::MemorySSAAnalysis
An analysis that produces MemorySSA for a function.
Definition: MemorySSA.h:930
llvm::SimpleLoopSafetyInfo::computeLoopSafetyInfo
void computeLoopSafetyInfo(const Loop *CurLoop) override
Computes safety information for a loop checks loop body & header for the possibility of may throw exc...
Definition: MustExecute.cpp:54
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:382
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:650
InstructionCost.h
llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:75
MemmoveVerifier
Definition: LoopIdiomRecognize.cpp:1267
llvm::User::setOperand
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
llvm::SetVector::insert
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:141
DisableLIRPMemcpy
static cl::opt< bool, true > DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memcpy."), cl::location(DisableLIRP::Memcpy), cl::init(false), cl::ReallyHidden)
getDebugLoc
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
Definition: MachineInstrBundle.cpp:109
llvm::PatternMatch::m_CombineAnd
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
Definition: PatternMatch.h:216
llvm::Sched::Source
@ Source
Definition: TargetLowering.h:99
llvm::MemIntrinsicBase::getDest
Value * getDest() const
This is just like getRawDest, but it strips off any cast instructions (including addrspacecast) that ...
Definition: IntrinsicInst.h:704
llvm::LoopInfo
Definition: LoopInfo.h:1086
llvm::BinaryOperator
Definition: InstrTypes.h:190
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
DataLayout.h
llvm::PatternMatch::m_SpecificInt_ICMP
cst_pred_ty< icmp_pred_with_threshold > m_SpecificInt_ICMP(ICmpInst::Predicate Predicate, const APInt &Threshold)
Match an integer or vector with every element comparing 'pred' (eg/ne/...) to Threshold.
Definition: PatternMatch.h:604
Cond
SmallVector< MachineOperand, 4 > Cond
Definition: BasicBlockSections.cpp:179
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:57
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:215
llvm::ScalarEvolution::getConstant
const SCEV * getConstant(ConstantInt *V)
Definition: ScalarEvolution.cpp:450
LoopPass.h
llvm::SCEVExpander::setInsertPoint
void setInsertPoint(Instruction *IP)
Set the current insertion point.
Definition: ScalarEvolutionExpander.h:347
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:134
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
llvm::ConstantInt::isZero
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:194
llvm::LoopBase::isOutermost
bool isOutermost() const
Return true if the loop does not have a parent (natural) loop.
Definition: LoopInfo.h:168
llvm::Instruction::getFunction
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:70
llvm::AnalysisUsage::addPreserved
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Definition: PassAnalysisSupport.h:98
llvm::Value::replaceAllUsesWith
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:532
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
getParent
static const Function * getParent(const Value *V)
Definition: BasicAliasAnalysis.cpp:870
uint32_t
llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:991
llvm::ModRefInfo::Mod
@ Mod
The access may modify the value stored in memory.
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:81
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::SCEV::FlagNUW
@ FlagNUW
Definition: ScalarEvolution.h:135
llvm::ConstantInt::isMinusOne
bool isMinusOne() const
This function will return true iff every bit in this constant is set to true.
Definition: Constants.h:206
llvm::LoopStandardAnalysisResults::LI
LoopInfo & LI
Definition: LoopAnalysisManager.h:56
llvm::MemoryAccess
Definition: MemorySSA.h:136
idiom
loop idiom
Definition: LoopIdiomRecognize.cpp:343
llvm::Value::getName
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:180
ValueHandle.h
matchCondition
static Value * matchCondition(BranchInst *BI, BasicBlock *LoopEntry, bool JmpOnZero=false)
Check if the given conditional branch is based on the comparison between a variable and zero,...
Definition: LoopIdiomRecognize.cpp:1566
llvm::BasicBlock::getTerminator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.cpp:152
llvm::SimpleLoopSafetyInfo::anyBlockMayThrow
bool anyBlockMayThrow() const override
Returns true iff any block of the loop for which this info is contains an instruction that may throw ...
Definition: MustExecute.cpp:50
mayLoopAccessLocation
static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, const SCEV *BECount, const SCEV *StoreSizeSCEV, AliasAnalysis &AA, SmallPtrSetImpl< Instruction * > &IgnoredInsts)
mayLoopAccessLocation - Return true if the specified loop might access the specified pointer location...
Definition: LoopIdiomRecognize.cpp:1007
llvm::Function::hasOptSize
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:661
llvm::ScalarEvolution::isLoopInvariant
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
Definition: ScalarEvolution.cpp:12996
detectShiftUntilBitTestIdiom
static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX, Value *&BitMask, Value *&BitPos, Value *&CurrX, Instruction *&NextX)
Return true if the idiom is detected in the loop.
Definition: LoopIdiomRecognize.cpp:2271
llvm::ConstantInt::getZExtValue
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:142
llvm::BasicBlock::front
const Instruction & front() const
Definition: BasicBlock.h:308
llvm::LoopIdiomRecognizePass::run
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
Definition: LoopIdiomRecognize.cpp:314
llvm::DisableLIRP::Memcpy
static bool Memcpy
When true, Memcpy is disabled.
Definition: LoopIdiomRecognize.h:36
Attributes.h
j
return j(j<< 16)
getNumBytes
static const SCEV * getNumBytes(const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, Loop *CurLoop, const DataLayout *DL, ScalarEvolution *SE)
Compute the number of bytes as a SCEV from the backedge taken count.
Definition: LoopIdiomRecognize.cpp:1087
llvm::Instruction::isAtomic
bool isAtomic() const
Return true if this instruction has an AtomicOrdering of unordered or higher.
Definition: Instruction.cpp:614
Constant.h
llvm::ScalarEvolution::forgetLoop
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
Definition: ScalarEvolution.cpp:7850
llvm::SCEVAddRecExpr::getLoop
const Loop * getLoop() const
Definition: ScalarEvolutionExpressions.h:354
llvm::GraphProgram::Name
Name
Definition: GraphWriter.h:50
llvm::decomposeBitTestICmp
bool decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate &Pred, Value *&X, APInt &Mask, bool LookThroughTrunc=true)
Decompose an icmp into the form ((X & Mask) pred 0) if possible.
Definition: CmpInstAnalysis.cpp:66
llvm::ScalarEvolution::getMinusSCEV
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
Definition: ScalarEvolution.cpp:4471
MemmoveVerifier::loadAndStoreMayFormMemmove
bool loadAndStoreMayFormMemmove(unsigned StoreSize, bool IsNegStride, const Instruction &TheLoad, bool IsMemCpy) const
Definition: LoopIdiomRecognize.cpp:1278
llvm::MemIntrinsicBase::getLength
Value * getLength() const
Definition: IntrinsicInst.h:695
llvm::LoopStandardAnalysisResults::TTI
TargetTransformInfo & TTI
Definition: LoopAnalysisManager.h:59
getMemSetPatternValue
static Constant * getMemSetPatternValue(Value *V, const DataLayout *DL)
getMemSetPatternValue - If a strided store of the specified value is safe to turn into a memset_patte...
Definition: LoopIdiomRecognize.cpp:437
llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:161
llvm::PHINode::Create
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
Definition: Instructions.h:2699
llvm::ore::setExtraArgs
DiagnosticInfoOptimizationBase::setExtraArgs setExtraArgs
Definition: OptimizationRemarkEmitter.h:138
llvm::ConstantInt::getBool
static ConstantInt * getBool(LLVMContext &Context, bool V)
Definition: Constants.cpp:894
GlobalVariable.h
llvm::IRBuilderBase::GetInsertBlock
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:178
llvm::MemorySSA::BeforeTerminator
@ BeforeTerminator
Definition: MemorySSA.h:792
llvm::TypeSize
Definition: TypeSize.h:416
llvm::SCEVAddRecExpr
This node represents a polynomial recurrence on the trip count of the specified loop.
Definition: ScalarEvolutionExpressions.h:342
Casting.h
idioms
loop Recognize loop idioms
Definition: LoopIdiomRecognize.cpp:344
llvm::LoopBase::getHeader
BlockT * getHeader() const
Definition: LoopInfo.h:104
PassManager.h
llvm::SetVector::count
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:215
llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition: TargetLibraryInfo.h:221
llvm::SCEV::isOne
bool isOne() const
Return true if the expression is a constant one.
Definition: ScalarEvolution.cpp:419
llvm::ConstantArray::get
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1295
llvm::GetPointerBaseWithConstantOffset
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
Definition: ValueTracking.h:278
llvm::SmallVectorImpl::clear
void clear()
Definition: SmallVector.h:579
llvm::PrevailingType::Yes
@ Yes
llvm::LoopBase::getNumBackEdges
unsigned getNumBackEdges() const
Calculate the number of back edges to the loop header.
Definition: LoopInfo.h:250
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:183
llvm::CmpInst::setPredicate
void setPredicate(Predicate P)
Set the predicate for this instruction to the specified value.
Definition: InstrTypes.h:814
isSimple
static bool isSimple(Instruction *I)
Definition: SLPVectorizer.cpp:593
llvm::LoopStandardAnalysisResults::SE
ScalarEvolution & SE
Definition: LoopAnalysisManager.h:57
llvm::LoopStandardAnalysisResults::AA
AAResults & AA
Definition: LoopAnalysisManager.h:53
llvm::SCEVConstant::getAPInt
const APInt & getAPInt() const
Definition: ScalarEvolutionExpressions.h:70
MemmoveVerifier::IsSameObject
const bool IsSameObject
Definition: LoopIdiomRecognize.cpp:1309
ScalarEvolutionExpressions.h
llvm::OptimizationRemark
Diagnostic information for applied optimization remarks.
Definition: DiagnosticInfo.h:685
llvm::Pass
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:91
MemorySSA.h
llvm::ScalarEvolution::getZeroExtendExpr
const SCEV * getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
Definition: ScalarEvolution.cpp:1582
llvm::GlobalValue::PrivateLinkage
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:56
Instructions.h
llvm::MemCpyInst
This class wraps the llvm.memcpy intrinsic.
Definition: IntrinsicInst.h:988
llvm::LoadInst::isVolatile
bool isVolatile() const
Return true if this is a load from a volatile memory location.
Definition: Instructions.h:217
llvm::MemIntrinsic::isVolatile
bool isVolatile() const
Definition: IntrinsicInst.h:935
SmallVector.h
match_LoopInvariant::match_LoopInvariant
match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
Definition: LoopIdiomRecognize.cpp:2235
llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:780
llvm::Instruction::getDebugLoc
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:370
User.h
llvm::StoreInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:406
llvm::PatternMatch::m_ICmp
CmpClass_match< LHS, RHS, ICmpInst, ICmpInst::Predicate > m_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R)
Definition: PatternMatch.h:1404
llvm::Instruction::hasNoSignedWrap
bool hasNoSignedWrap() const
Determine whether the no signed wrap flag is set.
Definition: Instruction.cpp:140
llvm::CmpInst::ICMP_UGT
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:744
Dominators.h
llvm::inferLibFuncAttributes
bool inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI)
Analyze the name and prototype of the given function and set any applicable attributes.
Definition: BuildLibCalls.cpp:235
llvm::SmallPtrSetImplBase::empty
LLVM_NODISCARD bool empty() const
Definition: SmallPtrSet.h:91
llvm::BasicBlock::size
size_t size() const
Definition: BasicBlock.h:306
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:94
llvm::CmpInst::getPredicate
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:811
llvm::ScalarEvolution::getZero
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
Definition: ScalarEvolution.h:649
getStartForNegStride
static const SCEV * getStartForNegStride(const SCEV *Start, const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, ScalarEvolution *SE)
Definition: LoopIdiomRecognize.cpp:1042
TargetTransformInfo.h
llvm::LoopStandardAnalysisResults::MSSA
MemorySSA * MSSA
Definition: LoopAnalysisManager.h:62
llvm::FunctionCallee
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:165
llvm::PHINode
Definition: Instructions.h:2657
llvm::SmallVectorImpl< BasicBlock * >
llvm::TargetTransformInfo::getAtomicMemIntrinsicMaxElementSize
unsigned getAtomicMemIntrinsicMaxElementSize() const
Definition: TargetTransformInfo.cpp:965
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom", "Recognize loop idioms", false, false) INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass
llvm::Module::getDataLayout
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:401
DerivedTypes.h
llvm::SmallPtrSetImpl< Instruction * >
llvm::SCEV::getType
Type * getType() const
Return the LLVM type of this SCEV expression.
Definition: ScalarEvolution.cpp:381
llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition: InstructionSimplify.h:44
llvm::ScalarEvolution::getAddExpr
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
Definition: ScalarEvolution.cpp:2440
createFFSIntrinsic
static CallInst * createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL, bool ZeroCheck, Intrinsic::ID IID)
Definition: LoopIdiomRecognize.cpp:1988
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1478
llvm::ConstantInt::isOne
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
Definition: Constants.h:200
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::LoadInst::isUnordered
bool isUnordered() const
Definition: Instructions.h:266
DisableLIRPMemset
static cl::opt< bool, true > DisableLIRPMemset("disable-" DEBUG_TYPE "-memset", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memset."), cl::location(DisableLIRP::Memset), cl::init(false), cl::ReallyHidden)
deleteDeadInstruction
static void deleteDeadInstruction(Instruction *I)
Definition: LoopIdiomRecognize.cpp:348
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::VerifyMemorySSA
bool VerifyMemorySSA
Enables verification of MemorySSA.
Definition: MemorySSA.cpp:91
LLVMContext.h
llvm::TargetTransformInfo::TCC_Basic
@ TCC_Basic
The cost of a typical 'add' instruction.
Definition: TargetTransformInfo.h:263
llvm::Value::takeName
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:382
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
llvm::PatternMatch::m_BasicBlock
class_match< BasicBlock > m_BasicBlock()
Match an arbitrary basic block value and ignore it.
Definition: PatternMatch.h:160
llvm::TargetTransformInfo::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueKind Opd1Info=OK_AnyValue, OperandValueKind Opd2Info=OK_AnyValue, OperandValueProperties Opd1PropInfo=OP_None, OperandValueProperties Opd2PropInfo=OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
Definition: TargetTransformInfo.cpp:733
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:389
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:412
llvm::GlobalObject::setAlignment
void setAlignment(MaybeAlign Align)
Definition: Globals.cpp:124
llvm::SCEVNAryExpr::getOperand
const SCEV * getOperand(unsigned i) const
Definition: ScalarEvolutionExpressions.h:201
llvm::BranchInst
Conditional or Unconditional Branch instruction.
Definition: Instructions.h:3092
raw_ostream.h
llvm::SetVector
A vector that has set insertion semantics.
Definition: SetVector.h:40
llvm::ScalarEvolution::getBackedgeTakenCount
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
Definition: ScalarEvolution.cpp:7720
llvm::SmallPtrSetImpl::contains
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:388
llvm::intersectModRef
LLVM_NODISCARD ModRefInfo intersectModRef(const ModRefInfo MRI1, const ModRefInfo MRI2)
Definition: AliasAnalysis.h:236
Value.h
getRecurrenceVar
static PHINode * getRecurrenceVar(Value *VarX, Instruction *DefX, BasicBlock *LoopEntry)
Definition: LoopIdiomRecognize.cpp:1594
llvm::LoopStandardAnalysisResults::TLI
TargetLibraryInfo & TLI
Definition: LoopAnalysisManager.h:58
InitializePasses.h
BuildLibCalls.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::PatternMatch::m_Shl
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1121
Debug.h
llvm::Value::users
iterator_range< user_iterator > users()
Definition: Value.h:421
llvm::AAResults::getModRefInfo
ModRefInfo getModRefInfo(const CallBase *Call, const MemoryLocation &Loc)
getModRefInfo (for call sites) - Return information about whether a particular call site modifies or ...
Definition: AliasAnalysis.cpp:218
llvm::APIntOps::smax
const APInt & smax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be signed.
Definition: APInt.h:2128
llvm::BranchInst::isConditional
bool isConditional() const
Definition: Instructions.h:3171
llvm::ModRefInfo::ModRef
@ ModRef
The access may reference and may modify the value stored in memory.
llvm::IRBuilderBase::CreateCall
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=None, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2252
llvm::Optional::getValue
constexpr const T & getValue() const LLVM_LVALUE_FUNCTION
Definition: Optional.h:282
llvm::BranchInst::getSuccessor
BasicBlock * getSuccessor(unsigned i) const
Definition: Instructions.h:3185
llvm::MemoryLocation
Representation for a specific memory location.
Definition: MemoryLocation.h:209
llvm::DisableLIRP::Memset
static bool Memset
When true, Memset is disabled.
Definition: LoopIdiomRecognize.h:33
SetVector.h
llvm::MemTransferBase::getSource
Value * getSource() const
This is just like getRawSource, but it strips off any cast instructions that feed it,...
Definition: IntrinsicInst.h:769
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:364
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38