LLVM  14.0.0git
LoopIdiomRecognize.cpp
Go to the documentation of this file.
1 //===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass implements an idiom recognizer that transforms simple loops into a
10 // non-loop form. In cases that this kicks in, it can be a significant
11 // performance win.
12 //
13 // If compiling for code size we avoid idiom recognition if the resulting
14 // code could be larger than the code for the original loop. One way this could
15 // happen is if the loop is not removable after idiom recognition due to the
16 // presence of non-idiom instructions. The initial implementation of the
17 // heuristics applies to idioms in multi-block loops.
18 //
19 //===----------------------------------------------------------------------===//
20 //
21 // TODO List:
22 //
23 // Future loop memory idioms to recognize:
24 // memcmp, strlen, etc.
25 // Future floating point idioms to recognize in -ffast-math mode:
26 // fpowi
27 // Future integer operation idioms to recognize:
28 // ctpop
29 //
30 // Beware that isel's default lowering for ctpop is highly inefficient for
31 // i64 and larger types when i64 is legal and the value has few bits set. It
32 // would be good to enhance isel to emit a loop for ctpop in this case.
33 //
34 // This could recognize common matrix multiplies and dot product idioms and
35 // replace them with calls to BLAS (if linked in??).
36 //
37 //===----------------------------------------------------------------------===//
38 
40 #include "llvm/ADT/APInt.h"
41 #include "llvm/ADT/ArrayRef.h"
42 #include "llvm/ADT/DenseMap.h"
43 #include "llvm/ADT/MapVector.h"
44 #include "llvm/ADT/SetVector.h"
45 #include "llvm/ADT/SmallPtrSet.h"
46 #include "llvm/ADT/SmallVector.h"
47 #include "llvm/ADT/Statistic.h"
48 #include "llvm/ADT/StringRef.h"
52 #include "llvm/Analysis/LoopInfo.h"
53 #include "llvm/Analysis/LoopPass.h"
64 #include "llvm/IR/Attributes.h"
65 #include "llvm/IR/BasicBlock.h"
66 #include "llvm/IR/Constant.h"
67 #include "llvm/IR/Constants.h"
68 #include "llvm/IR/DataLayout.h"
69 #include "llvm/IR/DebugLoc.h"
70 #include "llvm/IR/DerivedTypes.h"
71 #include "llvm/IR/Dominators.h"
72 #include "llvm/IR/GlobalValue.h"
73 #include "llvm/IR/GlobalVariable.h"
74 #include "llvm/IR/IRBuilder.h"
75 #include "llvm/IR/InstrTypes.h"
76 #include "llvm/IR/Instruction.h"
77 #include "llvm/IR/Instructions.h"
78 #include "llvm/IR/IntrinsicInst.h"
79 #include "llvm/IR/Intrinsics.h"
80 #include "llvm/IR/LLVMContext.h"
81 #include "llvm/IR/Module.h"
82 #include "llvm/IR/PassManager.h"
83 #include "llvm/IR/PatternMatch.h"
84 #include "llvm/IR/Type.h"
85 #include "llvm/IR/User.h"
86 #include "llvm/IR/Value.h"
87 #include "llvm/IR/ValueHandle.h"
88 #include "llvm/InitializePasses.h"
89 #include "llvm/Pass.h"
90 #include "llvm/Support/Casting.h"
92 #include "llvm/Support/Debug.h"
95 #include "llvm/Transforms/Scalar.h"
100 #include <algorithm>
101 #include <cassert>
102 #include <cstdint>
103 #include <utility>
104 #include <vector>
105 
106 using namespace llvm;
107 
108 #define DEBUG_TYPE "loop-idiom"
109 
110 STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
111 STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
112 STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores");
113 STATISTIC(
114  NumShiftUntilBitTest,
115  "Number of uncountable loops recognized as 'shift until bitttest' idiom");
116 STATISTIC(NumShiftUntilZero,
117  "Number of uncountable loops recognized as 'shift until zero' idiom");
118 
119 bool DisableLIRP::All;
120 static cl::opt<bool, true>
121  DisableLIRPAll("disable-" DEBUG_TYPE "-all",
122  cl::desc("Options to disable Loop Idiom Recognize Pass."),
125 
127 static cl::opt<bool, true>
128  DisableLIRPMemset("disable-" DEBUG_TYPE "-memset",
129  cl::desc("Proceed with loop idiom recognize pass, but do "
130  "not convert loop(s) to memset."),
133 
135 static cl::opt<bool, true>
136  DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy",
137  cl::desc("Proceed with loop idiom recognize pass, but do "
138  "not convert loop(s) to memcpy."),
141 
143  "use-lir-code-size-heurs",
144  cl::desc("Use loop idiom recognition code size heuristics when compiling"
145  "with -Os/-Oz"),
146  cl::init(true), cl::Hidden);
147 
148 namespace {
149 
150 class LoopIdiomRecognize {
151  Loop *CurLoop = nullptr;
152  AliasAnalysis *AA;
153  DominatorTree *DT;
154  LoopInfo *LI;
155  ScalarEvolution *SE;
156  TargetLibraryInfo *TLI;
157  const TargetTransformInfo *TTI;
158  const DataLayout *DL;
160  bool ApplyCodeSizeHeuristics;
161  std::unique_ptr<MemorySSAUpdater> MSSAU;
162 
163 public:
164  explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
165  LoopInfo *LI, ScalarEvolution *SE,
166  TargetLibraryInfo *TLI,
167  const TargetTransformInfo *TTI, MemorySSA *MSSA,
168  const DataLayout *DL,
170  : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
171  if (MSSA)
172  MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
173  }
174 
175  bool runOnLoop(Loop *L);
176 
177 private:
178  using StoreList = SmallVector<StoreInst *, 8>;
179  using StoreListMap = MapVector<Value *, StoreList>;
180 
181  StoreListMap StoreRefsForMemset;
182  StoreListMap StoreRefsForMemsetPattern;
183  StoreList StoreRefsForMemcpy;
184  bool HasMemset;
185  bool HasMemsetPattern;
186  bool HasMemcpy;
187 
188  /// Return code for isLegalStore()
189  enum LegalStoreKind {
190  None = 0,
191  Memset,
192  MemsetPattern,
193  Memcpy,
194  UnorderedAtomicMemcpy,
195  DontUse // Dummy retval never to be used. Allows catching errors in retval
196  // handling.
197  };
198 
199  /// \name Countable Loop Idiom Handling
200  /// @{
201 
202  bool runOnCountableLoop();
203  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
204  SmallVectorImpl<BasicBlock *> &ExitBlocks);
205 
206  void collectStores(BasicBlock *BB);
207  LegalStoreKind isLegalStore(StoreInst *SI);
208  enum class ForMemset { No, Yes };
209  bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
210  ForMemset For);
211 
212  template <typename MemInst>
213  bool processLoopMemIntrinsic(
214  BasicBlock *BB,
215  bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
216  const SCEV *BECount);
217  bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount);
218  bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
219 
220  bool processLoopStridedStore(Value *DestPtr, const SCEV *StoreSizeSCEV,
221  MaybeAlign StoreAlignment, Value *StoredVal,
222  Instruction *TheStore,
224  const SCEVAddRecExpr *Ev, const SCEV *BECount,
225  bool IsNegStride, bool IsLoopMemset = false);
226  bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
227  bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr,
228  const SCEV *StoreSize, MaybeAlign StoreAlign,
229  MaybeAlign LoadAlign, Instruction *TheStore,
230  Instruction *TheLoad,
231  const SCEVAddRecExpr *StoreEv,
232  const SCEVAddRecExpr *LoadEv,
233  const SCEV *BECount);
234  bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
235  bool IsLoopMemset = false);
236 
237  /// @}
238  /// \name Noncountable Loop Idiom Handling
239  /// @{
240 
241  bool runOnNoncountableLoop();
242 
243  bool recognizePopcount();
244  void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
245  PHINode *CntPhi, Value *Var);
246  bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
247  void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
248  Instruction *CntInst, PHINode *CntPhi,
249  Value *Var, Instruction *DefX,
250  const DebugLoc &DL, bool ZeroCheck,
251  bool IsCntPhiUsedOutsideLoop);
252 
253  bool recognizeShiftUntilBitTest();
254  bool recognizeShiftUntilZero();
255 
256  /// @}
257 };
258 
259 class LoopIdiomRecognizeLegacyPass : public LoopPass {
260 public:
261  static char ID;
262 
263  explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
266  }
267 
268  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
269  if (DisableLIRP::All)
270  return false;
271 
272  if (skipLoop(L))
273  return false;
274 
275  AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
276  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
277  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
278  ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
279  TargetLibraryInfo *TLI =
280  &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
281  *L->getHeader()->getParent());
282  const TargetTransformInfo *TTI =
283  &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
284  *L->getHeader()->getParent());
285  const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
286  auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
287  MemorySSA *MSSA = nullptr;
288  if (MSSAAnalysis)
289  MSSA = &MSSAAnalysis->getMSSA();
290 
291  // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
292  // pass. Function analyses need to be preserved across loop transformations
293  // but ORE cannot be preserved (see comment before the pass definition).
295 
296  LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, MSSA, DL, ORE);
297  return LIR.runOnLoop(L);
298  }
299 
300  /// This transformation requires natural loop information & requires that
301  /// loop preheaders be inserted into the CFG.
302  void getAnalysisUsage(AnalysisUsage &AU) const override {
307  }
308 };
309 
310 } // end anonymous namespace
311 
313 
316  LPMUpdater &) {
317  if (DisableLIRP::All)
318  return PreservedAnalyses::all();
319 
320  const auto *DL = &L.getHeader()->getModule()->getDataLayout();
321 
322  // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
323  // pass. Function analyses need to be preserved across loop transformations
324  // but ORE cannot be preserved (see comment before the pass definition).
326 
327  LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
328  AR.MSSA, DL, ORE);
329  if (!LIR.runOnLoop(&L))
330  return PreservedAnalyses::all();
331 
332  auto PA = getLoopPassPreservedAnalyses();
333  if (AR.MSSA)
334  PA.preserve<MemorySSAAnalysis>();
335  return PA;
336 }
337 
338 INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",
339  "Recognize loop idioms", false, false)
343 INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",
344  "Recognize loop idioms", false, false)
345 
346 Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
347 
349  I->replaceAllUsesWith(UndefValue::get(I->getType()));
350  I->eraseFromParent();
351 }
352 
353 //===----------------------------------------------------------------------===//
354 //
355 // Implementation of LoopIdiomRecognize
356 //
357 //===----------------------------------------------------------------------===//
358 
359 bool LoopIdiomRecognize::runOnLoop(Loop *L) {
360  CurLoop = L;
361  // If the loop could not be converted to canonical form, it must have an
362  // indirectbr in it, just give up.
363  if (!L->getLoopPreheader())
364  return false;
365 
366  // Disable loop idiom recognition if the function's name is a common idiom.
368  if (Name == "memset" || Name == "memcpy")
369  return false;
370 
371  // Determine if code size heuristics need to be applied.
372  ApplyCodeSizeHeuristics =
374 
375  HasMemset = TLI->has(LibFunc_memset);
376  HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
377  HasMemcpy = TLI->has(LibFunc_memcpy);
378 
379  if (HasMemset || HasMemsetPattern || HasMemcpy)
381  return runOnCountableLoop();
382 
383  return runOnNoncountableLoop();
384 }
385 
386 bool LoopIdiomRecognize::runOnCountableLoop() {
387  const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
388  assert(!isa<SCEVCouldNotCompute>(BECount) &&
389  "runOnCountableLoop() called on a loop without a predictable"
390  "backedge-taken count");
391 
392  // If this loop executes exactly one time, then it should be peeled, not
393  // optimized by this pass.
394  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
395  if (BECst->getAPInt() == 0)
396  return false;
397 
398  SmallVector<BasicBlock *, 8> ExitBlocks;
399  CurLoop->getUniqueExitBlocks(ExitBlocks);
400 
401  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
402  << CurLoop->getHeader()->getParent()->getName()
403  << "] Countable Loop %" << CurLoop->getHeader()->getName()
404  << "\n");
405 
406  // The following transforms hoist stores/memsets into the loop pre-header.
407  // Give up if the loop has instructions that may throw.
408  SimpleLoopSafetyInfo SafetyInfo;
409  SafetyInfo.computeLoopSafetyInfo(CurLoop);
410  if (SafetyInfo.anyBlockMayThrow())
411  return false;
412 
413  bool MadeChange = false;
414 
415  // Scan all the blocks in the loop that are not in subloops.
416  for (auto *BB : CurLoop->getBlocks()) {
417  // Ignore blocks in subloops.
418  if (LI->getLoopFor(BB) != CurLoop)
419  continue;
420 
421  MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
422  }
423  return MadeChange;
424 }
425 
426 static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
427  const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
428  return ConstStride->getAPInt();
429 }
430 
431 /// getMemSetPatternValue - If a strided store of the specified value is safe to
432 /// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
433 /// be passed in. Otherwise, return null.
434 ///
435 /// Note that we don't ever attempt to use memset_pattern8 or 4, because these
436 /// just replicate their input array and then pass on to memset_pattern16.
438  // FIXME: This could check for UndefValue because it can be merged into any
439  // other valid pattern.
440 
441  // If the value isn't a constant, we can't promote it to being in a constant
442  // array. We could theoretically do a store to an alloca or something, but
443  // that doesn't seem worthwhile.
444  Constant *C = dyn_cast<Constant>(V);
445  if (!C)
446  return nullptr;
447 
448  // Only handle simple values that are a power of two bytes in size.
449  uint64_t Size = DL->getTypeSizeInBits(V->getType());
450  if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
451  return nullptr;
452 
453  // Don't care enough about darwin/ppc to implement this.
454  if (DL->isBigEndian())
455  return nullptr;
456 
457  // Convert to size in bytes.
458  Size /= 8;
459 
460  // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
461  // if the top and bottom are the same (e.g. for vectors and large integers).
462  if (Size > 16)
463  return nullptr;
464 
465  // If the constant is exactly 16 bytes, just use it.
466  if (Size == 16)
467  return C;
468 
469  // Otherwise, we'll use an array of the constants.
470  unsigned ArraySize = 16 / Size;
471  ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
472  return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
473 }
474 
475 LoopIdiomRecognize::LegalStoreKind
476 LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
477  // Don't touch volatile stores.
478  if (SI->isVolatile())
479  return LegalStoreKind::None;
480  // We only want simple or unordered-atomic stores.
481  if (!SI->isUnordered())
482  return LegalStoreKind::None;
483 
484  // Avoid merging nontemporal stores.
485  if (SI->getMetadata(LLVMContext::MD_nontemporal))
486  return LegalStoreKind::None;
487 
488  Value *StoredVal = SI->getValueOperand();
489  Value *StorePtr = SI->getPointerOperand();
490 
491  // Don't convert stores of non-integral pointer types to memsets (which stores
492  // integers).
493  if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
494  return LegalStoreKind::None;
495 
496  // Reject stores that are so large that they overflow an unsigned.
497  // When storing out scalable vectors we bail out for now, since the code
498  // below currently only works for constant strides.
499  TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
500  if (SizeInBits.isScalable() || (SizeInBits.getFixedSize() & 7) ||
501  (SizeInBits.getFixedSize() >> 32) != 0)
502  return LegalStoreKind::None;
503 
504  // See if the pointer expression is an AddRec like {base,+,1} on the current
505  // loop, which indicates a strided store. If we have something else, it's a
506  // random store we can't handle.
507  const SCEVAddRecExpr *StoreEv =
508  dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
509  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
510  return LegalStoreKind::None;
511 
512  // Check to see if we have a constant stride.
513  if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
514  return LegalStoreKind::None;
515 
516  // See if the store can be turned into a memset.
517 
518  // If the stored value is a byte-wise value (like i32 -1), then it may be
519  // turned into a memset of i8 -1, assuming that all the consecutive bytes
520  // are stored. A store of i32 0x01020304 can never be turned into a memset,
521  // but it can be turned into memset_pattern if the target supports it.
522  Value *SplatValue = isBytewiseValue(StoredVal, *DL);
523 
524  // Note: memset and memset_pattern on unordered-atomic is yet not supported
525  bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
526 
527  // If we're allowed to form a memset, and the stored value would be
528  // acceptable for memset, use it.
529  if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
530  // Verify that the stored value is loop invariant. If not, we can't
531  // promote the memset.
532  CurLoop->isLoopInvariant(SplatValue)) {
533  // It looks like we can use SplatValue.
534  return LegalStoreKind::Memset;
535  }
536  if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
537  // Don't create memset_pattern16s with address spaces.
538  StorePtr->getType()->getPointerAddressSpace() == 0 &&
539  getMemSetPatternValue(StoredVal, DL)) {
540  // It looks like we can use PatternValue!
541  return LegalStoreKind::MemsetPattern;
542  }
543 
544  // Otherwise, see if the store can be turned into a memcpy.
545  if (HasMemcpy && !DisableLIRP::Memcpy) {
546  // Check to see if the stride matches the size of the store. If so, then we
547  // know that every byte is touched in the loop.
548  APInt Stride = getStoreStride(StoreEv);
549  unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
550  if (StoreSize != Stride && StoreSize != -Stride)
551  return LegalStoreKind::None;
552 
553  // The store must be feeding a non-volatile load.
554  LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
555 
556  // Only allow non-volatile loads
557  if (!LI || LI->isVolatile())
558  return LegalStoreKind::None;
559  // Only allow simple or unordered-atomic loads
560  if (!LI->isUnordered())
561  return LegalStoreKind::None;
562 
563  // See if the pointer expression is an AddRec like {base,+,1} on the current
564  // loop, which indicates a strided load. If we have something else, it's a
565  // random load we can't handle.
566  const SCEVAddRecExpr *LoadEv =
567  dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
568  if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
569  return LegalStoreKind::None;
570 
571  // The store and load must share the same stride.
572  if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
573  return LegalStoreKind::None;
574 
575  // Success. This store can be converted into a memcpy.
576  UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
577  return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
578  : LegalStoreKind::Memcpy;
579  }
580  // This store can't be transformed into a memset/memcpy.
581  return LegalStoreKind::None;
582 }
583 
584 void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
585  StoreRefsForMemset.clear();
586  StoreRefsForMemsetPattern.clear();
587  StoreRefsForMemcpy.clear();
588  for (Instruction &I : *BB) {
589  StoreInst *SI = dyn_cast<StoreInst>(&I);
590  if (!SI)
591  continue;
592 
593  // Make sure this is a strided store with a constant stride.
594  switch (isLegalStore(SI)) {
596  // Nothing to do
597  break;
598  case LegalStoreKind::Memset: {
599  // Find the base pointer.
600  Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
601  StoreRefsForMemset[Ptr].push_back(SI);
602  } break;
603  case LegalStoreKind::MemsetPattern: {
604  // Find the base pointer.
605  Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
606  StoreRefsForMemsetPattern[Ptr].push_back(SI);
607  } break;
608  case LegalStoreKind::Memcpy:
609  case LegalStoreKind::UnorderedAtomicMemcpy:
610  StoreRefsForMemcpy.push_back(SI);
611  break;
612  default:
613  assert(false && "unhandled return value");
614  break;
615  }
616  }
617 }
618 
619 /// runOnLoopBlock - Process the specified block, which lives in a counted loop
620 /// with the specified backedge count. This block is known to be in the current
621 /// loop and not in any subloops.
622 bool LoopIdiomRecognize::runOnLoopBlock(
623  BasicBlock *BB, const SCEV *BECount,
624  SmallVectorImpl<BasicBlock *> &ExitBlocks) {
625  // We can only promote stores in this block if they are unconditionally
626  // executed in the loop. For a block to be unconditionally executed, it has
627  // to dominate all the exit blocks of the loop. Verify this now.
628  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
629  if (!DT->dominates(BB, ExitBlocks[i]))
630  return false;
631 
632  bool MadeChange = false;
633  // Look for store instructions, which may be optimized to memset/memcpy.
634  collectStores(BB);
635 
636  // Look for a single store or sets of stores with a common base, which can be
637  // optimized into a memset (memset_pattern). The latter most commonly happens
638  // with structs and handunrolled loops.
639  for (auto &SL : StoreRefsForMemset)
640  MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
641 
642  for (auto &SL : StoreRefsForMemsetPattern)
643  MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
644 
645  // Optimize the store into a memcpy, if it feeds an similarly strided load.
646  for (auto &SI : StoreRefsForMemcpy)
647  MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
648 
649  MadeChange |= processLoopMemIntrinsic<MemCpyInst>(
650  BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
651  MadeChange |= processLoopMemIntrinsic<MemSetInst>(
652  BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
653 
654  return MadeChange;
655 }
656 
657 /// See if this store(s) can be promoted to a memset.
658 bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
659  const SCEV *BECount, ForMemset For) {
660  // Try to find consecutive stores that can be transformed into memsets.
661  SetVector<StoreInst *> Heads, Tails;
663 
664  // Do a quadratic search on all of the given stores and find
665  // all of the pairs of stores that follow each other.
666  SmallVector<unsigned, 16> IndexQueue;
667  for (unsigned i = 0, e = SL.size(); i < e; ++i) {
668  assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
669 
670  Value *FirstStoredVal = SL[i]->getValueOperand();
671  Value *FirstStorePtr = SL[i]->getPointerOperand();
672  const SCEVAddRecExpr *FirstStoreEv =
673  cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
674  APInt FirstStride = getStoreStride(FirstStoreEv);
675  unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
676 
677  // See if we can optimize just this store in isolation.
678  if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
679  Heads.insert(SL[i]);
680  continue;
681  }
682 
683  Value *FirstSplatValue = nullptr;
684  Constant *FirstPatternValue = nullptr;
685 
686  if (For == ForMemset::Yes)
687  FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
688  else
689  FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
690 
691  assert((FirstSplatValue || FirstPatternValue) &&
692  "Expected either splat value or pattern value.");
693 
694  IndexQueue.clear();
695  // If a store has multiple consecutive store candidates, search Stores
696  // array according to the sequence: from i+1 to e, then from i-1 to 0.
697  // This is because usually pairing with immediate succeeding or preceding
698  // candidate create the best chance to find memset opportunity.
699  unsigned j = 0;
700  for (j = i + 1; j < e; ++j)
701  IndexQueue.push_back(j);
702  for (j = i; j > 0; --j)
703  IndexQueue.push_back(j - 1);
704 
705  for (auto &k : IndexQueue) {
706  assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
707  Value *SecondStorePtr = SL[k]->getPointerOperand();
708  const SCEVAddRecExpr *SecondStoreEv =
709  cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
710  APInt SecondStride = getStoreStride(SecondStoreEv);
711 
712  if (FirstStride != SecondStride)
713  continue;
714 
715  Value *SecondStoredVal = SL[k]->getValueOperand();
716  Value *SecondSplatValue = nullptr;
717  Constant *SecondPatternValue = nullptr;
718 
719  if (For == ForMemset::Yes)
720  SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
721  else
722  SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
723 
724  assert((SecondSplatValue || SecondPatternValue) &&
725  "Expected either splat value or pattern value.");
726 
727  if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
728  if (For == ForMemset::Yes) {
729  if (isa<UndefValue>(FirstSplatValue))
730  FirstSplatValue = SecondSplatValue;
731  if (FirstSplatValue != SecondSplatValue)
732  continue;
733  } else {
734  if (isa<UndefValue>(FirstPatternValue))
735  FirstPatternValue = SecondPatternValue;
736  if (FirstPatternValue != SecondPatternValue)
737  continue;
738  }
739  Tails.insert(SL[k]);
740  Heads.insert(SL[i]);
741  ConsecutiveChain[SL[i]] = SL[k];
742  break;
743  }
744  }
745  }
746 
747  // We may run into multiple chains that merge into a single chain. We mark the
748  // stores that we transformed so that we don't visit the same store twice.
749  SmallPtrSet<Value *, 16> TransformedStores;
750  bool Changed = false;
751 
752  // For stores that start but don't end a link in the chain:
753  for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
754  it != e; ++it) {
755  if (Tails.count(*it))
756  continue;
757 
758  // We found a store instr that starts a chain. Now follow the chain and try
759  // to transform it.
760  SmallPtrSet<Instruction *, 8> AdjacentStores;
761  StoreInst *I = *it;
762 
763  StoreInst *HeadStore = I;
764  unsigned StoreSize = 0;
765 
766  // Collect the chain into a list.
767  while (Tails.count(I) || Heads.count(I)) {
768  if (TransformedStores.count(I))
769  break;
770  AdjacentStores.insert(I);
771 
772  StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
773  // Move to the next value in the chain.
774  I = ConsecutiveChain[I];
775  }
776 
777  Value *StoredVal = HeadStore->getValueOperand();
778  Value *StorePtr = HeadStore->getPointerOperand();
779  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
780  APInt Stride = getStoreStride(StoreEv);
781 
782  // Check to see if the stride matches the size of the stores. If so, then
783  // we know that every byte is touched in the loop.
784  if (StoreSize != Stride && StoreSize != -Stride)
785  continue;
786 
787  bool IsNegStride = StoreSize == -Stride;
788 
789  const SCEV *StoreSizeSCEV = SE->getConstant(BECount->getType(), StoreSize);
790  if (processLoopStridedStore(StorePtr, StoreSizeSCEV,
791  MaybeAlign(HeadStore->getAlignment()),
792  StoredVal, HeadStore, AdjacentStores, StoreEv,
793  BECount, IsNegStride)) {
794  TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
795  Changed = true;
796  }
797  }
798 
799  return Changed;
800 }
801 
802 /// processLoopMemIntrinsic - Template function for calling different processor
803 /// functions based on mem instrinsic type.
804 template <typename MemInst>
805 bool LoopIdiomRecognize::processLoopMemIntrinsic(
806  BasicBlock *BB,
807  bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
808  const SCEV *BECount) {
809  bool MadeChange = false;
810  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
811  Instruction *Inst = &*I++;
812  // Look for memory instructions, which may be optimized to a larger one.
813  if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
814  WeakTrackingVH InstPtr(&*I);
815  if (!(this->*Processor)(MI, BECount))
816  continue;
817  MadeChange = true;
818 
819  // If processing the instruction invalidated our iterator, start over from
820  // the top of the block.
821  if (!InstPtr)
822  I = BB->begin();
823  }
824  }
825  return MadeChange;
826 }
827 
828 /// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy
829 bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
830  const SCEV *BECount) {
831  // We can only handle non-volatile memcpys with a constant size.
832  if (MCI->isVolatile() || !isa<ConstantInt>(MCI->getLength()))
833  return false;
834 
835  // If we're not allowed to hack on memcpy, we fail.
836  if ((!HasMemcpy && !isa<MemCpyInlineInst>(MCI)) || DisableLIRP::Memcpy)
837  return false;
838 
839  Value *Dest = MCI->getDest();
840  Value *Source = MCI->getSource();
841  if (!Dest || !Source)
842  return false;
843 
844  // See if the load and store pointer expressions are AddRec like {base,+,1} on
845  // the current loop, which indicates a strided load and store. If we have
846  // something else, it's a random load or store we can't handle.
847  const SCEVAddRecExpr *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Dest));
848  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
849  return false;
850  const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Source));
851  if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
852  return false;
853 
854  // Reject memcpys that are so large that they overflow an unsigned.
855  uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength())->getZExtValue();
856  if ((SizeInBytes >> 32) != 0)
857  return false;
858 
859  // Check if the stride matches the size of the memcpy. If so, then we know
860  // that every byte is touched in the loop.
861  const SCEVConstant *ConstStoreStride =
862  dyn_cast<SCEVConstant>(StoreEv->getOperand(1));
863  const SCEVConstant *ConstLoadStride =
864  dyn_cast<SCEVConstant>(LoadEv->getOperand(1));
865  if (!ConstStoreStride || !ConstLoadStride)
866  return false;
867 
868  APInt StoreStrideValue = ConstStoreStride->getAPInt();
869  APInt LoadStrideValue = ConstLoadStride->getAPInt();
870  // Huge stride value - give up
871  if (StoreStrideValue.getBitWidth() > 64 || LoadStrideValue.getBitWidth() > 64)
872  return false;
873 
874  if (SizeInBytes != StoreStrideValue && SizeInBytes != -StoreStrideValue) {
875  ORE.emit([&]() {
876  return OptimizationRemarkMissed(DEBUG_TYPE, "SizeStrideUnequal", MCI)
877  << ore::NV("Inst", "memcpy") << " in "
878  << ore::NV("Function", MCI->getFunction())
879  << " function will not be hoisted: "
880  << ore::NV("Reason", "memcpy size is not equal to stride");
881  });
882  return false;
883  }
884 
885  int64_t StoreStrideInt = StoreStrideValue.getSExtValue();
886  int64_t LoadStrideInt = LoadStrideValue.getSExtValue();
887  // Check if the load stride matches the store stride.
888  if (StoreStrideInt != LoadStrideInt)
889  return false;
890 
891  return processLoopStoreOfLoopLoad(
892  Dest, Source, SE->getConstant(Dest->getType(), SizeInBytes),
893  MCI->getDestAlign(), MCI->getSourceAlign(), MCI, MCI, StoreEv, LoadEv,
894  BECount);
895 }
896 
897 /// processLoopMemSet - See if this memset can be promoted to a large memset.
898 bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
899  const SCEV *BECount) {
900  // We can only handle non-volatile memsets.
901  if (MSI->isVolatile())
902  return false;
903 
904  // If we're not allowed to hack on memset, we fail.
905  if (!HasMemset || DisableLIRP::Memset)
906  return false;
907 
908  Value *Pointer = MSI->getDest();
909 
910  // See if the pointer expression is an AddRec like {base,+,1} on the current
911  // loop, which indicates a strided store. If we have something else, it's a
912  // random store we can't handle.
913  const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
914  if (!Ev || Ev->getLoop() != CurLoop)
915  return false;
916  if (!Ev->isAffine()) {
917  LLVM_DEBUG(dbgs() << " Pointer is not affine, abort\n");
918  return false;
919  }
920 
921  const SCEV *PointerStrideSCEV = Ev->getOperand(1);
922  const SCEV *MemsetSizeSCEV = SE->getSCEV(MSI->getLength());
923  if (!PointerStrideSCEV || !MemsetSizeSCEV)
924  return false;
925 
926  bool IsNegStride = false;
927  const bool IsConstantSize = isa<ConstantInt>(MSI->getLength());
928 
929  if (IsConstantSize) {
930  // Memset size is constant.
931  // Check if the pointer stride matches the memset size. If so, then
932  // we know that every byte is touched in the loop.
933  LLVM_DEBUG(dbgs() << " memset size is constant\n");
934  uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
935  const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
936  if (!ConstStride)
937  return false;
938 
939  APInt Stride = ConstStride->getAPInt();
940  if (SizeInBytes != Stride && SizeInBytes != -Stride)
941  return false;
942 
943  IsNegStride = SizeInBytes == -Stride;
944  } else {
945  // Memset size is non-constant.
946  // Check if the pointer stride matches the memset size.
947  // To be conservative, the pass would not promote pointers that aren't in
948  // address space zero. Also, the pass only handles memset length and stride
949  // that are invariant for the top level loop.
950  LLVM_DEBUG(dbgs() << " memset size is non-constant\n");
951  if (Pointer->getType()->getPointerAddressSpace() != 0) {
952  LLVM_DEBUG(dbgs() << " pointer is not in address space zero, "
953  << "abort\n");
954  return false;
955  }
956  if (!SE->isLoopInvariant(MemsetSizeSCEV, CurLoop)) {
957  LLVM_DEBUG(dbgs() << " memset size is not a loop-invariant, "
958  << "abort\n");
959  return false;
960  }
961 
962  // Compare positive direction PointerStrideSCEV with MemsetSizeSCEV
963  IsNegStride = PointerStrideSCEV->isNonConstantNegative();
964  const SCEV *PositiveStrideSCEV =
965  IsNegStride ? SE->getNegativeSCEV(PointerStrideSCEV)
966  : PointerStrideSCEV;
967  LLVM_DEBUG(dbgs() << " MemsetSizeSCEV: " << *MemsetSizeSCEV << "\n"
968  << " PositiveStrideSCEV: " << *PositiveStrideSCEV
969  << "\n");
970 
971  if (PositiveStrideSCEV != MemsetSizeSCEV) {
972  // TODO: folding can be done to the SCEVs
973  // The folding is to fold expressions that is covered by the loop guard
974  // at loop entry. After the folding, compare again and proceed
975  // optimization if equal.
976  LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n");
977  return false;
978  }
979  }
980 
981  // Verify that the memset value is loop invariant. If not, we can't promote
982  // the memset.
983  Value *SplatValue = MSI->getValue();
984  if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
985  return false;
986 
988  MSIs.insert(MSI);
989  return processLoopStridedStore(Pointer, SE->getSCEV(MSI->getLength()),
991  SplatValue, MSI, MSIs, Ev, BECount,
992  IsNegStride, /*IsLoopMemset=*/true);
993 }
994 
995 /// mayLoopAccessLocation - Return true if the specified loop might access the
996 /// specified pointer location, which is a loop-strided access. The 'Access'
997 /// argument specifies what the verboten forms of access are (read or write).
998 static bool
1000  const SCEV *BECount, const SCEV *StoreSizeSCEV,
1001  AliasAnalysis &AA,
1002  SmallPtrSetImpl<Instruction *> &IgnoredInsts) {
1003  // Get the location that may be stored across the loop. Since the access is
1004  // strided positively through memory, we say that the modified location starts
1005  // at the pointer and has infinite size.
1007 
1008  // If the loop iterates a fixed number of times, we can refine the access size
1009  // to be exactly the size of the memset, which is (BECount+1)*StoreSize
1010  const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount);
1011  const SCEVConstant *ConstSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
1012  if (BECst && ConstSize)
1013  AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
1014  ConstSize->getValue()->getZExtValue());
1015 
1016  // TODO: For this to be really effective, we have to dive into the pointer
1017  // operand in the store. Store to &A[i] of 100 will always return may alias
1018  // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
1019  // which will then no-alias a store to &A[100].
1020  MemoryLocation StoreLoc(Ptr, AccessSize);
1021 
1022  for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
1023  ++BI)
1024  for (Instruction &I : **BI)
1025  if (IgnoredInsts.count(&I) == 0 &&
1026  isModOrRefSet(
1027  intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
1028  return true;
1029  return false;
1030 }
1031 
1032 // If we have a negative stride, Start refers to the end of the memory location
1033 // we're trying to memset. Therefore, we need to recompute the base pointer,
1034 // which is just Start - BECount*Size.
1035 static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
1036  Type *IntPtr, const SCEV *StoreSizeSCEV,
1037  ScalarEvolution *SE) {
1038  const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
1039  if (!StoreSizeSCEV->isOne()) {
1040  // index = back edge count * store size
1041  Index = SE->getMulExpr(Index,
1042  SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
1043  SCEV::FlagNUW);
1044  }
1045  // base pointer = start - index * store size
1046  return SE->getMinusSCEV(Start, Index);
1047 }
1048 
1049 /// Compute trip count from the backedge taken count.
1050 static const SCEV *getTripCount(const SCEV *BECount, Type *IntPtr,
1051  Loop *CurLoop, const DataLayout *DL,
1052  ScalarEvolution *SE) {
1053  const SCEV *TripCountS = nullptr;
1054  // The # stored bytes is (BECount+1). Expand the trip count out to
1055  // pointer size if it isn't already.
1056  //
1057  // If we're going to need to zero extend the BE count, check if we can add
1058  // one to it prior to zero extending without overflow. Provided this is safe,
1059  // it allows better simplification of the +1.
1060  if (DL->getTypeSizeInBits(BECount->getType()) <
1061  DL->getTypeSizeInBits(IntPtr) &&
1063  CurLoop, ICmpInst::ICMP_NE, BECount,
1064  SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
1065  TripCountS = SE->getZeroExtendExpr(
1066  SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
1067  IntPtr);
1068  } else {
1069  TripCountS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
1070  SE->getOne(IntPtr), SCEV::FlagNUW);
1071  }
1072 
1073  return TripCountS;
1074 }
1075 
1076 /// Compute the number of bytes as a SCEV from the backedge taken count.
1077 ///
1078 /// This also maps the SCEV into the provided type and tries to handle the
1079 /// computation in a way that will fold cleanly.
1080 static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
1081  const SCEV *StoreSizeSCEV, Loop *CurLoop,
1082  const DataLayout *DL, ScalarEvolution *SE) {
1083  const SCEV *TripCountSCEV = getTripCount(BECount, IntPtr, CurLoop, DL, SE);
1084 
1085  return SE->getMulExpr(TripCountSCEV,
1086  SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
1087  SCEV::FlagNUW);
1088 }
1089 
1090 /// processLoopStridedStore - We see a strided store of some value. If we can
1091 /// transform this into a memset or memset_pattern in the loop preheader, do so.
1092 bool LoopIdiomRecognize::processLoopStridedStore(
1093  Value *DestPtr, const SCEV *StoreSizeSCEV, MaybeAlign StoreAlignment,
1094  Value *StoredVal, Instruction *TheStore,
1096  const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) {
1097  Value *SplatValue = isBytewiseValue(StoredVal, *DL);
1098  Constant *PatternValue = nullptr;
1099 
1100  if (!SplatValue)
1101  PatternValue = getMemSetPatternValue(StoredVal, DL);
1102 
1103  assert((SplatValue || PatternValue) &&
1104  "Expected either splat value or pattern value.");
1105 
1106  // The trip count of the loop and the base pointer of the addrec SCEV is
1107  // guaranteed to be loop invariant, which means that it should dominate the
1108  // header. This allows us to insert code for it in the preheader.
1109  unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
1110  BasicBlock *Preheader = CurLoop->getLoopPreheader();
1111  IRBuilder<> Builder(Preheader->getTerminator());
1112  SCEVExpander Expander(*SE, *DL, "loop-idiom");
1113  SCEVExpanderCleaner ExpCleaner(Expander, *DT);
1114 
1115  Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
1116  Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
1117 
1118  bool Changed = false;
1119  const SCEV *Start = Ev->getStart();
1120  // Handle negative strided loops.
1121  if (IsNegStride)
1122  Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSizeSCEV, SE);
1123 
1124  // TODO: ideally we should still be able to generate memset if SCEV expander
1125  // is taught to generate the dependencies at the latest point.
1126  if (!isSafeToExpand(Start, *SE))
1127  return Changed;
1128 
1129  // Okay, we have a strided store "p[i]" of a splattable value. We can turn
1130  // this into a memset in the loop preheader now if we want. However, this
1131  // would be unsafe to do if there is anything else in the loop that may read
1132  // or write to the aliased location. Check for any overlap by generating the
1133  // base pointer and checking the region.
1134  Value *BasePtr =
1135  Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
1136 
1137  // From here on out, conservatively report to the pass manager that we've
1138  // changed the IR, even if we later clean up these added instructions. There
1139  // may be structural differences e.g. in the order of use lists not accounted
1140  // for in just a textual dump of the IR. This is written as a variable, even
1141  // though statically all the places this dominates could be replaced with
1142  // 'true', with the hope that anyone trying to be clever / "more precise" with
1143  // the return value will read this comment, and leave them alone.
1144  Changed = true;
1145 
1146  if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1147  StoreSizeSCEV, *AA, Stores))
1148  return Changed;
1149 
1150  if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
1151  return Changed;
1152 
1153  // Okay, everything looks good, insert the memset.
1154 
1155  const SCEV *NumBytesS =
1156  getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1157 
1158  // TODO: ideally we should still be able to generate memset if SCEV expander
1159  // is taught to generate the dependencies at the latest point.
1160  if (!isSafeToExpand(NumBytesS, *SE))
1161  return Changed;
1162 
1163  Value *NumBytes =
1164  Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1165 
1166  CallInst *NewCall;
1167  if (SplatValue) {
1168  NewCall = Builder.CreateMemSet(BasePtr, SplatValue, NumBytes,
1169  MaybeAlign(StoreAlignment));
1170  } else {
1171  // Everything is emitted in default address space
1172  Type *Int8PtrTy = DestInt8PtrTy;
1173 
1174  Module *M = TheStore->getModule();
1175  StringRef FuncName = "memset_pattern16";
1176  FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(),
1177  Int8PtrTy, Int8PtrTy, IntIdxTy);
1178  inferLibFuncAttributes(M, FuncName, *TLI);
1179 
1180  // Otherwise we should form a memset_pattern16. PatternValue is known to be
1181  // an constant array of 16-bytes. Plop the value into a mergable global.
1182  GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
1184  PatternValue, ".memset_pattern");
1185  GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
1186  GV->setAlignment(Align(16));
1187  Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
1188  NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
1189  }
1190  NewCall->setDebugLoc(TheStore->getDebugLoc());
1191 
1192  if (MSSAU) {
1193  MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1194  NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1195  MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1196  }
1197 
1198  LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
1199  << " from store to: " << *Ev << " at: " << *TheStore
1200  << "\n");
1201 
1202  ORE.emit([&]() {
1203  OptimizationRemark R(DEBUG_TYPE, "ProcessLoopStridedStore",
1204  NewCall->getDebugLoc(), Preheader);
1205  R << "Transformed loop-strided store in "
1206  << ore::NV("Function", TheStore->getFunction())
1207  << " function into a call to "
1208  << ore::NV("NewFunction", NewCall->getCalledFunction())
1209  << "() intrinsic";
1210  if (!Stores.empty())
1211  R << ore::setExtraArgs();
1212  for (auto *I : Stores) {
1213  R << ore::NV("FromBlock", I->getParent()->getName())
1214  << ore::NV("ToBlock", Preheader->getName());
1215  }
1216  return R;
1217  });
1218 
1219  // Okay, the memset has been formed. Zap the original store and anything that
1220  // feeds into it.
1221  for (auto *I : Stores) {
1222  if (MSSAU)
1223  MSSAU->removeMemoryAccess(I, true);
1225  }
1226  if (MSSAU && VerifyMemorySSA)
1227  MSSAU->getMemorySSA()->verifyMemorySSA();
1228  ++NumMemSet;
1229  ExpCleaner.markResultUsed();
1230  return true;
1231 }
1232 
1233 /// If the stored value is a strided load in the same loop with the same stride
1234 /// this may be transformable into a memcpy. This kicks in for stuff like
1235 /// for (i) A[i] = B[i];
1236 bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1237  const SCEV *BECount) {
1238  assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
1239 
1240  Value *StorePtr = SI->getPointerOperand();
1241  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1242  unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1243 
1244  // The store must be feeding a non-volatile load.
1245  LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
1246  assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
1247 
1248  // See if the pointer expression is an AddRec like {base,+,1} on the current
1249  // loop, which indicates a strided load. If we have something else, it's a
1250  // random load we can't handle.
1251  Value *LoadPtr = LI->getPointerOperand();
1252  const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
1253 
1254  const SCEV *StoreSizeSCEV = SE->getConstant(StorePtr->getType(), StoreSize);
1255  return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSizeSCEV,
1256  SI->getAlign(), LI->getAlign(), SI, LI,
1257  StoreEv, LoadEv, BECount);
1258 }
1259 
1260 bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
1261  Value *DestPtr, Value *SourcePtr, const SCEV *StoreSizeSCEV,
1262  MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore,
1263  Instruction *TheLoad, const SCEVAddRecExpr *StoreEv,
1264  const SCEVAddRecExpr *LoadEv, const SCEV *BECount) {
1265 
1266  // FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to
1267  // conservatively bail here, since otherwise we may have to transform
1268  // llvm.memcpy.inline into llvm.memcpy which is illegal.
1269  if (isa<MemCpyInlineInst>(TheStore))
1270  return false;
1271 
1272  // The trip count of the loop and the base pointer of the addrec SCEV is
1273  // guaranteed to be loop invariant, which means that it should dominate the
1274  // header. This allows us to insert code for it in the preheader.
1275  BasicBlock *Preheader = CurLoop->getLoopPreheader();
1276  IRBuilder<> Builder(Preheader->getTerminator());
1277  SCEVExpander Expander(*SE, *DL, "loop-idiom");
1278 
1279  SCEVExpanderCleaner ExpCleaner(Expander, *DT);
1280 
1281  bool Changed = false;
1282  const SCEV *StrStart = StoreEv->getStart();
1283  unsigned StrAS = DestPtr->getType()->getPointerAddressSpace();
1284  Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
1285 
1286  APInt Stride = getStoreStride(StoreEv);
1287  const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
1288 
1289  // TODO: Deal with non-constant size; Currently expect constant store size
1290  assert(ConstStoreSize && "store size is expected to be a constant");
1291 
1292  int64_t StoreSize = ConstStoreSize->getValue()->getZExtValue();
1293  bool IsNegStride = StoreSize == -Stride;
1294 
1295  // Handle negative strided loops.
1296  if (IsNegStride)
1297  StrStart =
1298  getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1299 
1300  // Okay, we have a strided store "p[i]" of a loaded value. We can turn
1301  // this into a memcpy in the loop preheader now if we want. However, this
1302  // would be unsafe to do if there is anything else in the loop that may read
1303  // or write the memory region we're storing to. This includes the load that
1304  // feeds the stores. Check for an alias by generating the base address and
1305  // checking everything.
1306  Value *StoreBasePtr = Expander.expandCodeFor(
1307  StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
1308 
1309  // From here on out, conservatively report to the pass manager that we've
1310  // changed the IR, even if we later clean up these added instructions. There
1311  // may be structural differences e.g. in the order of use lists not accounted
1312  // for in just a textual dump of the IR. This is written as a variable, even
1313  // though statically all the places this dominates could be replaced with
1314  // 'true', with the hope that anyone trying to be clever / "more precise" with
1315  // the return value will read this comment, and leave them alone.
1316  Changed = true;
1317 
1318  SmallPtrSet<Instruction *, 2> IgnoredInsts;
1319  IgnoredInsts.insert(TheStore);
1320 
1321  bool IsMemCpy = isa<MemCpyInst>(TheStore);
1322  const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";
1323 
1324  bool UseMemMove =
1325  mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1326  StoreSizeSCEV, *AA, IgnoredInsts);
1327  if (UseMemMove) {
1328  // For memmove case it's not enough to guarantee that loop doesn't access
1329  // TheStore and TheLoad. Additionally we need to make sure that TheStore is
1330  // the only user of TheLoad.
1331  if (!TheLoad->hasOneUse())
1332  return Changed;
1333  IgnoredInsts.insert(TheLoad);
1334  if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop,
1335  BECount, StoreSizeSCEV, *AA, IgnoredInsts)) {
1336  ORE.emit([&]() {
1337  return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore",
1338  TheStore)
1339  << ore::NV("Inst", InstRemark) << " in "
1340  << ore::NV("Function", TheStore->getFunction())
1341  << " function will not be hoisted: "
1342  << ore::NV("Reason", "The loop may access store location");
1343  });
1344  return Changed;
1345  }
1346  IgnoredInsts.erase(TheLoad);
1347  }
1348 
1349  const SCEV *LdStart = LoadEv->getStart();
1350  unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
1351 
1352  // Handle negative strided loops.
1353  if (IsNegStride)
1354  LdStart =
1355  getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1356 
1357  // For a memcpy, we have to make sure that the input array is not being
1358  // mutated by the loop.
1359  Value *LoadBasePtr = Expander.expandCodeFor(
1360  LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
1361 
1362  // If the store is a memcpy instruction, we must check if it will write to
1363  // the load memory locations. So remove it from the ignored stores.
1364  if (IsMemCpy)
1365  IgnoredInsts.erase(TheStore);
1366  if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1367  StoreSizeSCEV, *AA, IgnoredInsts)) {
1368  ORE.emit([&]() {
1369  return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
1370  << ore::NV("Inst", InstRemark) << " in "
1371  << ore::NV("Function", TheStore->getFunction())
1372  << " function will not be hoisted: "
1373  << ore::NV("Reason", "The loop may access load location");
1374  });
1375  return Changed;
1376  }
1377  if (UseMemMove) {
1378  // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr for
1379  // negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr.
1380  int64_t LoadOff = 0, StoreOff = 0;
1382  LoadBasePtr->stripPointerCasts(), LoadOff, *DL);
1384  StoreBasePtr->stripPointerCasts(), StoreOff, *DL);
1385  int64_t LoadSize =
1386  DL->getTypeSizeInBits(TheLoad->getType()).getFixedSize() / 8;
1387  if (BP1 != BP2 || LoadSize != int64_t(StoreSize))
1388  return Changed;
1389  if ((!IsNegStride && LoadOff < StoreOff + int64_t(StoreSize)) ||
1390  (IsNegStride && LoadOff + LoadSize > StoreOff))
1391  return Changed;
1392  }
1393 
1394  if (avoidLIRForMultiBlockLoop())
1395  return Changed;
1396 
1397  // Okay, everything is safe, we can transform this!
1398 
1399  const SCEV *NumBytesS =
1400  getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1401 
1402  Value *NumBytes =
1403  Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1404 
1405  CallInst *NewCall = nullptr;
1406  // Check whether to generate an unordered atomic memcpy:
1407  // If the load or store are atomic, then they must necessarily be unordered
1408  // by previous checks.
1409  if (!TheStore->isAtomic() && !TheLoad->isAtomic()) {
1410  if (UseMemMove)
1411  NewCall = Builder.CreateMemMove(StoreBasePtr, StoreAlign, LoadBasePtr,
1412  LoadAlign, NumBytes);
1413  else
1414  NewCall = Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr,
1415  LoadAlign, NumBytes);
1416  } else {
1417  // For now don't support unordered atomic memmove.
1418  if (UseMemMove)
1419  return Changed;
1420  // We cannot allow unaligned ops for unordered load/store, so reject
1421  // anything where the alignment isn't at least the element size.
1422  assert((StoreAlign.hasValue() && LoadAlign.hasValue()) &&
1423  "Expect unordered load/store to have align.");
1424  if (StoreAlign.getValue() < StoreSize || LoadAlign.getValue() < StoreSize)
1425  return Changed;
1426 
1427  // If the element.atomic memcpy is not lowered into explicit
1428  // loads/stores later, then it will be lowered into an element-size
1429  // specific lib call. If the lib call doesn't exist for our store size, then
1430  // we shouldn't generate the memcpy.
1431  if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1432  return Changed;
1433 
1434  // Create the call.
1435  // Note that unordered atomic loads/stores are *required* by the spec to
1436  // have an alignment but non-atomic loads/stores may not.
1437  NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1438  StoreBasePtr, StoreAlign.getValue(), LoadBasePtr, LoadAlign.getValue(),
1439  NumBytes, StoreSize);
1440  }
1441  NewCall->setDebugLoc(TheStore->getDebugLoc());
1442 
1443  if (MSSAU) {
1444  MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1445  NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1446  MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1447  }
1448 
1449  LLVM_DEBUG(dbgs() << " Formed new call: " << *NewCall << "\n"
1450  << " from load ptr=" << *LoadEv << " at: " << *TheLoad
1451  << "\n"
1452  << " from store ptr=" << *StoreEv << " at: " << *TheStore
1453  << "\n");
1454 
1455  ORE.emit([&]() {
1456  return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",
1457  NewCall->getDebugLoc(), Preheader)
1458  << "Formed a call to "
1459  << ore::NV("NewFunction", NewCall->getCalledFunction())
1460  << "() intrinsic from " << ore::NV("Inst", InstRemark)
1461  << " instruction in " << ore::NV("Function", TheStore->getFunction())
1462  << " function"
1463  << ore::setExtraArgs()
1464  << ore::NV("FromBlock", TheStore->getParent()->getName())
1465  << ore::NV("ToBlock", Preheader->getName());
1466  });
1467 
1468  // Okay, a new call to memcpy/memmove has been formed. Zap the original store
1469  // and anything that feeds into it.
1470  if (MSSAU)
1471  MSSAU->removeMemoryAccess(TheStore, true);
1472  deleteDeadInstruction(TheStore);
1473  if (MSSAU && VerifyMemorySSA)
1474  MSSAU->getMemorySSA()->verifyMemorySSA();
1475  if (UseMemMove)
1476  ++NumMemMove;
1477  else
1478  ++NumMemCpy;
1479  ExpCleaner.markResultUsed();
1480  return true;
1481 }
1482 
1483 // When compiling for codesize we avoid idiom recognition for a multi-block loop
1484 // unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1485 //
1486 bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1487  bool IsLoopMemset) {
1488  if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
1489  if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
1490  LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
1491  << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
1492  << " avoided: multi-block top-level loop\n");
1493  return true;
1494  }
1495  }
1496 
1497  return false;
1498 }
1499 
1500 bool LoopIdiomRecognize::runOnNoncountableLoop() {
1501  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
1502  << CurLoop->getHeader()->getParent()->getName()
1503  << "] Noncountable Loop %"
1504  << CurLoop->getHeader()->getName() << "\n");
1505 
1506  return recognizePopcount() || recognizeAndInsertFFS() ||
1507  recognizeShiftUntilBitTest() || recognizeShiftUntilZero();
1508 }
1509 
1510 /// Check if the given conditional branch is based on the comparison between
1511 /// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1512 /// true), the control yields to the loop entry. If the branch matches the
1513 /// behavior, the variable involved in the comparison is returned. This function
1514 /// will be called to see if the precondition and postcondition of the loop are
1515 /// in desirable form.
1516 static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
1517  bool JmpOnZero = false) {
1518  if (!BI || !BI->isConditional())
1519  return nullptr;
1520 
1521  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
1522  if (!Cond)
1523  return nullptr;
1524 
1525  ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
1526  if (!CmpZero || !CmpZero->isZero())
1527  return nullptr;
1528 
1529  BasicBlock *TrueSucc = BI->getSuccessor(0);
1530  BasicBlock *FalseSucc = BI->getSuccessor(1);
1531  if (JmpOnZero)
1532  std::swap(TrueSucc, FalseSucc);
1533 
1534  ICmpInst::Predicate Pred = Cond->getPredicate();
1535  if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
1536  (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
1537  return Cond->getOperand(0);
1538 
1539  return nullptr;
1540 }
1541 
1542 // Check if the recurrence variable `VarX` is in the right form to create
1543 // the idiom. Returns the value coerced to a PHINode if so.
1545  BasicBlock *LoopEntry) {
1546  auto *PhiX = dyn_cast<PHINode>(VarX);
1547  if (PhiX && PhiX->getParent() == LoopEntry &&
1548  (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
1549  return PhiX;
1550  return nullptr;
1551 }
1552 
1553 /// Return true iff the idiom is detected in the loop.
1554 ///
1555 /// Additionally:
1556 /// 1) \p CntInst is set to the instruction counting the population bit.
1557 /// 2) \p CntPhi is set to the corresponding phi node.
1558 /// 3) \p Var is set to the value whose population bits are being counted.
1559 ///
1560 /// The core idiom we are trying to detect is:
1561 /// \code
1562 /// if (x0 != 0)
1563 /// goto loop-exit // the precondition of the loop
1564 /// cnt0 = init-val;
1565 /// do {
1566 /// x1 = phi (x0, x2);
1567 /// cnt1 = phi(cnt0, cnt2);
1568 ///
1569 /// cnt2 = cnt1 + 1;
1570 /// ...
1571 /// x2 = x1 & (x1 - 1);
1572 /// ...
1573 /// } while(x != 0);
1574 ///
1575 /// loop-exit:
1576 /// \endcode
1577 static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
1578  Instruction *&CntInst, PHINode *&CntPhi,
1579  Value *&Var) {
1580  // step 1: Check to see if the look-back branch match this pattern:
1581  // "if (a!=0) goto loop-entry".
1582  BasicBlock *LoopEntry;
1583  Instruction *DefX2, *CountInst;
1584  Value *VarX1, *VarX0;
1585  PHINode *PhiX, *CountPhi;
1586 
1587  DefX2 = CountInst = nullptr;
1588  VarX1 = VarX0 = nullptr;
1589  PhiX = CountPhi = nullptr;
1590  LoopEntry = *(CurLoop->block_begin());
1591 
1592  // step 1: Check if the loop-back branch is in desirable form.
1593  {
1594  if (Value *T = matchCondition(
1595  dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1596  DefX2 = dyn_cast<Instruction>(T);
1597  else
1598  return false;
1599  }
1600 
1601  // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1602  {
1603  if (!DefX2 || DefX2->getOpcode() != Instruction::And)
1604  return false;
1605 
1606  BinaryOperator *SubOneOp;
1607 
1608  if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
1609  VarX1 = DefX2->getOperand(1);
1610  else {
1611  VarX1 = DefX2->getOperand(0);
1612  SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
1613  }
1614  if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
1615  return false;
1616 
1617  ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
1618  if (!Dec ||
1619  !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
1620  (SubOneOp->getOpcode() == Instruction::Add &&
1621  Dec->isMinusOne()))) {
1622  return false;
1623  }
1624  }
1625 
1626  // step 3: Check the recurrence of variable X
1627  PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
1628  if (!PhiX)
1629  return false;
1630 
1631  // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
1632  {
1633  CountInst = nullptr;
1634  for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1635  IterE = LoopEntry->end();
1636  Iter != IterE; Iter++) {
1637  Instruction *Inst = &*Iter;
1638  if (Inst->getOpcode() != Instruction::Add)
1639  continue;
1640 
1641  ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1642  if (!Inc || !Inc->isOne())
1643  continue;
1644 
1645  PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1646  if (!Phi)
1647  continue;
1648 
1649  // Check if the result of the instruction is live of the loop.
1650  bool LiveOutLoop = false;
1651  for (User *U : Inst->users()) {
1652  if ((cast<Instruction>(U))->getParent() != LoopEntry) {
1653  LiveOutLoop = true;
1654  break;
1655  }
1656  }
1657 
1658  if (LiveOutLoop) {
1659  CountInst = Inst;
1660  CountPhi = Phi;
1661  break;
1662  }
1663  }
1664 
1665  if (!CountInst)
1666  return false;
1667  }
1668 
1669  // step 5: check if the precondition is in this form:
1670  // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
1671  {
1672  auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1673  Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
1674  if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
1675  return false;
1676 
1677  CntInst = CountInst;
1678  CntPhi = CountPhi;
1679  Var = T;
1680  }
1681 
1682  return true;
1683 }
1684 
1685 /// Return true if the idiom is detected in the loop.
1686 ///
1687 /// Additionally:
1688 /// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1689 /// or nullptr if there is no such.
1690 /// 2) \p CntPhi is set to the corresponding phi node
1691 /// or nullptr if there is no such.
1692 /// 3) \p Var is set to the value whose CTLZ could be used.
1693 /// 4) \p DefX is set to the instruction calculating Loop exit condition.
1694 ///
1695 /// The core idiom we are trying to detect is:
1696 /// \code
1697 /// if (x0 == 0)
1698 /// goto loop-exit // the precondition of the loop
1699 /// cnt0 = init-val;
1700 /// do {
1701 /// x = phi (x0, x.next); //PhiX
1702 /// cnt = phi(cnt0, cnt.next);
1703 ///
1704 /// cnt.next = cnt + 1;
1705 /// ...
1706 /// x.next = x >> 1; // DefX
1707 /// ...
1708 /// } while(x.next != 0);
1709 ///
1710 /// loop-exit:
1711 /// \endcode
1712 static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
1713  Intrinsic::ID &IntrinID, Value *&InitX,
1714  Instruction *&CntInst, PHINode *&CntPhi,
1715  Instruction *&DefX) {
1716  BasicBlock *LoopEntry;
1717  Value *VarX = nullptr;
1718 
1719  DefX = nullptr;
1720  CntInst = nullptr;
1721  CntPhi = nullptr;
1722  LoopEntry = *(CurLoop->block_begin());
1723 
1724  // step 1: Check if the loop-back branch is in desirable form.
1725  if (Value *T = matchCondition(
1726  dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1727  DefX = dyn_cast<Instruction>(T);
1728  else
1729  return false;
1730 
1731  // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
1732  if (!DefX || !DefX->isShift())
1733  return false;
1734  IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
1735  Intrinsic::ctlz;
1736  ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
1737  if (!Shft || !Shft->isOne())
1738  return false;
1739  VarX = DefX->getOperand(0);
1740 
1741  // step 3: Check the recurrence of variable X
1742  PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
1743  if (!PhiX)
1744  return false;
1745 
1746  InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
1747 
1748  // Make sure the initial value can't be negative otherwise the ashr in the
1749  // loop might never reach zero which would make the loop infinite.
1750  if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
1751  return false;
1752 
1753  // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1754  // or cnt.next = cnt + -1.
1755  // TODO: We can skip the step. If loop trip count is known (CTLZ),
1756  // then all uses of "cnt.next" could be optimized to the trip count
1757  // plus "cnt0". Currently it is not optimized.
1758  // This step could be used to detect POPCNT instruction:
1759  // cnt.next = cnt + (x.next & 1)
1760  for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1761  IterE = LoopEntry->end();
1762  Iter != IterE; Iter++) {
1763  Instruction *Inst = &*Iter;
1764  if (Inst->getOpcode() != Instruction::Add)
1765  continue;
1766 
1767  ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1768  if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
1769  continue;
1770 
1771  PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1772  if (!Phi)
1773  continue;
1774 
1775  CntInst = Inst;
1776  CntPhi = Phi;
1777  break;
1778  }
1779  if (!CntInst)
1780  return false;
1781 
1782  return true;
1783 }
1784 
1785 /// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
1786 /// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
1787 /// trip count returns true; otherwise, returns false.
1788 bool LoopIdiomRecognize::recognizeAndInsertFFS() {
1789  // Give up if the loop has multiple blocks or multiple backedges.
1790  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1791  return false;
1792 
1793  Intrinsic::ID IntrinID;
1794  Value *InitX;
1795  Instruction *DefX = nullptr;
1796  PHINode *CntPhi = nullptr;
1797  Instruction *CntInst = nullptr;
1798  // Help decide if transformation is profitable. For ShiftUntilZero idiom,
1799  // this is always 6.
1800  size_t IdiomCanonicalSize = 6;
1801 
1802  if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
1803  CntInst, CntPhi, DefX))
1804  return false;
1805 
1806  bool IsCntPhiUsedOutsideLoop = false;
1807  for (User *U : CntPhi->users())
1808  if (!CurLoop->contains(cast<Instruction>(U))) {
1809  IsCntPhiUsedOutsideLoop = true;
1810  break;
1811  }
1812  bool IsCntInstUsedOutsideLoop = false;
1813  for (User *U : CntInst->users())
1814  if (!CurLoop->contains(cast<Instruction>(U))) {
1815  IsCntInstUsedOutsideLoop = true;
1816  break;
1817  }
1818  // If both CntInst and CntPhi are used outside the loop the profitability
1819  // is questionable.
1820  if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
1821  return false;
1822 
1823  // For some CPUs result of CTLZ(X) intrinsic is undefined
1824  // when X is 0. If we can not guarantee X != 0, we need to check this
1825  // when expand.
1826  bool ZeroCheck = false;
1827  // It is safe to assume Preheader exist as it was checked in
1828  // parent function RunOnLoop.
1829  BasicBlock *PH = CurLoop->getLoopPreheader();
1830 
1831  // If we are using the count instruction outside the loop, make sure we
1832  // have a zero check as a precondition. Without the check the loop would run
1833  // one iteration for before any check of the input value. This means 0 and 1
1834  // would have identical behavior in the original loop and thus
1835  if (!IsCntPhiUsedOutsideLoop) {
1836  auto *PreCondBB = PH->getSinglePredecessor();
1837  if (!PreCondBB)
1838  return false;
1839  auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1840  if (!PreCondBI)
1841  return false;
1842  if (matchCondition(PreCondBI, PH) != InitX)
1843  return false;
1844  ZeroCheck = true;
1845  }
1846 
1847  // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
1848  // profitable if we delete the loop.
1849 
1850  // the loop has only 6 instructions:
1851  // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
1852  // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
1853  // %shr = ashr %n.addr.0, 1
1854  // %tobool = icmp eq %shr, 0
1855  // %inc = add nsw %i.0, 1
1856  // br i1 %tobool
1857 
1858  const Value *Args[] = {InitX,
1859  ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
1860 
1861  // @llvm.dbg doesn't count as they have no semantic effect.
1862  auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
1864  std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
1865 
1866  IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
1867  InstructionCost Cost =
1869  if (HeaderSize != IdiomCanonicalSize &&
1871  return false;
1872 
1873  transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
1874  DefX->getDebugLoc(), ZeroCheck,
1875  IsCntPhiUsedOutsideLoop);
1876  return true;
1877 }
1878 
1879 /// Recognizes a population count idiom in a non-countable loop.
1880 ///
1881 /// If detected, transforms the relevant code to issue the popcount intrinsic
1882 /// function call, and returns true; otherwise, returns false.
1883 bool LoopIdiomRecognize::recognizePopcount() {
1885  return false;
1886 
1887  // Counting population are usually conducted by few arithmetic instructions.
1888  // Such instructions can be easily "absorbed" by vacant slots in a
1889  // non-compact loop. Therefore, recognizing popcount idiom only makes sense
1890  // in a compact loop.
1891 
1892  // Give up if the loop has multiple blocks or multiple backedges.
1893  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1894  return false;
1895 
1896  BasicBlock *LoopBody = *(CurLoop->block_begin());
1897  if (LoopBody->size() >= 20) {
1898  // The loop is too big, bail out.
1899  return false;
1900  }
1901 
1902  // It should have a preheader containing nothing but an unconditional branch.
1903  BasicBlock *PH = CurLoop->getLoopPreheader();
1904  if (!PH || &PH->front() != PH->getTerminator())
1905  return false;
1906  auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
1907  if (!EntryBI || EntryBI->isConditional())
1908  return false;
1909 
1910  // It should have a precondition block where the generated popcount intrinsic
1911  // function can be inserted.
1912  auto *PreCondBB = PH->getSinglePredecessor();
1913  if (!PreCondBB)
1914  return false;
1915  auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1916  if (!PreCondBI || PreCondBI->isUnconditional())
1917  return false;
1918 
1919  Instruction *CntInst;
1920  PHINode *CntPhi;
1921  Value *Val;
1922  if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
1923  return false;
1924 
1925  transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
1926  return true;
1927 }
1928 
1930  const DebugLoc &DL) {
1931  Value *Ops[] = {Val};
1932  Type *Tys[] = {Val->getType()};
1933 
1935  Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
1936  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1937  CI->setDebugLoc(DL);
1938 
1939  return CI;
1940 }
1941 
1943  const DebugLoc &DL, bool ZeroCheck,
1944  Intrinsic::ID IID) {
1945  Value *Ops[] = {Val, IRBuilder.getInt1(ZeroCheck)};
1946  Type *Tys[] = {Val->getType()};
1947 
1949  Function *Func = Intrinsic::getDeclaration(M, IID, Tys);
1950  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1951  CI->setDebugLoc(DL);
1952 
1953  return CI;
1954 }
1955 
1956 /// Transform the following loop (Using CTLZ, CTTZ is similar):
1957 /// loop:
1958 /// CntPhi = PHI [Cnt0, CntInst]
1959 /// PhiX = PHI [InitX, DefX]
1960 /// CntInst = CntPhi + 1
1961 /// DefX = PhiX >> 1
1962 /// LOOP_BODY
1963 /// Br: loop if (DefX != 0)
1964 /// Use(CntPhi) or Use(CntInst)
1965 ///
1966 /// Into:
1967 /// If CntPhi used outside the loop:
1968 /// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
1969 /// Count = CountPrev + 1
1970 /// else
1971 /// Count = BitWidth(InitX) - CTLZ(InitX)
1972 /// loop:
1973 /// CntPhi = PHI [Cnt0, CntInst]
1974 /// PhiX = PHI [InitX, DefX]
1975 /// PhiCount = PHI [Count, Dec]
1976 /// CntInst = CntPhi + 1
1977 /// DefX = PhiX >> 1
1978 /// Dec = PhiCount - 1
1979 /// LOOP_BODY
1980 /// Br: loop if (Dec != 0)
1981 /// Use(CountPrev + Cnt0) // Use(CntPhi)
1982 /// or
1983 /// Use(Count + Cnt0) // Use(CntInst)
1984 ///
1985 /// If LOOP_BODY is empty the loop will be deleted.
1986 /// If CntInst and DefX are not used in LOOP_BODY they will be removed.
1987 void LoopIdiomRecognize::transformLoopToCountable(
1988  Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
1989  PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
1990  bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
1991  BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
1992 
1993  // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
1994  IRBuilder<> Builder(PreheaderBr);
1995  Builder.SetCurrentDebugLocation(DL);
1996 
1997  // If there are no uses of CntPhi crate:
1998  // Count = BitWidth - CTLZ(InitX);
1999  // NewCount = Count;
2000  // If there are uses of CntPhi create:
2001  // NewCount = BitWidth - CTLZ(InitX >> 1);
2002  // Count = NewCount + 1;
2003  Value *InitXNext;
2004  if (IsCntPhiUsedOutsideLoop) {
2005  if (DefX->getOpcode() == Instruction::AShr)
2006  InitXNext = Builder.CreateAShr(InitX, 1);
2007  else if (DefX->getOpcode() == Instruction::LShr)
2008  InitXNext = Builder.CreateLShr(InitX, 1);
2009  else if (DefX->getOpcode() == Instruction::Shl) // cttz
2010  InitXNext = Builder.CreateShl(InitX, 1);
2011  else
2012  llvm_unreachable("Unexpected opcode!");
2013  } else
2014  InitXNext = InitX;
2015  Value *Count =
2016  createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
2017  Type *CountTy = Count->getType();
2018  Count = Builder.CreateSub(
2019  ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count);
2020  Value *NewCount = Count;
2021  if (IsCntPhiUsedOutsideLoop)
2022  Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1));
2023 
2024  NewCount = Builder.CreateZExtOrTrunc(NewCount, CntInst->getType());
2025 
2026  Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
2027  if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
2028  // If the counter was being incremented in the loop, add NewCount to the
2029  // counter's initial value, but only if the initial value is not zero.
2030  ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2031  if (!InitConst || !InitConst->isZero())
2032  NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2033  } else {
2034  // If the count was being decremented in the loop, subtract NewCount from
2035  // the counter's initial value.
2036  NewCount = Builder.CreateSub(CntInitVal, NewCount);
2037  }
2038 
2039  // Step 2: Insert new IV and loop condition:
2040  // loop:
2041  // ...
2042  // PhiCount = PHI [Count, Dec]
2043  // ...
2044  // Dec = PhiCount - 1
2045  // ...
2046  // Br: loop if (Dec != 0)
2047  BasicBlock *Body = *(CurLoop->block_begin());
2048  auto *LbBr = cast<BranchInst>(Body->getTerminator());
2049  ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2050 
2051  PHINode *TcPhi = PHINode::Create(CountTy, 2, "tcphi", &Body->front());
2052 
2053  Builder.SetInsertPoint(LbCond);
2054  Instruction *TcDec = cast<Instruction>(Builder.CreateSub(
2055  TcPhi, ConstantInt::get(CountTy, 1), "tcdec", false, true));
2056 
2057  TcPhi->addIncoming(Count, Preheader);
2058  TcPhi->addIncoming(TcDec, Body);
2059 
2060  CmpInst::Predicate Pred =
2061  (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
2062  LbCond->setPredicate(Pred);
2063  LbCond->setOperand(0, TcDec);
2064  LbCond->setOperand(1, ConstantInt::get(CountTy, 0));
2065 
2066  // Step 3: All the references to the original counter outside
2067  // the loop are replaced with the NewCount
2068  if (IsCntPhiUsedOutsideLoop)
2069  CntPhi->replaceUsesOutsideBlock(NewCount, Body);
2070  else
2071  CntInst->replaceUsesOutsideBlock(NewCount, Body);
2072 
2073  // step 4: Forget the "non-computable" trip-count SCEV associated with the
2074  // loop. The loop would otherwise not be deleted even if it becomes empty.
2075  SE->forgetLoop(CurLoop);
2076 }
2077 
2078 void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
2079  Instruction *CntInst,
2080  PHINode *CntPhi, Value *Var) {
2081  BasicBlock *PreHead = CurLoop->getLoopPreheader();
2082  auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
2083  const DebugLoc &DL = CntInst->getDebugLoc();
2084 
2085  // Assuming before transformation, the loop is following:
2086  // if (x) // the precondition
2087  // do { cnt++; x &= x - 1; } while(x);
2088 
2089  // Step 1: Insert the ctpop instruction at the end of the precondition block
2090  IRBuilder<> Builder(PreCondBr);
2091  Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
2092  {
2093  PopCnt = createPopcntIntrinsic(Builder, Var, DL);
2094  NewCount = PopCntZext =
2095  Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
2096 
2097  if (NewCount != PopCnt)
2098  (cast<Instruction>(NewCount))->setDebugLoc(DL);
2099 
2100  // TripCnt is exactly the number of iterations the loop has
2101  TripCnt = NewCount;
2102 
2103  // If the population counter's initial value is not zero, insert Add Inst.
2104  Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
2105  ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2106  if (!InitConst || !InitConst->isZero()) {
2107  NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2108  (cast<Instruction>(NewCount))->setDebugLoc(DL);
2109  }
2110  }
2111 
2112  // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
2113  // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
2114  // function would be partial dead code, and downstream passes will drag
2115  // it back from the precondition block to the preheader.
2116  {
2117  ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
2118 
2119  Value *Opnd0 = PopCntZext;
2120  Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
2121  if (PreCond->getOperand(0) != Var)
2122  std::swap(Opnd0, Opnd1);
2123 
2124  ICmpInst *NewPreCond = cast<ICmpInst>(
2125  Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
2126  PreCondBr->setCondition(NewPreCond);
2127 
2129  }
2130 
2131  // Step 3: Note that the population count is exactly the trip count of the
2132  // loop in question, which enable us to convert the loop from noncountable
2133  // loop into a countable one. The benefit is twofold:
2134  //
2135  // - If the loop only counts population, the entire loop becomes dead after
2136  // the transformation. It is a lot easier to prove a countable loop dead
2137  // than to prove a noncountable one. (In some C dialects, an infinite loop
2138  // isn't dead even if it computes nothing useful. In general, DCE needs
2139  // to prove a noncountable loop finite before safely delete it.)
2140  //
2141  // - If the loop also performs something else, it remains alive.
2142  // Since it is transformed to countable form, it can be aggressively
2143  // optimized by some optimizations which are in general not applicable
2144  // to a noncountable loop.
2145  //
2146  // After this step, this loop (conceptually) would look like following:
2147  // newcnt = __builtin_ctpop(x);
2148  // t = newcnt;
2149  // if (x)
2150  // do { cnt++; x &= x-1; t--) } while (t > 0);
2151  BasicBlock *Body = *(CurLoop->block_begin());
2152  {
2153  auto *LbBr = cast<BranchInst>(Body->getTerminator());
2154  ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2155  Type *Ty = TripCnt->getType();
2156 
2157  PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
2158 
2159  Builder.SetInsertPoint(LbCond);
2160  Instruction *TcDec = cast<Instruction>(
2161  Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
2162  "tcdec", false, true));
2163 
2164  TcPhi->addIncoming(TripCnt, PreHead);
2165  TcPhi->addIncoming(TcDec, Body);
2166 
2167  CmpInst::Predicate Pred =
2168  (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
2169  LbCond->setPredicate(Pred);
2170  LbCond->setOperand(0, TcDec);
2171  LbCond->setOperand(1, ConstantInt::get(Ty, 0));
2172  }
2173 
2174  // Step 4: All the references to the original population counter outside
2175  // the loop are replaced with the NewCount -- the value returned from
2176  // __builtin_ctpop().
2177  CntInst->replaceUsesOutsideBlock(NewCount, Body);
2178 
2179  // step 5: Forget the "non-computable" trip-count SCEV associated with the
2180  // loop. The loop would otherwise not be deleted even if it becomes empty.
2181  SE->forgetLoop(CurLoop);
2182 }
2183 
2184 /// Match loop-invariant value.
2185 template <typename SubPattern_t> struct match_LoopInvariant {
2186  SubPattern_t SubPattern;
2187  const Loop *L;
2188 
2189  match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
2190  : SubPattern(SP), L(L) {}
2191 
2192  template <typename ITy> bool match(ITy *V) {
2193  return L->isLoopInvariant(V) && SubPattern.match(V);
2194  }
2195 };
2196 
2197 /// Matches if the value is loop-invariant.
2198 template <typename Ty>
2199 inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
2200  return match_LoopInvariant<Ty>(M, L);
2201 }
2202 
2203 /// Return true if the idiom is detected in the loop.
2204 ///
2205 /// The core idiom we are trying to detect is:
2206 /// \code
2207 /// entry:
2208 /// <...>
2209 /// %bitmask = shl i32 1, %bitpos
2210 /// br label %loop
2211 ///
2212 /// loop:
2213 /// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2214 /// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2215 /// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2216 /// %x.next = shl i32 %x.curr, 1
2217 /// <...>
2218 /// br i1 %x.curr.isbitunset, label %loop, label %end
2219 ///
2220 /// end:
2221 /// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2222 /// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2223 /// <...>
2224 /// \endcode
2225 static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
2226  Value *&BitMask, Value *&BitPos,
2227  Value *&CurrX, Instruction *&NextX) {
2229  " Performing shift-until-bittest idiom detection.\n");
2230 
2231  // Give up if the loop has multiple blocks or multiple backedges.
2232  if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
2233  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
2234  return false;
2235  }
2236 
2237  BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2238  BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2239  assert(LoopPreheaderBB && "There is always a loop preheader.");
2240 
2241  using namespace PatternMatch;
2242 
2243  // Step 1: Check if the loop backedge is in desirable form.
2244 
2245  ICmpInst::Predicate Pred;
2246  Value *CmpLHS, *CmpRHS;
2247  BasicBlock *TrueBB, *FalseBB;
2248  if (!match(LoopHeaderBB->getTerminator(),
2249  m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
2250  m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {
2251  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
2252  return false;
2253  }
2254 
2255  // Step 2: Check if the backedge's condition is in desirable form.
2256 
2257  auto MatchVariableBitMask = [&]() {
2258  return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2259  match(CmpLHS,
2260  m_c_And(m_Value(CurrX),
2261  m_CombineAnd(
2262  m_Value(BitMask),
2263  m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),
2264  CurLoop))));
2265  };
2266  auto MatchConstantBitMask = [&]() {
2267  return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2268  match(CmpLHS, m_And(m_Value(CurrX),
2269  m_CombineAnd(m_Value(BitMask), m_Power2()))) &&
2270  (BitPos = ConstantExpr::getExactLogBase2(cast<Constant>(BitMask)));
2271  };
2272  auto MatchDecomposableConstantBitMask = [&]() {
2273  APInt Mask;
2274  return llvm::decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, CurrX, Mask) &&
2275  ICmpInst::isEquality(Pred) && Mask.isPowerOf2() &&
2276  (BitMask = ConstantInt::get(CurrX->getType(), Mask)) &&
2277  (BitPos = ConstantInt::get(CurrX->getType(), Mask.logBase2()));
2278  };
2279 
2280  if (!MatchVariableBitMask() && !MatchConstantBitMask() &&
2281  !MatchDecomposableConstantBitMask()) {
2282  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n");
2283  return false;
2284  }
2285 
2286  // Step 3: Check if the recurrence is in desirable form.
2287  auto *CurrXPN = dyn_cast<PHINode>(CurrX);
2288  if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) {
2289  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
2290  return false;
2291  }
2292 
2293  BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);
2294  NextX =
2295  dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));
2296 
2297  assert(CurLoop->isLoopInvariant(BaseX) &&
2298  "Expected BaseX to be avaliable in the preheader!");
2299 
2300  if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {
2301  // FIXME: support right-shift?
2302  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
2303  return false;
2304  }
2305 
2306  // Step 4: Check if the backedge's destinations are in desirable form.
2307 
2308  assert(ICmpInst::isEquality(Pred) &&
2309  "Should only get equality predicates here.");
2310 
2311  // cmp-br is commutative, so canonicalize to a single variant.
2312  if (Pred != ICmpInst::Predicate::ICMP_EQ) {
2313  Pred = ICmpInst::getInversePredicate(Pred);
2314  std::swap(TrueBB, FalseBB);
2315  }
2316 
2317  // We expect to exit loop when comparison yields false,
2318  // so when it yields true we should branch back to loop header.
2319  if (TrueBB != LoopHeaderBB) {
2320  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
2321  return false;
2322  }
2323 
2324  // Okay, idiom checks out.
2325  return true;
2326 }
2327 
2328 /// Look for the following loop:
2329 /// \code
2330 /// entry:
2331 /// <...>
2332 /// %bitmask = shl i32 1, %bitpos
2333 /// br label %loop
2334 ///
2335 /// loop:
2336 /// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2337 /// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2338 /// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2339 /// %x.next = shl i32 %x.curr, 1
2340 /// <...>
2341 /// br i1 %x.curr.isbitunset, label %loop, label %end
2342 ///
2343 /// end:
2344 /// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2345 /// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2346 /// <...>
2347 /// \endcode
2348 ///
2349 /// And transform it into:
2350 /// \code
2351 /// entry:
2352 /// %bitmask = shl i32 1, %bitpos
2353 /// %lowbitmask = add i32 %bitmask, -1
2354 /// %mask = or i32 %lowbitmask, %bitmask
2355 /// %x.masked = and i32 %x, %mask
2356 /// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
2357 /// i1 true)
2358 /// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
2359 /// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
2360 /// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
2361 /// %tripcount = add i32 %backedgetakencount, 1
2362 /// %x.curr = shl i32 %x, %backedgetakencount
2363 /// %x.next = shl i32 %x, %tripcount
2364 /// br label %loop
2365 ///
2366 /// loop:
2367 /// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
2368 /// %loop.iv.next = add nuw i32 %loop.iv, 1
2369 /// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
2370 /// <...>
2371 /// br i1 %loop.ivcheck, label %end, label %loop
2372 ///
2373 /// end:
2374 /// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2375 /// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2376 /// <...>
2377 /// \endcode
2378 bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
2379  bool MadeChange = false;
2380 
2381  Value *X, *BitMask, *BitPos, *XCurr;
2382  Instruction *XNext;
2383  if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,
2384  XNext)) {
2386  " shift-until-bittest idiom detection failed.\n");
2387  return MadeChange;
2388  }
2389  LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n");
2390 
2391  // Ok, it is the idiom we were looking for, we *could* transform this loop,
2392  // but is it profitable to transform?
2393 
2394  BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2395  BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2396  assert(LoopPreheaderBB && "There is always a loop preheader.");
2397 
2398  BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2399  assert(SuccessorBB && "There is only a single successor.");
2400 
2401  IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2402  Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());
2403 
2404  Intrinsic::ID IntrID = Intrinsic::ctlz;
2405  Type *Ty = X->getType();
2406  unsigned Bitwidth = Ty->getScalarSizeInBits();
2407 
2410 
2411  // The rewrite is considered to be unprofitable iff and only iff the
2412  // intrinsic/shift we'll use are not cheap. Note that we are okay with *just*
2413  // making the loop countable, even if nothing else changes.
2415  IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getTrue()});
2417  if (Cost > TargetTransformInfo::TCC_Basic) {
2419  " Intrinsic is too costly, not beneficial\n");
2420  return MadeChange;
2421  }
2422  if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >
2424  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n");
2425  return MadeChange;
2426  }
2427 
2428  // Ok, transform appears worthwhile.
2429  MadeChange = true;
2430 
2431  // Step 1: Compute the loop trip count.
2432 
2433  Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
2434  BitPos->getName() + ".lowbitmask");
2435  Value *Mask =
2436  Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
2437  Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
2438  CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
2439  IntrID, Ty, {XMasked, /*is_zero_undef=*/Builder.getTrue()},
2440  /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros");
2441  Value *XMaskedNumActiveBits = Builder.CreateSub(
2442  ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
2443  XMasked->getName() + ".numactivebits", /*HasNUW=*/true,
2444  /*HasNSW=*/Bitwidth != 2);
2445  Value *XMaskedLeadingOnePos =
2446  Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),
2447  XMasked->getName() + ".leadingonepos", /*HasNUW=*/false,
2448  /*HasNSW=*/Bitwidth > 2);
2449 
2450  Value *LoopBackedgeTakenCount = Builder.CreateSub(
2451  BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount",
2452  /*HasNUW=*/true, /*HasNSW=*/true);
2453  // We know loop's backedge-taken count, but what's loop's trip count?
2454  // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2455  Value *LoopTripCount =
2456  Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
2457  CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
2458  /*HasNSW=*/Bitwidth != 2);
2459 
2460  // Step 2: Compute the recurrence's final value without a loop.
2461 
2462  // NewX is always safe to compute, because `LoopBackedgeTakenCount`
2463  // will always be smaller than `bitwidth(X)`, i.e. we never get poison.
2464  Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);
2465  NewX->takeName(XCurr);
2466  if (auto *I = dyn_cast<Instruction>(NewX))
2467  I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
2468 
2469  Value *NewXNext;
2470  // Rewriting XNext is more complicated, however, because `X << LoopTripCount`
2471  // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
2472  // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
2473  // that isn't the case, we'll need to emit an alternative, safe IR.
2474  if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() ||
2478  Ty->getScalarSizeInBits() - 1))))
2479  NewXNext = Builder.CreateShl(X, LoopTripCount);
2480  else {
2481  // Otherwise, just additionally shift by one. It's the smallest solution,
2482  // alternatively, we could check that NewX is INT_MIN (or BitPos is )
2483  // and select 0 instead.
2484  NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));
2485  }
2486 
2487  NewXNext->takeName(XNext);
2488  if (auto *I = dyn_cast<Instruction>(NewXNext))
2489  I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
2490 
2491  // Step 3: Adjust the successor basic block to recieve the computed
2492  // recurrence's final value instead of the recurrence itself.
2493 
2494  XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);
2495  XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);
2496 
2497  // Step 4: Rewrite the loop into a countable form, with canonical IV.
2498 
2499  // The new canonical induction variable.
2500  Builder.SetInsertPoint(&LoopHeaderBB->front());
2501  auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
2502 
2503  // The induction itself.
2504  // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2505  Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
2506  auto *IVNext =
2507  Builder.CreateAdd(IV, ConstantInt::get(Ty, 1), IV->getName() + ".next",
2508  /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
2509 
2510  // The loop trip count check.
2511  auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
2512  CurLoop->getName() + ".ivcheck");
2513  Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
2514  LoopHeaderBB->getTerminator()->eraseFromParent();
2515 
2516  // Populate the IV PHI.
2517  IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
2518  IV->addIncoming(IVNext, LoopHeaderBB);
2519 
2520  // Step 5: Forget the "non-computable" trip-count SCEV associated with the
2521  // loop. The loop would otherwise not be deleted even if it becomes empty.
2522 
2523  SE->forgetLoop(CurLoop);
2524 
2525  // Other passes will take care of actually deleting the loop if possible.
2526 
2527  LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n");
2528 
2529  ++NumShiftUntilBitTest;
2530  return MadeChange;
2531 }
2532 
2533 /// Return true if the idiom is detected in the loop.
2534 ///
2535 /// The core idiom we are trying to detect is:
2536 /// \code
2537 /// entry:
2538 /// <...>
2539 /// %start = <...>
2540 /// %extraoffset = <...>
2541 /// <...>
2542 /// br label %for.cond
2543 ///
2544 /// loop:
2545 /// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
2546 /// %nbits = add nsw i8 %iv, %extraoffset
2547 /// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
2548 /// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
2549 /// %iv.next = add i8 %iv, 1
2550 /// <...>
2551 /// br i1 %val.shifted.iszero, label %end, label %loop
2552 ///
2553 /// end:
2554 /// %iv.res = phi i8 [ %iv, %loop ] <...>
2555 /// %nbits.res = phi i8 [ %nbits, %loop ] <...>
2556 /// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
2557 /// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
2558 /// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
2559 /// <...>
2560 /// \endcode
2562  Instruction *&ValShiftedIsZero,
2563  Intrinsic::ID &IntrinID, Instruction *&IV,
2564  Value *&Start, Value *&Val,
2565  const SCEV *&ExtraOffsetExpr,
2566  bool &InvertedCond) {
2568  " Performing shift-until-zero idiom detection.\n");
2569 
2570  // Give up if the loop has multiple blocks or multiple backedges.
2571  if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
2572  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
2573  return false;
2574  }
2575 
2576  Instruction *ValShifted, *NBits, *IVNext;
2577  Value *ExtraOffset;
2578 
2579  BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2580  BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2581  assert(LoopPreheaderBB && "There is always a loop preheader.");
2582 
2583  using namespace PatternMatch;
2584 
2585  // Step 1: Check if the loop backedge, condition is in desirable form.
2586 
2587  ICmpInst::Predicate Pred;
2588  BasicBlock *TrueBB, *FalseBB;
2589  if (!match(LoopHeaderBB->getTerminator(),
2590  m_Br(m_Instruction(ValShiftedIsZero), m_BasicBlock(TrueBB),
2591  m_BasicBlock(FalseBB))) ||
2592  !match(ValShiftedIsZero,
2593  m_ICmp(Pred, m_Instruction(ValShifted), m_Zero())) ||
2594  !ICmpInst::isEquality(Pred)) {
2595  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
2596  return false;
2597  }
2598 
2599  // Step 2: Check if the comparison's operand is in desirable form.
2600  // FIXME: Val could be a one-input PHI node, which we should look past.
2601  if (!match(ValShifted, m_Shift(m_LoopInvariant(m_Value(Val), CurLoop),
2602  m_Instruction(NBits)))) {
2603  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad comparisons value computation.\n");
2604  return false;
2605  }
2606  IntrinID = ValShifted->getOpcode() == Instruction::Shl ? Intrinsic::cttz
2607  : Intrinsic::ctlz;
2608 
2609  // Step 3: Check if the shift amount is in desirable form.
2610 
2611  if (match(NBits, m_c_Add(m_Instruction(IV),
2612  m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
2613  (NBits->hasNoSignedWrap() || NBits->hasNoUnsignedWrap()))
2614  ExtraOffsetExpr = SE->getNegativeSCEV(SE->getSCEV(ExtraOffset));
2615  else if (match(NBits,
2616  m_Sub(m_Instruction(IV),
2617  m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
2618  NBits->hasNoSignedWrap())
2619  ExtraOffsetExpr = SE->getSCEV(ExtraOffset);
2620  else {
2621  IV = NBits;
2622  ExtraOffsetExpr = SE->getZero(NBits->getType());
2623  }
2624 
2625  // Step 4: Check if the recurrence is in desirable form.
2626  auto *IVPN = dyn_cast<PHINode>(IV);
2627  if (!IVPN || IVPN->getParent() != LoopHeaderBB) {
2628  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
2629  return false;
2630  }
2631 
2632  Start = IVPN->getIncomingValueForBlock(LoopPreheaderBB);
2633  IVNext = dyn_cast<Instruction>(IVPN->getIncomingValueForBlock(LoopHeaderBB));
2634 
2635  if (!IVNext || !match(IVNext, m_Add(m_Specific(IVPN), m_One()))) {
2636  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
2637  return false;
2638  }
2639 
2640  // Step 4: Check if the backedge's destinations are in desirable form.
2641 
2642  assert(ICmpInst::isEquality(Pred) &&
2643  "Should only get equality predicates here.");
2644 
2645  // cmp-br is commutative, so canonicalize to a single variant.
2646  InvertedCond = Pred != ICmpInst::Predicate::ICMP_EQ;
2647  if (InvertedCond) {
2648  Pred = ICmpInst::getInversePredicate(Pred);
2649  std::swap(TrueBB, FalseBB);
2650  }
2651 
2652  // We expect to exit loop when comparison yields true,
2653  // so when it yields false we should branch back to loop header.
2654  if (FalseBB != LoopHeaderBB) {
2655  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
2656  return false;
2657  }
2658 
2659  // The new, countable, loop will certainly only run a known number of
2660  // iterations, It won't be infinite. But the old loop might be infinite
2661  // under certain conditions. For logical shifts, the value will become zero
2662  // after at most bitwidth(%Val) loop iterations. However, for arithmetic
2663  // right-shift, iff the sign bit was set, the value will never become zero,
2664  // and the loop may never finish.
2665  if (ValShifted->getOpcode() == Instruction::AShr &&
2666  !isMustProgress(CurLoop) && !SE->isKnownNonNegative(SE->getSCEV(Val))) {
2667  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Can not prove the loop is finite.\n");
2668  return false;
2669  }
2670 
2671  // Okay, idiom checks out.
2672  return true;
2673 }
2674 
2675 /// Look for the following loop:
2676 /// \code
2677 /// entry:
2678 /// <...>
2679 /// %start = <...>
2680 /// %extraoffset = <...>
2681 /// <...>
2682 /// br label %for.cond
2683 ///
2684 /// loop:
2685 /// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
2686 /// %nbits = add nsw i8 %iv, %extraoffset
2687 /// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
2688 /// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
2689 /// %iv.next = add i8 %iv, 1
2690 /// <...>
2691 /// br i1 %val.shifted.iszero, label %end, label %loop
2692 ///
2693 /// end:
2694 /// %iv.res = phi i8 [ %iv, %loop ] <...>
2695 /// %nbits.res = phi i8 [ %nbits, %loop ] <...>
2696 /// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
2697 /// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
2698 /// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
2699 /// <...>
2700 /// \endcode
2701 ///
2702 /// And transform it into:
2703 /// \code
2704 /// entry:
2705 /// <...>
2706 /// %start = <...>
2707 /// %extraoffset = <...>
2708 /// <...>
2709 /// %val.numleadingzeros = call i8 @llvm.ct{l,t}z.i8(i8 %val, i1 0)
2710 /// %val.numactivebits = sub i8 8, %val.numleadingzeros
2711 /// %extraoffset.neg = sub i8 0, %extraoffset
2712 /// %tmp = add i8 %val.numactivebits, %extraoffset.neg
2713 /// %iv.final = call i8 @llvm.smax.i8(i8 %tmp, i8 %start)
2714 /// %loop.tripcount = sub i8 %iv.final, %start
2715 /// br label %loop
2716 ///
2717 /// loop:
2718 /// %loop.iv = phi i8 [ 0, %entry ], [ %loop.iv.next, %loop ]
2719 /// %loop.iv.next = add i8 %loop.iv, 1
2720 /// %loop.ivcheck = icmp eq i8 %loop.iv.next, %loop.tripcount
2721 /// %iv = add i8 %loop.iv, %start
2722 /// <...>
2723 /// br i1 %loop.ivcheck, label %end, label %loop
2724 ///
2725 /// end:
2726 /// %iv.res = phi i8 [ %iv.final, %loop ] <...>
2727 /// <...>
2728 /// \endcode
2729 bool LoopIdiomRecognize::recognizeShiftUntilZero() {
2730  bool MadeChange = false;
2731 
2732  Instruction *ValShiftedIsZero;
2733  Intrinsic::ID IntrID;
2734  Instruction *IV;
2735  Value *Start, *Val;
2736  const SCEV *ExtraOffsetExpr;
2737  bool InvertedCond;
2738  if (!detectShiftUntilZeroIdiom(CurLoop, SE, ValShiftedIsZero, IntrID, IV,
2739  Start, Val, ExtraOffsetExpr, InvertedCond)) {
2741  " shift-until-zero idiom detection failed.\n");
2742  return MadeChange;
2743  }
2744  LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom detected!\n");
2745 
2746  // Ok, it is the idiom we were looking for, we *could* transform this loop,
2747  // but is it profitable to transform?
2748 
2749  BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2750  BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2751  assert(LoopPreheaderBB && "There is always a loop preheader.");
2752 
2753  BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2754  assert(SuccessorBB && "There is only a single successor.");
2755 
2756  IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2757  Builder.SetCurrentDebugLocation(IV->getDebugLoc());
2758 
2759  Type *Ty = Val->getType();
2760  unsigned Bitwidth = Ty->getScalarSizeInBits();
2761 
2764 
2765  // The rewrite is considered to be unprofitable iff and only iff the
2766  // intrinsic we'll use are not cheap. Note that we are okay with *just*
2767  // making the loop countable, even if nothing else changes.
2769  IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getFalse()});
2771  if (Cost > TargetTransformInfo::TCC_Basic) {
2773  " Intrinsic is too costly, not beneficial\n");
2774  return MadeChange;
2775  }
2776 
2777  // Ok, transform appears worthwhile.
2778  MadeChange = true;
2779 
2780  bool OffsetIsZero = false;
2781  if (auto *ExtraOffsetExprC = dyn_cast<SCEVConstant>(ExtraOffsetExpr))
2782  OffsetIsZero = ExtraOffsetExprC->isZero();
2783 
2784  // Step 1: Compute the loop's final IV value / trip count.
2785 
2786  CallInst *ValNumLeadingZeros = Builder.CreateIntrinsic(
2787  IntrID, Ty, {Val, /*is_zero_undef=*/Builder.getFalse()},
2788  /*FMFSource=*/nullptr, Val->getName() + ".numleadingzeros");
2789  Value *ValNumActiveBits = Builder.CreateSub(
2790  ConstantInt::get(Ty, Ty->getScalarSizeInBits()), ValNumLeadingZeros,
2791  Val->getName() + ".numactivebits", /*HasNUW=*/true,
2792  /*HasNSW=*/Bitwidth != 2);
2793 
2794  SCEVExpander Expander(*SE, *DL, "loop-idiom");
2795  Expander.setInsertPoint(&*Builder.GetInsertPoint());
2796  Value *ExtraOffset = Expander.expandCodeFor(ExtraOffsetExpr);
2797 
2798  Value *ValNumActiveBitsOffset = Builder.CreateAdd(
2799  ValNumActiveBits, ExtraOffset, ValNumActiveBits->getName() + ".offset",
2800  /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true);
2801  Value *IVFinal = Builder.CreateIntrinsic(Intrinsic::smax, {Ty},
2802  {ValNumActiveBitsOffset, Start},
2803  /*FMFSource=*/nullptr, "iv.final");
2804 
2805  auto *LoopBackedgeTakenCount = cast<Instruction>(Builder.CreateSub(
2806  IVFinal, Start, CurLoop->getName() + ".backedgetakencount",
2807  /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true));
2808  // FIXME: or when the offset was `add nuw`
2809 
2810  // We know loop's backedge-taken count, but what's loop's trip count?
2811  Value *LoopTripCount =
2812  Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
2813  CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
2814  /*HasNSW=*/Bitwidth != 2);
2815 
2816  // Step 2: Adjust the successor basic block to recieve the original
2817  // induction variable's final value instead of the orig. IV itself.
2818 
2819  IV->replaceUsesOutsideBlock(IVFinal, LoopHeaderBB);
2820 
2821  // Step 3: Rewrite the loop into a countable form, with canonical IV.
2822 
2823  // The new canonical induction variable.
2824  Builder.SetInsertPoint(&LoopHeaderBB->front());
2825  auto *CIV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
2826 
2827  // The induction itself.
2828  Builder.SetInsertPoint(LoopHeaderBB->getFirstNonPHI());
2829  auto *CIVNext =
2830  Builder.CreateAdd(CIV, ConstantInt::get(Ty, 1), CIV->getName() + ".next",
2831  /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
2832 
2833  // The loop trip count check.
2834  auto *CIVCheck = Builder.CreateICmpEQ(CIVNext, LoopTripCount,
2835  CurLoop->getName() + ".ivcheck");
2836  auto *NewIVCheck = CIVCheck;
2837  if (InvertedCond) {
2838  NewIVCheck = Builder.CreateNot(CIVCheck);
2839  NewIVCheck->takeName(ValShiftedIsZero);
2840  }
2841 
2842  // The original IV, but rebased to be an offset to the CIV.
2843  auto *IVDePHId = Builder.CreateAdd(CIV, Start, "", /*HasNUW=*/false,
2844  /*HasNSW=*/true); // FIXME: what about NUW?
2845  IVDePHId->takeName(IV);
2846 
2847  // The loop terminator.
2848  Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
2849  Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
2850  LoopHeaderBB->getTerminator()->eraseFromParent();
2851 
2852  // Populate the IV PHI.
2853  CIV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
2854  CIV->addIncoming(CIVNext, LoopHeaderBB);
2855 
2856  // Step 4: Forget the "non-computable" trip-count SCEV associated with the
2857  // loop. The loop would otherwise not be deleted even if it becomes empty.
2858 
2859  SE->forgetLoop(CurLoop);
2860 
2861  // Step 5: Try to cleanup the loop's body somewhat.
2862  IV->replaceAllUsesWith(IVDePHId);
2863  IV->eraseFromParent();
2864 
2865  ValShiftedIsZero->replaceAllUsesWith(NewIVCheck);
2866  ValShiftedIsZero->eraseFromParent();
2867 
2868  // Other passes will take care of actually deleting the loop if possible.
2869 
2870  LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom optimized!\n");
2871 
2872  ++NumShiftUntilZero;
2873  return MadeChange;
2874 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
i
i
Definition: README.txt:29
llvm::InstructionCost
Definition: InstructionCost.h:29
DisableLIRPAll
static cl::opt< bool, true > DisableLIRPAll("disable-" DEBUG_TYPE "-all", cl::desc("Options to disable Loop Idiom Recognize Pass."), cl::location(DisableLIRP::All), cl::init(false), cl::ReallyHidden)
llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:155
llvm::TargetTransformInfo::PSK_FastHardware
@ PSK_FastHardware
Definition: TargetTransformInfo.h:592
llvm::RecursivelyDeleteTriviallyDeadInstructions
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:511
llvm::ScalarEvolution::hasLoopInvariantBackedgeTakenCount
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
Definition: ScalarEvolution.cpp:12351
llvm::BasicBlock::end
iterator end()
Definition: BasicBlock.h:298
Attrs
Function Attrs
Definition: README_ALTIVEC.txt:215
llvm::OptimizationRemarkMissed
Diagnostic information for missed-optimization remarks.
Definition: DiagnosticInfo.h:729
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:211
llvm::Loop::isLoopInvariant
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:64
llvm::MemIntrinsicBase::getDestAlignment
unsigned getDestAlignment() const
FIXME: Remove this function once transition to Align is over.
Definition: IntrinsicInst.h:660
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:103
llvm::ScalarEvolution::isLoopEntryGuardedByCond
bool isLoopEntryGuardedByCond(const Loop *L, ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS)
Test whether entry to the loop is protected by a conditional between LHS and RHS.
Definition: ScalarEvolution.cpp:10522
llvm::ScalarEvolution::getNegativeSCEV
const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
Definition: ScalarEvolution.cpp:4113
llvm
---------------------— PointerInfo ------------------------------------—
Definition: AllocatorList.h:23
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::Instruction::getModule
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:66
llvm::CmpInst::ICMP_EQ
@ ICMP_EQ
equal
Definition: InstrTypes.h:741
it
into xmm2 addss xmm2 xmm1 xmm3 addss xmm3 movaps xmm0 unpcklps xmm0 ret seems silly when it could just be one addps Expand libm rounding functions main should enable SSE DAZ mode and other fast SSE modes Think about doing i64 math in SSE regs on x86 This testcase should have no SSE instructions in it
Definition: README-SSE.txt:81
llvm::isKnownNonNegative
bool isKnownNonNegative(const Value *V, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Returns true if the give value is known to be non-negative.
Definition: ValueTracking.cpp:312
match_LoopInvariant
Match loop-invariant value.
Definition: LoopIdiomRecognize.cpp:2185
llvm::SmallPtrSetImpl::erase
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:378
llvm::LoopBase::getExitBlock
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
Definition: LoopInfoImpl.h:82
llvm::WeakTrackingVH
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
match_LoopInvariant::L
const Loop * L
Definition: LoopIdiomRecognize.cpp:2187
CmpInstAnalysis.h
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:112
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:435
llvm::LocationSize::afterPointer
constexpr static LocationSize afterPointer()
Any location after the base pointer (but still within the underlying object).
Definition: MemoryLocation.h:123
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1379
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:720
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:90
llvm::SCEVAddRecExpr::isAffine
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
Definition: ScalarEvolutionExpressions.h:379
llvm::isModOrRefSet
LLVM_NODISCARD bool isModOrRefSet(const ModRefInfo MRI)
Definition: AliasAnalysis.h:189
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
IntrinsicInst.h
ScalarEvolutionExpander.h
llvm::MemIntrinsicBase::getDestAlign
MaybeAlign getDestAlign() const
Definition: IntrinsicInst.h:665
Scalar.h
llvm::SCEVAddRecExpr::getStart
const SCEV * getStart() const
Definition: ScalarEvolutionExpressions.h:363
llvm::TypeSize::getFixedSize
ScalarTy getFixedSize() const
Definition: TypeSize.h:426
MemorySSAUpdater.h
llvm::Function
Definition: Function.h:61
llvm::cl::location
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:459
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:530
StringRef.h
Pass.h
llvm::LoopBase::contains
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:122
llvm::SCEVExpander
This class uses information about analyze scalars to rewrite expressions in canonical form.
Definition: ScalarEvolutionExpander.h:63
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:319
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
Statistic.h
llvm::PatternMatch::m_Add
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1008
llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1467
detectPopcountIdiom
static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, Instruction *&CntInst, PHINode *&CntPhi, Value *&Var)
Return true iff the idiom is detected in the loop.
Definition: LoopIdiomRecognize.cpp:1577
llvm::TargetTransformInfo
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Definition: TargetTransformInfo.h:168
LoopAccessAnalysis.h
llvm::Type::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:734
llvm::IRBuilder<>
MapVector.h
llvm::GlobalVariable
Definition: GlobalVariable.h:40
llvm::SmallDenseMap
Definition: DenseMap.h:880
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:461
llvm::TargetTransformInfo::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
Definition: TargetTransformInfo.cpp:868
llvm::CmpInst::ICMP_NE
@ ICMP_NE
not equal
Definition: InstrTypes.h:742
llvm::Instruction::hasNoUnsignedWrap
bool hasNoUnsignedWrap() const
Determine whether the no unsigned wrap flag is set.
Definition: Instruction.cpp:136
llvm::CmpInst::getInversePredicate
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:820
llvm::ConstantExpr::getBitCast
static Constant * getBitCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2208
llvm::initializeLoopIdiomRecognizeLegacyPassPass
void initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry &)
ValueTracking.h
Local.h
OptimizationRemarkEmitter.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:151
llvm::SetVector::iterator
typename vector_type::const_iterator iterator
Definition: SetVector.h:48
llvm::getLoopAnalysisUsage
void getLoopAnalysisUsage(AnalysisUsage &AU)
Helper to consistently add the set of standard passes to a loop pass's AnalysisUsage.
Definition: LoopUtils.cpp:149
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
llvm::PatternMatch::m_Br
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
Definition: PatternMatch.h:1741
APInt.h
ScalarEvolution.h
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
DenseMap.h
llvm::APInt::getBitWidth
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1403
llvm::ICmpInst::isEquality
bool isEquality() const
Return true if this predicate is either EQ or NE.
Definition: Instructions.h:1298
Module.h
llvm::createLoopIdiomPass
Pass * createLoopIdiomPass()
Definition: LoopIdiomRecognize.cpp:346
getTripCount
static const SCEV * getTripCount(const SCEV *BECount, Type *IntPtr, Loop *CurLoop, const DataLayout *DL, ScalarEvolution *SE)
Compute trip count from the backedge taken count.
Definition: LoopIdiomRecognize.cpp:1050
llvm::Instruction::isShift
bool isShift() const
Definition: Instruction.h:167
llvm::isBytewiseValue
Value * isBytewiseValue(Value *V, const DataLayout &DL)
If the specified value can be set by repeating the same byte in memory, return the i8 value that it i...
Definition: ValueTracking.cpp:3843
llvm::LoopStandardAnalysisResults
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Definition: LoopAnalysisManager.h:52
match_LoopInvariant::match
bool match(ITy *V)
Definition: LoopIdiomRecognize.cpp:2192
llvm::GlobalValue::UnnamedAddr::Global
@ Global
llvm::isConsecutiveAccess
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
Definition: LoopAccessAnalysis.cpp:1253
T
#define T
Definition: Mips16ISelLowering.cpp:341
llvm::MapVector< Value *, StoreList >
llvm::SmallPtrSet< Value *, 16 >
llvm::ore::NV
DiagnosticInfoOptimizationBase::Argument NV
Definition: OptimizationRemarkEmitter.h:136
llvm::DisableLIRP::All
static bool All
When true, the entire pass is disabled.
Definition: LoopIdiomRecognize.h:30
llvm::CmpInst::ICMP_SLE
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:750
llvm::ScalarEvolution::isKnownNonNegative
bool isKnownNonNegative(const SCEV *S)
Test if the given expression is known to be non-negative.
Definition: ScalarEvolution.cpp:9830
llvm::ConstantExpr::getExactLogBase2
static Constant * getExactLogBase2(Constant *C)
If C is a scalar/fixed width vector of known powers of 2, then this function returns a new scalar/fix...
Definition: Constants.cpp:2758
llvm::BasicBlock::getSinglePredecessor
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:264
llvm::GlobalValue::setUnnamedAddr
void setUnnamedAddr(UnnamedAddr Val)
Definition: GlobalValue.h:212
llvm::ArrayType
Class to represent array types.
Definition: DerivedTypes.h:357
llvm::LoadInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:267
llvm::ScalarEvolution::getTruncateOrZeroExtend
const SCEV * getTruncateOrZeroExtend(const SCEV *V, Type *Ty, unsigned Depth=0)
Return a SCEV corresponding to a conversion of the input value to the specified type.
Definition: ScalarEvolution.cpp:4249
llvm::PatternMatch::m_c_And
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
Definition: PatternMatch.h:2242
llvm::SCEVConstant::getValue
ConstantInt * getValue() const
Definition: ScalarEvolutionExpressions.h:56
llvm::LoopStandardAnalysisResults::DT
DominatorTree & DT
Definition: LoopAnalysisManager.h:55
llvm::LoadInst::getAlign
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:223
llvm::BitmaskEnumDetail::Mask
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::SimpleLoopSafetyInfo
Simple and conservative implementation of LoopSafetyInfo that can give false-positive answers to its ...
Definition: MustExecute.h:110
llvm::codeview::PointerMode::Pointer
@ Pointer
llvm::cl::ReallyHidden
@ ReallyHidden
Definition: CommandLine.h:144
llvm::LinearPolySize::isScalable
bool isScalable() const
Returns whether the size is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:299
llvm::IRBuilderBase::getInt1
ConstantInt * getInt1(bool V)
Get a constant value representing either true or false.
Definition: IRBuilder.h:453
MustExecute.h
llvm::LoopBase::getNumBlocks
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:185
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::RISCVFenceField::R
@ R
Definition: RISCVBaseInfo.h:193
llvm::MemSetBase::getValue
Value * getValue() const
Definition: IntrinsicInst.h:766
llvm::LoopBase::block_end
block_iterator block_end() const
Definition: LoopInfo.h:177
loop
Analysis the ScalarEvolution expression for r is< loop > Outside the loop
Definition: README.txt:8
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
AliasAnalysis.h
llvm::Optional::hasValue
constexpr bool hasValue() const
Definition: Optional.h:288
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::ScalarEvolution::getMulExpr
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
Definition: ScalarEvolution.cpp:2986
llvm::DominatorTree::dominates
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:115
Instruction.h
llvm::isSafeToExpand
bool isSafeToExpand(const SCEV *S, ScalarEvolution &SE)
Return true if the given expression is safe to expand in the sense that all materialized values are s...
Definition: ScalarEvolutionExpander.cpp:2647
CommandLine.h
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::Intrinsic::getType
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Return the function type for an intrinsic.
Definition: Function.cpp:1335
llvm::Instruction::getOpcode
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:160
llvm::StoreInst::getValueOperand
Value * getValueOperand()
Definition: Instructions.h:398
GlobalValue.h
llvm::InlinerFunctionImportStatsOpts::No
@ No
llvm::MemorySSAWrapperPass
Legacy analysis pass which computes MemorySSA.
Definition: MemorySSA.h:981
llvm::SetVector::begin
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:82
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:31
llvm::ScalarEvolution::getOne
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
Definition: ScalarEvolution.h:629
Constants.h
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
llvm::AAResults
Definition: AliasAnalysis.h:456
LoopIdiomRecognize.h
llvm::TargetTransformInfo::getPopcntSupport
PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const
Return hardware support for population count.
Definition: TargetTransformInfo.cpp:527
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::LocationSize
Definition: MemoryLocation.h:65
llvm::User
Definition: User.h:44
Intrinsics.h
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
InstrTypes.h
llvm::CallBase::getCalledFunction
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation.
Definition: InstrTypes.h:1393
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::StoreInst::getAlignment
unsigned getAlignment() const
Return the alignment of the access that is being performed FIXME: Remove this function once transitio...
Definition: Instructions.h:351
TargetLibraryInfo.h
llvm::PatternMatch::m_Instruction
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:748
false
Definition: StackSlotColoring.cpp:142
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:109
llvm::PatternMatch::m_c_Add
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
Definition: PatternMatch.h:2228
llvm::PHINode::getIncomingValueForBlock
Value * getIncomingValueForBlock(const BasicBlock *BB) const
Definition: Instructions.h:2818
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:171
llvm::Constant::getAllOnesValue
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:405
llvm::BinaryOperator::getOpcode
BinaryOps getOpcode() const
Definition: InstrTypes.h:393
llvm::Instruction
Definition: Instruction.h:45
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:153
llvm::STATISTIC
STATISTIC(NumFunctions, "Total number of functions")
llvm::codeview::EncodedFramePtrReg::BasePtr
@ BasePtr
llvm::UndefValue::get
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1771
llvm::LocationSize::precise
static LocationSize precise(uint64_t Value)
Definition: MemoryLocation.h:100
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:900
llvm::isMustProgress
bool isMustProgress(const Loop *L)
Return true if this loop can be assumed to make progress.
Definition: LoopInfo.cpp:1116
LoopUtils.h
DEBUG_TYPE
#define DEBUG_TYPE
Definition: LoopIdiomRecognize.cpp:108
llvm::getUnderlyingObject
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
Definition: ValueTracking.cpp:4377
DebugLoc.h
llvm::LPPassManager
Definition: LoopPass.h:75
SmallPtrSet.h
m_LoopInvariant
match_LoopInvariant< Ty > m_LoopInvariant(const Ty &M, const Loop *L)
Matches if the value is loop-invariant.
Definition: LoopIdiomRecognize.cpp:2199
llvm::BasicBlock::getModule
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:144
Align
uint64_t Align
Definition: ELFObjHandler.cpp:83
PatternMatch.h
llvm::BasicBlock::getFirstNonPHI
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:212
llvm::MemTransferBase::getSourceAlign
MaybeAlign getSourceAlign() const
Definition: IntrinsicInst.h:731
llvm::PatternMatch::m_Shift
BinOpPred_match< LHS, RHS, is_shift_op > m_Shift(const LHS &L, const RHS &R)
Matches shift operations.
Definition: PatternMatch.h:1306
llvm::Loop::getName
StringRef getName() const
Definition: LoopInfo.h:866
llvm::None
const NoneType None
Definition: None.h:23
llvm::Type::getIntegerBitWidth
unsigned getIntegerBitWidth() const
Definition: DerivedTypes.h:97
Type.h
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:118
X
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
llvm::MemSetInst
This class wraps the llvm.memset intrinsic.
Definition: IntrinsicInst.h:905
llvm::PatternMatch::m_One
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:513
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
LoopInfo.h
llvm::BranchInst::getCondition
Value * getCondition() const
Definition: Instructions.h:3149
llvm::SCEV::isNonConstantNegative
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Definition: ScalarEvolution.cpp:427
llvm::ScalarEvolution::getSCEV
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
Definition: ScalarEvolution.cpp:4066
llvm::LoopBase::block_begin
block_iterator block_begin() const
Definition: LoopInfo.h:176
llvm::PatternMatch::m_Power2
cst_pred_ty< is_power2 > m_Power2()
Match an integer or vector power-of-2.
Definition: PatternMatch.h:544
getStoreStride
static APInt getStoreStride(const SCEVAddRecExpr *StoreEv)
Definition: LoopIdiomRecognize.cpp:426
UseLIRCodeSizeHeurs
static cl::opt< bool > UseLIRCodeSizeHeurs("use-lir-code-size-heurs", cl::desc("Use loop idiom recognition code size heuristics when compiling" "with -Os/-Oz"), cl::init(true), cl::Hidden)
llvm::getLoopPassPreservedAnalyses
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
Definition: LoopAnalysisManager.cpp:140
BasicBlock.h
llvm::cl::opt
Definition: CommandLine.h:1434
llvm::LoopBase< BasicBlock, Loop >::block_iterator
ArrayRef< BasicBlock * >::const_iterator block_iterator
Definition: LoopInfo.h:175
llvm::SCEV
This class represents an analyzed expression in the program.
Definition: ScalarEvolution.h:78
llvm::PatternMatch::m_Zero
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:535
llvm::StoreInst
An instruction for storing to memory.
Definition: Instructions.h:304
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
llvm::Instruction::eraseFromParent
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:78
llvm::ICmpInst
This instruction compares its operands according to the predicate given to the constructor.
Definition: Instructions.h:1203
Index
uint32_t Index
Definition: ELFObjHandler.cpp:84
llvm::TargetLibraryInfoWrapperPass
Definition: TargetLibraryInfo.h:463
uint64_t
llvm::LoopPass
Definition: LoopPass.h:27
llvm::TargetTransformInfoWrapperPass
Wrapper pass for TargetTransformInfo.
Definition: TargetTransformInfo.h:2387
llvm::GlobalValue::getParent
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:572
llvm::LPMUpdater
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
Definition: LoopPassManager.h:249
llvm::BTF::HeaderSize
@ HeaderSize
Definition: BTF.h:58
llvm::SmallPtrSetImpl::end
iterator end() const
Definition: SmallPtrSet.h:407
llvm::LoopBase::getUniqueExitBlocks
void getUniqueExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all unique successor blocks of this loop.
Definition: LoopInfoImpl.h:122
createPopcntIntrinsic
static CallInst * createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL)
Definition: LoopIdiomRecognize.cpp:1929
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
llvm::PHINode::addIncoming
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Definition: Instructions.h:2783
llvm::numbers::e
constexpr double e
Definition: MathExtras.h:57
MemoryLocation.h
llvm::MemorySSA
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition: MemorySSA.h:705
llvm::LoopInfoBase::getLoopFor
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Definition: LoopInfo.h:967
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
llvm::LoopBase::getLoopPreheader
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:167
llvm::PatternMatch::m_And
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1123
llvm::SmallPtrSetImpl::begin
iterator begin() const
Definition: SmallPtrSet.h:402
ArrayRef.h
llvm::Instruction::setDebugLoc
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:367
llvm::SCEVConstant
This class represents a constant integer value.
Definition: ScalarEvolutionExpressions.h:47
llvm::ModRefInfo
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition: AliasAnalysis.h:148
IRBuilder.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::PatternMatch::m_Sub
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1020
llvm::Value::replaceUsesOutsideBlock
void replaceUsesOutsideBlock(Value *V, BasicBlock *BB)
replaceUsesOutsideBlock - Go through the uses list for this definition and make each use point to "V"...
Definition: Value.cpp:586
std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:840
SI
StandardInstrumentations SI(Debug, VerifyEach)
llvm::OptimizationRemarkEmitter::emit
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Definition: OptimizationRemarkEmitter.cpp:77
match_LoopInvariant::SubPattern
SubPattern_t SubPattern
Definition: LoopIdiomRecognize.cpp:2186
detectShiftUntilZeroIdiom
static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL, Intrinsic::ID &IntrinID, Value *&InitX, Instruction *&CntInst, PHINode *&CntPhi, Instruction *&DefX)
Return true if the idiom is detected in the loop.
Definition: LoopIdiomRecognize.cpp:1712
llvm::TargetLibraryInfo::has
bool has(LibFunc F) const
Tests whether a library function is available.
Definition: TargetLibraryInfo.h:323
llvm::TTI
TargetTransformInfo TTI
Definition: TargetTransformInfo.h:163
llvm::SCEVExpanderCleaner
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
Definition: ScalarEvolutionExpander.h:502
llvm::ArrayType::get
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:602
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
llvm::MemorySSAAnalysis
An analysis that produces MemorySSA for a function.
Definition: MemorySSA.h:931
llvm::SimpleLoopSafetyInfo::computeLoopSafetyInfo
void computeLoopSafetyInfo(const Loop *CurLoop) override
Computes safety information for a loop checks loop body & header for the possibility of may throw exc...
Definition: MustExecute.cpp:54
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:382
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:650
InstructionCost.h
llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:75
llvm::User::setOperand
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
llvm::SetVector::insert
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:141
DisableLIRPMemcpy
static cl::opt< bool, true > DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memcpy."), cl::location(DisableLIRP::Memcpy), cl::init(false), cl::ReallyHidden)
getDebugLoc
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
Definition: MachineInstrBundle.cpp:109
llvm::PatternMatch::m_CombineAnd
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
Definition: PatternMatch.h:216
llvm::Sched::Source
@ Source
Definition: TargetLowering.h:100
llvm::MemIntrinsicBase::getDest
Value * getDest() const
This is just like getRawDest, but it strips off any cast instructions (including addrspacecast) that ...
Definition: IntrinsicInst.h:652
llvm::LoopInfo
Definition: LoopInfo.h:1083
llvm::BasicBlock::instructionsWithoutDebug
iterator_range< filter_iterator< BasicBlock::const_iterator, std::function< bool(const Instruction &)> > > instructionsWithoutDebug(bool SkipPseudoOp=false) const
Return a const iterator range over the instructions in the block, skipping any debug instructions.
Definition: BasicBlock.cpp:100
llvm::BinaryOperator
Definition: InstrTypes.h:189
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
DataLayout.h
llvm::PatternMatch::m_SpecificInt_ICMP
cst_pred_ty< icmp_pred_with_threshold > m_SpecificInt_ICMP(ICmpInst::Predicate Predicate, const APInt &Threshold)
Match an integer or vector with every element comparing 'pred' (eg/ne/...) to Threshold.
Definition: PatternMatch.h:626
Cond
SmallVector< MachineOperand, 4 > Cond
Definition: BasicBlockSections.cpp:179
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:58
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:215
llvm::ScalarEvolution::getConstant
const SCEV * getConstant(ConstantInt *V)
Definition: ScalarEvolution.cpp:446
LoopPass.h
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:136
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
llvm::ConstantInt::isZero
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:194
llvm::LoopBase::isOutermost
bool isOutermost() const
Return true if the loop does not have a parent (natural) loop.
Definition: LoopInfo.h:168
llvm::Instruction::getFunction
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:70
llvm::AnalysisUsage::addPreserved
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Definition: PassAnalysisSupport.h:98
llvm::Value::replaceAllUsesWith
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:532
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
getParent
static const Function * getParent(const Value *V)
Definition: BasicAliasAnalysis.cpp:776
uint32_t
llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:990
llvm::ModRefInfo::Mod
@ Mod
The access may modify the value stored in memory.
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:81
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::SCEV::FlagNUW
@ FlagNUW
Definition: ScalarEvolution.h:136
llvm::ConstantInt::isMinusOne
bool isMinusOne() const
This function will return true iff every bit in this constant is set to true.
Definition: Constants.h:206
llvm::LoopStandardAnalysisResults::LI
LoopInfo & LI
Definition: LoopAnalysisManager.h:56
llvm::MemoryAccess
Definition: MemorySSA.h:137
idiom
loop idiom
Definition: LoopIdiomRecognize.cpp:343
llvm::Value::getName
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:175
ValueHandle.h
matchCondition
static Value * matchCondition(BranchInst *BI, BasicBlock *LoopEntry, bool JmpOnZero=false)
Check if the given conditional branch is based on the comparison between a variable and zero,...
Definition: LoopIdiomRecognize.cpp:1516
llvm::BasicBlock::getTerminator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.cpp:148
llvm::SimpleLoopSafetyInfo::anyBlockMayThrow
bool anyBlockMayThrow() const override
Returns true iff any block of the loop for which this info is contains an instruction that may throw ...
Definition: MustExecute.cpp:50
llvm::Value::stripPointerCasts
const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition: Value.cpp:687
mayLoopAccessLocation
static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, const SCEV *BECount, const SCEV *StoreSizeSCEV, AliasAnalysis &AA, SmallPtrSetImpl< Instruction * > &IgnoredInsts)
mayLoopAccessLocation - Return true if the specified loop might access the specified pointer location...
Definition: LoopIdiomRecognize.cpp:999
llvm::Function::hasOptSize
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:671
llvm::ScalarEvolution::isLoopInvariant
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
Definition: ScalarEvolution.cpp:12619
detectShiftUntilBitTestIdiom
static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX, Value *&BitMask, Value *&BitPos, Value *&CurrX, Instruction *&NextX)
Return true if the idiom is detected in the loop.
Definition: LoopIdiomRecognize.cpp:2225
llvm::ConstantInt::getZExtValue
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:142
llvm::BasicBlock::front
const Instruction & front() const
Definition: BasicBlock.h:308
llvm::LoopIdiomRecognizePass::run
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
Definition: LoopIdiomRecognize.cpp:314
llvm::DisableLIRP::Memcpy
static bool Memcpy
When true, Memcpy is disabled.
Definition: LoopIdiomRecognize.h:36
Attributes.h
j
return j(j<< 16)
getNumBytes
static const SCEV * getNumBytes(const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, Loop *CurLoop, const DataLayout *DL, ScalarEvolution *SE)
Compute the number of bytes as a SCEV from the backedge taken count.
Definition: LoopIdiomRecognize.cpp:1080
llvm::Instruction::isAtomic
bool isAtomic() const
Return true if this instruction has an AtomicOrdering of unordered or higher.
Definition: Instruction.cpp:604
Constant.h
llvm::ScalarEvolution::forgetLoop
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
Definition: ScalarEvolution.cpp:7452
llvm::SCEVAddRecExpr::getLoop
const Loop * getLoop() const
Definition: ScalarEvolutionExpressions.h:364
llvm::GraphProgram::Name
Name
Definition: GraphWriter.h:52
llvm::decomposeBitTestICmp
bool decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate &Pred, Value *&X, APInt &Mask, bool LookThroughTrunc=true)
Decompose an icmp into the form ((X & Mask) pred 0) if possible.
Definition: CmpInstAnalysis.cpp:66
llvm::ScalarEvolution::getMinusSCEV
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
Definition: ScalarEvolution.cpp:4199
llvm::MemIntrinsicBase::getLength
Value * getLength() const
Definition: IntrinsicInst.h:643
llvm::LoopStandardAnalysisResults::TTI
TargetTransformInfo & TTI
Definition: LoopAnalysisManager.h:59
getMemSetPatternValue
static Constant * getMemSetPatternValue(Value *V, const DataLayout *DL)
getMemSetPatternValue - If a strided store of the specified value is safe to turn into a memset_patte...
Definition: LoopIdiomRecognize.cpp:437
llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:161
llvm::PHINode::Create
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
Definition: Instructions.h:2675
llvm::ore::setExtraArgs
DiagnosticInfoOptimizationBase::setExtraArgs setExtraArgs
Definition: OptimizationRemarkEmitter.h:138
llvm::ConstantInt::getBool
static ConstantInt * getBool(LLVMContext &Context, bool V)
Definition: Constants.cpp:862
GlobalVariable.h
llvm::IRBuilderBase::GetInsertBlock
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:178
llvm::MemorySSA::BeforeTerminator
@ BeforeTerminator
Definition: MemorySSA.h:793
llvm::TypeSize
Definition: TypeSize.h:417
llvm::SCEVAddRecExpr
This node represents a polynomial recurrence on the trip count of the specified loop.
Definition: ScalarEvolutionExpressions.h:352
Casting.h
idioms
loop Recognize loop idioms
Definition: LoopIdiomRecognize.cpp:344
llvm::LoopBase::getHeader
BlockT * getHeader() const
Definition: LoopInfo.h:104
PassManager.h
llvm::SetVector::count
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:215
llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition: TargetLibraryInfo.h:219
llvm::SCEV::isOne
bool isOne() const
Return true if the expression is a constant one.
Definition: ScalarEvolution.cpp:415
llvm::ConstantArray::get
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1263
llvm::GetPointerBaseWithConstantOffset
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
Definition: ValueTracking.h:279
llvm::SmallVectorImpl::clear
void clear()
Definition: SmallVector.h:585
llvm::PrevailingType::Yes
@ Yes
llvm::LoopBase::getNumBackEdges
unsigned getNumBackEdges() const
Calculate the number of back edges to the loop header.
Definition: LoopInfo.h:250
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:183
llvm::CmpInst::setPredicate
void setPredicate(Predicate P)
Set the predicate for this instruction to the specified value.
Definition: InstrTypes.h:799
isSimple
static bool isSimple(Instruction *I)
Definition: SLPVectorizer.cpp:529
llvm::LoopStandardAnalysisResults::SE
ScalarEvolution & SE
Definition: LoopAnalysisManager.h:57
llvm::LoopStandardAnalysisResults::AA
AAResults & AA
Definition: LoopAnalysisManager.h:53
llvm::SCEVConstant::getAPInt
const APInt & getAPInt() const
Definition: ScalarEvolutionExpressions.h:57
ScalarEvolutionExpressions.h
llvm::SetVector::end
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:92
llvm::OptimizationRemark
Diagnostic information for applied optimization remarks.
Definition: DiagnosticInfo.h:684
llvm::Pass
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:91
MemorySSA.h
llvm::ScalarEvolution::getZeroExtendExpr
const SCEV * getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
Definition: ScalarEvolution.cpp:1577
llvm::GlobalValue::PrivateLinkage
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:56
Instructions.h
llvm::MemCpyInst
This class wraps the llvm.memcpy intrinsic.
Definition: IntrinsicInst.h:936
llvm::LoadInst::isVolatile
bool isVolatile() const
Return true if this is a load from a volatile memory location.
Definition: Instructions.h:212
llvm::MemIntrinsic::isVolatile
bool isVolatile() const
Definition: IntrinsicInst.h:883
SmallVector.h
match_LoopInvariant::match_LoopInvariant
match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
Definition: LoopIdiomRecognize.cpp:2189
llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:802
llvm::Instruction::getDebugLoc
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:370
User.h
llvm::StoreInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:401
llvm::PatternMatch::m_ICmp
CmpClass_match< LHS, RHS, ICmpInst, ICmpInst::Predicate > m_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R)
Definition: PatternMatch.h:1404
llvm::Instruction::hasNoSignedWrap
bool hasNoSignedWrap() const
Determine whether the no signed wrap flag is set.
Definition: Instruction.cpp:140
llvm::CmpInst::ICMP_UGT
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:743
Dominators.h
llvm::inferLibFuncAttributes
bool inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI)
Analyze the name and prototype of the given function and set any applicable attributes.
Definition: BuildLibCalls.cpp:218
llvm::SmallPtrSetImplBase::empty
LLVM_NODISCARD bool empty() const
Definition: SmallPtrSet.h:91
llvm::BasicBlock::size
size_t size() const
Definition: BasicBlock.h:306
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:94
llvm::CmpInst::getPredicate
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:796
llvm::ScalarEvolution::getZero
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
Definition: ScalarEvolution.h:626
getStartForNegStride
static const SCEV * getStartForNegStride(const SCEV *Start, const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, ScalarEvolution *SE)
Definition: LoopIdiomRecognize.cpp:1035
TargetTransformInfo.h
llvm::LoopStandardAnalysisResults::MSSA
MemorySSA * MSSA
Definition: LoopAnalysisManager.h:61
llvm::FunctionCallee
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:165
llvm::PHINode
Definition: Instructions.h:2633
llvm::SmallVectorImpl< BasicBlock * >
llvm::TargetTransformInfo::getAtomicMemIntrinsicMaxElementSize
unsigned getAtomicMemIntrinsicMaxElementSize() const
Definition: TargetTransformInfo.cpp:937
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom", "Recognize loop idioms", false, false) INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass
llvm::Module::getDataLayout
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:401
DerivedTypes.h
llvm::SmallPtrSetImpl< Instruction * >
llvm::SCEV::getType
Type * getType() const
Return the LLVM type of this SCEV expression.
Definition: ScalarEvolution.cpp:379
llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition: InstructionSimplify.h:44
llvm::ScalarEvolution::getAddExpr
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
Definition: ScalarEvolution.cpp:2419
createFFSIntrinsic
static CallInst * createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL, bool ZeroCheck, Intrinsic::ID IID)
Definition: LoopIdiomRecognize.cpp:1942
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1475
llvm::ConstantInt::isOne
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
Definition: Constants.h:200
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::LoadInst::isUnordered
bool isUnordered() const
Definition: Instructions.h:261
DisableLIRPMemset
static cl::opt< bool, true > DisableLIRPMemset("disable-" DEBUG_TYPE "-memset", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memset."), cl::location(DisableLIRP::Memset), cl::init(false), cl::ReallyHidden)
deleteDeadInstruction
static void deleteDeadInstruction(Instruction *I)
Definition: LoopIdiomRecognize.cpp:348
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::VerifyMemorySSA
bool VerifyMemorySSA
Enables verification of MemorySSA.
Definition: MemorySSA.cpp:91
LLVMContext.h
llvm::TargetTransformInfo::TCC_Basic
@ TCC_Basic
The cost of a typical 'add' instruction.
Definition: TargetTransformInfo.h:263
llvm::Value::takeName
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:382
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
llvm::PatternMatch::m_BasicBlock
class_match< BasicBlock > m_BasicBlock()
Match an arbitrary basic block value and ignore it.
Definition: PatternMatch.h:160
llvm::TargetTransformInfo::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueKind Opd1Info=OK_AnyValue, OperandValueKind Opd2Info=OK_AnyValue, OperandValueProperties Opd1PropInfo=OP_None, OperandValueProperties Opd2PropInfo=OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
Definition: TargetTransformInfo.cpp:714
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:389
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:414
llvm::GlobalObject::setAlignment
void setAlignment(MaybeAlign Align)
Definition: Globals.cpp:117
llvm::SCEVNAryExpr::getOperand
const SCEV * getOperand(unsigned i) const
Definition: ScalarEvolutionExpressions.h:199
llvm::BranchInst
Conditional or Unconditional Branch instruction.
Definition: Instructions.h:3068
raw_ostream.h
llvm::SetVector
A vector that has set insertion semantics.
Definition: SetVector.h:40
llvm::ScalarEvolution::getBackedgeTakenCount
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
Definition: ScalarEvolution.cpp:7284
llvm::intersectModRef
LLVM_NODISCARD ModRefInfo intersectModRef(const ModRefInfo MRI1, const ModRefInfo MRI2)
Definition: AliasAnalysis.h:236
Value.h
getRecurrenceVar
static PHINode * getRecurrenceVar(Value *VarX, Instruction *DefX, BasicBlock *LoopEntry)
Definition: LoopIdiomRecognize.cpp:1544
llvm::LoopStandardAnalysisResults::TLI
TargetLibraryInfo & TLI
Definition: LoopAnalysisManager.h:58
InitializePasses.h
BuildLibCalls.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:75
llvm::PatternMatch::m_Shl
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1141
Debug.h
llvm::Value::users
iterator_range< user_iterator > users()
Definition: Value.h:422
llvm::AAResults::getModRefInfo
ModRefInfo getModRefInfo(const CallBase *Call, const MemoryLocation &Loc)
getModRefInfo (for call sites) - Return information about whether a particular call site modifies or ...
Definition: AliasAnalysis.cpp:218
llvm::APIntOps::smax
const APInt & smax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be signed.
Definition: APInt.h:2116
llvm::BranchInst::isConditional
bool isConditional() const
Definition: Instructions.h:3147
llvm::ModRefInfo::ModRef
@ ModRef
The access may reference and may modify the value stored in memory.
llvm::IRBuilderBase::CreateCall
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=None, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2395
llvm::Optional::getValue
constexpr const T & getValue() const LLVM_LVALUE_FUNCTION
Definition: Optional.h:282
llvm::BranchInst::getSuccessor
BasicBlock * getSuccessor(unsigned i) const
Definition: Instructions.h:3161
llvm::MemoryLocation
Representation for a specific memory location.
Definition: MemoryLocation.h:209
llvm::DisableLIRP::Memset
static bool Memset
When true, Memset is disabled.
Definition: LoopIdiomRecognize.h:33
SetVector.h
llvm::MemTransferBase::getSource
Value * getSource() const
This is just like getRawSource, but it strips off any cast instructions that feed it,...
Definition: IntrinsicInst.h:717
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:364
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37