LLVM  10.0.0svn
LoopIdiomRecognize.cpp
Go to the documentation of this file.
1 //===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass implements an idiom recognizer that transforms simple loops into a
10 // non-loop form. In cases that this kicks in, it can be a significant
11 // performance win.
12 //
13 // If compiling for code size we avoid idiom recognition if the resulting
14 // code could be larger than the code for the original loop. One way this could
15 // happen is if the loop is not removable after idiom recognition due to the
16 // presence of non-idiom instructions. The initial implementation of the
17 // heuristics applies to idioms in multi-block loops.
18 //
19 //===----------------------------------------------------------------------===//
20 //
21 // TODO List:
22 //
23 // Future loop memory idioms to recognize:
24 // memcmp, memmove, strlen, etc.
25 // Future floating point idioms to recognize in -ffast-math mode:
26 // fpowi
27 // Future integer operation idioms to recognize:
28 // ctpop
29 //
30 // Beware that isel's default lowering for ctpop is highly inefficient for
31 // i64 and larger types when i64 is legal and the value has few bits set. It
32 // would be good to enhance isel to emit a loop for ctpop in this case.
33 //
34 // This could recognize common matrix multiplies and dot product idioms and
35 // replace them with calls to BLAS (if linked in??).
36 //
37 //===----------------------------------------------------------------------===//
38 
40 #include "llvm/ADT/APInt.h"
41 #include "llvm/ADT/ArrayRef.h"
42 #include "llvm/ADT/DenseMap.h"
43 #include "llvm/ADT/MapVector.h"
44 #include "llvm/ADT/STLExtras.h"
45 #include "llvm/ADT/SetVector.h"
46 #include "llvm/ADT/SmallPtrSet.h"
47 #include "llvm/ADT/SmallVector.h"
48 #include "llvm/ADT/Statistic.h"
49 #include "llvm/ADT/StringRef.h"
52 #include "llvm/Analysis/LoopInfo.h"
53 #include "llvm/Analysis/LoopPass.h"
62 #include "llvm/IR/Attributes.h"
63 #include "llvm/IR/BasicBlock.h"
64 #include "llvm/IR/Constant.h"
65 #include "llvm/IR/Constants.h"
66 #include "llvm/IR/DataLayout.h"
67 #include "llvm/IR/DebugLoc.h"
68 #include "llvm/IR/DerivedTypes.h"
69 #include "llvm/IR/Dominators.h"
70 #include "llvm/IR/GlobalValue.h"
71 #include "llvm/IR/GlobalVariable.h"
72 #include "llvm/IR/IRBuilder.h"
73 #include "llvm/IR/InstrTypes.h"
74 #include "llvm/IR/Instruction.h"
75 #include "llvm/IR/Instructions.h"
76 #include "llvm/IR/IntrinsicInst.h"
77 #include "llvm/IR/Intrinsics.h"
78 #include "llvm/IR/LLVMContext.h"
79 #include "llvm/IR/Module.h"
80 #include "llvm/IR/PassManager.h"
81 #include "llvm/IR/PatternMatch.h"
82 #include "llvm/IR/Type.h"
83 #include "llvm/IR/User.h"
84 #include "llvm/IR/Value.h"
85 #include "llvm/IR/ValueHandle.h"
86 #include "llvm/IR/Verifier.h"
87 #include "llvm/Pass.h"
88 #include "llvm/Support/Casting.h"
90 #include "llvm/Support/Debug.h"
92 #include "llvm/Transforms/Scalar.h"
98 #include <algorithm>
99 #include <cassert>
100 #include <cstdint>
101 #include <utility>
102 #include <vector>
103 
104 using namespace llvm;
105 
106 #define DEBUG_TYPE "loop-idiom"
107 
108 STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
109 STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
110 STATISTIC(NumBCmp, "Number of memcmp's formed from loop 2xload+eq-compare");
111 
113  "use-lir-code-size-heurs",
114  cl::desc("Use loop idiom recognition code size heuristics when compiling"
115  "with -Os/-Oz"),
116  cl::init(true), cl::Hidden);
117 
118 namespace {
119 
120 // FIXME: reinventing the wheel much? Is there a cleaner solution?
121 struct PMAbstraction {
122  virtual void markLoopAsDeleted(Loop *L) = 0;
123  virtual ~PMAbstraction() = default;
124 };
125 struct LegacyPMAbstraction : PMAbstraction {
126  LPPassManager &LPM;
127  LegacyPMAbstraction(LPPassManager &LPM) : LPM(LPM) {}
128  virtual ~LegacyPMAbstraction() = default;
129  void markLoopAsDeleted(Loop *L) override { LPM.markLoopAsDeleted(*L); }
130 };
131 struct NewPMAbstraction : PMAbstraction {
132  LPMUpdater &Updater;
133  NewPMAbstraction(LPMUpdater &Updater) : Updater(Updater) {}
134  virtual ~NewPMAbstraction() = default;
135  void markLoopAsDeleted(Loop *L) override {
136  Updater.markLoopAsDeleted(*L, L->getName());
137  }
138 };
139 
140 class LoopIdiomRecognize {
141  Loop *CurLoop = nullptr;
142  AliasAnalysis *AA;
143  DominatorTree *DT;
144  LoopInfo *LI;
145  ScalarEvolution *SE;
146  TargetLibraryInfo *TLI;
147  const TargetTransformInfo *TTI;
148  const DataLayout *DL;
149  PMAbstraction &LoopDeleter;
151  bool ApplyCodeSizeHeuristics;
152 
153 public:
154  explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
155  LoopInfo *LI, ScalarEvolution *SE,
156  TargetLibraryInfo *TLI,
157  const TargetTransformInfo *TTI,
158  const DataLayout *DL, PMAbstraction &LoopDeleter,
160  : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL),
161  LoopDeleter(LoopDeleter), ORE(ORE) {}
162 
163  bool runOnLoop(Loop *L);
164 
165 private:
166  using StoreList = SmallVector<StoreInst *, 8>;
167  using StoreListMap = MapVector<Value *, StoreList>;
168 
169  StoreListMap StoreRefsForMemset;
170  StoreListMap StoreRefsForMemsetPattern;
171  StoreList StoreRefsForMemcpy;
172  bool HasMemset;
173  bool HasMemsetPattern;
174  bool HasMemcpy;
175  bool HasMemCmp;
176  bool HasBCmp;
177 
178  /// Return code for isLegalStore()
179  enum LegalStoreKind {
180  None = 0,
181  Memset,
182  MemsetPattern,
183  Memcpy,
184  UnorderedAtomicMemcpy,
185  DontUse // Dummy retval never to be used. Allows catching errors in retval
186  // handling.
187  };
188 
189  /// \name Countable Loop Idiom Handling
190  /// @{
191 
192  bool runOnCountableLoop();
193  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
194  SmallVectorImpl<BasicBlock *> &ExitBlocks);
195 
196  void collectStores(BasicBlock *BB);
197  LegalStoreKind isLegalStore(StoreInst *SI);
198  enum class ForMemset { No, Yes };
199  bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
200  ForMemset For);
201  bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
202 
203  bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
204  unsigned StoreAlignment, Value *StoredVal,
205  Instruction *TheStore,
207  const SCEVAddRecExpr *Ev, const SCEV *BECount,
208  bool NegStride, bool IsLoopMemset = false);
209  bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
210  bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
211  bool IsLoopMemset = false);
212 
213  /// @}
214  /// \name Noncountable Loop Idiom Handling
215  /// @{
216 
217  bool runOnNoncountableLoop();
218 
219  struct CmpLoopStructure {
220  Value *BCmpValue, *LatchCmpValue;
221  BasicBlock *HeaderBrEqualBB, *HeaderBrUnequalBB;
222  BasicBlock *LatchBrFinishBB, *LatchBrContinueBB;
223  };
224  bool matchBCmpLoopStructure(CmpLoopStructure &CmpLoop) const;
225  struct CmpOfLoads {
226  ICmpInst::Predicate BCmpPred;
227  Value *LoadSrcA, *LoadSrcB;
228  Value *LoadA, *LoadB;
229  };
230  bool matchBCmpOfLoads(Value *BCmpValue, CmpOfLoads &CmpOfLoads) const;
231  bool recognizeBCmpLoopControlFlow(const CmpOfLoads &CmpOfLoads,
232  CmpLoopStructure &CmpLoop) const;
233  bool recognizeBCmpLoopSCEV(uint64_t BCmpTyBytes, CmpOfLoads &CmpOfLoads,
234  const SCEV *&SrcA, const SCEV *&SrcB,
235  const SCEV *&Iterations) const;
236  bool detectBCmpIdiom(ICmpInst *&BCmpInst, CmpInst *&LatchCmpInst,
237  LoadInst *&LoadA, LoadInst *&LoadB, const SCEV *&SrcA,
238  const SCEV *&SrcB, const SCEV *&NBytes) const;
239  BasicBlock *transformBCmpControlFlow(ICmpInst *ComparedEqual);
240  void transformLoopToBCmp(ICmpInst *BCmpInst, CmpInst *LatchCmpInst,
241  LoadInst *LoadA, LoadInst *LoadB, const SCEV *SrcA,
242  const SCEV *SrcB, const SCEV *NBytes);
243  bool recognizeBCmp();
244 
245  bool recognizePopcount();
246  void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
247  PHINode *CntPhi, Value *Var);
248  bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
249  void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
250  Instruction *CntInst, PHINode *CntPhi,
251  Value *Var, Instruction *DefX,
252  const DebugLoc &DL, bool ZeroCheck,
253  bool IsCntPhiUsedOutsideLoop);
254 
255  /// @}
256 };
257 
258 class LoopIdiomRecognizeLegacyPass : public LoopPass {
259 public:
260  static char ID;
261 
262  explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
265  }
266 
267  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
268  if (skipLoop(L))
269  return false;
270 
271  AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
272  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
273  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
274  ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
275  TargetLibraryInfo *TLI =
276  &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
277  *L->getHeader()->getParent());
278  const TargetTransformInfo *TTI =
279  &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
280  *L->getHeader()->getParent());
281  const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
282  LegacyPMAbstraction LoopDeleter(LPM);
283 
284  // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
285  // pass. Function analyses need to be preserved across loop transformations
286  // but ORE cannot be preserved (see comment before the pass definition).
288 
289  LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL, LoopDeleter, ORE);
290  return LIR.runOnLoop(L);
291  }
292 
293  /// This transformation requires natural loop information & requires that
294  /// loop preheaders be inserted into the CFG.
295  void getAnalysisUsage(AnalysisUsage &AU) const override {
299  }
300 };
301 
302 } // end anonymous namespace
303 
305 
308  LPMUpdater &Updater) {
309  const auto *DL = &L.getHeader()->getModule()->getDataLayout();
310 
311  const auto &FAM =
312  AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
313  Function *F = L.getHeader()->getParent();
314 
315  auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
316  // FIXME: This should probably be optional rather than required.
317  if (!ORE)
319  "LoopIdiomRecognizePass: OptimizationRemarkEmitterAnalysis not cached "
320  "at a higher level");
321 
322  NewPMAbstraction LoopDeleter(Updater);
323  LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, DL,
324  LoopDeleter, *ORE);
325  if (!LIR.runOnLoop(&L))
326  return PreservedAnalyses::all();
327 
329 }
330 
331 INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",
332  "Recognize loop idioms", false, false)
336 INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",
337  "Recognize loop idioms", false, false)
338 
339 Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
340 
343  I->eraseFromParent();
344 }
345 
346 //===----------------------------------------------------------------------===//
347 //
348 // Implementation of LoopIdiomRecognize
349 //
350 //===----------------------------------------------------------------------===//
351 
352 bool LoopIdiomRecognize::runOnLoop(Loop *L) {
353  CurLoop = L;
354  // If the loop could not be converted to canonical form, it must have an
355  // indirectbr in it, just give up.
356  if (!L->getLoopPreheader())
357  return false;
358 
359  // Disable loop idiom recognition if the function's name is a common idiom.
361  if (Name == "memset" || Name == "memcpy" || Name == "memcmp" ||
362  Name == "bcmp")
363  return false;
364 
365  // Determine if code size heuristics need to be applied.
366  ApplyCodeSizeHeuristics =
368 
369  HasMemset = TLI->has(LibFunc_memset);
370  HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
371  HasMemcpy = TLI->has(LibFunc_memcpy);
372  HasMemCmp = TLI->has(LibFunc_memcmp);
373  HasBCmp = TLI->has(LibFunc_bcmp);
374 
375  if (HasMemset || HasMemsetPattern || HasMemcpy || HasMemCmp || HasBCmp)
377  return runOnCountableLoop();
378 
379  return runOnNoncountableLoop();
380 }
381 
382 bool LoopIdiomRecognize::runOnCountableLoop() {
383  const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
384  assert(!isa<SCEVCouldNotCompute>(BECount) &&
385  "runOnCountableLoop() called on a loop without a predictable"
386  "backedge-taken count");
387 
388  // If this loop executes exactly one time, then it should be peeled, not
389  // optimized by this pass.
390  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
391  if (BECst->getAPInt() == 0)
392  return false;
393 
394  SmallVector<BasicBlock *, 8> ExitBlocks;
395  CurLoop->getUniqueExitBlocks(ExitBlocks);
396 
397  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
398  << CurLoop->getHeader()->getParent()->getName()
399  << "] Countable Loop %" << CurLoop->getHeader()->getName()
400  << "\n");
401 
402  bool MadeChange = false;
403 
404  // The following transforms hoist stores/memsets into the loop pre-header.
405  // Give up if the loop has instructions may throw.
406  SimpleLoopSafetyInfo SafetyInfo;
407  SafetyInfo.computeLoopSafetyInfo(CurLoop);
408  if (SafetyInfo.anyBlockMayThrow())
409  return MadeChange;
410 
411  // Scan all the blocks in the loop that are not in subloops.
412  for (auto *BB : CurLoop->getBlocks()) {
413  // Ignore blocks in subloops.
414  if (LI->getLoopFor(BB) != CurLoop)
415  continue;
416 
417  MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
418  }
419  return MadeChange;
420 }
421 
422 static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
423  const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
424  return ConstStride->getAPInt();
425 }
426 
427 /// getMemSetPatternValue - If a strided store of the specified value is safe to
428 /// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
429 /// be passed in. Otherwise, return null.
430 ///
431 /// Note that we don't ever attempt to use memset_pattern8 or 4, because these
432 /// just replicate their input array and then pass on to memset_pattern16.
434  // FIXME: This could check for UndefValue because it can be merged into any
435  // other valid pattern.
436 
437  // If the value isn't a constant, we can't promote it to being in a constant
438  // array. We could theoretically do a store to an alloca or something, but
439  // that doesn't seem worthwhile.
440  Constant *C = dyn_cast<Constant>(V);
441  if (!C)
442  return nullptr;
443 
444  // Only handle simple values that are a power of two bytes in size.
445  uint64_t Size = DL->getTypeSizeInBits(V->getType());
446  if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
447  return nullptr;
448 
449  // Don't care enough about darwin/ppc to implement this.
450  if (DL->isBigEndian())
451  return nullptr;
452 
453  // Convert to size in bytes.
454  Size /= 8;
455 
456  // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
457  // if the top and bottom are the same (e.g. for vectors and large integers).
458  if (Size > 16)
459  return nullptr;
460 
461  // If the constant is exactly 16 bytes, just use it.
462  if (Size == 16)
463  return C;
464 
465  // Otherwise, we'll use an array of the constants.
466  unsigned ArraySize = 16 / Size;
467  ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
468  return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
469 }
470 
471 LoopIdiomRecognize::LegalStoreKind
472 LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
473  // Don't touch volatile stores.
474  if (SI->isVolatile())
475  return LegalStoreKind::None;
476  // We only want simple or unordered-atomic stores.
477  if (!SI->isUnordered())
478  return LegalStoreKind::None;
479 
480  // Don't convert stores of non-integral pointer types to memsets (which stores
481  // integers).
482  if (DL->isNonIntegralPointerType(SI->getValueOperand()->getType()))
483  return LegalStoreKind::None;
484 
485  // Avoid merging nontemporal stores.
486  if (SI->getMetadata(LLVMContext::MD_nontemporal))
487  return LegalStoreKind::None;
488 
489  Value *StoredVal = SI->getValueOperand();
490  Value *StorePtr = SI->getPointerOperand();
491 
492  // Reject stores that are so large that they overflow an unsigned.
493  uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
494  if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
495  return LegalStoreKind::None;
496 
497  // See if the pointer expression is an AddRec like {base,+,1} on the current
498  // loop, which indicates a strided store. If we have something else, it's a
499  // random store we can't handle.
500  const SCEVAddRecExpr *StoreEv =
501  dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
502  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
503  return LegalStoreKind::None;
504 
505  // Check to see if we have a constant stride.
506  if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
507  return LegalStoreKind::None;
508 
509  // See if the store can be turned into a memset.
510 
511  // If the stored value is a byte-wise value (like i32 -1), then it may be
512  // turned into a memset of i8 -1, assuming that all the consecutive bytes
513  // are stored. A store of i32 0x01020304 can never be turned into a memset,
514  // but it can be turned into memset_pattern if the target supports it.
515  Value *SplatValue = isBytewiseValue(StoredVal, *DL);
516  Constant *PatternValue = nullptr;
517 
518  // Note: memset and memset_pattern on unordered-atomic is yet not supported
519  bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
520 
521  // If we're allowed to form a memset, and the stored value would be
522  // acceptable for memset, use it.
523  if (!UnorderedAtomic && HasMemset && SplatValue &&
524  // Verify that the stored value is loop invariant. If not, we can't
525  // promote the memset.
526  CurLoop->isLoopInvariant(SplatValue)) {
527  // It looks like we can use SplatValue.
528  return LegalStoreKind::Memset;
529  } else if (!UnorderedAtomic && HasMemsetPattern &&
530  // Don't create memset_pattern16s with address spaces.
531  StorePtr->getType()->getPointerAddressSpace() == 0 &&
532  (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
533  // It looks like we can use PatternValue!
534  return LegalStoreKind::MemsetPattern;
535  }
536 
537  // Otherwise, see if the store can be turned into a memcpy.
538  if (HasMemcpy) {
539  // Check to see if the stride matches the size of the store. If so, then we
540  // know that every byte is touched in the loop.
541  APInt Stride = getStoreStride(StoreEv);
542  unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
543  if (StoreSize != Stride && StoreSize != -Stride)
544  return LegalStoreKind::None;
545 
546  // The store must be feeding a non-volatile load.
548 
549  // Only allow non-volatile loads
550  if (!LI || LI->isVolatile())
551  return LegalStoreKind::None;
552  // Only allow simple or unordered-atomic loads
553  if (!LI->isUnordered())
554  return LegalStoreKind::None;
555 
556  // See if the pointer expression is an AddRec like {base,+,1} on the current
557  // loop, which indicates a strided load. If we have something else, it's a
558  // random load we can't handle.
559  const SCEVAddRecExpr *LoadEv =
561  if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
562  return LegalStoreKind::None;
563 
564  // The store and load must share the same stride.
565  if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
566  return LegalStoreKind::None;
567 
568  // Success. This store can be converted into a memcpy.
569  UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
570  return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
571  : LegalStoreKind::Memcpy;
572  }
573  // This store can't be transformed into a memset/memcpy.
574  return LegalStoreKind::None;
575 }
576 
577 void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
578  StoreRefsForMemset.clear();
579  StoreRefsForMemsetPattern.clear();
580  StoreRefsForMemcpy.clear();
581  for (Instruction &I : *BB) {
582  StoreInst *SI = dyn_cast<StoreInst>(&I);
583  if (!SI)
584  continue;
585 
586  // Make sure this is a strided store with a constant stride.
587  switch (isLegalStore(SI)) {
589  // Nothing to do
590  break;
591  case LegalStoreKind::Memset: {
592  // Find the base pointer.
593  Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
594  StoreRefsForMemset[Ptr].push_back(SI);
595  } break;
596  case LegalStoreKind::MemsetPattern: {
597  // Find the base pointer.
598  Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
599  StoreRefsForMemsetPattern[Ptr].push_back(SI);
600  } break;
601  case LegalStoreKind::Memcpy:
602  case LegalStoreKind::UnorderedAtomicMemcpy:
603  StoreRefsForMemcpy.push_back(SI);
604  break;
605  default:
606  assert(false && "unhandled return value");
607  break;
608  }
609  }
610 }
611 
612 /// runOnLoopBlock - Process the specified block, which lives in a counted loop
613 /// with the specified backedge count. This block is known to be in the current
614 /// loop and not in any subloops.
615 bool LoopIdiomRecognize::runOnLoopBlock(
616  BasicBlock *BB, const SCEV *BECount,
617  SmallVectorImpl<BasicBlock *> &ExitBlocks) {
618  // We can only promote stores in this block if they are unconditionally
619  // executed in the loop. For a block to be unconditionally executed, it has
620  // to dominate all the exit blocks of the loop. Verify this now.
621  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
622  if (!DT->dominates(BB, ExitBlocks[i]))
623  return false;
624 
625  bool MadeChange = false;
626  // Look for store instructions, which may be optimized to memset/memcpy.
627  collectStores(BB);
628 
629  // Look for a single store or sets of stores with a common base, which can be
630  // optimized into a memset (memset_pattern). The latter most commonly happens
631  // with structs and handunrolled loops.
632  for (auto &SL : StoreRefsForMemset)
633  MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
634 
635  for (auto &SL : StoreRefsForMemsetPattern)
636  MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
637 
638  // Optimize the store into a memcpy, if it feeds an similarly strided load.
639  for (auto &SI : StoreRefsForMemcpy)
640  MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
641 
642  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
643  Instruction *Inst = &*I++;
644  // Look for memset instructions, which may be optimized to a larger memset.
645  if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
646  WeakTrackingVH InstPtr(&*I);
647  if (!processLoopMemSet(MSI, BECount))
648  continue;
649  MadeChange = true;
650 
651  // If processing the memset invalidated our iterator, start over from the
652  // top of the block.
653  if (!InstPtr)
654  I = BB->begin();
655  continue;
656  }
657  }
658 
659  return MadeChange;
660 }
661 
662 /// See if this store(s) can be promoted to a memset.
663 bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
664  const SCEV *BECount, ForMemset For) {
665  // Try to find consecutive stores that can be transformed into memsets.
666  SetVector<StoreInst *> Heads, Tails;
668 
669  // Do a quadratic search on all of the given stores and find
670  // all of the pairs of stores that follow each other.
671  SmallVector<unsigned, 16> IndexQueue;
672  for (unsigned i = 0, e = SL.size(); i < e; ++i) {
673  assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
674 
675  Value *FirstStoredVal = SL[i]->getValueOperand();
676  Value *FirstStorePtr = SL[i]->getPointerOperand();
677  const SCEVAddRecExpr *FirstStoreEv =
678  cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
679  APInt FirstStride = getStoreStride(FirstStoreEv);
680  unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
681 
682  // See if we can optimize just this store in isolation.
683  if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
684  Heads.insert(SL[i]);
685  continue;
686  }
687 
688  Value *FirstSplatValue = nullptr;
689  Constant *FirstPatternValue = nullptr;
690 
691  if (For == ForMemset::Yes)
692  FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
693  else
694  FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
695 
696  assert((FirstSplatValue || FirstPatternValue) &&
697  "Expected either splat value or pattern value.");
698 
699  IndexQueue.clear();
700  // If a store has multiple consecutive store candidates, search Stores
701  // array according to the sequence: from i+1 to e, then from i-1 to 0.
702  // This is because usually pairing with immediate succeeding or preceding
703  // candidate create the best chance to find memset opportunity.
704  unsigned j = 0;
705  for (j = i + 1; j < e; ++j)
706  IndexQueue.push_back(j);
707  for (j = i; j > 0; --j)
708  IndexQueue.push_back(j - 1);
709 
710  for (auto &k : IndexQueue) {
711  assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
712  Value *SecondStorePtr = SL[k]->getPointerOperand();
713  const SCEVAddRecExpr *SecondStoreEv =
714  cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
715  APInt SecondStride = getStoreStride(SecondStoreEv);
716 
717  if (FirstStride != SecondStride)
718  continue;
719 
720  Value *SecondStoredVal = SL[k]->getValueOperand();
721  Value *SecondSplatValue = nullptr;
722  Constant *SecondPatternValue = nullptr;
723 
724  if (For == ForMemset::Yes)
725  SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
726  else
727  SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
728 
729  assert((SecondSplatValue || SecondPatternValue) &&
730  "Expected either splat value or pattern value.");
731 
732  if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
733  if (For == ForMemset::Yes) {
734  if (isa<UndefValue>(FirstSplatValue))
735  FirstSplatValue = SecondSplatValue;
736  if (FirstSplatValue != SecondSplatValue)
737  continue;
738  } else {
739  if (isa<UndefValue>(FirstPatternValue))
740  FirstPatternValue = SecondPatternValue;
741  if (FirstPatternValue != SecondPatternValue)
742  continue;
743  }
744  Tails.insert(SL[k]);
745  Heads.insert(SL[i]);
746  ConsecutiveChain[SL[i]] = SL[k];
747  break;
748  }
749  }
750  }
751 
752  // We may run into multiple chains that merge into a single chain. We mark the
753  // stores that we transformed so that we don't visit the same store twice.
754  SmallPtrSet<Value *, 16> TransformedStores;
755  bool Changed = false;
756 
757  // For stores that start but don't end a link in the chain:
758  for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
759  it != e; ++it) {
760  if (Tails.count(*it))
761  continue;
762 
763  // We found a store instr that starts a chain. Now follow the chain and try
764  // to transform it.
765  SmallPtrSet<Instruction *, 8> AdjacentStores;
766  StoreInst *I = *it;
767 
768  StoreInst *HeadStore = I;
769  unsigned StoreSize = 0;
770 
771  // Collect the chain into a list.
772  while (Tails.count(I) || Heads.count(I)) {
773  if (TransformedStores.count(I))
774  break;
775  AdjacentStores.insert(I);
776 
777  StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
778  // Move to the next value in the chain.
779  I = ConsecutiveChain[I];
780  }
781 
782  Value *StoredVal = HeadStore->getValueOperand();
783  Value *StorePtr = HeadStore->getPointerOperand();
784  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
785  APInt Stride = getStoreStride(StoreEv);
786 
787  // Check to see if the stride matches the size of the stores. If so, then
788  // we know that every byte is touched in the loop.
789  if (StoreSize != Stride && StoreSize != -Stride)
790  continue;
791 
792  bool NegStride = StoreSize == -Stride;
793 
794  if (processLoopStridedStore(StorePtr, StoreSize, HeadStore->getAlignment(),
795  StoredVal, HeadStore, AdjacentStores, StoreEv,
796  BECount, NegStride)) {
797  TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
798  Changed = true;
799  }
800  }
801 
802  return Changed;
803 }
804 
805 /// processLoopMemSet - See if this memset can be promoted to a large memset.
806 bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
807  const SCEV *BECount) {
808  // We can only handle non-volatile memsets with a constant size.
809  if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength()))
810  return false;
811 
812  // If we're not allowed to hack on memset, we fail.
813  if (!HasMemset)
814  return false;
815 
816  Value *Pointer = MSI->getDest();
817 
818  // See if the pointer expression is an AddRec like {base,+,1} on the current
819  // loop, which indicates a strided store. If we have something else, it's a
820  // random store we can't handle.
821  const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
822  if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine())
823  return false;
824 
825  // Reject memsets that are so large that they overflow an unsigned.
826  uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
827  if ((SizeInBytes >> 32) != 0)
828  return false;
829 
830  // Check to see if the stride matches the size of the memset. If so, then we
831  // know that every byte is touched in the loop.
832  const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
833  if (!ConstStride)
834  return false;
835 
836  APInt Stride = ConstStride->getAPInt();
837  if (SizeInBytes != Stride && SizeInBytes != -Stride)
838  return false;
839 
840  // Verify that the memset value is loop invariant. If not, we can't promote
841  // the memset.
842  Value *SplatValue = MSI->getValue();
843  if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
844  return false;
845 
847  MSIs.insert(MSI);
848  bool NegStride = SizeInBytes == -Stride;
849  return processLoopStridedStore(Pointer, (unsigned)SizeInBytes,
850  MSI->getDestAlignment(), SplatValue, MSI, MSIs,
851  Ev, BECount, NegStride, /*IsLoopMemset=*/true);
852 }
853 
854 /// mayLoopAccessLocation - Return true if the specified loop might access the
855 /// specified pointer location, which is a loop-strided access. The 'Access'
856 /// argument specifies what the verboten forms of access are (read or write).
857 static bool
859  const SCEV *BECount, unsigned StoreSize,
860  AliasAnalysis &AA,
861  SmallPtrSetImpl<Instruction *> &IgnoredStores) {
862  // Get the location that may be stored across the loop. Since the access is
863  // strided positively through memory, we say that the modified location starts
864  // at the pointer and has infinite size.
865  LocationSize AccessSize = LocationSize::unknown();
866 
867  // If the loop iterates a fixed number of times, we can refine the access size
868  // to be exactly the size of the memset, which is (BECount+1)*StoreSize
869  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
870  AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
871  StoreSize);
872 
873  // TODO: For this to be really effective, we have to dive into the pointer
874  // operand in the store. Store to &A[i] of 100 will always return may alias
875  // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
876  // which will then no-alias a store to &A[100].
877  MemoryLocation StoreLoc(Ptr, AccessSize);
878 
879  for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
880  ++BI)
881  for (Instruction &I : **BI)
882  if (IgnoredStores.count(&I) == 0 &&
884  intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
885  return true;
886 
887  return false;
888 }
889 
890 // If we have a negative stride, Start refers to the end of the memory location
891 // we're trying to memset. Therefore, we need to recompute the base pointer,
892 // which is just Start - BECount*Size.
893 static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
894  Type *IntPtr, unsigned StoreSize,
895  ScalarEvolution *SE) {
896  const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
897  if (StoreSize != 1)
898  Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize),
899  SCEV::FlagNUW);
900  return SE->getMinusSCEV(Start, Index);
901 }
902 
903 /// Compute the number of bytes as a SCEV from the backedge taken count.
904 ///
905 /// This also maps the SCEV into the provided type and tries to handle the
906 /// computation in a way that will fold cleanly.
907 static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
908  unsigned StoreSize, Loop *CurLoop,
909  const DataLayout *DL, ScalarEvolution *SE) {
910  const SCEV *NumBytesS;
911  // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
912  // pointer size if it isn't already.
913  //
914  // If we're going to need to zero extend the BE count, check if we can add
915  // one to it prior to zero extending without overflow. Provided this is safe,
916  // it allows better simplification of the +1.
917  if (DL->getTypeSizeInBits(BECount->getType()) <
918  DL->getTypeSizeInBits(IntPtr) &&
920  CurLoop, ICmpInst::ICMP_NE, BECount,
921  SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
922  NumBytesS = SE->getZeroExtendExpr(
923  SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
924  IntPtr);
925  } else {
926  NumBytesS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
927  SE->getOne(IntPtr), SCEV::FlagNUW);
928  }
929 
930  // And scale it based on the store size.
931  if (StoreSize != 1) {
932  NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
933  SCEV::FlagNUW);
934  }
935  return NumBytesS;
936 }
937 
938 /// processLoopStridedStore - We see a strided store of some value. If we can
939 /// transform this into a memset or memset_pattern in the loop preheader, do so.
940 bool LoopIdiomRecognize::processLoopStridedStore(
941  Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment,
942  Value *StoredVal, Instruction *TheStore,
944  const SCEV *BECount, bool NegStride, bool IsLoopMemset) {
945  Value *SplatValue = isBytewiseValue(StoredVal, *DL);
946  Constant *PatternValue = nullptr;
947 
948  if (!SplatValue)
949  PatternValue = getMemSetPatternValue(StoredVal, DL);
950 
951  assert((SplatValue || PatternValue) &&
952  "Expected either splat value or pattern value.");
953 
954  // The trip count of the loop and the base pointer of the addrec SCEV is
955  // guaranteed to be loop invariant, which means that it should dominate the
956  // header. This allows us to insert code for it in the preheader.
957  unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
958  BasicBlock *Preheader = CurLoop->getLoopPreheader();
959  IRBuilder<> Builder(Preheader->getTerminator());
960  SCEVExpander Expander(*SE, *DL, "loop-idiom");
961 
962  Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
963  Type *IntPtr = Builder.getIntPtrTy(*DL, DestAS);
964 
965  const SCEV *Start = Ev->getStart();
966  // Handle negative strided loops.
967  if (NegStride)
968  Start = getStartForNegStride(Start, BECount, IntPtr, StoreSize, SE);
969 
970  // TODO: ideally we should still be able to generate memset if SCEV expander
971  // is taught to generate the dependencies at the latest point.
972  if (!isSafeToExpand(Start, *SE))
973  return false;
974 
975  // Okay, we have a strided store "p[i]" of a splattable value. We can turn
976  // this into a memset in the loop preheader now if we want. However, this
977  // would be unsafe to do if there is anything else in the loop that may read
978  // or write to the aliased location. Check for any overlap by generating the
979  // base pointer and checking the region.
980  Value *BasePtr =
981  Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
982  if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
983  StoreSize, *AA, Stores)) {
984  Expander.clear();
985  // If we generated new code for the base pointer, clean up.
987  return false;
988  }
989 
990  if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
991  return false;
992 
993  // Okay, everything looks good, insert the memset.
994 
995  const SCEV *NumBytesS =
996  getNumBytes(BECount, IntPtr, StoreSize, CurLoop, DL, SE);
997 
998  // TODO: ideally we should still be able to generate memset if SCEV expander
999  // is taught to generate the dependencies at the latest point.
1000  if (!isSafeToExpand(NumBytesS, *SE))
1001  return false;
1002 
1003  Value *NumBytes =
1004  Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator());
1005 
1006  CallInst *NewCall;
1007  if (SplatValue) {
1008  NewCall =
1009  Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, StoreAlignment);
1010  } else {
1011  // Everything is emitted in default address space
1012  Type *Int8PtrTy = DestInt8PtrTy;
1013 
1014  Module *M = TheStore->getModule();
1015  StringRef FuncName = "memset_pattern16";
1016  FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(),
1017  Int8PtrTy, Int8PtrTy, IntPtr);
1018  inferLibFuncAttributes(M, FuncName, *TLI);
1019 
1020  // Otherwise we should form a memset_pattern16. PatternValue is known to be
1021  // an constant array of 16-bytes. Plop the value into a mergable global.
1022  GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
1024  PatternValue, ".memset_pattern");
1025  GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
1026  GV->setAlignment(Align(16));
1027  Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
1028  NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
1029  }
1030 
1031  LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
1032  << " from store to: " << *Ev << " at: " << *TheStore
1033  << "\n");
1034  NewCall->setDebugLoc(TheStore->getDebugLoc());
1035 
1036  ORE.emit([&]() {
1037  return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStridedStore",
1038  NewCall->getDebugLoc(), Preheader)
1039  << "Transformed loop-strided store into a call to "
1040  << ore::NV("NewFunction", NewCall->getCalledFunction())
1041  << "() function";
1042  });
1043 
1044  // Okay, the memset has been formed. Zap the original store and anything that
1045  // feeds into it.
1046  for (auto *I : Stores)
1048  ++NumMemSet;
1049  return true;
1050 }
1051 
1052 /// If the stored value is a strided load in the same loop with the same stride
1053 /// this may be transformable into a memcpy. This kicks in for stuff like
1054 /// for (i) A[i] = B[i];
1055 bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1056  const SCEV *BECount) {
1057  assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
1058 
1059  Value *StorePtr = SI->getPointerOperand();
1060  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1061  APInt Stride = getStoreStride(StoreEv);
1062  unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1063  bool NegStride = StoreSize == -Stride;
1064 
1065  // The store must be feeding a non-volatile load.
1066  LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
1067  assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
1068 
1069  // See if the pointer expression is an AddRec like {base,+,1} on the current
1070  // loop, which indicates a strided load. If we have something else, it's a
1071  // random load we can't handle.
1072  const SCEVAddRecExpr *LoadEv =
1073  cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
1074 
1075  // The trip count of the loop and the base pointer of the addrec SCEV is
1076  // guaranteed to be loop invariant, which means that it should dominate the
1077  // header. This allows us to insert code for it in the preheader.
1078  BasicBlock *Preheader = CurLoop->getLoopPreheader();
1079  IRBuilder<> Builder(Preheader->getTerminator());
1080  SCEVExpander Expander(*SE, *DL, "loop-idiom");
1081 
1082  const SCEV *StrStart = StoreEv->getStart();
1083  unsigned StrAS = SI->getPointerAddressSpace();
1084  Type *IntPtrTy = Builder.getIntPtrTy(*DL, StrAS);
1085 
1086  // Handle negative strided loops.
1087  if (NegStride)
1088  StrStart = getStartForNegStride(StrStart, BECount, IntPtrTy, StoreSize, SE);
1089 
1090  // Okay, we have a strided store "p[i]" of a loaded value. We can turn
1091  // this into a memcpy in the loop preheader now if we want. However, this
1092  // would be unsafe to do if there is anything else in the loop that may read
1093  // or write the memory region we're storing to. This includes the load that
1094  // feeds the stores. Check for an alias by generating the base address and
1095  // checking everything.
1096  Value *StoreBasePtr = Expander.expandCodeFor(
1097  StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
1098 
1100  Stores.insert(SI);
1101  if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1102  StoreSize, *AA, Stores)) {
1103  Expander.clear();
1104  // If we generated new code for the base pointer, clean up.
1105  RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
1106  return false;
1107  }
1108 
1109  const SCEV *LdStart = LoadEv->getStart();
1110  unsigned LdAS = LI->getPointerAddressSpace();
1111 
1112  // Handle negative strided loops.
1113  if (NegStride)
1114  LdStart = getStartForNegStride(LdStart, BECount, IntPtrTy, StoreSize, SE);
1115 
1116  // For a memcpy, we have to make sure that the input array is not being
1117  // mutated by the loop.
1118  Value *LoadBasePtr = Expander.expandCodeFor(
1119  LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
1120 
1121  if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1122  StoreSize, *AA, Stores)) {
1123  Expander.clear();
1124  // If we generated new code for the base pointer, clean up.
1126  RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
1127  return false;
1128  }
1129 
1130  if (avoidLIRForMultiBlockLoop())
1131  return false;
1132 
1133  // Okay, everything is safe, we can transform this!
1134 
1135  const SCEV *NumBytesS =
1136  getNumBytes(BECount, IntPtrTy, StoreSize, CurLoop, DL, SE);
1137 
1138  Value *NumBytes =
1139  Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
1140 
1141  CallInst *NewCall = nullptr;
1142  // Check whether to generate an unordered atomic memcpy:
1143  // If the load or store are atomic, then they must necessarily be unordered
1144  // by previous checks.
1145  if (!SI->isAtomic() && !LI->isAtomic())
1146  NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlignment(),
1147  LoadBasePtr, LI->getAlignment(), NumBytes);
1148  else {
1149  // We cannot allow unaligned ops for unordered load/store, so reject
1150  // anything where the alignment isn't at least the element size.
1151  unsigned Align = std::min(SI->getAlignment(), LI->getAlignment());
1152  if (Align < StoreSize)
1153  return false;
1154 
1155  // If the element.atomic memcpy is not lowered into explicit
1156  // loads/stores later, then it will be lowered into an element-size
1157  // specific lib call. If the lib call doesn't exist for our store size, then
1158  // we shouldn't generate the memcpy.
1159  if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1160  return false;
1161 
1162  // Create the call.
1163  // Note that unordered atomic loads/stores are *required* by the spec to
1164  // have an alignment but non-atomic loads/stores may not.
1165  NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1166  StoreBasePtr, SI->getAlignment(), LoadBasePtr, LI->getAlignment(),
1167  NumBytes, StoreSize);
1168  }
1169  NewCall->setDebugLoc(SI->getDebugLoc());
1170 
1171  LLVM_DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n"
1172  << " from load ptr=" << *LoadEv << " at: " << *LI << "\n"
1173  << " from store ptr=" << *StoreEv << " at: " << *SI
1174  << "\n");
1175 
1176  ORE.emit([&]() {
1177  return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",
1178  NewCall->getDebugLoc(), Preheader)
1179  << "Formed a call to "
1180  << ore::NV("NewFunction", NewCall->getCalledFunction())
1181  << "() function";
1182  });
1183 
1184  // Okay, the memcpy has been formed. Zap the original store and anything that
1185  // feeds into it.
1187  ++NumMemCpy;
1188  return true;
1189 }
1190 
1191 // When compiling for codesize we avoid idiom recognition for a multi-block loop
1192 // unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1193 //
1194 bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1195  bool IsLoopMemset) {
1196  if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
1197  if (!CurLoop->getParentLoop() && (!IsMemset || !IsLoopMemset)) {
1198  LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
1199  << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
1200  << " avoided: multi-block top-level loop\n");
1201  return true;
1202  }
1203  }
1204 
1205  return false;
1206 }
1207 
1208 bool LoopIdiomRecognize::runOnNoncountableLoop() {
1209  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
1210  << CurLoop->getHeader()->getParent()->getName()
1211  << "] Noncountable Loop %"
1212  << CurLoop->getHeader()->getName() << "\n");
1213 
1214  return recognizeBCmp() || recognizePopcount() || recognizeAndInsertFFS();
1215 }
1216 
1217 /// Check if the given conditional branch is based on the comparison between
1218 /// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1219 /// true), the control yields to the loop entry. If the branch matches the
1220 /// behavior, the variable involved in the comparison is returned. This function
1221 /// will be called to see if the precondition and postcondition of the loop are
1222 /// in desirable form.
1223 static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
1224  bool JmpOnZero = false) {
1225  if (!BI || !BI->isConditional())
1226  return nullptr;
1227 
1228  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
1229  if (!Cond)
1230  return nullptr;
1231 
1232  ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
1233  if (!CmpZero || !CmpZero->isZero())
1234  return nullptr;
1235 
1236  BasicBlock *TrueSucc = BI->getSuccessor(0);
1237  BasicBlock *FalseSucc = BI->getSuccessor(1);
1238  if (JmpOnZero)
1239  std::swap(TrueSucc, FalseSucc);
1240 
1241  ICmpInst::Predicate Pred = Cond->getPredicate();
1242  if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
1243  (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
1244  return Cond->getOperand(0);
1245 
1246  return nullptr;
1247 }
1248 
1249 // Check if the recurrence variable `VarX` is in the right form to create
1250 // the idiom. Returns the value coerced to a PHINode if so.
1252  BasicBlock *LoopEntry) {
1253  auto *PhiX = dyn_cast<PHINode>(VarX);
1254  if (PhiX && PhiX->getParent() == LoopEntry &&
1255  (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
1256  return PhiX;
1257  return nullptr;
1258 }
1259 
1260 /// Return true iff the idiom is detected in the loop.
1261 ///
1262 /// Additionally:
1263 /// 1) \p CntInst is set to the instruction counting the population bit.
1264 /// 2) \p CntPhi is set to the corresponding phi node.
1265 /// 3) \p Var is set to the value whose population bits are being counted.
1266 ///
1267 /// The core idiom we are trying to detect is:
1268 /// \code
1269 /// if (x0 != 0)
1270 /// goto loop-exit // the precondition of the loop
1271 /// cnt0 = init-val;
1272 /// do {
1273 /// x1 = phi (x0, x2);
1274 /// cnt1 = phi(cnt0, cnt2);
1275 ///
1276 /// cnt2 = cnt1 + 1;
1277 /// ...
1278 /// x2 = x1 & (x1 - 1);
1279 /// ...
1280 /// } while(x != 0);
1281 ///
1282 /// loop-exit:
1283 /// \endcode
1284 static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
1285  Instruction *&CntInst, PHINode *&CntPhi,
1286  Value *&Var) {
1287  // step 1: Check to see if the look-back branch match this pattern:
1288  // "if (a!=0) goto loop-entry".
1289  BasicBlock *LoopEntry;
1290  Instruction *DefX2, *CountInst;
1291  Value *VarX1, *VarX0;
1292  PHINode *PhiX, *CountPhi;
1293 
1294  DefX2 = CountInst = nullptr;
1295  VarX1 = VarX0 = nullptr;
1296  PhiX = CountPhi = nullptr;
1297  LoopEntry = *(CurLoop->block_begin());
1298 
1299  // step 1: Check if the loop-back branch is in desirable form.
1300  {
1301  if (Value *T = matchCondition(
1302  dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1303  DefX2 = dyn_cast<Instruction>(T);
1304  else
1305  return false;
1306  }
1307 
1308  // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1309  {
1310  if (!DefX2 || DefX2->getOpcode() != Instruction::And)
1311  return false;
1312 
1313  BinaryOperator *SubOneOp;
1314 
1315  if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
1316  VarX1 = DefX2->getOperand(1);
1317  else {
1318  VarX1 = DefX2->getOperand(0);
1319  SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
1320  }
1321  if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
1322  return false;
1323 
1324  ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
1325  if (!Dec ||
1326  !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
1327  (SubOneOp->getOpcode() == Instruction::Add &&
1328  Dec->isMinusOne()))) {
1329  return false;
1330  }
1331  }
1332 
1333  // step 3: Check the recurrence of variable X
1334  PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
1335  if (!PhiX)
1336  return false;
1337 
1338  // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
1339  {
1340  CountInst = nullptr;
1341  for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1342  IterE = LoopEntry->end();
1343  Iter != IterE; Iter++) {
1344  Instruction *Inst = &*Iter;
1345  if (Inst->getOpcode() != Instruction::Add)
1346  continue;
1347 
1348  ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1349  if (!Inc || !Inc->isOne())
1350  continue;
1351 
1352  PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1353  if (!Phi)
1354  continue;
1355 
1356  // Check if the result of the instruction is live of the loop.
1357  bool LiveOutLoop = false;
1358  for (User *U : Inst->users()) {
1359  if ((cast<Instruction>(U))->getParent() != LoopEntry) {
1360  LiveOutLoop = true;
1361  break;
1362  }
1363  }
1364 
1365  if (LiveOutLoop) {
1366  CountInst = Inst;
1367  CountPhi = Phi;
1368  break;
1369  }
1370  }
1371 
1372  if (!CountInst)
1373  return false;
1374  }
1375 
1376  // step 5: check if the precondition is in this form:
1377  // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
1378  {
1379  auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1380  Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
1381  if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
1382  return false;
1383 
1384  CntInst = CountInst;
1385  CntPhi = CountPhi;
1386  Var = T;
1387  }
1388 
1389  return true;
1390 }
1391 
1392 /// Return true if the idiom is detected in the loop.
1393 ///
1394 /// Additionally:
1395 /// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1396 /// or nullptr if there is no such.
1397 /// 2) \p CntPhi is set to the corresponding phi node
1398 /// or nullptr if there is no such.
1399 /// 3) \p Var is set to the value whose CTLZ could be used.
1400 /// 4) \p DefX is set to the instruction calculating Loop exit condition.
1401 ///
1402 /// The core idiom we are trying to detect is:
1403 /// \code
1404 /// if (x0 == 0)
1405 /// goto loop-exit // the precondition of the loop
1406 /// cnt0 = init-val;
1407 /// do {
1408 /// x = phi (x0, x.next); //PhiX
1409 /// cnt = phi(cnt0, cnt.next);
1410 ///
1411 /// cnt.next = cnt + 1;
1412 /// ...
1413 /// x.next = x >> 1; // DefX
1414 /// ...
1415 /// } while(x.next != 0);
1416 ///
1417 /// loop-exit:
1418 /// \endcode
1419 static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
1420  Intrinsic::ID &IntrinID, Value *&InitX,
1421  Instruction *&CntInst, PHINode *&CntPhi,
1422  Instruction *&DefX) {
1423  BasicBlock *LoopEntry;
1424  Value *VarX = nullptr;
1425 
1426  DefX = nullptr;
1427  CntInst = nullptr;
1428  CntPhi = nullptr;
1429  LoopEntry = *(CurLoop->block_begin());
1430 
1431  // step 1: Check if the loop-back branch is in desirable form.
1432  if (Value *T = matchCondition(
1433  dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1434  DefX = dyn_cast<Instruction>(T);
1435  else
1436  return false;
1437 
1438  // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
1439  if (!DefX || !DefX->isShift())
1440  return false;
1441  IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
1442  Intrinsic::ctlz;
1443  ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
1444  if (!Shft || !Shft->isOne())
1445  return false;
1446  VarX = DefX->getOperand(0);
1447 
1448  // step 3: Check the recurrence of variable X
1449  PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
1450  if (!PhiX)
1451  return false;
1452 
1453  InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
1454 
1455  // Make sure the initial value can't be negative otherwise the ashr in the
1456  // loop might never reach zero which would make the loop infinite.
1457  if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
1458  return false;
1459 
1460  // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1461  // TODO: We can skip the step. If loop trip count is known (CTLZ),
1462  // then all uses of "cnt.next" could be optimized to the trip count
1463  // plus "cnt0". Currently it is not optimized.
1464  // This step could be used to detect POPCNT instruction:
1465  // cnt.next = cnt + (x.next & 1)
1466  for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1467  IterE = LoopEntry->end();
1468  Iter != IterE; Iter++) {
1469  Instruction *Inst = &*Iter;
1470  if (Inst->getOpcode() != Instruction::Add)
1471  continue;
1472 
1473  ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1474  if (!Inc || !Inc->isOne())
1475  continue;
1476 
1477  PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1478  if (!Phi)
1479  continue;
1480 
1481  CntInst = Inst;
1482  CntPhi = Phi;
1483  break;
1484  }
1485  if (!CntInst)
1486  return false;
1487 
1488  return true;
1489 }
1490 
1491 /// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
1492 /// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
1493 /// trip count returns true; otherwise, returns false.
1494 bool LoopIdiomRecognize::recognizeAndInsertFFS() {
1495  // Give up if the loop has multiple blocks or multiple backedges.
1496  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1497  return false;
1498 
1499  Intrinsic::ID IntrinID;
1500  Value *InitX;
1501  Instruction *DefX = nullptr;
1502  PHINode *CntPhi = nullptr;
1503  Instruction *CntInst = nullptr;
1504  // Help decide if transformation is profitable. For ShiftUntilZero idiom,
1505  // this is always 6.
1506  size_t IdiomCanonicalSize = 6;
1507 
1508  if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
1509  CntInst, CntPhi, DefX))
1510  return false;
1511 
1512  bool IsCntPhiUsedOutsideLoop = false;
1513  for (User *U : CntPhi->users())
1514  if (!CurLoop->contains(cast<Instruction>(U))) {
1515  IsCntPhiUsedOutsideLoop = true;
1516  break;
1517  }
1518  bool IsCntInstUsedOutsideLoop = false;
1519  for (User *U : CntInst->users())
1520  if (!CurLoop->contains(cast<Instruction>(U))) {
1521  IsCntInstUsedOutsideLoop = true;
1522  break;
1523  }
1524  // If both CntInst and CntPhi are used outside the loop the profitability
1525  // is questionable.
1526  if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
1527  return false;
1528 
1529  // For some CPUs result of CTLZ(X) intrinsic is undefined
1530  // when X is 0. If we can not guarantee X != 0, we need to check this
1531  // when expand.
1532  bool ZeroCheck = false;
1533  // It is safe to assume Preheader exist as it was checked in
1534  // parent function RunOnLoop.
1535  BasicBlock *PH = CurLoop->getLoopPreheader();
1536 
1537  // If we are using the count instruction outside the loop, make sure we
1538  // have a zero check as a precondition. Without the check the loop would run
1539  // one iteration for before any check of the input value. This means 0 and 1
1540  // would have identical behavior in the original loop and thus
1541  if (!IsCntPhiUsedOutsideLoop) {
1542  auto *PreCondBB = PH->getSinglePredecessor();
1543  if (!PreCondBB)
1544  return false;
1545  auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1546  if (!PreCondBI)
1547  return false;
1548  if (matchCondition(PreCondBI, PH) != InitX)
1549  return false;
1550  ZeroCheck = true;
1551  }
1552 
1553  // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
1554  // profitable if we delete the loop.
1555 
1556  // the loop has only 6 instructions:
1557  // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
1558  // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
1559  // %shr = ashr %n.addr.0, 1
1560  // %tobool = icmp eq %shr, 0
1561  // %inc = add nsw %i.0, 1
1562  // br i1 %tobool
1563 
1564  const Value *Args[] =
1565  {InitX, ZeroCheck ? ConstantInt::getTrue(InitX->getContext())
1566  : ConstantInt::getFalse(InitX->getContext())};
1567 
1568  // @llvm.dbg doesn't count as they have no semantic effect.
1569  auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
1571  std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
1572 
1573  if (HeaderSize != IdiomCanonicalSize &&
1574  TTI->getIntrinsicCost(IntrinID, InitX->getType(), Args) >
1576  return false;
1577 
1578  transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
1579  DefX->getDebugLoc(), ZeroCheck,
1580  IsCntPhiUsedOutsideLoop);
1581  return true;
1582 }
1583 
1584 /// Recognizes a population count idiom in a non-countable loop.
1585 ///
1586 /// If detected, transforms the relevant code to issue the popcount intrinsic
1587 /// function call, and returns true; otherwise, returns false.
1588 bool LoopIdiomRecognize::recognizePopcount() {
1589  if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
1590  return false;
1591 
1592  // Counting population are usually conducted by few arithmetic instructions.
1593  // Such instructions can be easily "absorbed" by vacant slots in a
1594  // non-compact loop. Therefore, recognizing popcount idiom only makes sense
1595  // in a compact loop.
1596 
1597  // Give up if the loop has multiple blocks or multiple backedges.
1598  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1599  return false;
1600 
1601  BasicBlock *LoopBody = *(CurLoop->block_begin());
1602  if (LoopBody->size() >= 20) {
1603  // The loop is too big, bail out.
1604  return false;
1605  }
1606 
1607  // It should have a preheader containing nothing but an unconditional branch.
1608  BasicBlock *PH = CurLoop->getLoopPreheader();
1609  if (!PH || &PH->front() != PH->getTerminator())
1610  return false;
1611  auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
1612  if (!EntryBI || EntryBI->isConditional())
1613  return false;
1614 
1615  // It should have a precondition block where the generated popcount intrinsic
1616  // function can be inserted.
1617  auto *PreCondBB = PH->getSinglePredecessor();
1618  if (!PreCondBB)
1619  return false;
1620  auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1621  if (!PreCondBI || PreCondBI->isUnconditional())
1622  return false;
1623 
1624  Instruction *CntInst;
1625  PHINode *CntPhi;
1626  Value *Val;
1627  if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
1628  return false;
1629 
1630  transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
1631  return true;
1632 }
1633 
1635  const DebugLoc &DL) {
1636  Value *Ops[] = {Val};
1637  Type *Tys[] = {Val->getType()};
1638 
1639  Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1640  Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
1641  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1642  CI->setDebugLoc(DL);
1643 
1644  return CI;
1645 }
1646 
1648  const DebugLoc &DL, bool ZeroCheck,
1649  Intrinsic::ID IID) {
1650  Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()};
1651  Type *Tys[] = {Val->getType()};
1652 
1653  Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1654  Function *Func = Intrinsic::getDeclaration(M, IID, Tys);
1655  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1656  CI->setDebugLoc(DL);
1657 
1658  return CI;
1659 }
1660 
1661 /// Transform the following loop (Using CTLZ, CTTZ is similar):
1662 /// loop:
1663 /// CntPhi = PHI [Cnt0, CntInst]
1664 /// PhiX = PHI [InitX, DefX]
1665 /// CntInst = CntPhi + 1
1666 /// DefX = PhiX >> 1
1667 /// LOOP_BODY
1668 /// Br: loop if (DefX != 0)
1669 /// Use(CntPhi) or Use(CntInst)
1670 ///
1671 /// Into:
1672 /// If CntPhi used outside the loop:
1673 /// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
1674 /// Count = CountPrev + 1
1675 /// else
1676 /// Count = BitWidth(InitX) - CTLZ(InitX)
1677 /// loop:
1678 /// CntPhi = PHI [Cnt0, CntInst]
1679 /// PhiX = PHI [InitX, DefX]
1680 /// PhiCount = PHI [Count, Dec]
1681 /// CntInst = CntPhi + 1
1682 /// DefX = PhiX >> 1
1683 /// Dec = PhiCount - 1
1684 /// LOOP_BODY
1685 /// Br: loop if (Dec != 0)
1686 /// Use(CountPrev + Cnt0) // Use(CntPhi)
1687 /// or
1688 /// Use(Count + Cnt0) // Use(CntInst)
1689 ///
1690 /// If LOOP_BODY is empty the loop will be deleted.
1691 /// If CntInst and DefX are not used in LOOP_BODY they will be removed.
1692 void LoopIdiomRecognize::transformLoopToCountable(
1693  Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
1694  PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
1695  bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
1696  BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
1697 
1698  // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
1699  IRBuilder<> Builder(PreheaderBr);
1700  Builder.SetCurrentDebugLocation(DL);
1701  Value *FFS, *Count, *CountPrev, *NewCount, *InitXNext;
1702 
1703  // Count = BitWidth - CTLZ(InitX);
1704  // If there are uses of CntPhi create:
1705  // CountPrev = BitWidth - CTLZ(InitX >> 1);
1706  if (IsCntPhiUsedOutsideLoop) {
1707  if (DefX->getOpcode() == Instruction::AShr)
1708  InitXNext =
1709  Builder.CreateAShr(InitX, ConstantInt::get(InitX->getType(), 1));
1710  else if (DefX->getOpcode() == Instruction::LShr)
1711  InitXNext =
1712  Builder.CreateLShr(InitX, ConstantInt::get(InitX->getType(), 1));
1713  else if (DefX->getOpcode() == Instruction::Shl) // cttz
1714  InitXNext =
1715  Builder.CreateShl(InitX, ConstantInt::get(InitX->getType(), 1));
1716  else
1717  llvm_unreachable("Unexpected opcode!");
1718  } else
1719  InitXNext = InitX;
1720  FFS = createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
1721  Count = Builder.CreateSub(
1722  ConstantInt::get(FFS->getType(),
1723  FFS->getType()->getIntegerBitWidth()),
1724  FFS);
1725  if (IsCntPhiUsedOutsideLoop) {
1726  CountPrev = Count;
1727  Count = Builder.CreateAdd(
1728  CountPrev,
1729  ConstantInt::get(CountPrev->getType(), 1));
1730  }
1731 
1732  NewCount = Builder.CreateZExtOrTrunc(
1733  IsCntPhiUsedOutsideLoop ? CountPrev : Count,
1734  cast<IntegerType>(CntInst->getType()));
1735 
1736  // If the counter's initial value is not zero, insert Add Inst.
1737  Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
1738  ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
1739  if (!InitConst || !InitConst->isZero())
1740  NewCount = Builder.CreateAdd(NewCount, CntInitVal);
1741 
1742  // Step 2: Insert new IV and loop condition:
1743  // loop:
1744  // ...
1745  // PhiCount = PHI [Count, Dec]
1746  // ...
1747  // Dec = PhiCount - 1
1748  // ...
1749  // Br: loop if (Dec != 0)
1750  BasicBlock *Body = *(CurLoop->block_begin());
1751  auto *LbBr = cast<BranchInst>(Body->getTerminator());
1752  ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
1753  Type *Ty = Count->getType();
1754 
1755  PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
1756 
1757  Builder.SetInsertPoint(LbCond);
1758  Instruction *TcDec = cast<Instruction>(
1759  Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
1760  "tcdec", false, true));
1761 
1762  TcPhi->addIncoming(Count, Preheader);
1763  TcPhi->addIncoming(TcDec, Body);
1764 
1765  CmpInst::Predicate Pred =
1766  (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
1767  LbCond->setPredicate(Pred);
1768  LbCond->setOperand(0, TcDec);
1769  LbCond->setOperand(1, ConstantInt::get(Ty, 0));
1770 
1771  // Step 3: All the references to the original counter outside
1772  // the loop are replaced with the NewCount
1773  if (IsCntPhiUsedOutsideLoop)
1774  CntPhi->replaceUsesOutsideBlock(NewCount, Body);
1775  else
1776  CntInst->replaceUsesOutsideBlock(NewCount, Body);
1777 
1778  // step 4: Forget the "non-computable" trip-count SCEV associated with the
1779  // loop. The loop would otherwise not be deleted even if it becomes empty.
1780  SE->forgetLoop(CurLoop);
1781 }
1782 
1783 void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
1784  Instruction *CntInst,
1785  PHINode *CntPhi, Value *Var) {
1786  BasicBlock *PreHead = CurLoop->getLoopPreheader();
1787  auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
1788  const DebugLoc &DL = CntInst->getDebugLoc();
1789 
1790  // Assuming before transformation, the loop is following:
1791  // if (x) // the precondition
1792  // do { cnt++; x &= x - 1; } while(x);
1793 
1794  // Step 1: Insert the ctpop instruction at the end of the precondition block
1795  IRBuilder<> Builder(PreCondBr);
1796  Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
1797  {
1798  PopCnt = createPopcntIntrinsic(Builder, Var, DL);
1799  NewCount = PopCntZext =
1800  Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
1801 
1802  if (NewCount != PopCnt)
1803  (cast<Instruction>(NewCount))->setDebugLoc(DL);
1804 
1805  // TripCnt is exactly the number of iterations the loop has
1806  TripCnt = NewCount;
1807 
1808  // If the population counter's initial value is not zero, insert Add Inst.
1809  Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
1810  ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
1811  if (!InitConst || !InitConst->isZero()) {
1812  NewCount = Builder.CreateAdd(NewCount, CntInitVal);
1813  (cast<Instruction>(NewCount))->setDebugLoc(DL);
1814  }
1815  }
1816 
1817  // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
1818  // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
1819  // function would be partial dead code, and downstream passes will drag
1820  // it back from the precondition block to the preheader.
1821  {
1822  ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
1823 
1824  Value *Opnd0 = PopCntZext;
1825  Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
1826  if (PreCond->getOperand(0) != Var)
1827  std::swap(Opnd0, Opnd1);
1828 
1829  ICmpInst *NewPreCond = cast<ICmpInst>(
1830  Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
1831  PreCondBr->setCondition(NewPreCond);
1832 
1834  }
1835 
1836  // Step 3: Note that the population count is exactly the trip count of the
1837  // loop in question, which enable us to convert the loop from noncountable
1838  // loop into a countable one. The benefit is twofold:
1839  //
1840  // - If the loop only counts population, the entire loop becomes dead after
1841  // the transformation. It is a lot easier to prove a countable loop dead
1842  // than to prove a noncountable one. (In some C dialects, an infinite loop
1843  // isn't dead even if it computes nothing useful. In general, DCE needs
1844  // to prove a noncountable loop finite before safely delete it.)
1845  //
1846  // - If the loop also performs something else, it remains alive.
1847  // Since it is transformed to countable form, it can be aggressively
1848  // optimized by some optimizations which are in general not applicable
1849  // to a noncountable loop.
1850  //
1851  // After this step, this loop (conceptually) would look like following:
1852  // newcnt = __builtin_ctpop(x);
1853  // t = newcnt;
1854  // if (x)
1855  // do { cnt++; x &= x-1; t--) } while (t > 0);
1856  BasicBlock *Body = *(CurLoop->block_begin());
1857  {
1858  auto *LbBr = cast<BranchInst>(Body->getTerminator());
1859  ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
1860  Type *Ty = TripCnt->getType();
1861 
1862  PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
1863 
1864  Builder.SetInsertPoint(LbCond);
1865  Instruction *TcDec = cast<Instruction>(
1866  Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
1867  "tcdec", false, true));
1868 
1869  TcPhi->addIncoming(TripCnt, PreHead);
1870  TcPhi->addIncoming(TcDec, Body);
1871 
1872  CmpInst::Predicate Pred =
1873  (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
1874  LbCond->setPredicate(Pred);
1875  LbCond->setOperand(0, TcDec);
1876  LbCond->setOperand(1, ConstantInt::get(Ty, 0));
1877  }
1878 
1879  // Step 4: All the references to the original population counter outside
1880  // the loop are replaced with the NewCount -- the value returned from
1881  // __builtin_ctpop().
1882  CntInst->replaceUsesOutsideBlock(NewCount, Body);
1883 
1884  // step 5: Forget the "non-computable" trip-count SCEV associated with the
1885  // loop. The loop would otherwise not be deleted even if it becomes empty.
1886  SE->forgetLoop(CurLoop);
1887 }
1888 
1889 bool LoopIdiomRecognize::matchBCmpLoopStructure(
1890  CmpLoopStructure &CmpLoop) const {
1891  ICmpInst::Predicate BCmpPred;
1892 
1893  // We are looking for the following basic layout:
1894  // PreheaderBB: <preheader> ; preds = ???
1895  // <...>
1896  // br label %LoopHeaderBB
1897  // LoopHeaderBB: <header,exiting> ; preds = %PreheaderBB,%LoopLatchBB
1898  // <...>
1899  // %BCmpValue = icmp <...>
1900  // br i1 %BCmpValue, label %LoopLatchBB, label %Successor0
1901  // LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB
1902  // <...>
1903  // %LatchCmpValue = <are we done, or do next iteration?>
1904  // br i1 %LatchCmpValue, label %Successor1, label %LoopHeaderBB
1905  // Successor0: <exit> ; preds = %LoopHeaderBB
1906  // <...>
1907  // Successor1: <exit> ; preds = %LoopLatchBB
1908  // <...>
1909  //
1910  // Successor0 and Successor1 may or may not be the same basic block.
1911 
1912  // Match basic frame-work of this supposedly-comparison loop.
1913  using namespace PatternMatch;
1914  if (!match(CurLoop->getHeader()->getTerminator(),
1915  m_Br(m_CombineAnd(m_ICmp(BCmpPred, m_Value(), m_Value()),
1916  m_Value(CmpLoop.BCmpValue)),
1917  CmpLoop.HeaderBrEqualBB, CmpLoop.HeaderBrUnequalBB)) ||
1918  !match(CurLoop->getLoopLatch()->getTerminator(),
1919  m_Br(m_CombineAnd(m_Cmp(), m_Value(CmpLoop.LatchCmpValue)),
1920  CmpLoop.LatchBrFinishBB, CmpLoop.LatchBrContinueBB))) {
1921  LLVM_DEBUG(dbgs() << "Basic control-flow layout unrecognized.\n");
1922  return false;
1923  }
1924  LLVM_DEBUG(dbgs() << "Recognized basic control-flow layout.\n");
1925  return true;
1926 }
1927 
1928 bool LoopIdiomRecognize::matchBCmpOfLoads(Value *BCmpValue,
1929  CmpOfLoads &CmpOfLoads) const {
1930  using namespace PatternMatch;
1931  LLVM_DEBUG(dbgs() << "Analyzing header icmp " << *BCmpValue
1932  << " as bcmp pattern.\n");
1933 
1934  // Match bcmp-style loop header cmp. It must be an eq-icmp of loads. Example:
1935  // %v0 = load <...>, <...>* %LoadSrcA
1936  // %v1 = load <...>, <...>* %LoadSrcB
1937  // %CmpLoop.BCmpValue = icmp eq <...> %v0, %v1
1938  // There won't be any no-op bitcasts between load and icmp,
1939  // they would have been transformed into a load of bitcast.
1940  // FIXME: {b,mem}cmp() calls have the same semantics as icmp. Match them too.
1941  if (!match(BCmpValue,
1942  m_ICmp(CmpOfLoads.BCmpPred,
1943  m_CombineAnd(m_Load(m_Value(CmpOfLoads.LoadSrcA)),
1944  m_Value(CmpOfLoads.LoadA)),
1945  m_CombineAnd(m_Load(m_Value(CmpOfLoads.LoadSrcB)),
1946  m_Value(CmpOfLoads.LoadB)))) ||
1947  !ICmpInst::isEquality(CmpOfLoads.BCmpPred)) {
1948  LLVM_DEBUG(dbgs() << "Loop header icmp did not match bcmp pattern.\n");
1949  return false;
1950  }
1951  LLVM_DEBUG(dbgs() << "Recognized header icmp as bcmp pattern with loads:\n\t"
1952  << *CmpOfLoads.LoadA << "\n\t" << *CmpOfLoads.LoadB
1953  << "\n");
1954  // FIXME: handle memcmp pattern?
1955  return true;
1956 }
1957 
1958 bool LoopIdiomRecognize::recognizeBCmpLoopControlFlow(
1959  const CmpOfLoads &CmpOfLoads, CmpLoopStructure &CmpLoop) const {
1960  BasicBlock *LoopHeaderBB = CurLoop->getHeader();
1961  BasicBlock *LoopLatchBB = CurLoop->getLoopLatch();
1962 
1963  // Be wary, comparisons can be inverted, canonicalize order.
1964  // If this 'element' comparison passed, we expect to proceed to the next elt.
1965  if (CmpOfLoads.BCmpPred != ICmpInst::Predicate::ICMP_EQ)
1966  std::swap(CmpLoop.HeaderBrEqualBB, CmpLoop.HeaderBrUnequalBB);
1967  // The predicate on loop latch does not matter, just canonicalize some order.
1968  if (CmpLoop.LatchBrContinueBB != LoopHeaderBB)
1969  std::swap(CmpLoop.LatchBrFinishBB, CmpLoop.LatchBrContinueBB);
1970 
1971  SmallVector<BasicBlock *, 2> ExitBlocks;
1972 
1973  CurLoop->getUniqueExitBlocks(ExitBlocks);
1974  assert(ExitBlocks.size() <= 2U && "Can't have more than two exit blocks.");
1975 
1976  // Check that control-flow between blocks is as expected.
1977  if (CmpLoop.HeaderBrEqualBB != LoopLatchBB ||
1978  CmpLoop.LatchBrContinueBB != LoopHeaderBB ||
1979  !is_contained(ExitBlocks, CmpLoop.HeaderBrUnequalBB) ||
1980  !is_contained(ExitBlocks, CmpLoop.LatchBrFinishBB)) {
1981  LLVM_DEBUG(dbgs() << "Loop control-flow not recognized.\n");
1982  return false;
1983  }
1984 
1985  assert(!is_contained(ExitBlocks, CmpLoop.HeaderBrEqualBB) &&
1986  !is_contained(ExitBlocks, CmpLoop.LatchBrContinueBB) &&
1987  "Unexpected exit edges.");
1988 
1989  LLVM_DEBUG(dbgs() << "Recognized loop control-flow.\n");
1990 
1991  LLVM_DEBUG(dbgs() << "Performing side-effect analysis on the loop.\n");
1992  assert(CurLoop->isLCSSAForm(*DT) && "Should only get LCSSA-form loops here.");
1993  // No loop instructions must be used outside of the loop. Since we are in
1994  // LCSSA form, we only need to check successor block's PHI nodes's incoming
1995  // values for incoming blocks that are the loop basic blocks.
1996  for (const BasicBlock *ExitBB : ExitBlocks) {
1997  for (const PHINode &PHI : ExitBB->phis()) {
1998  for (const BasicBlock *LoopBB :
1999  make_filter_range(PHI.blocks(), [this](BasicBlock *PredecessorBB) {
2000  return CurLoop->contains(PredecessorBB);
2001  })) {
2002  const auto *I =
2003  dyn_cast<Instruction>(PHI.getIncomingValueForBlock(LoopBB));
2004  if (I && CurLoop->contains(I)) {
2005  LLVM_DEBUG(dbgs()
2006  << "Loop contains instruction " << *I
2007  << " which is used outside of the loop in basic block "
2008  << ExitBB->getName() << " in phi node " << PHI << "\n");
2009  return false;
2010  }
2011  }
2012  }
2013  }
2014  // Similarly, the loop should not have any other observable side-effects
2015  // other than the final comparison result.
2016  for (BasicBlock *LoopBB : CurLoop->blocks()) {
2017  for (Instruction &I : *LoopBB) {
2018  if (isa<DbgInfoIntrinsic>(I)) // Ignore dbginfo.
2019  continue; // FIXME: anything else? lifetime info?
2020  if ((I.mayHaveSideEffects() || I.isAtomic() || I.isFenceLike()) &&
2021  &I != CmpOfLoads.LoadA && &I != CmpOfLoads.LoadB) {
2022  LLVM_DEBUG(
2023  dbgs() << "Loop contains instruction with potential side-effects: "
2024  << I << "\n");
2025  return false;
2026  }
2027  }
2028  }
2029  LLVM_DEBUG(dbgs() << "No loop instructions deemed to have side-effects.\n");
2030  return true;
2031 }
2032 
2033 bool LoopIdiomRecognize::recognizeBCmpLoopSCEV(uint64_t BCmpTyBytes,
2034  CmpOfLoads &CmpOfLoads,
2035  const SCEV *&SrcA,
2036  const SCEV *&SrcB,
2037  const SCEV *&Iterations) const {
2038  // Try to compute SCEV of the loads, for this loop's scope.
2039  const auto *ScevForSrcA = dyn_cast<SCEVAddRecExpr>(
2040  SE->getSCEVAtScope(CmpOfLoads.LoadSrcA, CurLoop));
2041  const auto *ScevForSrcB = dyn_cast<SCEVAddRecExpr>(
2042  SE->getSCEVAtScope(CmpOfLoads.LoadSrcB, CurLoop));
2043  if (!ScevForSrcA || !ScevForSrcB) {
2044  LLVM_DEBUG(dbgs() << "Failed to get SCEV expressions for load sources.\n");
2045  return false;
2046  }
2047 
2048  LLVM_DEBUG(dbgs() << "Got SCEV expressions (at loop scope) for loads:\n\t"
2049  << *ScevForSrcA << "\n\t" << *ScevForSrcB << "\n");
2050 
2051  // Loads must have folloving SCEV exprs: {%ptr,+,BCmpTyBytes}<%LoopHeaderBB>
2052  const SCEV *RecStepForA = ScevForSrcA->getStepRecurrence(*SE);
2053  const SCEV *RecStepForB = ScevForSrcB->getStepRecurrence(*SE);
2054  if (!ScevForSrcA->isAffine() || !ScevForSrcB->isAffine() ||
2055  ScevForSrcA->getLoop() != CurLoop || ScevForSrcB->getLoop() != CurLoop ||
2056  RecStepForA != RecStepForB || !isa<SCEVConstant>(RecStepForA) ||
2057  cast<SCEVConstant>(RecStepForA)->getAPInt() != BCmpTyBytes) {
2058  LLVM_DEBUG(dbgs() << "Unsupported SCEV expressions for loads. Only support "
2059  "affine SCEV expressions originating in the loop we "
2060  "are analysing with identical constant positive step, "
2061  "equal to the count of bytes compared. Got:\n\t"
2062  << *RecStepForA << "\n\t" << *RecStepForB << "\n");
2063  return false;
2064  // FIXME: can support BCmpTyBytes > Step.
2065  // But will need to account for the extra bytes compared at the end.
2066  }
2067 
2068  SrcA = ScevForSrcA->getStart();
2069  SrcB = ScevForSrcB->getStart();
2070  LLVM_DEBUG(dbgs() << "Got SCEV expressions for load sources:\n\t" << *SrcA
2071  << "\n\t" << *SrcB << "\n");
2072 
2073  // The load sources must be loop-invants that dominate the loop header.
2074  if (SrcA == SE->getCouldNotCompute() || SrcB == SE->getCouldNotCompute() ||
2075  !SE->isAvailableAtLoopEntry(SrcA, CurLoop) ||
2076  !SE->isAvailableAtLoopEntry(SrcB, CurLoop)) {
2077  LLVM_DEBUG(dbgs() << "Unsupported SCEV expressions for loads, unavaliable "
2078  "prior to loop header.\n");
2079  return false;
2080  }
2081 
2082  LLVM_DEBUG(dbgs() << "SCEV expressions for loads are acceptable.\n");
2083 
2084  // bcmp / memcmp take length argument as size_t, so let's conservatively
2085  // assume that the iteration count should be not wider than that.
2086  Type *CmpFuncSizeTy = DL->getIntPtrType(SE->getContext());
2087 
2088  // For how many iterations is loop guaranteed not to exit via LoopLatch?
2089  // This is one less than the maximal number of comparisons,and is: n + -1
2090  const SCEV *LoopExitCount =
2091  SE->getExitCount(CurLoop, CurLoop->getLoopLatch());
2092  LLVM_DEBUG(dbgs() << "Got SCEV expression for loop latch exit count: "
2093  << *LoopExitCount << "\n");
2094  // Exit count, similarly, must be loop-invant that dominates the loop header.
2095  if (LoopExitCount == SE->getCouldNotCompute() ||
2096  !LoopExitCount->getType()->isIntOrPtrTy() ||
2097  LoopExitCount->getType()->getScalarSizeInBits() >
2098  CmpFuncSizeTy->getScalarSizeInBits() ||
2099  !SE->isAvailableAtLoopEntry(LoopExitCount, CurLoop)) {
2100  LLVM_DEBUG(dbgs() << "Unsupported SCEV expression for loop latch exit.\n");
2101  return false;
2102  }
2103 
2104  // LoopExitCount is always one less than the actual count of iterations.
2105  // Do this before cast, else we will be stuck with 1 + zext(-1 + n)
2106  Iterations = SE->getAddExpr(
2107  LoopExitCount, SE->getOne(LoopExitCount->getType()), SCEV::FlagNUW);
2108  assert(Iterations != SE->getCouldNotCompute() &&
2109  "Shouldn't fail to increment by one.");
2110 
2111  LLVM_DEBUG(dbgs() << "Computed iteration count: " << *Iterations << "\n");
2112  return true;
2113 }
2114 
2115 /// Return true iff the bcmp idiom is detected in the loop.
2116 ///
2117 /// Additionally:
2118 /// 1) \p BCmpInst is set to the root byte-comparison instruction.
2119 /// 2) \p LatchCmpInst is set to the comparison that controls the latch.
2120 /// 3) \p LoadA is set to the first LoadInst.
2121 /// 4) \p LoadB is set to the second LoadInst.
2122 /// 5) \p SrcA is set to the first source location that is being compared.
2123 /// 6) \p SrcB is set to the second source location that is being compared.
2124 /// 7) \p NBytes is set to the number of bytes to compare.
2125 bool LoopIdiomRecognize::detectBCmpIdiom(ICmpInst *&BCmpInst,
2126  CmpInst *&LatchCmpInst,
2127  LoadInst *&LoadA, LoadInst *&LoadB,
2128  const SCEV *&SrcA, const SCEV *&SrcB,
2129  const SCEV *&NBytes) const {
2130  LLVM_DEBUG(dbgs() << "Recognizing bcmp idiom\n");
2131 
2132  // Give up if the loop is not in normal form, or has more than 2 blocks.
2133  if (!CurLoop->isLoopSimplifyForm() || CurLoop->getNumBlocks() > 2) {
2134  LLVM_DEBUG(dbgs() << "Basic loop structure unrecognized.\n");
2135  return false;
2136  }
2137  LLVM_DEBUG(dbgs() << "Recognized basic loop structure.\n");
2138 
2139  CmpLoopStructure CmpLoop;
2140  if (!matchBCmpLoopStructure(CmpLoop))
2141  return false;
2142 
2143  CmpOfLoads CmpOfLoads;
2144  if (!matchBCmpOfLoads(CmpLoop.BCmpValue, CmpOfLoads))
2145  return false;
2146 
2147  if (!recognizeBCmpLoopControlFlow(CmpOfLoads, CmpLoop))
2148  return false;
2149 
2150  BCmpInst = cast<ICmpInst>(CmpLoop.BCmpValue); // FIXME: is there no
2151  LatchCmpInst = cast<CmpInst>(CmpLoop.LatchCmpValue); // way to combine
2152  LoadA = cast<LoadInst>(CmpOfLoads.LoadA); // these cast with
2153  LoadB = cast<LoadInst>(CmpOfLoads.LoadB); // m_Value() matcher?
2154 
2155  Type *BCmpValTy = BCmpInst->getOperand(0)->getType();
2156  LLVMContext &Context = BCmpValTy->getContext();
2157  uint64_t BCmpTyBits = DL->getTypeSizeInBits(BCmpValTy);
2158  static constexpr uint64_t ByteTyBits = 8;
2159 
2160  LLVM_DEBUG(dbgs() << "Got comparison between values of type " << *BCmpValTy
2161  << " of size " << BCmpTyBits
2162  << " bits (while byte = " << ByteTyBits << " bits).\n");
2163  // bcmp()/memcmp() minimal unit of work is a byte. Therefore we must check
2164  // that we are dealing with a multiple of a byte here.
2165  if (BCmpTyBits % ByteTyBits != 0) {
2166  LLVM_DEBUG(dbgs() << "Value size is not a multiple of byte.\n");
2167  return false;
2168  // FIXME: could still be done under a run-time check that the total bit
2169  // count is a multiple of a byte i guess? Or handle remainder separately?
2170  }
2171 
2172  // Each comparison is done on this many bytes.
2173  uint64_t BCmpTyBytes = BCmpTyBits / ByteTyBits;
2174  LLVM_DEBUG(dbgs() << "Size is exactly " << BCmpTyBytes
2175  << " bytes, eligible for bcmp conversion.\n");
2176 
2177  const SCEV *Iterations;
2178  if (!recognizeBCmpLoopSCEV(BCmpTyBytes, CmpOfLoads, SrcA, SrcB, Iterations))
2179  return false;
2180 
2181  // bcmp / memcmp take length argument as size_t, do promotion now.
2182  Type *CmpFuncSizeTy = DL->getIntPtrType(Context);
2183  Iterations = SE->getNoopOrZeroExtend(Iterations, CmpFuncSizeTy);
2184  assert(Iterations != SE->getCouldNotCompute() && "Promotion failed.");
2185  // Note that it didn't do ptrtoint cast, we will need to do it manually.
2186 
2187  // We will be comparing *bytes*, not BCmpTy, we need to recalculate size.
2188  // It's a multiplication, and it *could* overflow. But for it to overflow
2189  // we'd want to compare more bytes than could be represented by size_t, But
2190  // allocation functions also take size_t. So how'd you produce such buffer?
2191  // FIXME: we likely need to actually check that we know this won't overflow,
2192  // via llvm::computeOverflowForUnsignedMul().
2193  NBytes = SE->getMulExpr(
2194  Iterations, SE->getConstant(CmpFuncSizeTy, BCmpTyBytes), SCEV::FlagNUW);
2195  assert(NBytes != SE->getCouldNotCompute() &&
2196  "Shouldn't fail to increment by one.");
2197 
2198  LLVM_DEBUG(dbgs() << "Computed total byte count: " << *NBytes << "\n");
2199 
2200  if (LoadA->getPointerAddressSpace() != LoadB->getPointerAddressSpace() ||
2201  LoadA->getPointerAddressSpace() != 0 || !LoadA->isSimple() ||
2202  !LoadB->isSimple()) {
2203  StringLiteral L("Unsupported loads in idiom - only support identical, "
2204  "simple loads from address space 0.\n");
2205  LLVM_DEBUG(dbgs() << L);
2206  ORE.emit([&]() {
2207  return OptimizationRemarkMissed(DEBUG_TYPE, "BCmpIdiomUnsupportedLoads",
2208  BCmpInst->getDebugLoc(),
2209  CurLoop->getHeader())
2210  << L;
2211  });
2212  return false; // FIXME: support non-simple loads.
2213  }
2214 
2215  LLVM_DEBUG(dbgs() << "Recognized bcmp idiom\n");
2216  ORE.emit([&]() {
2217  return OptimizationRemarkAnalysis(DEBUG_TYPE, "RecognizedBCmpIdiom",
2218  CurLoop->getStartLoc(),
2219  CurLoop->getHeader())
2220  << "Loop recognized as a bcmp idiom";
2221  });
2222 
2223  return true;
2224 }
2225 
2226 BasicBlock *
2227 LoopIdiomRecognize::transformBCmpControlFlow(ICmpInst *ComparedEqual) {
2228  LLVM_DEBUG(dbgs() << "Transforming control-flow.\n");
2230 
2231  BasicBlock *PreheaderBB = CurLoop->getLoopPreheader();
2232  BasicBlock *HeaderBB = CurLoop->getHeader();
2233  BasicBlock *LoopLatchBB = CurLoop->getLoopLatch();
2234  SmallString<32> LoopName = CurLoop->getName();
2235  Function *Func = PreheaderBB->getParent();
2236  LLVMContext &Context = Func->getContext();
2237 
2238  // Before doing anything, drop SCEV info.
2239  SE->forgetLoop(CurLoop);
2240 
2241  // Here we start with: (0/6)
2242  // PreheaderBB: <preheader> ; preds = ???
2243  // <...>
2244  // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
2245  // %ComparedEqual = icmp eq <...> %memcmp, 0
2246  // br label %LoopHeaderBB
2247  // LoopHeaderBB: <header,exiting> ; preds = %PreheaderBB,%LoopLatchBB
2248  // <...>
2249  // br i1 %<...>, label %LoopLatchBB, label %Successor0BB
2250  // LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB
2251  // <...>
2252  // br i1 %<...>, label %Successor1BB, label %LoopHeaderBB
2253  // Successor0BB: <exit> ; preds = %LoopHeaderBB
2254  // %S0PHI = phi <...> [ <...>, %LoopHeaderBB ]
2255  // <...>
2256  // Successor1BB: <exit> ; preds = %LoopLatchBB
2257  // %S1PHI = phi <...> [ <...>, %LoopLatchBB ]
2258  // <...>
2259  //
2260  // Successor0 and Successor1 may or may not be the same basic block.
2261 
2262  // Decouple the edge between loop preheader basic block and loop header basic
2263  // block. Thus the loop has become unreachable.
2264  assert(cast<BranchInst>(PreheaderBB->getTerminator())->isUnconditional() &&
2265  PreheaderBB->getTerminator()->getSuccessor(0) == HeaderBB &&
2266  "Preheader bb must end with an unconditional branch to header bb.");
2267  PreheaderBB->getTerminator()->eraseFromParent();
2268  DTUpdates.push_back({DominatorTree::Delete, PreheaderBB, HeaderBB});
2269 
2270  // Create a new preheader basic block before loop header basic block.
2271  auto *PhonyPreheaderBB = BasicBlock::Create(
2272  Context, LoopName + ".phonypreheaderbb", Func, HeaderBB);
2273  // And insert an unconditional branch from phony preheader basic block to
2274  // loop header basic block.
2275  IRBuilder<>(PhonyPreheaderBB).CreateBr(HeaderBB);
2276  DTUpdates.push_back({DominatorTree::Insert, PhonyPreheaderBB, HeaderBB});
2277 
2278  // Create a *single* new empty block that we will substitute as a
2279  // successor basic block for the loop's exits. This one is temporary.
2280  // Much like phony preheader basic block, it is not connected.
2281  auto *PhonySuccessorBB =
2282  BasicBlock::Create(Context, LoopName + ".phonysuccessorbb", Func,
2283  LoopLatchBB->getNextNode());
2284  // That block must have *some* non-PHI instruction, or else deleteDeadLoop()
2285  // will mess up cleanup of dbginfo, and verifier will complain.
2286  IRBuilder<>(PhonySuccessorBB).CreateUnreachable();
2287 
2288  // Create two new empty blocks that we will use to preserve the original
2289  // loop exit control-flow, and preserve the incoming values in the PHI nodes
2290  // in loop's successor exit blocks. These will live one.
2291  auto *ComparedUnequalBB =
2292  BasicBlock::Create(Context, ComparedEqual->getName() + ".unequalbb", Func,
2293  PhonySuccessorBB->getNextNode());
2294  auto *ComparedEqualBB =
2295  BasicBlock::Create(Context, ComparedEqual->getName() + ".equalbb", Func,
2296  PhonySuccessorBB->getNextNode());
2297 
2298  // By now we have: (1/6)
2299  // PreheaderBB: ; preds = ???
2300  // <...>
2301  // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
2302  // %ComparedEqual = icmp eq <...> %memcmp, 0
2303  // [no terminator instruction!]
2304  // PhonyPreheaderBB: <preheader> ; No preds, UNREACHABLE!
2305  // br label %LoopHeaderBB
2306  // LoopHeaderBB: <header,exiting> ; preds = %PhonyPreheaderBB, %LoopLatchBB
2307  // <...>
2308  // br i1 %<...>, label %LoopLatchBB, label %Successor0BB
2309  // LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB
2310  // <...>
2311  // br i1 %<...>, label %Successor1BB, label %LoopHeaderBB
2312  // PhonySuccessorBB: ; No preds, UNREACHABLE!
2313  // unreachable
2314  // EqualBB: ; No preds, UNREACHABLE!
2315  // [no terminator instruction!]
2316  // UnequalBB: ; No preds, UNREACHABLE!
2317  // [no terminator instruction!]
2318  // Successor0BB: <exit> ; preds = %LoopHeaderBB
2319  // %S0PHI = phi <...> [ <...>, %LoopHeaderBB ]
2320  // <...>
2321  // Successor1BB: <exit> ; preds = %LoopLatchBB
2322  // %S1PHI = phi <...> [ <...>, %LoopLatchBB ]
2323  // <...>
2324 
2325  // What is the mapping/replacement basic block for exiting out of the loop
2326  // from either of old's loop basic blocks?
2327  auto GetReplacementBB = [this, ComparedEqualBB,
2328  ComparedUnequalBB](const BasicBlock *OldBB) {
2329  assert(CurLoop->contains(OldBB) && "Only for loop's basic blocks.");
2330  if (OldBB == CurLoop->getLoopLatch()) // "all elements compared equal".
2331  return ComparedEqualBB;
2332  if (OldBB == CurLoop->getHeader()) // "element compared unequal".
2333  return ComparedUnequalBB;
2334  llvm_unreachable("Only had two basic blocks in loop.");
2335  };
2336 
2337  // What are the exits out of this loop?
2338  SmallVector<Loop::Edge, 2> LoopExitEdges;
2339  CurLoop->getExitEdges(LoopExitEdges);
2340  assert(LoopExitEdges.size() == 2 && "Should have only to two exit edges.");
2341 
2342  // Populate new basic blocks, update the exiting control-flow, PHI nodes.
2343  for (const Loop::Edge &Edge : LoopExitEdges) {
2344  auto *OldLoopBB = const_cast<BasicBlock *>(Edge.first);
2345  auto *SuccessorBB = const_cast<BasicBlock *>(Edge.second);
2346  assert(CurLoop->contains(OldLoopBB) && !CurLoop->contains(SuccessorBB) &&
2347  "Unexpected edge.");
2348 
2349  // If we would exit the loop from this loop's basic block,
2350  // what semantically would that mean? Did comparison succeed or fail?
2351  BasicBlock *NewBB = GetReplacementBB(OldLoopBB);
2352  assert(NewBB->empty() && "Should not get same new basic block here twice.");
2353  IRBuilder<> Builder(NewBB);
2354  Builder.SetCurrentDebugLocation(OldLoopBB->getTerminator()->getDebugLoc());
2355  Builder.CreateBr(SuccessorBB);
2356  DTUpdates.push_back({DominatorTree::Insert, NewBB, SuccessorBB});
2357  // Also, be *REALLY* careful with PHI nodes in successor basic block,
2358  // update them to recieve the same input value, but not from current loop's
2359  // basic block, but from new basic block instead.
2360  SuccessorBB->replacePhiUsesWith(OldLoopBB, NewBB);
2361  // Also, change loop control-flow. This loop's basic block shall no longer
2362  // exit from the loop to it's original successor basic block, but to our new
2363  // phony successor basic block. Note that new successor will be unique exit.
2364  OldLoopBB->getTerminator()->replaceSuccessorWith(SuccessorBB,
2365  PhonySuccessorBB);
2366  DTUpdates.push_back({DominatorTree::Delete, OldLoopBB, SuccessorBB});
2367  DTUpdates.push_back({DominatorTree::Insert, OldLoopBB, PhonySuccessorBB});
2368  }
2369 
2370  // Inform DomTree about edge changes. Note that LoopInfo is still out-of-date.
2371  assert(DTUpdates.size() == 8 && "Update count prediction failed.");
2373  DTU.applyUpdates(DTUpdates);
2374  DTUpdates.clear();
2375 
2376  // By now we have: (2/6)
2377  // PreheaderBB: ; preds = ???
2378  // <...>
2379  // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
2380  // %ComparedEqual = icmp eq <...> %memcmp, 0
2381  // [no terminator instruction!]
2382  // PhonyPreheaderBB: <preheader> ; No preds, UNREACHABLE!
2383  // br label %LoopHeaderBB
2384  // LoopHeaderBB: <header,exiting> ; preds = %PhonyPreheaderBB, %LoopLatchBB
2385  // <...>
2386  // br i1 %<...>, label %LoopLatchBB, label %PhonySuccessorBB
2387  // LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB
2388  // <...>
2389  // br i1 %<...>, label %PhonySuccessorBB, label %LoopHeaderBB
2390  // PhonySuccessorBB: <uniq. exit> ; preds = %LoopHeaderBB, %LoopLatchBB
2391  // unreachable
2392  // EqualBB: ; No preds, UNREACHABLE!
2393  // br label %Successor1BB
2394  // UnequalBB: ; No preds, UNREACHABLE!
2395  // br label %Successor0BB
2396  // Successor0BB: ; preds = %UnequalBB
2397  // %S0PHI = phi <...> [ <...>, %UnequalBB ]
2398  // <...>
2399  // Successor1BB: ; preds = %EqualBB
2400  // %S0PHI = phi <...> [ <...>, %EqualBB ]
2401  // <...>
2402 
2403  // *Finally*, zap the original loop. Record it's parent loop though.
2404  Loop *ParentLoop = CurLoop->getParentLoop();
2405  LLVM_DEBUG(dbgs() << "Deleting old loop.\n");
2406  LoopDeleter.markLoopAsDeleted(CurLoop); // Mark as deleted *BEFORE* deleting!
2407  deleteDeadLoop(CurLoop, DT, SE, LI); // And actually delete the loop.
2408  CurLoop = nullptr;
2409 
2410  // By now we have: (3/6)
2411  // PreheaderBB: ; preds = ???
2412  // <...>
2413  // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
2414  // %ComparedEqual = icmp eq <...> %memcmp, 0
2415  // [no terminator instruction!]
2416  // PhonyPreheaderBB: ; No preds, UNREACHABLE!
2417  // br label %PhonySuccessorBB
2418  // PhonySuccessorBB: ; preds = %PhonyPreheaderBB
2419  // unreachable
2420  // EqualBB: ; No preds, UNREACHABLE!
2421  // br label %Successor1BB
2422  // UnequalBB: ; No preds, UNREACHABLE!
2423  // br label %Successor0BB
2424  // Successor0BB: ; preds = %UnequalBB
2425  // %S0PHI = phi <...> [ <...>, %UnequalBB ]
2426  // <...>
2427  // Successor1BB: ; preds = %EqualBB
2428  // %S0PHI = phi <...> [ <...>, %EqualBB ]
2429  // <...>
2430 
2431  // Now, actually restore the CFG.
2432 
2433  // Insert an unconditional branch from an actual preheader basic block to
2434  // phony preheader basic block.
2435  IRBuilder<>(PreheaderBB).CreateBr(PhonyPreheaderBB);
2436  DTUpdates.push_back({DominatorTree::Insert, PhonyPreheaderBB, HeaderBB});
2437  // Insert proper conditional branch from phony successor basic block to the
2438  // "dispatch" basic blocks, which were used to preserve incoming values in
2439  // original loop's successor basic blocks.
2440  assert(isa<UnreachableInst>(PhonySuccessorBB->getTerminator()) &&
2441  "Yep, that's the one we created to keep deleteDeadLoop() happy.");
2442  PhonySuccessorBB->getTerminator()->eraseFromParent();
2443  {
2444  IRBuilder<> Builder(PhonySuccessorBB);
2445  Builder.SetCurrentDebugLocation(ComparedEqual->getDebugLoc());
2446  Builder.CreateCondBr(ComparedEqual, ComparedEqualBB, ComparedUnequalBB);
2447  }
2448  DTUpdates.push_back(
2449  {DominatorTree::Insert, PhonySuccessorBB, ComparedEqualBB});
2450  DTUpdates.push_back(
2451  {DominatorTree::Insert, PhonySuccessorBB, ComparedUnequalBB});
2452 
2453  BasicBlock *DispatchBB = PhonySuccessorBB;
2454  DispatchBB->setName(LoopName + ".bcmpdispatchbb");
2455 
2456  assert(DTUpdates.size() == 3 && "Update count prediction failed.");
2457  DTU.applyUpdates(DTUpdates);
2458  DTUpdates.clear();
2459 
2460  // By now we have: (4/6)
2461  // PreheaderBB: ; preds = ???
2462  // <...>
2463  // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
2464  // %ComparedEqual = icmp eq <...> %memcmp, 0
2465  // br label %PhonyPreheaderBB
2466  // PhonyPreheaderBB: ; preds = %PreheaderBB
2467  // br label %DispatchBB
2468  // DispatchBB: ; preds = %PhonyPreheaderBB
2469  // br i1 %ComparedEqual, label %EqualBB, label %UnequalBB
2470  // EqualBB: ; preds = %DispatchBB
2471  // br label %Successor1BB
2472  // UnequalBB: ; preds = %DispatchBB
2473  // br label %Successor0BB
2474  // Successor0BB: ; preds = %UnequalBB
2475  // %S0PHI = phi <...> [ <...>, %UnequalBB ]
2476  // <...>
2477  // Successor1BB: ; preds = %EqualBB
2478  // %S0PHI = phi <...> [ <...>, %EqualBB ]
2479  // <...>
2480 
2481  // The basic CFG has been restored! Now let's merge redundant basic blocks.
2482 
2483  // Merge phony successor basic block into it's only predecessor,
2484  // phony preheader basic block. It is fully pointlessly redundant.
2485  MergeBasicBlockIntoOnlyPred(DispatchBB, &DTU);
2486 
2487  // By now we have: (5/6)
2488  // PreheaderBB: ; preds = ???
2489  // <...>
2490  // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
2491  // %ComparedEqual = icmp eq <...> %memcmp, 0
2492  // br label %DispatchBB
2493  // DispatchBB: ; preds = %PreheaderBB
2494  // br i1 %ComparedEqual, label %EqualBB, label %UnequalBB
2495  // EqualBB: ; preds = %DispatchBB
2496  // br label %Successor1BB
2497  // UnequalBB: ; preds = %DispatchBB
2498  // br label %Successor0BB
2499  // Successor0BB: ; preds = %UnequalBB
2500  // %S0PHI = phi <...> [ <...>, %UnequalBB ]
2501  // <...>
2502  // Successor1BB: ; preds = %EqualBB
2503  // %S0PHI = phi <...> [ <...>, %EqualBB ]
2504  // <...>
2505 
2506  // Was this loop nested?
2507  if (!ParentLoop) {
2508  // If the loop was *NOT* nested, then let's also merge phony successor
2509  // basic block into it's only predecessor, preheader basic block.
2510  // Also, here we need to update LoopInfo.
2511  LI->removeBlock(PreheaderBB);
2512  MergeBasicBlockIntoOnlyPred(DispatchBB, &DTU);
2513 
2514  // By now we have: (6/6)
2515  // DispatchBB: ; preds = ???
2516  // <...>
2517  // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
2518  // %ComparedEqual = icmp eq <...> %memcmp, 0
2519  // br i1 %ComparedEqual, label %EqualBB, label %UnequalBB
2520  // EqualBB: ; preds = %DispatchBB
2521  // br label %Successor1BB
2522  // UnequalBB: ; preds = %DispatchBB
2523  // br label %Successor0BB
2524  // Successor0BB: ; preds = %UnequalBB
2525  // %S0PHI = phi <...> [ <...>, %UnequalBB ]
2526  // <...>
2527  // Successor1BB: ; preds = %EqualBB
2528  // %S0PHI = phi <...> [ <...>, %EqualBB ]
2529  // <...>
2530 
2531  return DispatchBB;
2532  }
2533 
2534  // Otherwise, we need to "preserve" the LoopSimplify form of the deleted loop.
2535  // To achieve that, we shall keep the preheader basic block (mainly so that
2536  // the loop header block will be guaranteed to have a predecessor outside of
2537  // the loop), and create a phony loop with all these new three basic blocks.
2538  Loop *PhonyLoop = LI->AllocateLoop();
2539  ParentLoop->addChildLoop(PhonyLoop);
2540  PhonyLoop->addBasicBlockToLoop(DispatchBB, *LI);
2541  PhonyLoop->addBasicBlockToLoop(ComparedEqualBB, *LI);
2542  PhonyLoop->addBasicBlockToLoop(ComparedUnequalBB, *LI);
2543 
2544  // But we only have a preheader basic block, a header basic block block and
2545  // two exiting basic blocks. For a proper loop we also need a backedge from
2546  // non-header basic block to header bb.
2547  // Let's just add a never-taken branch from both of the exiting basic blocks.
2548  for (BasicBlock *BB : {ComparedEqualBB, ComparedUnequalBB}) {
2549  BranchInst *OldTerminator = cast<BranchInst>(BB->getTerminator());
2550  assert(OldTerminator->isUnconditional() && "That's the one we created.");
2551  BasicBlock *SuccessorBB = OldTerminator->getSuccessor(0);
2552 
2553  IRBuilder<> Builder(OldTerminator);
2554  Builder.SetCurrentDebugLocation(OldTerminator->getDebugLoc());
2555  Builder.CreateCondBr(ConstantInt::getTrue(Context), SuccessorBB,
2556  DispatchBB);
2557  OldTerminator->eraseFromParent();
2558  // Yes, the backedge will never be taken. The control-flow is redundant.
2559  // If it can be simplified further, other passes will take care.
2560  DTUpdates.push_back({DominatorTree::Delete, BB, SuccessorBB});
2561  DTUpdates.push_back({DominatorTree::Insert, BB, SuccessorBB});
2562  DTUpdates.push_back({DominatorTree::Insert, BB, DispatchBB});
2563  }
2564  assert(DTUpdates.size() == 6 && "Update count prediction failed.");
2565  DTU.applyUpdates(DTUpdates);
2566  DTUpdates.clear();
2567 
2568  // By now we have: (6/6)
2569  // PreheaderBB: <preheader> ; preds = ???
2570  // <...>
2571  // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
2572  // %ComparedEqual = icmp eq <...> %memcmp, 0
2573  // br label %BCmpDispatchBB
2574  // BCmpDispatchBB: <header> ; preds = %PreheaderBB
2575  // br i1 %ComparedEqual, label %EqualBB, label %UnequalBB
2576  // EqualBB: <latch,exiting> ; preds = %BCmpDispatchBB
2577  // br i1 %true, label %Successor1BB, label %BCmpDispatchBB
2578  // UnequalBB: <latch,exiting> ; preds = %BCmpDispatchBB
2579  // br i1 %true, label %Successor0BB, label %BCmpDispatchBB
2580  // Successor0BB: ; preds = %UnequalBB
2581  // %S0PHI = phi <...> [ <...>, %UnequalBB ]
2582  // <...>
2583  // Successor1BB: ; preds = %EqualBB
2584  // %S0PHI = phi <...> [ <...>, %EqualBB ]
2585  // <...>
2586 
2587  // Finally fully DONE!
2588  return DispatchBB;
2589 }
2590 
2591 void LoopIdiomRecognize::transformLoopToBCmp(ICmpInst *BCmpInst,
2592  CmpInst *LatchCmpInst,
2593  LoadInst *LoadA, LoadInst *LoadB,
2594  const SCEV *SrcA, const SCEV *SrcB,
2595  const SCEV *NBytes) {
2596  // We will be inserting before the terminator instruction of preheader block.
2597  IRBuilder<> Builder(CurLoop->getLoopPreheader()->getTerminator());
2598 
2599  LLVM_DEBUG(dbgs() << "Transforming bcmp loop idiom into a call.\n");
2600  LLVM_DEBUG(dbgs() << "Emitting new instructions.\n");
2601 
2602  // Expand the SCEV expressions for both sources to compare, and produce value
2603  // for the byte len (beware of Iterations potentially being a pointer, and
2604  // account for element size being BCmpTyBytes bytes, which may be not 1 byte)
2605  Value *PtrA, *PtrB, *Len;
2606  {
2607  SCEVExpander SExp(*SE, *DL, "LoopToBCmp");
2608  SExp.setInsertPoint(&*Builder.GetInsertPoint());
2609 
2610  auto HandlePtr = [&SExp](LoadInst *Load, const SCEV *Src) {
2612  // If the pointer operand of original load had dbgloc - use it.
2613  if (const auto *I = dyn_cast<Instruction>(Load->getPointerOperand()))
2614  SExp.SetCurrentDebugLocation(I->getDebugLoc());
2615  return SExp.expandCodeFor(Src);
2616  };
2617  PtrA = HandlePtr(LoadA, SrcA);
2618  PtrB = HandlePtr(LoadB, SrcB);
2619 
2620  // For len calculation let's use dbgloc for the loop's latch condition.
2621  Builder.SetCurrentDebugLocation(LatchCmpInst->getDebugLoc());
2622  SExp.SetCurrentDebugLocation(LatchCmpInst->getDebugLoc());
2623  Len = SExp.expandCodeFor(NBytes);
2624 
2625  Type *CmpFuncSizeTy = DL->getIntPtrType(Builder.getContext());
2626  assert(SE->getTypeSizeInBits(Len->getType()) ==
2627  DL->getTypeSizeInBits(CmpFuncSizeTy) &&
2628  "Len should already have the correct size.");
2629 
2630  // Make sure that iteration count is a number, insert ptrtoint cast if not.
2631  if (Len->getType()->isPointerTy())
2632  Len = Builder.CreatePtrToInt(Len, CmpFuncSizeTy);
2633  assert(Len->getType() == CmpFuncSizeTy && "Should have correct type now.");
2634 
2635  Len->setName(Len->getName() + ".bytecount");
2636 
2637  // There is no legality check needed. We want to compare that the memory
2638  // regions [PtrA, PtrA+Len) and [PtrB, PtrB+Len) are fully identical, equal.
2639  // For them to be fully equal, they must match bit-by-bit. And likewise,
2640  // for them to *NOT* be fully equal, they have to differ just by one bit.
2641  // The step of comparison (bits compared at once) simply does not matter.
2642  }
2643 
2644  // For the rest of new instructions, dbgloc should point at the value cmp.
2645  Builder.SetCurrentDebugLocation(BCmpInst->getDebugLoc());
2646 
2647  // Emit the comparison itself.
2648  auto *CmpCall =
2649  cast<CallInst>(HasBCmp ? emitBCmp(PtrA, PtrB, Len, Builder, *DL, TLI)
2650  : emitMemCmp(PtrA, PtrB, Len, Builder, *DL, TLI));
2651  // FIXME: add {B,Mem}CmpInst with MemoryCompareInst
2652  // (based on MemIntrinsicBase) as base?
2653  // FIXME: propagate metadata from loads? (alignments, AS, TBAA, ...)
2654 
2655  // {b,mem}cmp returned 0 if they were equal, or non-zero if not equal.
2656  auto *ComparedEqual = cast<ICmpInst>(Builder.CreateICmpEQ(
2657  CmpCall, ConstantInt::get(CmpCall->getType(), 0),
2658  PtrA->getName() + ".vs." + PtrB->getName() + ".eqcmp"));
2659 
2660  BasicBlock *BB = transformBCmpControlFlow(ComparedEqual);
2661  Builder.ClearInsertionPoint();
2662 
2663  // We're done.
2664  LLVM_DEBUG(dbgs() << "Transformed loop bcmp idiom into a call.\n");
2665  ORE.emit([&]() {
2666  return OptimizationRemark(DEBUG_TYPE, "TransformedBCmpIdiomToCall",
2667  CmpCall->getDebugLoc(), BB)
2668  << "Transformed bcmp idiom into a call to "
2669  << ore::NV("NewFunction", CmpCall->getCalledFunction())
2670  << "() function";
2671  });
2672  ++NumBCmp;
2673 }
2674 
2675 /// Recognizes a bcmp idiom in a non-countable loop.
2676 ///
2677 /// If detected, transforms the relevant code to issue the bcmp (or memcmp)
2678 /// intrinsic function call, and returns true; otherwise, returns false.
2679 bool LoopIdiomRecognize::recognizeBCmp() {
2680  if (!HasMemCmp && !HasBCmp)
2681  return false;
2682 
2683  ICmpInst *BCmpInst;
2684  CmpInst *LatchCmpInst;
2685  LoadInst *LoadA, *LoadB;
2686  const SCEV *SrcA, *SrcB, *NBytes;
2687  if (!detectBCmpIdiom(BCmpInst, LatchCmpInst, LoadA, LoadB, SrcA, SrcB,
2688  NBytes)) {
2689  LLVM_DEBUG(dbgs() << "bcmp idiom recognition failed.\n");
2690  return false;
2691  }
2692 
2693  transformLoopToBCmp(BCmpInst, LatchCmpInst, LoadA, LoadB, SrcA, SrcB, NBytes);
2694  return true;
2695 }
The access may reference and may modify the value stored in memory.
virtual void computeLoopSafetyInfo(const Loop *CurLoop)
Computes safety information for a loop checks loop body & header for the possibility of may throw exc...
Definition: MustExecute.cpp:49
Pass interface - Implemented by all &#39;passes&#39;.
Definition: Pass.h:80
uint64_t CallInst * C
Value * getValueOperand()
Definition: Instructions.h:415
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks &#39;this&#39; from the containing basic block and deletes it.
Definition: Instruction.cpp:67
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:616
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional &#39;br Cond, TrueDest, FalseDest&#39; instruction.
Definition: IRBuilder.h:890
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:70
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2211
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:722
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:78
Diagnostic information for missed-optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
static CallInst * createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL)
LLVMContext & Context
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:1887
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:288
const SCEV * getConstant(ConstantInt *V)
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:777
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
LLVM_ATTRIBUTE_NORETURN void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:139
This class represents lattice values for constants.
Definition: AllocatorList.h:23
BinaryOps getOpcode() const
Definition: InstrTypes.h:402
bool isAtomic() const
Return true if this instruction has an AtomicOrdering of unordered or higher.
This header provides classes for managing a pipeline of passes over loops in LLVM IR...
ArrayRef< BasicBlock *>::const_iterator block_iterator
Definition: LoopInfo.h:158
A Module instance is used to store all the information related to an LLVM module. ...
Definition: Module.h:66
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:627
BasicBlock * getSuccessor(unsigned Idx) const
Return the specified successor. This instruction must be a terminator.
amdgpu Simplify well known AMD library false FunctionCallee Value const Twine & Name
static constexpr LocationSize unknown()
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:170
bool isUnordered() const
Definition: Instructions.h:409
void push_back(const T &Elt)
Definition: SmallVector.h:211
The main scalar evolution driver.
This class represents a function call, abstracting a target machine&#39;s calling convention.
static PHINode * getRecurrenceVar(Value *VarX, Instruction *DefX, BasicBlock *LoopEntry)
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:160
bool isAvailableAtLoopEntry(const SCEV *S, const Loop *L)
Determine if the SCEV can be evaluated at loop&#39;s entry.
Value * getValue() const
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:56
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:743
static LocationSize precise(uint64_t Value)
uint64_t getTypeSizeInBits(Type *Ty) const
Return the size in bits of the specified type, for which isSCEVable must return true.
This class wraps the llvm.memset intrinsic.
BasicBlock * getSuccessor(unsigned i) const
STATISTIC(NumFunctions, "Total number of functions")
A debug info location.
Definition: DebugLoc.h:33
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
F(f)
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:635
An instruction for reading from memory.
Definition: Instructions.h:169
static Value * matchCondition(BranchInst *BI, BasicBlock *LoopEntry, bool JmpOnZero=false)
Check if the given conditional branch is based on the comparison between a variable and zero...
Value * getCondition() const
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:624
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.cpp:144
Value * getLength() const
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:92
static Constant * get(ArrayType *T, ArrayRef< Constant *> V)
Definition: Constants.cpp:1014
Value * getDest() const
This is just like getRawDest, but it strips off any cast instructions (including addrspacecast) that ...
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:273
Simple and conservative implementation of LoopSafetyInfo that can give false-positive answers to its ...
Definition: MustExecute.h:104
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:47
AnalysisUsage & addRequired()
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:140
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:50
bool isVolatile() const
Return true if this is a load from a volatile memory location.
Definition: Instructions.h:233
void MergeBasicBlockIntoOnlyPred(BasicBlock *BB, DomTreeUpdater *DTU=nullptr)
BB is a block with one predecessor and its predecessor is known to have one successor (BB!)...
Definition: Local.cpp:670
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:369
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Definition: LoopInfo.h:928
Diagnostic information for optimization analysis remarks.
LLVMContext & getContext() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:779
This file contains the simple types necessary to represent the attributes associated with functions a...
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1118
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:285
unsigned getDestAlignment() const
This file implements a class to represent arbitrary precision integral constant values and operations...
const APInt & getAPInt() const
BlockT * getHeader() const
Definition: LoopInfo.h:105
void deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE, LoopInfo *LI)
This function deletes dead loops.
Definition: LoopUtils.cpp:506
static bool isSimple(Instruction *I)
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
Definition: Constants.h:200
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:424
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:246
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:181
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:141
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
Definition: LoopInfoImpl.h:235
This node represents a polynomial recurrence on the trip count of the specified loop.
static APInt getStoreStride(const SCEVAddRecExpr *StoreEv)
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:126
Class to represent array types.
Definition: DerivedTypes.h:408
static cl::opt< bool > UseLIRCodeSizeHeurs("use-lir-code-size-heurs", cl::desc("Use loop idiom recognition code size heuristics when compiling" "with -Os/-Oz"), cl::init(true), cl::Hidden)
bool has(LibFunc F) const
Tests whether a library function is available.
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:82
bool isIntOrPtrTy() const
Return true if this is an integer type or a pointer type.
Definition: Type.h:212
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:244
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:125
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1135
void initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry &)
An instruction for storing to memory.
Definition: Instructions.h:325
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:156
bool isMinusOne() const
This function will return true iff every bit in this constant is set to true.
Definition: Constants.h:208
bool isKnownNonNegative(const Value *V, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Returns true if the give value is known to be non-negative.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:429
static void deleteDeadInstruction(Instruction *I)
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:144
Function * getDeclaration(Module *M, ID id, ArrayRef< Type *> Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1093
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block...
Definition: IRBuilder.h:132
Value * getOperand(unsigned i) const
Definition: User.h:169
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:210
bool isLoopEntryGuardedByCond(const Loop *L, ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS)
Test whether entry to the loop is protected by a conditional between LHS and RHS. ...
static Constant * getBitCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:1804
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
const SCEV * getCouldNotCompute()
bool inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI)
Analyze the name and prototype of the given function and set any applicable attributes.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
const SCEV * getOperand(unsigned i) const
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:852
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:196
Wrapper pass for TargetTransformInfo.
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:154
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:328
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:240
LLVM Basic Block Representation.
Definition: BasicBlock.h:57
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:64
void setInsertPoint(Instruction *IP)
Set the current insertion point.
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:323
Conditional or Unconditional Branch instruction.
static CallInst * createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL, bool ZeroCheck, Intrinsic::ID IID)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This is an important base class in LLVM.
Definition: Constant.h:41
std::pair< BasicBlock *, BasicBlock *> Edge
Edge type.
Definition: LoopInfo.h:289
Value * getIncomingValueForBlock(const BasicBlock *BB) const
This file contains the declarations for the subclasses of Constant, which represent the different fla...
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:224
const Instruction & front() const
Definition: BasicBlock.h:285
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:370
const SCEV * getAddExpr(SmallVectorImpl< const SCEV *> &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
size_t size() const
Definition: BasicBlock.h:283
Diagnostic information for applied optimization remarks.
const SCEV * getSCEVAtScope(const SCEV *S, const Loop *L)
Return a SCEV expression for the specified value at the specified scope in the program.
bool isUnordered() const
Definition: Instructions.h:283
Represent the analysis usage information of a pass.
This instruction compares its operands according to the predicate given to the constructor.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:732
constexpr double e
Definition: MathExtras.h:57
Value * expandCodeFor(const SCEV *SH, Type *Ty, Instruction *I)
Insert code to directly compute the specified SCEV expression into the program.
static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL, Intrinsic::ID &IntrinID, Value *&InitX, Instruction *&CntInst, PHINode *&CntPhi, Instruction *&DefX)
Return true if the idiom is detected in the loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values...
Value * getPointerOperand()
Definition: Instructions.h:289
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:99
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:381
self_iterator getIterator()
Definition: ilist_node.h:81
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS. Minus is represented in SCEV as A+B*-1.
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function. ...
Definition: Function.cpp:205
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:440
Used in the streaming interface as the general argument type.
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1446
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:160
size_t size() const
Definition: SmallVector.h:52
static wasm::ValType getType(const TargetRegisterClass *RC)
static PointerType * getInt8PtrTy(LLVMContext &C, unsigned AS=0)
Definition: Type.cpp:224
void markLoopAsDeleted(Loop &L, llvm::StringRef Name)
Loop passes should use this method to indicate they have deleted a loop from the nest.
const SCEV * getNoopOrZeroExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
bool isVolatile() const
const SCEV * getMulExpr(SmallVectorImpl< const SCEV *> &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
Value * GetUnderlyingObject(Value *V, const DataLayout &DL, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value...
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:40
Value * isBytewiseValue(Value *V, const DataLayout &DL)
If the specified value can be set by repeating the same byte in memory, return the i8 value that it i...
Representation for a specific memory location.
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
typename vector_type::const_iterator iterator
Definition: SetVector.h:48
Iterator for intrusive lists based on ilist_node.
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
Type * getType() const
Return the LLVM type of this SCEV expression.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
iterator end()
Definition: BasicBlock.h:275
An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...
Definition: PassManager.h:1161
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type...
Definition: Type.cpp:134
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
bool dominates(const Instruction *Def, const Use &U) const
Return true if Def dominates a use in User.
Definition: Dominators.cpp:248
Module.h This file contains the declarations for the Module class.
Provides information about what library functions are available for the current target.
LoopT * AllocateLoop(ArgsTy &&... Args)
Definition: LoopInfo.h:892
static Constant * get(Type *Ty, uint64_t V, bool isSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:653
bool isConditional() const
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
FunctionCallee getOrInsertFunction(StringRef Name, FunctionType *T, AttributeList AttributeList)
Look up the specified function in the module symbol table.
Definition: Module.cpp:143
static Constant * getMemSetPatternValue(Value *V, const DataLayout *DL)
getMemSetPatternValue - If a strided store of the specified value is safe to turn into a memset_patte...
void markLoopAsDeleted(Loop &L)
Definition: LoopPass.cpp:143
static const SCEV * getNumBytes(const SCEV *BECount, Type *IntPtr, unsigned StoreSize, Loop *CurLoop, const DataLayout *DL, ScalarEvolution *SE)
Compute the number of bytes as a SCEV from the backedge taken count.
static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, Instruction *&CntInst, PHINode *&CntPhi, Value *&Var)
Return true iff the idiom is detected in the loop.
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:609
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
The access may modify the value stored in memory.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:940
signed less or equal
Definition: InstrTypes.h:762
loop Recognize loop idioms
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:55
Class for arbitrary precision integers.
Definition: APInt.h:69
loop idiom
iterator_range< user_iterator > users()
Definition: Value.h:420
This class uses information about analyze scalars to rewrite expressions in canonical form...
Pass * createLoopIdiomPass()
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:328
LoopT * getParentLoop() const
Definition: LoopInfo.h:106
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:807
bool isVolatile() const
Return true if this is a store to a volatile memory location.
Definition: Instructions.h:356
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:331
void setAlignment(MaybeAlign Align)
Definition: Globals.cpp:116
INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom", "Recognize loop idioms", false, false) INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
void setUnnamedAddr(UnnamedAddr Val)
Definition: GlobalValue.h:219
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:242
This class represents an analyzed expression in the program.
void addChildLoop(LoopT *NewChild)
Add the specified loop to be a child of this loop.
Definition: LoopInfo.h:375
unsigned getIntegerBitWidth() const
Definition: DerivedTypes.h:102
StringRef getName() const
Definition: LoopInfo.h:827
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:509
This file provides utility analysis objects describing memory locations.
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation.
Definition: InstrTypes.h:1287
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:106
bool isEquality() const
Return true if this predicate is either EQ or NE.
LLVM_NODISCARD ModRefInfo intersectModRef(const ModRefInfo MRI1, const ModRefInfo MRI2)
#define I(x, y, z)
Definition: MD5.cpp:58
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:192
void getLoopAnalysisUsage(AnalysisUsage &AU)
Helper to consistently add the set of standard passes to a loop pass&#39;s AnalysisUsage.
Definition: LoopUtils.cpp:138
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:587
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:332
block_iterator block_end() const
Definition: LoopInfo.h:160
uint32_t Size
Definition: Profile.cpp:46
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value *> Args=None, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2239
Value * emitBCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B, const DataLayout &DL, const TargetLibraryInfo *TLI)
Emit a call to the bcmp function.
const SCEV * getTruncateOrZeroExtend(const SCEV *V, Type *Ty, unsigned Depth=0)
Return a SCEV corresponding to a conversion of the input value to the specified type.
bool isUnconditional() const
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition: STLExtras.h:422
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:368
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:295
static const SCEV * getStartForNegStride(const SCEV *Start, const SCEV *BECount, Type *IntPtr, unsigned StoreSize, ScalarEvolution *SE)
const SCEV * getBackedgeTakenCount(const Loop *L)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
Value * emitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B, const DataLayout &DL, const TargetLibraryInfo *TLI)
Emit a call to the memcmp function.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
The cost of a typical &#39;add&#39; instruction.
const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:575
LLVM Value Representation.
Definition: Value.h:74
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
A vector that has set insertion semantics.
Definition: SetVector.h:40
static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, const SCEV *BECount, unsigned StoreSize, AliasAnalysis &AA, SmallPtrSetImpl< Instruction *> &IgnoredStores)
mayLoopAccessLocation - Return true if the specified loop might access the specified pointer location...
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
Definition: PatternMatch.h:148
unsigned greater than
Definition: InstrTypes.h:755
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
A container for analyses that lazily runs them and caches their results.
void replaceUsesOutsideBlock(Value *V, BasicBlock *BB)
replaceUsesOutsideBlock - Go through the uses list for this definition and make each use point to "V"...
Definition: Value.cpp:439
const SCEV * getExitCount(const Loop *L, BasicBlock *ExitingBlock)
Return the number of times the backedge executes before the given exit would be taken; if not exactly...
This pass exposes codegen information to IR-level passes.
LLVM_NODISCARD bool isModOrRefSet(const ModRefInfo MRI)
bool isSimple() const
Definition: Instructions.h:407
This header defines various interfaces for pass management in LLVM.
bool isBigEndian() const
Definition: DataLayout.h:233
#define DEBUG_TYPE
ModRefInfo getModRefInfo(const CallBase *Call, const MemoryLocation &Loc)
getModRefInfo (for call sites) - Return information about whether a particular call site modifies or ...
virtual bool anyBlockMayThrow() const
Returns true iff any block of the loop for which this info is contains an instruction that may throw ...
Definition: MustExecute.cpp:45
#define LLVM_DEBUG(X)
Definition: Debug.h:122
bool isSafeToExpand(const SCEV *S, ScalarEvolution &SE)
Return true if the given expression is safe to expand in the sense that all materialized values are s...
block_iterator block_begin() const
Definition: LoopInfo.h:159
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count...
Value * getPointerOperand()
Definition: Instructions.h:418
const SCEV * getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
The optimization diagnostic interface.
void removeBlock(BlockT *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
Definition: LoopInfo.h:987
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
This class represents a constant integer value.
CmpClass_match< LHS, RHS, ICmpInst, ICmpInst::Predicate > m_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R)
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:1224