LLVM 22.0.0git
LoopIdiomRecognize.cpp
Go to the documentation of this file.
1//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements an idiom recognizer that transforms simple loops into a
10// non-loop form. In cases that this kicks in, it can be a significant
11// performance win.
12//
13// If compiling for code size we avoid idiom recognition if the resulting
14// code could be larger than the code for the original loop. One way this could
15// happen is if the loop is not removable after idiom recognition due to the
16// presence of non-idiom instructions. The initial implementation of the
17// heuristics applies to idioms in multi-block loops.
18//
19//===----------------------------------------------------------------------===//
20//
21// TODO List:
22//
23// Future loop memory idioms to recognize: memcmp, etc.
24//
25// This could recognize common matrix multiplies and dot product idioms and
26// replace them with calls to BLAS (if linked in??).
27//
28//===----------------------------------------------------------------------===//
29
31#include "llvm/ADT/APInt.h"
32#include "llvm/ADT/ArrayRef.h"
33#include "llvm/ADT/DenseMap.h"
34#include "llvm/ADT/MapVector.h"
35#include "llvm/ADT/SetVector.h"
38#include "llvm/ADT/Statistic.h"
39#include "llvm/ADT/StringRef.h"
56#include "llvm/IR/BasicBlock.h"
57#include "llvm/IR/Constant.h"
58#include "llvm/IR/Constants.h"
59#include "llvm/IR/DataLayout.h"
60#include "llvm/IR/DebugLoc.h"
62#include "llvm/IR/Dominators.h"
63#include "llvm/IR/GlobalValue.h"
65#include "llvm/IR/IRBuilder.h"
66#include "llvm/IR/InstrTypes.h"
67#include "llvm/IR/Instruction.h"
70#include "llvm/IR/Intrinsics.h"
71#include "llvm/IR/LLVMContext.h"
72#include "llvm/IR/Module.h"
73#include "llvm/IR/PassManager.h"
76#include "llvm/IR/Type.h"
77#include "llvm/IR/User.h"
78#include "llvm/IR/Value.h"
79#include "llvm/IR/ValueHandle.h"
82#include "llvm/Support/Debug.h"
89#include <algorithm>
90#include <cassert>
91#include <cstdint>
92#include <utility>
93
94using namespace llvm;
95using namespace SCEVPatternMatch;
96
97#define DEBUG_TYPE "loop-idiom"
98
99STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
100STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
101STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores");
102STATISTIC(NumStrLen, "Number of strlen's and wcslen's formed from loop loads");
104 NumShiftUntilBitTest,
105 "Number of uncountable loops recognized as 'shift until bitttest' idiom");
106STATISTIC(NumShiftUntilZero,
107 "Number of uncountable loops recognized as 'shift until zero' idiom");
108
109namespace llvm {
112 DisableLIRPAll("disable-" DEBUG_TYPE "-all",
113 cl::desc("Options to disable Loop Idiom Recognize Pass."),
116
119 DisableLIRPMemset("disable-" DEBUG_TYPE "-memset",
120 cl::desc("Proceed with loop idiom recognize pass, but do "
121 "not convert loop(s) to memset."),
124
127 DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy",
128 cl::desc("Proceed with loop idiom recognize pass, but do "
129 "not convert loop(s) to memcpy."),
132
135 DisableLIRPStrlen("disable-loop-idiom-strlen",
136 cl::desc("Proceed with loop idiom recognize pass, but do "
137 "not convert loop(s) to strlen."),
140
143 EnableLIRPWcslen("disable-loop-idiom-wcslen",
144 cl::desc("Proceed with loop idiom recognize pass, "
145 "enable conversion of loop(s) to wcslen."),
148
151 DisableLIRPHashRecognize("disable-" DEBUG_TYPE "-hashrecognize",
152 cl::desc("Proceed with loop idiom recognize pass, "
153 "but do not optimize CRC loops."),
155 cl::init(false), cl::ReallyHidden);
156
158 "use-lir-code-size-heurs",
159 cl::desc("Use loop idiom recognition code size heuristics when compiling "
160 "with -Os/-Oz"),
161 cl::init(true), cl::Hidden);
162
164 "loop-idiom-force-memset-pattern-intrinsic",
165 cl::desc("Use memset.pattern intrinsic whenever possible"), cl::init(false),
166 cl::Hidden);
167
169
170} // namespace llvm
171
172namespace {
173
174class LoopIdiomRecognize {
175 Loop *CurLoop = nullptr;
177 DominatorTree *DT;
178 LoopInfo *LI;
179 ScalarEvolution *SE;
182 const DataLayout *DL;
184 bool ApplyCodeSizeHeuristics;
185 std::unique_ptr<MemorySSAUpdater> MSSAU;
186
187public:
188 explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
189 LoopInfo *LI, ScalarEvolution *SE,
191 const TargetTransformInfo *TTI, MemorySSA *MSSA,
192 const DataLayout *DL,
194 : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
195 if (MSSA)
196 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
197 }
198
199 bool runOnLoop(Loop *L);
200
201private:
202 using StoreList = SmallVector<StoreInst *, 8>;
203 using StoreListMap = MapVector<Value *, StoreList>;
204
205 StoreListMap StoreRefsForMemset;
206 StoreListMap StoreRefsForMemsetPattern;
207 StoreList StoreRefsForMemcpy;
208 bool HasMemset;
209 bool HasMemsetPattern;
210 bool HasMemcpy;
211
212 /// Return code for isLegalStore()
213 enum LegalStoreKind {
214 None = 0,
215 Memset,
216 MemsetPattern,
217 Memcpy,
218 UnorderedAtomicMemcpy,
219 DontUse // Dummy retval never to be used. Allows catching errors in retval
220 // handling.
221 };
222
223 /// \name Countable Loop Idiom Handling
224 /// @{
225
226 bool runOnCountableLoop();
227 bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
228 SmallVectorImpl<BasicBlock *> &ExitBlocks);
229
230 void collectStores(BasicBlock *BB);
231 LegalStoreKind isLegalStore(StoreInst *SI);
232 enum class ForMemset { No, Yes };
233 bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
234 ForMemset For);
235
236 template <typename MemInst>
237 bool processLoopMemIntrinsic(
238 BasicBlock *BB,
239 bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
240 const SCEV *BECount);
241 bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount);
242 bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
243
244 bool processLoopStridedStore(Value *DestPtr, const SCEV *StoreSizeSCEV,
245 MaybeAlign StoreAlignment, Value *StoredVal,
246 Instruction *TheStore,
247 SmallPtrSetImpl<Instruction *> &Stores,
248 const SCEVAddRecExpr *Ev, const SCEV *BECount,
249 bool IsNegStride, bool IsLoopMemset = false);
250 bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
251 bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr,
252 const SCEV *StoreSize, MaybeAlign StoreAlign,
253 MaybeAlign LoadAlign, Instruction *TheStore,
254 Instruction *TheLoad,
255 const SCEVAddRecExpr *StoreEv,
256 const SCEVAddRecExpr *LoadEv,
257 const SCEV *BECount);
258 bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
259 bool IsLoopMemset = false);
260 bool optimizeCRCLoop(const PolynomialInfo &Info);
261
262 /// @}
263 /// \name Noncountable Loop Idiom Handling
264 /// @{
265
266 bool runOnNoncountableLoop();
267
268 bool recognizePopcount();
269 void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
270 PHINode *CntPhi, Value *Var);
271 bool isProfitableToInsertFFS(Intrinsic::ID IntrinID, Value *InitX,
272 bool ZeroCheck, size_t CanonicalSize);
273 bool insertFFSIfProfitable(Intrinsic::ID IntrinID, Value *InitX,
274 Instruction *DefX, PHINode *CntPhi,
275 Instruction *CntInst);
276 bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
277 bool recognizeShiftUntilLessThan();
278 void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
279 Instruction *CntInst, PHINode *CntPhi,
280 Value *Var, Instruction *DefX,
281 const DebugLoc &DL, bool ZeroCheck,
282 bool IsCntPhiUsedOutsideLoop,
283 bool InsertSub = false);
284
285 bool recognizeShiftUntilBitTest();
286 bool recognizeShiftUntilZero();
287 bool recognizeAndInsertStrLen();
288
289 /// @}
290};
291} // end anonymous namespace
292
295 LPMUpdater &) {
297 return PreservedAnalyses::all();
298
299 const auto *DL = &L.getHeader()->getDataLayout();
300
301 // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
302 // pass. Function analyses need to be preserved across loop transformations
303 // but ORE cannot be preserved (see comment before the pass definition).
304 OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
305
306 std::optional<PolynomialInfo> HR;
307
308 LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
309 AR.MSSA, DL, ORE);
310 if (!LIR.runOnLoop(&L))
311 return PreservedAnalyses::all();
312
314 if (AR.MSSA)
315 PA.preserve<MemorySSAAnalysis>();
316 return PA;
317}
318
320 I->replaceAllUsesWith(PoisonValue::get(I->getType()));
321 I->eraseFromParent();
322}
323
324//===----------------------------------------------------------------------===//
325//
326// Implementation of LoopIdiomRecognize
327//
328//===----------------------------------------------------------------------===//
329
330bool LoopIdiomRecognize::runOnLoop(Loop *L) {
331 CurLoop = L;
332 // If the loop could not be converted to canonical form, it must have an
333 // indirectbr in it, just give up.
334 if (!L->getLoopPreheader())
335 return false;
336
337 // Disable loop idiom recognition if the function's name is a common idiom.
338 StringRef Name = L->getHeader()->getParent()->getName();
339 if (Name == "memset" || Name == "memcpy" || Name == "strlen" ||
340 Name == "wcslen")
341 return false;
342
343 // Determine if code size heuristics need to be applied.
344 ApplyCodeSizeHeuristics =
345 L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
346
347 HasMemset = TLI->has(LibFunc_memset);
348 // TODO: Unconditionally enable use of the memset pattern intrinsic (or at
349 // least, opt-in via target hook) once we are confident it will never result
350 // in worse codegen than without. For now, use it only when the target
351 // supports memset_pattern16 libcall (or unless this is overridden by
352 // command line option).
353 HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
354 HasMemcpy = TLI->has(LibFunc_memcpy);
355
356 if (HasMemset || HasMemsetPattern || ForceMemsetPatternIntrinsic ||
357 HasMemcpy || !DisableLIRP::HashRecognize)
359 return runOnCountableLoop();
360
361 return runOnNoncountableLoop();
362}
363
364bool LoopIdiomRecognize::runOnCountableLoop() {
365 const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
367 "runOnCountableLoop() called on a loop without a predictable"
368 "backedge-taken count");
369
370 // If this loop executes exactly one time, then it should be peeled, not
371 // optimized by this pass.
372 if (BECount->isZero())
373 return false;
374
376 CurLoop->getUniqueExitBlocks(ExitBlocks);
377
378 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
379 << CurLoop->getHeader()->getParent()->getName()
380 << "] Countable Loop %" << CurLoop->getHeader()->getName()
381 << "\n");
382
383 // The following transforms hoist stores/memsets into the loop pre-header.
384 // Give up if the loop has instructions that may throw.
385 SimpleLoopSafetyInfo SafetyInfo;
386 SafetyInfo.computeLoopSafetyInfo(CurLoop);
387 if (SafetyInfo.anyBlockMayThrow())
388 return false;
389
390 bool MadeChange = false;
391
392 // Scan all the blocks in the loop that are not in subloops.
393 for (auto *BB : CurLoop->getBlocks()) {
394 // Ignore blocks in subloops.
395 if (LI->getLoopFor(BB) != CurLoop)
396 continue;
397
398 MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
399 }
400
401 // Optimize a CRC loop if HashRecognize found one, provided we're not
402 // optimizing for size.
403 if (!DisableLIRP::HashRecognize && !ApplyCodeSizeHeuristics)
404 if (auto Res = HashRecognize(*CurLoop, *SE).getResult())
405 optimizeCRCLoop(*Res);
406
407 return MadeChange;
408}
409
410static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
411 const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
412 return ConstStride->getAPInt();
413}
414
415/// getMemSetPatternValue - If a strided store of the specified value is safe to
416/// turn into a memset.patternn intrinsic, return the Constant that should
417/// be passed in. Otherwise, return null.
418///
419/// TODO this function could allow more constants than it does today (e.g.
420/// those over 16 bytes) now it has transitioned to being used for the
421/// memset.pattern intrinsic rather than directly the memset_pattern16
422/// libcall.
424 // FIXME: This could check for UndefValue because it can be merged into any
425 // other valid pattern.
426
427 // If the value isn't a constant, we can't promote it to being in a constant
428 // array. We could theoretically do a store to an alloca or something, but
429 // that doesn't seem worthwhile.
431 if (!C || isa<ConstantExpr>(C))
432 return nullptr;
433
434 // Only handle simple values that are a power of two bytes in size.
435 uint64_t Size = DL->getTypeSizeInBits(V->getType());
436 if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
437 return nullptr;
438
439 // Don't care enough about darwin/ppc to implement this.
440 if (DL->isBigEndian())
441 return nullptr;
442
443 // Convert to size in bytes.
444 Size /= 8;
445
446 // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
447 // if the top and bottom are the same (e.g. for vectors and large integers).
448 if (Size > 16)
449 return nullptr;
450
451 // For now, don't handle types that aren't int, floats, or pointers.
452 Type *CTy = C->getType();
453 if (!CTy->isIntOrPtrTy() && !CTy->isFloatingPointTy())
454 return nullptr;
455
456 return C;
457}
458
459LoopIdiomRecognize::LegalStoreKind
460LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
461 // Don't touch volatile stores.
462 if (SI->isVolatile())
463 return LegalStoreKind::None;
464 // We only want simple or unordered-atomic stores.
465 if (!SI->isUnordered())
466 return LegalStoreKind::None;
467
468 // Avoid merging nontemporal stores.
469 if (SI->getMetadata(LLVMContext::MD_nontemporal))
470 return LegalStoreKind::None;
471
472 Value *StoredVal = SI->getValueOperand();
473 Value *StorePtr = SI->getPointerOperand();
474
475 // Don't convert stores of non-integral pointer types to memsets (which stores
476 // integers).
477 if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
478 return LegalStoreKind::None;
479
480 // Reject stores that are so large that they overflow an unsigned.
481 // When storing out scalable vectors we bail out for now, since the code
482 // below currently only works for constant strides.
483 TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
484 if (SizeInBits.isScalable() || (SizeInBits.getFixedValue() & 7) ||
485 (SizeInBits.getFixedValue() >> 32) != 0)
486 return LegalStoreKind::None;
487
488 // See if the pointer expression is an AddRec like {base,+,1} on the current
489 // loop, which indicates a strided store. If we have something else, it's a
490 // random store we can't handle.
491 const SCEV *StoreEv = SE->getSCEV(StorePtr);
492 const SCEVConstant *Stride;
493 if (!match(StoreEv, m_scev_AffineAddRec(m_SCEV(), m_SCEVConstant(Stride),
494 m_SpecificLoop(CurLoop))))
495 return LegalStoreKind::None;
496
497 // See if the store can be turned into a memset.
498
499 // If the stored value is a byte-wise value (like i32 -1), then it may be
500 // turned into a memset of i8 -1, assuming that all the consecutive bytes
501 // are stored. A store of i32 0x01020304 can never be turned into a memset,
502 // but it can be turned into memset_pattern if the target supports it.
503 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
504
505 // Note: memset and memset_pattern on unordered-atomic is yet not supported
506 bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
507
508 // If we're allowed to form a memset, and the stored value would be
509 // acceptable for memset, use it.
510 if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
511 // Verify that the stored value is loop invariant. If not, we can't
512 // promote the memset.
513 CurLoop->isLoopInvariant(SplatValue)) {
514 // It looks like we can use SplatValue.
515 return LegalStoreKind::Memset;
516 }
517 if (!UnorderedAtomic && (HasMemsetPattern || ForceMemsetPatternIntrinsic) &&
519 // Don't create memset_pattern16s with address spaces.
520 StorePtr->getType()->getPointerAddressSpace() == 0 &&
521 getMemSetPatternValue(StoredVal, DL)) {
522 // It looks like we can use PatternValue!
523 return LegalStoreKind::MemsetPattern;
524 }
525
526 // Otherwise, see if the store can be turned into a memcpy.
527 if (HasMemcpy && !DisableLIRP::Memcpy) {
528 // Check to see if the stride matches the size of the store. If so, then we
529 // know that every byte is touched in the loop.
530 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
531 APInt StrideAP = Stride->getAPInt();
532 if (StoreSize != StrideAP && StoreSize != -StrideAP)
533 return LegalStoreKind::None;
534
535 // The store must be feeding a non-volatile load.
536 LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
537
538 // Only allow non-volatile loads
539 if (!LI || LI->isVolatile())
540 return LegalStoreKind::None;
541 // Only allow simple or unordered-atomic loads
542 if (!LI->isUnordered())
543 return LegalStoreKind::None;
544
545 // See if the pointer expression is an AddRec like {base,+,1} on the current
546 // loop, which indicates a strided load. If we have something else, it's a
547 // random load we can't handle.
548 const SCEV *LoadEv = SE->getSCEV(LI->getPointerOperand());
549
550 // The store and load must share the same stride.
551 if (!match(LoadEv, m_scev_AffineAddRec(m_SCEV(), m_scev_Specific(Stride),
552 m_SpecificLoop(CurLoop))))
553 return LegalStoreKind::None;
554
555 // Success. This store can be converted into a memcpy.
556 UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
557 return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
558 : LegalStoreKind::Memcpy;
559 }
560 // This store can't be transformed into a memset/memcpy.
561 return LegalStoreKind::None;
562}
563
564void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
565 StoreRefsForMemset.clear();
566 StoreRefsForMemsetPattern.clear();
567 StoreRefsForMemcpy.clear();
568 for (Instruction &I : *BB) {
570 if (!SI)
571 continue;
572
573 // Make sure this is a strided store with a constant stride.
574 switch (isLegalStore(SI)) {
575 case LegalStoreKind::None:
576 // Nothing to do
577 break;
578 case LegalStoreKind::Memset: {
579 // Find the base pointer.
580 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
581 StoreRefsForMemset[Ptr].push_back(SI);
582 } break;
583 case LegalStoreKind::MemsetPattern: {
584 // Find the base pointer.
585 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
586 StoreRefsForMemsetPattern[Ptr].push_back(SI);
587 } break;
588 case LegalStoreKind::Memcpy:
589 case LegalStoreKind::UnorderedAtomicMemcpy:
590 StoreRefsForMemcpy.push_back(SI);
591 break;
592 default:
593 assert(false && "unhandled return value");
594 break;
595 }
596 }
597}
598
599/// runOnLoopBlock - Process the specified block, which lives in a counted loop
600/// with the specified backedge count. This block is known to be in the current
601/// loop and not in any subloops.
602bool LoopIdiomRecognize::runOnLoopBlock(
603 BasicBlock *BB, const SCEV *BECount,
604 SmallVectorImpl<BasicBlock *> &ExitBlocks) {
605 // We can only promote stores in this block if they are unconditionally
606 // executed in the loop. For a block to be unconditionally executed, it has
607 // to dominate all the exit blocks of the loop. Verify this now.
608 for (BasicBlock *ExitBlock : ExitBlocks)
609 if (!DT->dominates(BB, ExitBlock))
610 return false;
611
612 bool MadeChange = false;
613 // Look for store instructions, which may be optimized to memset/memcpy.
614 collectStores(BB);
615
616 // Look for a single store or sets of stores with a common base, which can be
617 // optimized into a memset (memset_pattern). The latter most commonly happens
618 // with structs and handunrolled loops.
619 for (auto &SL : StoreRefsForMemset)
620 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
621
622 for (auto &SL : StoreRefsForMemsetPattern)
623 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
624
625 // Optimize the store into a memcpy, if it feeds an similarly strided load.
626 for (auto &SI : StoreRefsForMemcpy)
627 MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
628
629 MadeChange |= processLoopMemIntrinsic<MemCpyInst>(
630 BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
631 MadeChange |= processLoopMemIntrinsic<MemSetInst>(
632 BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
633
634 return MadeChange;
635}
636
637/// See if this store(s) can be promoted to a memset.
638bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
639 const SCEV *BECount, ForMemset For) {
640 // Try to find consecutive stores that can be transformed into memsets.
641 SetVector<StoreInst *> Heads, Tails;
643
644 // Do a quadratic search on all of the given stores and find
645 // all of the pairs of stores that follow each other.
646 SmallVector<unsigned, 16> IndexQueue;
647 for (unsigned i = 0, e = SL.size(); i < e; ++i) {
648 assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
649
650 Value *FirstStoredVal = SL[i]->getValueOperand();
651 Value *FirstStorePtr = SL[i]->getPointerOperand();
652 const SCEVAddRecExpr *FirstStoreEv =
653 cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
654 APInt FirstStride = getStoreStride(FirstStoreEv);
655 unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
656
657 // See if we can optimize just this store in isolation.
658 if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
659 Heads.insert(SL[i]);
660 continue;
661 }
662
663 Value *FirstSplatValue = nullptr;
664 Constant *FirstPatternValue = nullptr;
665
666 if (For == ForMemset::Yes)
667 FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
668 else
669 FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
670
671 assert((FirstSplatValue || FirstPatternValue) &&
672 "Expected either splat value or pattern value.");
673
674 IndexQueue.clear();
675 // If a store has multiple consecutive store candidates, search Stores
676 // array according to the sequence: from i+1 to e, then from i-1 to 0.
677 // This is because usually pairing with immediate succeeding or preceding
678 // candidate create the best chance to find memset opportunity.
679 unsigned j = 0;
680 for (j = i + 1; j < e; ++j)
681 IndexQueue.push_back(j);
682 for (j = i; j > 0; --j)
683 IndexQueue.push_back(j - 1);
684
685 for (auto &k : IndexQueue) {
686 assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
687 Value *SecondStorePtr = SL[k]->getPointerOperand();
688 const SCEVAddRecExpr *SecondStoreEv =
689 cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
690 APInt SecondStride = getStoreStride(SecondStoreEv);
691
692 if (FirstStride != SecondStride)
693 continue;
694
695 Value *SecondStoredVal = SL[k]->getValueOperand();
696 Value *SecondSplatValue = nullptr;
697 Constant *SecondPatternValue = nullptr;
698
699 if (For == ForMemset::Yes)
700 SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
701 else
702 SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
703
704 assert((SecondSplatValue || SecondPatternValue) &&
705 "Expected either splat value or pattern value.");
706
707 if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
708 if (For == ForMemset::Yes) {
709 if (isa<UndefValue>(FirstSplatValue))
710 FirstSplatValue = SecondSplatValue;
711 if (FirstSplatValue != SecondSplatValue)
712 continue;
713 } else {
714 if (isa<UndefValue>(FirstPatternValue))
715 FirstPatternValue = SecondPatternValue;
716 if (FirstPatternValue != SecondPatternValue)
717 continue;
718 }
719 Tails.insert(SL[k]);
720 Heads.insert(SL[i]);
721 ConsecutiveChain[SL[i]] = SL[k];
722 break;
723 }
724 }
725 }
726
727 // We may run into multiple chains that merge into a single chain. We mark the
728 // stores that we transformed so that we don't visit the same store twice.
729 SmallPtrSet<Value *, 16> TransformedStores;
730 bool Changed = false;
731
732 // For stores that start but don't end a link in the chain:
733 for (StoreInst *I : Heads) {
734 if (Tails.count(I))
735 continue;
736
737 // We found a store instr that starts a chain. Now follow the chain and try
738 // to transform it.
739 SmallPtrSet<Instruction *, 8> AdjacentStores;
740 StoreInst *HeadStore = I;
741 unsigned StoreSize = 0;
742
743 // Collect the chain into a list.
744 while (Tails.count(I) || Heads.count(I)) {
745 if (TransformedStores.count(I))
746 break;
747 AdjacentStores.insert(I);
748
749 StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
750 // Move to the next value in the chain.
751 I = ConsecutiveChain[I];
752 }
753
754 Value *StoredVal = HeadStore->getValueOperand();
755 Value *StorePtr = HeadStore->getPointerOperand();
756 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
757 APInt Stride = getStoreStride(StoreEv);
758
759 // Check to see if the stride matches the size of the stores. If so, then
760 // we know that every byte is touched in the loop.
761 if (StoreSize != Stride && StoreSize != -Stride)
762 continue;
763
764 bool IsNegStride = StoreSize == -Stride;
765
766 Type *IntIdxTy = DL->getIndexType(StorePtr->getType());
767 const SCEV *StoreSizeSCEV = SE->getConstant(IntIdxTy, StoreSize);
768 if (processLoopStridedStore(StorePtr, StoreSizeSCEV,
769 MaybeAlign(HeadStore->getAlign()), StoredVal,
770 HeadStore, AdjacentStores, StoreEv, BECount,
771 IsNegStride)) {
772 TransformedStores.insert_range(AdjacentStores);
773 Changed = true;
774 }
775 }
776
777 return Changed;
778}
779
780/// processLoopMemIntrinsic - Template function for calling different processor
781/// functions based on mem intrinsic type.
782template <typename MemInst>
783bool LoopIdiomRecognize::processLoopMemIntrinsic(
784 BasicBlock *BB,
785 bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
786 const SCEV *BECount) {
787 bool MadeChange = false;
788 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
789 Instruction *Inst = &*I++;
790 // Look for memory instructions, which may be optimized to a larger one.
791 if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
792 WeakTrackingVH InstPtr(&*I);
793 if (!(this->*Processor)(MI, BECount))
794 continue;
795 MadeChange = true;
796
797 // If processing the instruction invalidated our iterator, start over from
798 // the top of the block.
799 if (!InstPtr)
800 I = BB->begin();
801 }
802 }
803 return MadeChange;
804}
805
806/// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy
807bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
808 const SCEV *BECount) {
809 // We can only handle non-volatile memcpys with a constant size.
810 if (MCI->isVolatile() || !isa<ConstantInt>(MCI->getLength()))
811 return false;
812
813 // If we're not allowed to hack on memcpy, we fail.
814 if ((!HasMemcpy && !MCI->isForceInlined()) || DisableLIRP::Memcpy)
815 return false;
816
817 Value *Dest = MCI->getDest();
818 Value *Source = MCI->getSource();
819 if (!Dest || !Source)
820 return false;
821
822 // See if the load and store pointer expressions are AddRec like {base,+,1} on
823 // the current loop, which indicates a strided load and store. If we have
824 // something else, it's a random load or store we can't handle.
825 const SCEV *StoreEv = SE->getSCEV(Dest);
826 const SCEV *LoadEv = SE->getSCEV(Source);
827 const APInt *StoreStrideValue, *LoadStrideValue;
828 if (!match(StoreEv,
829 m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StoreStrideValue),
830 m_SpecificLoop(CurLoop))) ||
831 !match(LoadEv,
832 m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(LoadStrideValue),
833 m_SpecificLoop(CurLoop))))
834 return false;
835
836 // Reject memcpys that are so large that they overflow an unsigned.
837 uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength())->getZExtValue();
838 if ((SizeInBytes >> 32) != 0)
839 return false;
840
841 // Huge stride value - give up
842 if (StoreStrideValue->getBitWidth() > 64 ||
843 LoadStrideValue->getBitWidth() > 64)
844 return false;
845
846 if (SizeInBytes != *StoreStrideValue && SizeInBytes != -*StoreStrideValue) {
847 ORE.emit([&]() {
848 return OptimizationRemarkMissed(DEBUG_TYPE, "SizeStrideUnequal", MCI)
849 << ore::NV("Inst", "memcpy") << " in "
850 << ore::NV("Function", MCI->getFunction())
851 << " function will not be hoisted: "
852 << ore::NV("Reason", "memcpy size is not equal to stride");
853 });
854 return false;
855 }
856
857 int64_t StoreStrideInt = StoreStrideValue->getSExtValue();
858 int64_t LoadStrideInt = LoadStrideValue->getSExtValue();
859 // Check if the load stride matches the store stride.
860 if (StoreStrideInt != LoadStrideInt)
861 return false;
862
863 return processLoopStoreOfLoopLoad(
864 Dest, Source, SE->getConstant(Dest->getType(), SizeInBytes),
865 MCI->getDestAlign(), MCI->getSourceAlign(), MCI, MCI,
866 cast<SCEVAddRecExpr>(StoreEv), cast<SCEVAddRecExpr>(LoadEv), BECount);
867}
868
869/// processLoopMemSet - See if this memset can be promoted to a large memset.
870bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
871 const SCEV *BECount) {
872 // We can only handle non-volatile memsets.
873 if (MSI->isVolatile())
874 return false;
875
876 // If we're not allowed to hack on memset, we fail.
877 if (!HasMemset || DisableLIRP::Memset)
878 return false;
879
880 Value *Pointer = MSI->getDest();
881
882 // See if the pointer expression is an AddRec like {base,+,1} on the current
883 // loop, which indicates a strided store. If we have something else, it's a
884 // random store we can't handle.
885 const SCEV *Ev = SE->getSCEV(Pointer);
886 const SCEV *PointerStrideSCEV;
887 if (!match(Ev, m_scev_AffineAddRec(m_SCEV(), m_SCEV(PointerStrideSCEV),
888 m_SpecificLoop(CurLoop)))) {
889 LLVM_DEBUG(dbgs() << " Pointer is not affine, abort\n");
890 return false;
891 }
892
893 const SCEV *MemsetSizeSCEV = SE->getSCEV(MSI->getLength());
894
895 bool IsNegStride = false;
896 const bool IsConstantSize = isa<ConstantInt>(MSI->getLength());
897
898 if (IsConstantSize) {
899 // Memset size is constant.
900 // Check if the pointer stride matches the memset size. If so, then
901 // we know that every byte is touched in the loop.
902 LLVM_DEBUG(dbgs() << " memset size is constant\n");
903 uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
904 const APInt *Stride;
905 if (!match(PointerStrideSCEV, m_scev_APInt(Stride)))
906 return false;
907
908 if (SizeInBytes != *Stride && SizeInBytes != -*Stride)
909 return false;
910
911 IsNegStride = SizeInBytes == -*Stride;
912 } else {
913 // Memset size is non-constant.
914 // Check if the pointer stride matches the memset size.
915 // To be conservative, the pass would not promote pointers that aren't in
916 // address space zero. Also, the pass only handles memset length and stride
917 // that are invariant for the top level loop.
918 LLVM_DEBUG(dbgs() << " memset size is non-constant\n");
919 if (Pointer->getType()->getPointerAddressSpace() != 0) {
920 LLVM_DEBUG(dbgs() << " pointer is not in address space zero, "
921 << "abort\n");
922 return false;
923 }
924 if (!SE->isLoopInvariant(MemsetSizeSCEV, CurLoop)) {
925 LLVM_DEBUG(dbgs() << " memset size is not a loop-invariant, "
926 << "abort\n");
927 return false;
928 }
929
930 // Compare positive direction PointerStrideSCEV with MemsetSizeSCEV
931 IsNegStride = PointerStrideSCEV->isNonConstantNegative();
932 const SCEV *PositiveStrideSCEV =
933 IsNegStride ? SE->getNegativeSCEV(PointerStrideSCEV)
934 : PointerStrideSCEV;
935 LLVM_DEBUG(dbgs() << " MemsetSizeSCEV: " << *MemsetSizeSCEV << "\n"
936 << " PositiveStrideSCEV: " << *PositiveStrideSCEV
937 << "\n");
938
939 if (PositiveStrideSCEV != MemsetSizeSCEV) {
940 // If an expression is covered by the loop guard, compare again and
941 // proceed with optimization if equal.
942 const SCEV *FoldedPositiveStride =
943 SE->applyLoopGuards(PositiveStrideSCEV, CurLoop);
944 const SCEV *FoldedMemsetSize =
945 SE->applyLoopGuards(MemsetSizeSCEV, CurLoop);
946
947 LLVM_DEBUG(dbgs() << " Try to fold SCEV based on loop guard\n"
948 << " FoldedMemsetSize: " << *FoldedMemsetSize << "\n"
949 << " FoldedPositiveStride: " << *FoldedPositiveStride
950 << "\n");
951
952 if (FoldedPositiveStride != FoldedMemsetSize) {
953 LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n");
954 return false;
955 }
956 }
957 }
958
959 // Verify that the memset value is loop invariant. If not, we can't promote
960 // the memset.
961 Value *SplatValue = MSI->getValue();
962 if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
963 return false;
964
966 MSIs.insert(MSI);
967 return processLoopStridedStore(Pointer, SE->getSCEV(MSI->getLength()),
968 MSI->getDestAlign(), SplatValue, MSI, MSIs,
969 cast<SCEVAddRecExpr>(Ev), BECount, IsNegStride,
970 /*IsLoopMemset=*/true);
971}
972
973/// mayLoopAccessLocation - Return true if the specified loop might access the
974/// specified pointer location, which is a loop-strided access. The 'Access'
975/// argument specifies what the verboten forms of access are (read or write).
976static bool
978 const SCEV *BECount, const SCEV *StoreSizeSCEV,
980 SmallPtrSetImpl<Instruction *> &IgnoredInsts) {
981 // Get the location that may be stored across the loop. Since the access is
982 // strided positively through memory, we say that the modified location starts
983 // at the pointer and has infinite size.
985
986 // If the loop iterates a fixed number of times, we can refine the access size
987 // to be exactly the size of the memset, which is (BECount+1)*StoreSize
988 const APInt *BECst, *ConstSize;
989 if (match(BECount, m_scev_APInt(BECst)) &&
990 match(StoreSizeSCEV, m_scev_APInt(ConstSize))) {
991 std::optional<uint64_t> BEInt = BECst->tryZExtValue();
992 std::optional<uint64_t> SizeInt = ConstSize->tryZExtValue();
993 // FIXME: Should this check for overflow?
994 if (BEInt && SizeInt)
995 AccessSize = LocationSize::precise((*BEInt + 1) * *SizeInt);
996 }
997
998 // TODO: For this to be really effective, we have to dive into the pointer
999 // operand in the store. Store to &A[i] of 100 will always return may alias
1000 // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
1001 // which will then no-alias a store to &A[100].
1002 MemoryLocation StoreLoc(Ptr, AccessSize);
1003
1004 for (BasicBlock *B : L->blocks())
1005 for (Instruction &I : *B)
1006 if (!IgnoredInsts.contains(&I) &&
1007 isModOrRefSet(AA.getModRefInfo(&I, StoreLoc) & Access))
1008 return true;
1009 return false;
1010}
1011
1012// If we have a negative stride, Start refers to the end of the memory location
1013// we're trying to memset. Therefore, we need to recompute the base pointer,
1014// which is just Start - BECount*Size.
1015static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
1016 Type *IntPtr, const SCEV *StoreSizeSCEV,
1017 ScalarEvolution *SE) {
1018 const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
1019 if (!StoreSizeSCEV->isOne()) {
1020 // index = back edge count * store size
1021 Index = SE->getMulExpr(Index,
1022 SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
1024 }
1025 // base pointer = start - index * store size
1026 return SE->getMinusSCEV(Start, Index);
1027}
1028
1029/// Compute the number of bytes as a SCEV from the backedge taken count.
1030///
1031/// This also maps the SCEV into the provided type and tries to handle the
1032/// computation in a way that will fold cleanly.
1033static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
1034 const SCEV *StoreSizeSCEV, Loop *CurLoop,
1035 const DataLayout *DL, ScalarEvolution *SE) {
1036 const SCEV *TripCountSCEV =
1037 SE->getTripCountFromExitCount(BECount, IntPtr, CurLoop);
1038 return SE->getMulExpr(TripCountSCEV,
1039 SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
1041}
1042
1043/// processLoopStridedStore - We see a strided store of some value. If we can
1044/// transform this into a memset or memset_pattern in the loop preheader, do so.
1045bool LoopIdiomRecognize::processLoopStridedStore(
1046 Value *DestPtr, const SCEV *StoreSizeSCEV, MaybeAlign StoreAlignment,
1047 Value *StoredVal, Instruction *TheStore,
1049 const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) {
1050 Module *M = TheStore->getModule();
1051
1052 // The trip count of the loop and the base pointer of the addrec SCEV is
1053 // guaranteed to be loop invariant, which means that it should dominate the
1054 // header. This allows us to insert code for it in the preheader.
1055 unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
1056 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1057 IRBuilder<> Builder(Preheader->getTerminator());
1058 SCEVExpander Expander(*SE, *DL, "loop-idiom");
1059 SCEVExpanderCleaner ExpCleaner(Expander);
1060
1061 Type *DestInt8PtrTy = Builder.getPtrTy(DestAS);
1062 Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
1063
1064 bool Changed = false;
1065 const SCEV *Start = Ev->getStart();
1066 // Handle negative strided loops.
1067 if (IsNegStride)
1068 Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSizeSCEV, SE);
1069
1070 // TODO: ideally we should still be able to generate memset if SCEV expander
1071 // is taught to generate the dependencies at the latest point.
1072 if (!Expander.isSafeToExpand(Start))
1073 return Changed;
1074
1075 // Okay, we have a strided store "p[i]" of a splattable value. We can turn
1076 // this into a memset in the loop preheader now if we want. However, this
1077 // would be unsafe to do if there is anything else in the loop that may read
1078 // or write to the aliased location. Check for any overlap by generating the
1079 // base pointer and checking the region.
1080 Value *BasePtr =
1081 Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
1082
1083 // From here on out, conservatively report to the pass manager that we've
1084 // changed the IR, even if we later clean up these added instructions. There
1085 // may be structural differences e.g. in the order of use lists not accounted
1086 // for in just a textual dump of the IR. This is written as a variable, even
1087 // though statically all the places this dominates could be replaced with
1088 // 'true', with the hope that anyone trying to be clever / "more precise" with
1089 // the return value will read this comment, and leave them alone.
1090 Changed = true;
1091
1092 if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1093 StoreSizeSCEV, *AA, Stores))
1094 return Changed;
1095
1096 if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
1097 return Changed;
1098
1099 // Okay, everything looks good, insert the memset.
1100 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
1101 Constant *PatternValue = nullptr;
1102 if (!SplatValue)
1103 PatternValue = getMemSetPatternValue(StoredVal, DL);
1104
1105 // MemsetArg is the number of bytes for the memset libcall, and the number
1106 // of pattern repetitions if the memset.pattern intrinsic is being used.
1107 Value *MemsetArg;
1108 std::optional<int64_t> BytesWritten;
1109
1110 if (PatternValue && (HasMemsetPattern || ForceMemsetPatternIntrinsic)) {
1111 const SCEV *TripCountS =
1112 SE->getTripCountFromExitCount(BECount, IntIdxTy, CurLoop);
1113 if (!Expander.isSafeToExpand(TripCountS))
1114 return Changed;
1115 const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
1116 if (!ConstStoreSize)
1117 return Changed;
1118 Value *TripCount = Expander.expandCodeFor(TripCountS, IntIdxTy,
1119 Preheader->getTerminator());
1120 uint64_t PatternRepsPerTrip =
1121 (ConstStoreSize->getValue()->getZExtValue() * 8) /
1122 DL->getTypeSizeInBits(PatternValue->getType());
1123 // If ConstStoreSize is not equal to the width of PatternValue, then
1124 // MemsetArg is TripCount * (ConstStoreSize/PatternValueWidth). Else
1125 // MemSetArg is just TripCount.
1126 MemsetArg =
1127 PatternRepsPerTrip == 1
1128 ? TripCount
1129 : Builder.CreateMul(TripCount,
1130 Builder.getIntN(IntIdxTy->getIntegerBitWidth(),
1131 PatternRepsPerTrip));
1132 if (auto *CI = dyn_cast<ConstantInt>(TripCount))
1133 BytesWritten =
1134 CI->getZExtValue() * ConstStoreSize->getValue()->getZExtValue();
1135
1136 } else {
1137 const SCEV *NumBytesS =
1138 getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1139
1140 // TODO: ideally we should still be able to generate memset if SCEV expander
1141 // is taught to generate the dependencies at the latest point.
1142 if (!Expander.isSafeToExpand(NumBytesS))
1143 return Changed;
1144 MemsetArg =
1145 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1146 if (auto *CI = dyn_cast<ConstantInt>(MemsetArg))
1147 BytesWritten = CI->getZExtValue();
1148 }
1149 assert(MemsetArg && "MemsetArg should have been set");
1150
1151 AAMDNodes AATags = TheStore->getAAMetadata();
1152 for (Instruction *Store : Stores)
1153 AATags = AATags.merge(Store->getAAMetadata());
1154 if (BytesWritten)
1155 AATags = AATags.extendTo(BytesWritten.value());
1156 else
1157 AATags = AATags.extendTo(-1);
1158
1159 CallInst *NewCall;
1160 if (SplatValue) {
1161 NewCall = Builder.CreateMemSet(BasePtr, SplatValue, MemsetArg,
1162 MaybeAlign(StoreAlignment),
1163 /*isVolatile=*/false, AATags);
1164 } else if (ForceMemsetPatternIntrinsic ||
1165 isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)) {
1166 assert(isa<SCEVConstant>(StoreSizeSCEV) && "Expected constant store size");
1167
1168 NewCall = Builder.CreateIntrinsic(
1169 Intrinsic::experimental_memset_pattern,
1170 {DestInt8PtrTy, PatternValue->getType(), IntIdxTy},
1171 {BasePtr, PatternValue, MemsetArg,
1172 ConstantInt::getFalse(M->getContext())});
1173 if (StoreAlignment)
1174 cast<MemSetPatternInst>(NewCall)->setDestAlignment(*StoreAlignment);
1175 NewCall->setAAMetadata(AATags);
1176 } else {
1177 // Neither a memset, nor memset_pattern16
1178 return Changed;
1179 }
1180
1181 NewCall->setDebugLoc(TheStore->getDebugLoc());
1182
1183 if (MSSAU) {
1184 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1185 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1186 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1187 }
1188
1189 LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
1190 << " from store to: " << *Ev << " at: " << *TheStore
1191 << "\n");
1192
1193 ORE.emit([&]() {
1194 OptimizationRemark R(DEBUG_TYPE, "ProcessLoopStridedStore",
1195 NewCall->getDebugLoc(), Preheader);
1196 R << "Transformed loop-strided store in "
1197 << ore::NV("Function", TheStore->getFunction())
1198 << " function into a call to "
1199 << ore::NV("NewFunction", NewCall->getCalledFunction())
1200 << "() intrinsic";
1201 if (!Stores.empty())
1202 R << ore::setExtraArgs();
1203 for (auto *I : Stores) {
1204 R << ore::NV("FromBlock", I->getParent()->getName())
1205 << ore::NV("ToBlock", Preheader->getName());
1206 }
1207 return R;
1208 });
1209
1210 // Okay, the memset has been formed. Zap the original store and anything that
1211 // feeds into it.
1212 for (auto *I : Stores) {
1213 if (MSSAU)
1214 MSSAU->removeMemoryAccess(I, true);
1216 }
1217 if (MSSAU && VerifyMemorySSA)
1218 MSSAU->getMemorySSA()->verifyMemorySSA();
1219 ++NumMemSet;
1220 ExpCleaner.markResultUsed();
1221 return true;
1222}
1223
1224/// If the stored value is a strided load in the same loop with the same stride
1225/// this may be transformable into a memcpy. This kicks in for stuff like
1226/// for (i) A[i] = B[i];
1227bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1228 const SCEV *BECount) {
1229 assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
1230
1231 Value *StorePtr = SI->getPointerOperand();
1232 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1233 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1234
1235 // The store must be feeding a non-volatile load.
1236 LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
1237 assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
1238
1239 // See if the pointer expression is an AddRec like {base,+,1} on the current
1240 // loop, which indicates a strided load. If we have something else, it's a
1241 // random load we can't handle.
1242 Value *LoadPtr = LI->getPointerOperand();
1243 const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
1244
1245 const SCEV *StoreSizeSCEV = SE->getConstant(StorePtr->getType(), StoreSize);
1246 return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSizeSCEV,
1247 SI->getAlign(), LI->getAlign(), SI, LI,
1248 StoreEv, LoadEv, BECount);
1249}
1250
1251namespace {
1252class MemmoveVerifier {
1253public:
1254 explicit MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr,
1255 const DataLayout &DL)
1257 LoadBasePtr.stripPointerCasts(), LoadOff, DL)),
1259 StoreBasePtr.stripPointerCasts(), StoreOff, DL)),
1260 IsSameObject(BP1 == BP2) {}
1261
1262 bool loadAndStoreMayFormMemmove(unsigned StoreSize, bool IsNegStride,
1263 const Instruction &TheLoad,
1264 bool IsMemCpy) const {
1265 if (IsMemCpy) {
1266 // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1267 // for negative stride.
1268 if ((!IsNegStride && LoadOff <= StoreOff) ||
1269 (IsNegStride && LoadOff >= StoreOff))
1270 return false;
1271 } else {
1272 // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1273 // for negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr.
1274 int64_t LoadSize =
1275 DL.getTypeSizeInBits(TheLoad.getType()).getFixedValue() / 8;
1276 if (BP1 != BP2 || LoadSize != int64_t(StoreSize))
1277 return false;
1278 if ((!IsNegStride && LoadOff < StoreOff + int64_t(StoreSize)) ||
1279 (IsNegStride && LoadOff + LoadSize > StoreOff))
1280 return false;
1281 }
1282 return true;
1283 }
1284
1285private:
1286 const DataLayout &DL;
1287 int64_t LoadOff = 0;
1288 int64_t StoreOff = 0;
1289 const Value *BP1;
1290 const Value *BP2;
1291
1292public:
1293 const bool IsSameObject;
1294};
1295} // namespace
1296
1297bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
1298 Value *DestPtr, Value *SourcePtr, const SCEV *StoreSizeSCEV,
1299 MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore,
1300 Instruction *TheLoad, const SCEVAddRecExpr *StoreEv,
1301 const SCEVAddRecExpr *LoadEv, const SCEV *BECount) {
1302
1303 // FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to
1304 // conservatively bail here, since otherwise we may have to transform
1305 // llvm.memcpy.inline into llvm.memcpy which is illegal.
1306 if (auto *MCI = dyn_cast<MemCpyInst>(TheStore); MCI && MCI->isForceInlined())
1307 return false;
1308
1309 // The trip count of the loop and the base pointer of the addrec SCEV is
1310 // guaranteed to be loop invariant, which means that it should dominate the
1311 // header. This allows us to insert code for it in the preheader.
1312 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1313 IRBuilder<> Builder(Preheader->getTerminator());
1314 SCEVExpander Expander(*SE, *DL, "loop-idiom");
1315
1316 SCEVExpanderCleaner ExpCleaner(Expander);
1317
1318 bool Changed = false;
1319 const SCEV *StrStart = StoreEv->getStart();
1320 unsigned StrAS = DestPtr->getType()->getPointerAddressSpace();
1321 Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
1322
1323 APInt Stride = getStoreStride(StoreEv);
1324 const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
1325
1326 // TODO: Deal with non-constant size; Currently expect constant store size
1327 assert(ConstStoreSize && "store size is expected to be a constant");
1328
1329 int64_t StoreSize = ConstStoreSize->getValue()->getZExtValue();
1330 bool IsNegStride = StoreSize == -Stride;
1331
1332 // Handle negative strided loops.
1333 if (IsNegStride)
1334 StrStart =
1335 getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1336
1337 // Okay, we have a strided store "p[i]" of a loaded value. We can turn
1338 // this into a memcpy in the loop preheader now if we want. However, this
1339 // would be unsafe to do if there is anything else in the loop that may read
1340 // or write the memory region we're storing to. This includes the load that
1341 // feeds the stores. Check for an alias by generating the base address and
1342 // checking everything.
1343 Value *StoreBasePtr = Expander.expandCodeFor(
1344 StrStart, Builder.getPtrTy(StrAS), Preheader->getTerminator());
1345
1346 // From here on out, conservatively report to the pass manager that we've
1347 // changed the IR, even if we later clean up these added instructions. There
1348 // may be structural differences e.g. in the order of use lists not accounted
1349 // for in just a textual dump of the IR. This is written as a variable, even
1350 // though statically all the places this dominates could be replaced with
1351 // 'true', with the hope that anyone trying to be clever / "more precise" with
1352 // the return value will read this comment, and leave them alone.
1353 Changed = true;
1354
1355 SmallPtrSet<Instruction *, 2> IgnoredInsts;
1356 IgnoredInsts.insert(TheStore);
1357
1358 bool IsMemCpy = isa<MemCpyInst>(TheStore);
1359 const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";
1360
1361 bool LoopAccessStore =
1362 mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1363 StoreSizeSCEV, *AA, IgnoredInsts);
1364 if (LoopAccessStore) {
1365 // For memmove case it's not enough to guarantee that loop doesn't access
1366 // TheStore and TheLoad. Additionally we need to make sure that TheStore is
1367 // the only user of TheLoad.
1368 if (!TheLoad->hasOneUse())
1369 return Changed;
1370 IgnoredInsts.insert(TheLoad);
1371 if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop,
1372 BECount, StoreSizeSCEV, *AA, IgnoredInsts)) {
1373 ORE.emit([&]() {
1374 return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore",
1375 TheStore)
1376 << ore::NV("Inst", InstRemark) << " in "
1377 << ore::NV("Function", TheStore->getFunction())
1378 << " function will not be hoisted: "
1379 << ore::NV("Reason", "The loop may access store location");
1380 });
1381 return Changed;
1382 }
1383 IgnoredInsts.erase(TheLoad);
1384 }
1385
1386 const SCEV *LdStart = LoadEv->getStart();
1387 unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
1388
1389 // Handle negative strided loops.
1390 if (IsNegStride)
1391 LdStart =
1392 getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1393
1394 // For a memcpy, we have to make sure that the input array is not being
1395 // mutated by the loop.
1396 Value *LoadBasePtr = Expander.expandCodeFor(LdStart, Builder.getPtrTy(LdAS),
1397 Preheader->getTerminator());
1398
1399 // If the store is a memcpy instruction, we must check if it will write to
1400 // the load memory locations. So remove it from the ignored stores.
1401 MemmoveVerifier Verifier(*LoadBasePtr, *StoreBasePtr, *DL);
1402 if (IsMemCpy && !Verifier.IsSameObject)
1403 IgnoredInsts.erase(TheStore);
1404 if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1405 StoreSizeSCEV, *AA, IgnoredInsts)) {
1406 ORE.emit([&]() {
1407 return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
1408 << ore::NV("Inst", InstRemark) << " in "
1409 << ore::NV("Function", TheStore->getFunction())
1410 << " function will not be hoisted: "
1411 << ore::NV("Reason", "The loop may access load location");
1412 });
1413 return Changed;
1414 }
1415
1416 bool IsAtomic = TheStore->isAtomic() || TheLoad->isAtomic();
1417 bool UseMemMove = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore;
1418
1419 if (IsAtomic) {
1420 // For now don't support unordered atomic memmove.
1421 if (UseMemMove)
1422 return Changed;
1423
1424 // We cannot allow unaligned ops for unordered load/store, so reject
1425 // anything where the alignment isn't at least the element size.
1426 assert((StoreAlign && LoadAlign) &&
1427 "Expect unordered load/store to have align.");
1428 if (*StoreAlign < StoreSize || *LoadAlign < StoreSize)
1429 return Changed;
1430
1431 // If the element.atomic memcpy is not lowered into explicit
1432 // loads/stores later, then it will be lowered into an element-size
1433 // specific lib call. If the lib call doesn't exist for our store size, then
1434 // we shouldn't generate the memcpy.
1435 if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1436 return Changed;
1437 }
1438
1439 if (UseMemMove)
1440 if (!Verifier.loadAndStoreMayFormMemmove(StoreSize, IsNegStride, *TheLoad,
1441 IsMemCpy))
1442 return Changed;
1443
1444 if (avoidLIRForMultiBlockLoop())
1445 return Changed;
1446
1447 // Okay, everything is safe, we can transform this!
1448
1449 const SCEV *NumBytesS =
1450 getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1451
1452 Value *NumBytes =
1453 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1454
1455 AAMDNodes AATags = TheLoad->getAAMetadata();
1456 AAMDNodes StoreAATags = TheStore->getAAMetadata();
1457 AATags = AATags.merge(StoreAATags);
1458 if (auto CI = dyn_cast<ConstantInt>(NumBytes))
1459 AATags = AATags.extendTo(CI->getZExtValue());
1460 else
1461 AATags = AATags.extendTo(-1);
1462
1463 CallInst *NewCall = nullptr;
1464 // Check whether to generate an unordered atomic memcpy:
1465 // If the load or store are atomic, then they must necessarily be unordered
1466 // by previous checks.
1467 if (!IsAtomic) {
1468 if (UseMemMove)
1469 NewCall = Builder.CreateMemMove(StoreBasePtr, StoreAlign, LoadBasePtr,
1470 LoadAlign, NumBytes,
1471 /*isVolatile=*/false, AATags);
1472 else
1473 NewCall =
1474 Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign,
1475 NumBytes, /*isVolatile=*/false, AATags);
1476 } else {
1477 // Create the call.
1478 // Note that unordered atomic loads/stores are *required* by the spec to
1479 // have an alignment but non-atomic loads/stores may not.
1480 NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1481 StoreBasePtr, *StoreAlign, LoadBasePtr, *LoadAlign, NumBytes, StoreSize,
1482 AATags);
1483 }
1484 NewCall->setDebugLoc(TheStore->getDebugLoc());
1485
1486 if (MSSAU) {
1487 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1488 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1489 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1490 }
1491
1492 LLVM_DEBUG(dbgs() << " Formed new call: " << *NewCall << "\n"
1493 << " from load ptr=" << *LoadEv << " at: " << *TheLoad
1494 << "\n"
1495 << " from store ptr=" << *StoreEv << " at: " << *TheStore
1496 << "\n");
1497
1498 ORE.emit([&]() {
1499 return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",
1500 NewCall->getDebugLoc(), Preheader)
1501 << "Formed a call to "
1502 << ore::NV("NewFunction", NewCall->getCalledFunction())
1503 << "() intrinsic from " << ore::NV("Inst", InstRemark)
1504 << " instruction in " << ore::NV("Function", TheStore->getFunction())
1505 << " function"
1507 << ore::NV("FromBlock", TheStore->getParent()->getName())
1508 << ore::NV("ToBlock", Preheader->getName());
1509 });
1510
1511 // Okay, a new call to memcpy/memmove has been formed. Zap the original store
1512 // and anything that feeds into it.
1513 if (MSSAU)
1514 MSSAU->removeMemoryAccess(TheStore, true);
1515 deleteDeadInstruction(TheStore);
1516 if (MSSAU && VerifyMemorySSA)
1517 MSSAU->getMemorySSA()->verifyMemorySSA();
1518 if (UseMemMove)
1519 ++NumMemMove;
1520 else
1521 ++NumMemCpy;
1522 ExpCleaner.markResultUsed();
1523 return true;
1524}
1525
1526// When compiling for codesize we avoid idiom recognition for a multi-block loop
1527// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1528//
1529bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1530 bool IsLoopMemset) {
1531 if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
1532 if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
1533 LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
1534 << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
1535 << " avoided: multi-block top-level loop\n");
1536 return true;
1537 }
1538 }
1539
1540 return false;
1541}
1542
1543bool LoopIdiomRecognize::optimizeCRCLoop(const PolynomialInfo &Info) {
1544 // FIXME: Hexagon has a special HexagonLoopIdiom that optimizes CRC using
1545 // carry-less multiplication instructions, which is more efficient than our
1546 // Sarwate table-lookup optimization. Hence, until we're able to emit
1547 // target-specific instructions for Hexagon, subsuming HexagonLoopIdiom,
1548 // disable the optimization for Hexagon.
1549 Module &M = *CurLoop->getHeader()->getModule();
1550 Triple TT(M.getTargetTriple());
1551 if (TT.getArch() == Triple::hexagon)
1552 return false;
1553
1554 // First, create a new GlobalVariable corresponding to the
1555 // Sarwate-lookup-table.
1556 Type *CRCTy = Info.LHS->getType();
1557 unsigned CRCBW = CRCTy->getIntegerBitWidth();
1558 std::array<Constant *, 256> CRCConstants;
1559 transform(HashRecognize::genSarwateTable(Info.RHS, Info.ByteOrderSwapped),
1560 CRCConstants.begin(),
1561 [CRCTy](const APInt &E) { return ConstantInt::get(CRCTy, E); });
1562 Constant *ConstArray =
1563 ConstantArray::get(ArrayType::get(CRCTy, 256), CRCConstants);
1564 GlobalVariable *GV =
1565 new GlobalVariable(M, ConstArray->getType(), true,
1566 GlobalValue::PrivateLinkage, ConstArray, ".crctable");
1567
1570
1571 // Next, mark all PHIs for removal except IV.
1572 {
1573 for (PHINode &PN : CurLoop->getHeader()->phis()) {
1574 if (&PN == IV)
1575 continue;
1576 PN.replaceAllUsesWith(PoisonValue::get(PN.getType()));
1577 Cleanup.push_back(&PN);
1578 }
1579 }
1580
1581 // Next, fix up the trip count.
1582 {
1583 unsigned NewBTC = (Info.TripCount / 8) - 1;
1584 BasicBlock *LoopBlk = CurLoop->getLoopLatch();
1585 BranchInst *BrInst = cast<BranchInst>(LoopBlk->getTerminator());
1586 CmpPredicate ExitPred = BrInst->getSuccessor(0) == LoopBlk
1589 Instruction *ExitCond = CurLoop->getLatchCmpInst();
1590 Value *ExitLimit = ConstantInt::get(IV->getType(), NewBTC);
1591 IRBuilder<> Builder(ExitCond);
1592 Value *NewExitCond =
1593 Builder.CreateICmp(ExitPred, IV, ExitLimit, "exit.cond");
1594 ExitCond->replaceAllUsesWith(NewExitCond);
1595 deleteDeadInstruction(ExitCond);
1596 }
1597
1598 // Finally, fill the loop with the Sarwate-table-lookup logic, and replace all
1599 // uses of ComputedValue.
1600 //
1601 // Little-endian:
1602 // crc = (crc >> 8) ^ tbl[(iv'th byte of data) ^ (bottom byte of crc)]
1603 // Big-Endian:
1604 // crc = (crc << 8) ^ tbl[(iv'th byte of data) ^ (top byte of crc)]
1605 {
1606 auto LoByte = [](IRBuilderBase &Builder, Value *Op, const Twine &Name) {
1607 return Builder.CreateZExtOrTrunc(
1608 Op, IntegerType::getInt8Ty(Op->getContext()), Name);
1609 };
1610 auto HiIdx = [LoByte, CRCBW](IRBuilderBase &Builder, Value *Op,
1611 const Twine &Name) {
1612 Type *OpTy = Op->getType();
1613
1614 // When the bitwidth of the CRC mismatches the Op's bitwidth, we need to
1615 // use the CRC's bitwidth as the reference for shifting right.
1616 return LoByte(Builder,
1617 CRCBW > 8 ? Builder.CreateLShr(
1618 Op, ConstantInt::get(OpTy, CRCBW - 8), Name)
1619 : Op,
1620 Name + ".lo.byte");
1621 };
1622
1623 IRBuilder<> Builder(CurLoop->getHeader(),
1624 CurLoop->getHeader()->getFirstNonPHIIt());
1625
1626 // Create the CRC PHI, and initialize its incoming value to the initial
1627 // value of CRC.
1628 PHINode *CRCPhi = Builder.CreatePHI(CRCTy, 2, "crc");
1629 CRCPhi->addIncoming(Info.LHS, CurLoop->getLoopPreheader());
1630
1631 // CRC is now an evolving variable, initialized to the PHI.
1632 Value *CRC = CRCPhi;
1633
1634 // TableIndexer = ((top|bottom) byte of CRC). It is XOR'ed with (iv'th byte
1635 // of LHSAux), if LHSAux is non-nullptr.
1636 Value *Indexer = CRC;
1637 if (Value *Data = Info.LHSAux) {
1638 Type *DataTy = Data->getType();
1639
1640 // To index into the (iv'th byte of LHSAux), we multiply iv by 8, and we
1641 // shift right by that amount, and take the lo-byte (in the little-endian
1642 // case), or shift left by that amount, and take the hi-idx (in the
1643 // big-endian case).
1644 Value *IVBits = Builder.CreateZExtOrTrunc(
1645 Builder.CreateShl(IV, 3, "iv.bits"), DataTy, "iv.indexer");
1646 Value *DataIndexer =
1647 Info.ByteOrderSwapped
1648 ? Builder.CreateShl(Data, IVBits, "data.indexer")
1649 : Builder.CreateLShr(Data, IVBits, "data.indexer");
1650 Indexer = Builder.CreateXor(
1651 DataIndexer,
1652 Builder.CreateZExtOrTrunc(Indexer, DataTy, "crc.indexer.cast"),
1653 "crc.data.indexer");
1654 }
1655
1656 Indexer = Info.ByteOrderSwapped ? HiIdx(Builder, Indexer, "indexer.hi")
1657 : LoByte(Builder, Indexer, "indexer.lo");
1658
1659 // Always index into a GEP using the index type.
1660 Indexer = Builder.CreateZExt(
1661 Indexer, SE->getDataLayout().getIndexType(GV->getType()),
1662 "indexer.ext");
1663
1664 // CRCTableLd = CRCTable[(iv'th byte of data) ^ (top|bottom) byte of CRC].
1665 Value *CRCTableGEP =
1666 Builder.CreateInBoundsGEP(CRCTy, GV, Indexer, "tbl.ptradd");
1667 Value *CRCTableLd = Builder.CreateLoad(CRCTy, CRCTableGEP, "tbl.ld");
1668
1669 // CRCNext = (CRC (<<|>>) 8) ^ CRCTableLd, or simply CRCTableLd in case of
1670 // CRC-8.
1671 Value *CRCNext = CRCTableLd;
1672 if (CRCBW > 8) {
1673 Value *CRCShift = Info.ByteOrderSwapped
1674 ? Builder.CreateShl(CRC, 8, "crc.be.shift")
1675 : Builder.CreateLShr(CRC, 8, "crc.le.shift");
1676 CRCNext = Builder.CreateXor(CRCShift, CRCTableLd, "crc.next");
1677 }
1678
1679 // Connect the back-edge for the loop, and RAUW the ComputedValue.
1680 CRCPhi->addIncoming(CRCNext, CurLoop->getLoopLatch());
1681 Info.ComputedValue->replaceUsesOutsideBlock(CRCNext,
1682 CurLoop->getLoopLatch());
1683 }
1684
1685 // Cleanup.
1686 {
1687 for (PHINode *PN : Cleanup)
1689 SE->forgetLoop(CurLoop);
1690 }
1691 return true;
1692}
1693
1694bool LoopIdiomRecognize::runOnNoncountableLoop() {
1695 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
1696 << CurLoop->getHeader()->getParent()->getName()
1697 << "] Noncountable Loop %"
1698 << CurLoop->getHeader()->getName() << "\n");
1699
1700 return recognizePopcount() || recognizeAndInsertFFS() ||
1701 recognizeShiftUntilBitTest() || recognizeShiftUntilZero() ||
1702 recognizeShiftUntilLessThan() || recognizeAndInsertStrLen();
1703}
1704
1705/// Check if the given conditional branch is based on the comparison between
1706/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1707/// true), the control yields to the loop entry. If the branch matches the
1708/// behavior, the variable involved in the comparison is returned. This function
1709/// will be called to see if the precondition and postcondition of the loop are
1710/// in desirable form.
1712 bool JmpOnZero = false) {
1713 if (!BI || !BI->isConditional())
1714 return nullptr;
1715
1717 if (!Cond)
1718 return nullptr;
1719
1720 auto *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
1721 if (!CmpZero || !CmpZero->isZero())
1722 return nullptr;
1723
1724 BasicBlock *TrueSucc = BI->getSuccessor(0);
1725 BasicBlock *FalseSucc = BI->getSuccessor(1);
1726 if (JmpOnZero)
1727 std::swap(TrueSucc, FalseSucc);
1728
1729 ICmpInst::Predicate Pred = Cond->getPredicate();
1730 if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
1731 (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
1732 return Cond->getOperand(0);
1733
1734 return nullptr;
1735}
1736
1737namespace {
1738
1739class StrlenVerifier {
1740public:
1741 explicit StrlenVerifier(const Loop *CurLoop, ScalarEvolution *SE,
1742 const TargetLibraryInfo *TLI)
1743 : CurLoop(CurLoop), SE(SE), TLI(TLI) {}
1744
1745 bool isValidStrlenIdiom() {
1746 // Give up if the loop has multiple blocks, multiple backedges, or
1747 // multiple exit blocks
1748 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1 ||
1749 !CurLoop->getUniqueExitBlock())
1750 return false;
1751
1752 // It should have a preheader and a branch instruction.
1753 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1754 if (!Preheader)
1755 return false;
1756
1757 BranchInst *EntryBI = dyn_cast<BranchInst>(Preheader->getTerminator());
1758 if (!EntryBI)
1759 return false;
1760
1761 // The loop exit must be conditioned on an icmp with 0 the null terminator.
1762 // The icmp operand has to be a load on some SSA reg that increments
1763 // by 1 in the loop.
1764 BasicBlock *LoopBody = *CurLoop->block_begin();
1765
1766 // Skip if the body is too big as it most likely is not a strlen idiom.
1767 if (!LoopBody || LoopBody->size() >= 15)
1768 return false;
1769
1770 BranchInst *LoopTerm = dyn_cast<BranchInst>(LoopBody->getTerminator());
1771 Value *LoopCond = matchCondition(LoopTerm, LoopBody);
1772 if (!LoopCond)
1773 return false;
1774
1775 LoadInst *LoopLoad = dyn_cast<LoadInst>(LoopCond);
1776 if (!LoopLoad || LoopLoad->getPointerAddressSpace() != 0)
1777 return false;
1778
1779 OperandType = LoopLoad->getType();
1780 if (!OperandType || !OperandType->isIntegerTy())
1781 return false;
1782
1783 // See if the pointer expression is an AddRec with constant step a of form
1784 // ({n,+,a}) where a is the width of the char type.
1785 Value *IncPtr = LoopLoad->getPointerOperand();
1786 const SCEV *LoadEv = SE->getSCEV(IncPtr);
1787 const APInt *Step;
1788 if (!match(LoadEv,
1789 m_scev_AffineAddRec(m_SCEV(LoadBaseEv), m_scev_APInt(Step))))
1790 return false;
1791
1792 LLVM_DEBUG(dbgs() << "pointer load scev: " << *LoadEv << "\n");
1793
1794 unsigned StepSize = Step->getZExtValue();
1795
1796 // Verify that StepSize is consistent with platform char width.
1797 OpWidth = OperandType->getIntegerBitWidth();
1798 unsigned WcharSize = TLI->getWCharSize(*LoopLoad->getModule());
1799 if (OpWidth != StepSize * 8)
1800 return false;
1801 if (OpWidth != 8 && OpWidth != 16 && OpWidth != 32)
1802 return false;
1803 if (OpWidth >= 16)
1804 if (OpWidth != WcharSize * 8)
1805 return false;
1806
1807 // Scan every instruction in the loop to ensure there are no side effects.
1808 for (Instruction &I : *LoopBody)
1809 if (I.mayHaveSideEffects())
1810 return false;
1811
1812 BasicBlock *LoopExitBB = CurLoop->getExitBlock();
1813 if (!LoopExitBB)
1814 return false;
1815
1816 for (PHINode &PN : LoopExitBB->phis()) {
1817 if (!SE->isSCEVable(PN.getType()))
1818 return false;
1819
1820 const SCEV *Ev = SE->getSCEV(&PN);
1821 if (!Ev)
1822 return false;
1823
1824 LLVM_DEBUG(dbgs() << "loop exit phi scev: " << *Ev << "\n");
1825
1826 // Since we verified that the loop trip count will be a valid strlen
1827 // idiom, we can expand all lcssa phi with {n,+,1} as (n + strlen) and use
1828 // SCEVExpander materialize the loop output.
1829 const SCEVAddRecExpr *AddRecEv = dyn_cast<SCEVAddRecExpr>(Ev);
1830 if (!AddRecEv || !AddRecEv->isAffine())
1831 return false;
1832
1833 // We only want RecAddExpr with recurrence step that is constant. This
1834 // is good enough for all the idioms we want to recognize. Later we expand
1835 // and materialize the recurrence as {base,+,a} -> (base + a * strlen)
1836 if (!isa<SCEVConstant>(AddRecEv->getStepRecurrence(*SE)))
1837 return false;
1838 }
1839
1840 return true;
1841 }
1842
1843public:
1844 const Loop *CurLoop;
1845 ScalarEvolution *SE;
1846 const TargetLibraryInfo *TLI;
1847
1848 unsigned OpWidth;
1849 ConstantInt *StepSizeCI;
1850 const SCEV *LoadBaseEv;
1852};
1853
1854} // namespace
1855
1856/// The Strlen Idiom we are trying to detect has the following structure
1857///
1858/// preheader:
1859/// ...
1860/// br label %body, ...
1861///
1862/// body:
1863/// ... ; %0 is incremented by a gep
1864/// %1 = load i8, ptr %0, align 1
1865/// %2 = icmp eq i8 %1, 0
1866/// br i1 %2, label %exit, label %body
1867///
1868/// exit:
1869/// %lcssa = phi [%0, %body], ...
1870///
1871/// We expect the strlen idiom to have a load of a character type that
1872/// is compared against '\0', and such load pointer operand must have scev
1873/// expression of the form {%str,+,c} where c is a ConstantInt of the
1874/// appropiate character width for the idiom, and %str is the base of the string
1875/// And, that all lcssa phis have the form {...,+,n} where n is a constant,
1876///
1877/// When transforming the output of the strlen idiom, the lccsa phi are
1878/// expanded using SCEVExpander as {base scev,+,a} -> (base scev + a * strlen)
1879/// and all subsequent uses are replaced. For example,
1880///
1881/// \code{.c}
1882/// const char* base = str;
1883/// while (*str != '\0')
1884/// ++str;
1885/// size_t result = str - base;
1886/// \endcode
1887///
1888/// will be transformed as follows: The idiom will be replaced by a strlen
1889/// computation to compute the address of the null terminator of the string.
1890///
1891/// \code{.c}
1892/// const char* base = str;
1893/// const char* end = base + strlen(str);
1894/// size_t result = end - base;
1895/// \endcode
1896///
1897/// In the case we index by an induction variable, as long as the induction
1898/// variable has a constant int increment, we can replace all such indvars
1899/// with the closed form computation of strlen
1900///
1901/// \code{.c}
1902/// size_t i = 0;
1903/// while (str[i] != '\0')
1904/// ++i;
1905/// size_t result = i;
1906/// \endcode
1907///
1908/// Will be replaced by
1909///
1910/// \code{.c}
1911/// size_t i = 0 + strlen(str);
1912/// size_t result = i;
1913/// \endcode
1914///
1915bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
1916 if (DisableLIRP::All)
1917 return false;
1918
1919 StrlenVerifier Verifier(CurLoop, SE, TLI);
1920
1921 if (!Verifier.isValidStrlenIdiom())
1922 return false;
1923
1924 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1925 BasicBlock *LoopBody = *CurLoop->block_begin();
1926 BasicBlock *LoopExitBB = CurLoop->getExitBlock();
1927 BranchInst *LoopTerm = dyn_cast<BranchInst>(LoopBody->getTerminator());
1928 assert(Preheader && LoopBody && LoopExitBB && LoopTerm &&
1929 "Should be verified to be valid by StrlenVerifier");
1930
1931 if (Verifier.OpWidth == 8) {
1933 return false;
1934 if (!isLibFuncEmittable(Preheader->getModule(), TLI, LibFunc_strlen))
1935 return false;
1936 } else {
1938 return false;
1939 if (!isLibFuncEmittable(Preheader->getModule(), TLI, LibFunc_wcslen))
1940 return false;
1941 }
1942
1943 IRBuilder<> Builder(Preheader->getTerminator());
1944 Builder.SetCurrentDebugLocation(CurLoop->getStartLoc());
1945 SCEVExpander Expander(*SE, Preheader->getModule()->getDataLayout(),
1946 "strlen_idiom");
1947 Value *MaterialzedBase = Expander.expandCodeFor(
1948 Verifier.LoadBaseEv, Verifier.LoadBaseEv->getType(),
1949 Builder.GetInsertPoint());
1950
1951 Value *StrLenFunc = nullptr;
1952 if (Verifier.OpWidth == 8) {
1953 StrLenFunc = emitStrLen(MaterialzedBase, Builder, *DL, TLI);
1954 } else {
1955 StrLenFunc = emitWcsLen(MaterialzedBase, Builder, *DL, TLI);
1956 }
1957 assert(StrLenFunc && "Failed to emit strlen function.");
1958
1959 const SCEV *StrlenEv = SE->getSCEV(StrLenFunc);
1961 for (PHINode &PN : LoopExitBB->phis()) {
1962 // We can now materialize the loop output as all phi have scev {base,+,a}.
1963 // We expand the phi as:
1964 // %strlen = call i64 @strlen(%str)
1965 // %phi.new = base expression + step * %strlen
1966 const SCEV *Ev = SE->getSCEV(&PN);
1967 const SCEVAddRecExpr *AddRecEv = dyn_cast<SCEVAddRecExpr>(Ev);
1968 const SCEVConstant *Step =
1970 const SCEV *Base = AddRecEv->getStart();
1971
1972 // It is safe to truncate to base since if base is narrower than size_t
1973 // the equivalent user code will have to truncate anyways.
1974 const SCEV *NewEv = SE->getAddExpr(
1976 StrlenEv, Base->getType())));
1977
1978 Value *MaterializedPHI = Expander.expandCodeFor(NewEv, NewEv->getType(),
1979 Builder.GetInsertPoint());
1980 Expander.clear();
1981 PN.replaceAllUsesWith(MaterializedPHI);
1982 Cleanup.push_back(&PN);
1983 }
1984
1985 // All LCSSA Loop Phi are dead, the left over dead loop body can be cleaned
1986 // up by later passes
1987 for (PHINode *PN : Cleanup)
1989
1990 // LoopDeletion only delete invariant loops with known trip-count. We can
1991 // update the condition so it will reliablely delete the invariant loop
1992 assert(LoopTerm->getNumSuccessors() == 2 &&
1993 (LoopTerm->getSuccessor(0) == LoopBody ||
1994 LoopTerm->getSuccessor(1) == LoopBody) &&
1995 "loop body must have a successor that is it self");
1996 ConstantInt *NewLoopCond = LoopTerm->getSuccessor(0) == LoopBody
1997 ? Builder.getFalse()
1998 : Builder.getTrue();
1999 LoopTerm->setCondition(NewLoopCond);
2000 SE->forgetLoop(CurLoop);
2001
2002 ++NumStrLen;
2003 LLVM_DEBUG(dbgs() << " Formed strlen idiom: " << *StrLenFunc << "\n");
2004 ORE.emit([&]() {
2005 return OptimizationRemark(DEBUG_TYPE, "recognizeAndInsertStrLen",
2006 CurLoop->getStartLoc(), Preheader)
2007 << "Transformed " << StrLenFunc->getName() << " loop idiom";
2008 });
2009
2010 return true;
2011}
2012
2013/// Check if the given conditional branch is based on an unsigned less-than
2014/// comparison between a variable and a constant, and if the comparison is false
2015/// the control yields to the loop entry. If the branch matches the behaviour,
2016/// the variable involved in the comparison is returned.
2018 APInt &Threshold) {
2019 if (!BI || !BI->isConditional())
2020 return nullptr;
2021
2023 if (!Cond)
2024 return nullptr;
2025
2026 ConstantInt *CmpConst = dyn_cast<ConstantInt>(Cond->getOperand(1));
2027 if (!CmpConst)
2028 return nullptr;
2029
2030 BasicBlock *FalseSucc = BI->getSuccessor(1);
2031 ICmpInst::Predicate Pred = Cond->getPredicate();
2032
2033 if (Pred == ICmpInst::ICMP_ULT && FalseSucc == LoopEntry) {
2034 Threshold = CmpConst->getValue();
2035 return Cond->getOperand(0);
2036 }
2037
2038 return nullptr;
2039}
2040
2041// Check if the recurrence variable `VarX` is in the right form to create
2042// the idiom. Returns the value coerced to a PHINode if so.
2044 BasicBlock *LoopEntry) {
2045 auto *PhiX = dyn_cast<PHINode>(VarX);
2046 if (PhiX && PhiX->getParent() == LoopEntry &&
2047 (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
2048 return PhiX;
2049 return nullptr;
2050}
2051
2052/// Return true if the idiom is detected in the loop.
2053///
2054/// Additionally:
2055/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
2056/// or nullptr if there is no such.
2057/// 2) \p CntPhi is set to the corresponding phi node
2058/// or nullptr if there is no such.
2059/// 3) \p InitX is set to the value whose CTLZ could be used.
2060/// 4) \p DefX is set to the instruction calculating Loop exit condition.
2061/// 5) \p Threshold is set to the constant involved in the unsigned less-than
2062/// comparison.
2063///
2064/// The core idiom we are trying to detect is:
2065/// \code
2066/// if (x0 < 2)
2067/// goto loop-exit // the precondition of the loop
2068/// cnt0 = init-val
2069/// do {
2070/// x = phi (x0, x.next); //PhiX
2071/// cnt = phi (cnt0, cnt.next)
2072///
2073/// cnt.next = cnt + 1;
2074/// ...
2075/// x.next = x >> 1; // DefX
2076/// } while (x >= 4)
2077/// loop-exit:
2078/// \endcode
2080 Intrinsic::ID &IntrinID,
2081 Value *&InitX, Instruction *&CntInst,
2082 PHINode *&CntPhi, Instruction *&DefX,
2083 APInt &Threshold) {
2084 BasicBlock *LoopEntry;
2085
2086 DefX = nullptr;
2087 CntInst = nullptr;
2088 CntPhi = nullptr;
2089 LoopEntry = *(CurLoop->block_begin());
2090
2091 // step 1: Check if the loop-back branch is in desirable form.
2093 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry,
2094 Threshold))
2095 DefX = dyn_cast<Instruction>(T);
2096 else
2097 return false;
2098
2099 // step 2: Check the recurrence of variable X
2100 if (!DefX || !isa<PHINode>(DefX))
2101 return false;
2102
2103 PHINode *VarPhi = cast<PHINode>(DefX);
2104 int Idx = VarPhi->getBasicBlockIndex(LoopEntry);
2105 if (Idx == -1)
2106 return false;
2107
2108 DefX = dyn_cast<Instruction>(VarPhi->getIncomingValue(Idx));
2109 if (!DefX || DefX->getNumOperands() == 0 || DefX->getOperand(0) != VarPhi)
2110 return false;
2111
2112 // step 3: detect instructions corresponding to "x.next = x >> 1"
2113 if (DefX->getOpcode() != Instruction::LShr)
2114 return false;
2115
2116 IntrinID = Intrinsic::ctlz;
2118 if (!Shft || !Shft->isOne())
2119 return false;
2120
2121 InitX = VarPhi->getIncomingValueForBlock(CurLoop->getLoopPreheader());
2122
2123 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
2124 // or cnt.next = cnt + -1.
2125 // TODO: We can skip the step. If loop trip count is known (CTLZ),
2126 // then all uses of "cnt.next" could be optimized to the trip count
2127 // plus "cnt0". Currently it is not optimized.
2128 // This step could be used to detect POPCNT instruction:
2129 // cnt.next = cnt + (x.next & 1)
2130 for (Instruction &Inst :
2131 llvm::make_range(LoopEntry->getFirstNonPHIIt(), LoopEntry->end())) {
2132 if (Inst.getOpcode() != Instruction::Add)
2133 continue;
2134
2136 if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
2137 continue;
2138
2139 PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
2140 if (!Phi)
2141 continue;
2142
2143 CntInst = &Inst;
2144 CntPhi = Phi;
2145 break;
2146 }
2147 if (!CntInst)
2148 return false;
2149
2150 return true;
2151}
2152
2153/// Return true iff the idiom is detected in the loop.
2154///
2155/// Additionally:
2156/// 1) \p CntInst is set to the instruction counting the population bit.
2157/// 2) \p CntPhi is set to the corresponding phi node.
2158/// 3) \p Var is set to the value whose population bits are being counted.
2159///
2160/// The core idiom we are trying to detect is:
2161/// \code
2162/// if (x0 != 0)
2163/// goto loop-exit // the precondition of the loop
2164/// cnt0 = init-val;
2165/// do {
2166/// x1 = phi (x0, x2);
2167/// cnt1 = phi(cnt0, cnt2);
2168///
2169/// cnt2 = cnt1 + 1;
2170/// ...
2171/// x2 = x1 & (x1 - 1);
2172/// ...
2173/// } while(x != 0);
2174///
2175/// loop-exit:
2176/// \endcode
2177static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
2178 Instruction *&CntInst, PHINode *&CntPhi,
2179 Value *&Var) {
2180 // step 1: Check to see if the look-back branch match this pattern:
2181 // "if (a!=0) goto loop-entry".
2182 BasicBlock *LoopEntry;
2183 Instruction *DefX2, *CountInst;
2184 Value *VarX1, *VarX0;
2185 PHINode *PhiX, *CountPhi;
2186
2187 DefX2 = CountInst = nullptr;
2188 VarX1 = VarX0 = nullptr;
2189 PhiX = CountPhi = nullptr;
2190 LoopEntry = *(CurLoop->block_begin());
2191
2192 // step 1: Check if the loop-back branch is in desirable form.
2193 {
2194 if (Value *T = matchCondition(
2195 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
2196 DefX2 = dyn_cast<Instruction>(T);
2197 else
2198 return false;
2199 }
2200
2201 // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
2202 {
2203 if (!DefX2 || DefX2->getOpcode() != Instruction::And)
2204 return false;
2205
2206 BinaryOperator *SubOneOp;
2207
2208 if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
2209 VarX1 = DefX2->getOperand(1);
2210 else {
2211 VarX1 = DefX2->getOperand(0);
2212 SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
2213 }
2214 if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
2215 return false;
2216
2217 ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
2218 if (!Dec ||
2219 !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
2220 (SubOneOp->getOpcode() == Instruction::Add &&
2221 Dec->isMinusOne()))) {
2222 return false;
2223 }
2224 }
2225
2226 // step 3: Check the recurrence of variable X
2227 PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
2228 if (!PhiX)
2229 return false;
2230
2231 // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
2232 {
2233 CountInst = nullptr;
2234 for (Instruction &Inst :
2235 llvm::make_range(LoopEntry->getFirstNonPHIIt(), LoopEntry->end())) {
2236 if (Inst.getOpcode() != Instruction::Add)
2237 continue;
2238
2240 if (!Inc || !Inc->isOne())
2241 continue;
2242
2243 PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
2244 if (!Phi)
2245 continue;
2246
2247 // Check if the result of the instruction is live of the loop.
2248 bool LiveOutLoop = false;
2249 for (User *U : Inst.users()) {
2250 if ((cast<Instruction>(U))->getParent() != LoopEntry) {
2251 LiveOutLoop = true;
2252 break;
2253 }
2254 }
2255
2256 if (LiveOutLoop) {
2257 CountInst = &Inst;
2258 CountPhi = Phi;
2259 break;
2260 }
2261 }
2262
2263 if (!CountInst)
2264 return false;
2265 }
2266
2267 // step 5: check if the precondition is in this form:
2268 // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
2269 {
2270 auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
2271 Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
2272 if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
2273 return false;
2274
2275 CntInst = CountInst;
2276 CntPhi = CountPhi;
2277 Var = T;
2278 }
2279
2280 return true;
2281}
2282
2283/// Return true if the idiom is detected in the loop.
2284///
2285/// Additionally:
2286/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
2287/// or nullptr if there is no such.
2288/// 2) \p CntPhi is set to the corresponding phi node
2289/// or nullptr if there is no such.
2290/// 3) \p Var is set to the value whose CTLZ could be used.
2291/// 4) \p DefX is set to the instruction calculating Loop exit condition.
2292///
2293/// The core idiom we are trying to detect is:
2294/// \code
2295/// if (x0 == 0)
2296/// goto loop-exit // the precondition of the loop
2297/// cnt0 = init-val;
2298/// do {
2299/// x = phi (x0, x.next); //PhiX
2300/// cnt = phi(cnt0, cnt.next);
2301///
2302/// cnt.next = cnt + 1;
2303/// ...
2304/// x.next = x >> 1; // DefX
2305/// ...
2306/// } while(x.next != 0);
2307///
2308/// loop-exit:
2309/// \endcode
2310static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
2311 Intrinsic::ID &IntrinID, Value *&InitX,
2312 Instruction *&CntInst, PHINode *&CntPhi,
2313 Instruction *&DefX) {
2314 BasicBlock *LoopEntry;
2315 Value *VarX = nullptr;
2316
2317 DefX = nullptr;
2318 CntInst = nullptr;
2319 CntPhi = nullptr;
2320 LoopEntry = *(CurLoop->block_begin());
2321
2322 // step 1: Check if the loop-back branch is in desirable form.
2323 if (Value *T = matchCondition(
2324 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
2325 DefX = dyn_cast<Instruction>(T);
2326 else
2327 return false;
2328
2329 // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
2330 if (!DefX || !DefX->isShift())
2331 return false;
2332 IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
2333 Intrinsic::ctlz;
2335 if (!Shft || !Shft->isOne())
2336 return false;
2337 VarX = DefX->getOperand(0);
2338
2339 // step 3: Check the recurrence of variable X
2340 PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
2341 if (!PhiX)
2342 return false;
2343
2344 InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
2345
2346 // Make sure the initial value can't be negative otherwise the ashr in the
2347 // loop might never reach zero which would make the loop infinite.
2348 if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
2349 return false;
2350
2351 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
2352 // or cnt.next = cnt + -1.
2353 // TODO: We can skip the step. If loop trip count is known (CTLZ),
2354 // then all uses of "cnt.next" could be optimized to the trip count
2355 // plus "cnt0". Currently it is not optimized.
2356 // This step could be used to detect POPCNT instruction:
2357 // cnt.next = cnt + (x.next & 1)
2358 for (Instruction &Inst :
2359 llvm::make_range(LoopEntry->getFirstNonPHIIt(), LoopEntry->end())) {
2360 if (Inst.getOpcode() != Instruction::Add)
2361 continue;
2362
2364 if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
2365 continue;
2366
2367 PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
2368 if (!Phi)
2369 continue;
2370
2371 CntInst = &Inst;
2372 CntPhi = Phi;
2373 break;
2374 }
2375 if (!CntInst)
2376 return false;
2377
2378 return true;
2379}
2380
2381// Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
2382// profitable if we delete the loop.
2383bool LoopIdiomRecognize::isProfitableToInsertFFS(Intrinsic::ID IntrinID,
2384 Value *InitX, bool ZeroCheck,
2385 size_t CanonicalSize) {
2386 const Value *Args[] = {InitX,
2387 ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
2388
2389 // @llvm.dbg doesn't count as they have no semantic effect.
2390 auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
2391 uint32_t HeaderSize =
2392 std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
2393
2394 IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
2395 InstructionCost Cost = TTI->getIntrinsicInstrCost(
2397 if (HeaderSize != CanonicalSize && Cost > TargetTransformInfo::TCC_Basic)
2398 return false;
2399
2400 return true;
2401}
2402
2403/// Convert CTLZ / CTTZ idiom loop into countable loop.
2404/// If CTLZ / CTTZ inserted as a new trip count returns true; otherwise,
2405/// returns false.
2406bool LoopIdiomRecognize::insertFFSIfProfitable(Intrinsic::ID IntrinID,
2407 Value *InitX, Instruction *DefX,
2408 PHINode *CntPhi,
2409 Instruction *CntInst) {
2410 bool IsCntPhiUsedOutsideLoop = false;
2411 for (User *U : CntPhi->users())
2412 if (!CurLoop->contains(cast<Instruction>(U))) {
2413 IsCntPhiUsedOutsideLoop = true;
2414 break;
2415 }
2416 bool IsCntInstUsedOutsideLoop = false;
2417 for (User *U : CntInst->users())
2418 if (!CurLoop->contains(cast<Instruction>(U))) {
2419 IsCntInstUsedOutsideLoop = true;
2420 break;
2421 }
2422 // If both CntInst and CntPhi are used outside the loop the profitability
2423 // is questionable.
2424 if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
2425 return false;
2426
2427 // For some CPUs result of CTLZ(X) intrinsic is undefined
2428 // when X is 0. If we can not guarantee X != 0, we need to check this
2429 // when expand.
2430 bool ZeroCheck = false;
2431 // It is safe to assume Preheader exist as it was checked in
2432 // parent function RunOnLoop.
2433 BasicBlock *PH = CurLoop->getLoopPreheader();
2434
2435 // If we are using the count instruction outside the loop, make sure we
2436 // have a zero check as a precondition. Without the check the loop would run
2437 // one iteration for before any check of the input value. This means 0 and 1
2438 // would have identical behavior in the original loop and thus
2439 if (!IsCntPhiUsedOutsideLoop) {
2440 auto *PreCondBB = PH->getSinglePredecessor();
2441 if (!PreCondBB)
2442 return false;
2443 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
2444 if (!PreCondBI)
2445 return false;
2446 if (matchCondition(PreCondBI, PH) != InitX)
2447 return false;
2448 ZeroCheck = true;
2449 }
2450
2451 // FFS idiom loop has only 6 instructions:
2452 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
2453 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
2454 // %shr = ashr %n.addr.0, 1
2455 // %tobool = icmp eq %shr, 0
2456 // %inc = add nsw %i.0, 1
2457 // br i1 %tobool
2458 size_t IdiomCanonicalSize = 6;
2459 if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, IdiomCanonicalSize))
2460 return false;
2461
2462 transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
2463 DefX->getDebugLoc(), ZeroCheck,
2464 IsCntPhiUsedOutsideLoop);
2465 return true;
2466}
2467
2468/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
2469/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
2470/// trip count returns true; otherwise, returns false.
2471bool LoopIdiomRecognize::recognizeAndInsertFFS() {
2472 // Give up if the loop has multiple blocks or multiple backedges.
2473 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
2474 return false;
2475
2476 Intrinsic::ID IntrinID;
2477 Value *InitX;
2478 Instruction *DefX = nullptr;
2479 PHINode *CntPhi = nullptr;
2480 Instruction *CntInst = nullptr;
2481
2482 if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX, CntInst, CntPhi,
2483 DefX))
2484 return false;
2485
2486 return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst);
2487}
2488
2489bool LoopIdiomRecognize::recognizeShiftUntilLessThan() {
2490 // Give up if the loop has multiple blocks or multiple backedges.
2491 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
2492 return false;
2493
2494 Intrinsic::ID IntrinID;
2495 Value *InitX;
2496 Instruction *DefX = nullptr;
2497 PHINode *CntPhi = nullptr;
2498 Instruction *CntInst = nullptr;
2499
2500 APInt LoopThreshold;
2501 if (!detectShiftUntilLessThanIdiom(CurLoop, *DL, IntrinID, InitX, CntInst,
2502 CntPhi, DefX, LoopThreshold))
2503 return false;
2504
2505 if (LoopThreshold == 2) {
2506 // Treat as regular FFS.
2507 return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst);
2508 }
2509
2510 // Look for Floor Log2 Idiom.
2511 if (LoopThreshold != 4)
2512 return false;
2513
2514 // Abort if CntPhi is used outside of the loop.
2515 for (User *U : CntPhi->users())
2516 if (!CurLoop->contains(cast<Instruction>(U)))
2517 return false;
2518
2519 // It is safe to assume Preheader exist as it was checked in
2520 // parent function RunOnLoop.
2521 BasicBlock *PH = CurLoop->getLoopPreheader();
2522 auto *PreCondBB = PH->getSinglePredecessor();
2523 if (!PreCondBB)
2524 return false;
2525 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
2526 if (!PreCondBI)
2527 return false;
2528
2529 APInt PreLoopThreshold;
2530 if (matchShiftULTCondition(PreCondBI, PH, PreLoopThreshold) != InitX ||
2531 PreLoopThreshold != 2)
2532 return false;
2533
2534 bool ZeroCheck = true;
2535
2536 // the loop has only 6 instructions:
2537 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
2538 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
2539 // %shr = ashr %n.addr.0, 1
2540 // %tobool = icmp ult %n.addr.0, C
2541 // %inc = add nsw %i.0, 1
2542 // br i1 %tobool
2543 size_t IdiomCanonicalSize = 6;
2544 if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, IdiomCanonicalSize))
2545 return false;
2546
2547 // log2(x) = w − 1 − clz(x)
2548 transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
2549 DefX->getDebugLoc(), ZeroCheck,
2550 /*IsCntPhiUsedOutsideLoop=*/false,
2551 /*InsertSub=*/true);
2552 return true;
2553}
2554
2555/// Recognizes a population count idiom in a non-countable loop.
2556///
2557/// If detected, transforms the relevant code to issue the popcount intrinsic
2558/// function call, and returns true; otherwise, returns false.
2559bool LoopIdiomRecognize::recognizePopcount() {
2560 if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
2561 return false;
2562
2563 // Counting population are usually conducted by few arithmetic instructions.
2564 // Such instructions can be easily "absorbed" by vacant slots in a
2565 // non-compact loop. Therefore, recognizing popcount idiom only makes sense
2566 // in a compact loop.
2567
2568 // Give up if the loop has multiple blocks or multiple backedges.
2569 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
2570 return false;
2571
2572 BasicBlock *LoopBody = *(CurLoop->block_begin());
2573 if (LoopBody->size() >= 20) {
2574 // The loop is too big, bail out.
2575 return false;
2576 }
2577
2578 // It should have a preheader containing nothing but an unconditional branch.
2579 BasicBlock *PH = CurLoop->getLoopPreheader();
2580 if (!PH || &PH->front() != PH->getTerminator())
2581 return false;
2582 auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
2583 if (!EntryBI || EntryBI->isConditional())
2584 return false;
2585
2586 // It should have a precondition block where the generated popcount intrinsic
2587 // function can be inserted.
2588 auto *PreCondBB = PH->getSinglePredecessor();
2589 if (!PreCondBB)
2590 return false;
2591 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
2592 if (!PreCondBI || PreCondBI->isUnconditional())
2593 return false;
2594
2595 Instruction *CntInst;
2596 PHINode *CntPhi;
2597 Value *Val;
2598 if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
2599 return false;
2600
2601 transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
2602 return true;
2603}
2604
2606 const DebugLoc &DL) {
2607 Value *Ops[] = {Val};
2608 Type *Tys[] = {Val->getType()};
2609
2610 CallInst *CI = IRBuilder.CreateIntrinsic(Intrinsic::ctpop, Tys, Ops);
2611 CI->setDebugLoc(DL);
2612
2613 return CI;
2614}
2615
2617 const DebugLoc &DL, bool ZeroCheck,
2618 Intrinsic::ID IID) {
2619 Value *Ops[] = {Val, IRBuilder.getInt1(ZeroCheck)};
2620 Type *Tys[] = {Val->getType()};
2621
2622 CallInst *CI = IRBuilder.CreateIntrinsic(IID, Tys, Ops);
2623 CI->setDebugLoc(DL);
2624
2625 return CI;
2626}
2627
2628/// Transform the following loop (Using CTLZ, CTTZ is similar):
2629/// loop:
2630/// CntPhi = PHI [Cnt0, CntInst]
2631/// PhiX = PHI [InitX, DefX]
2632/// CntInst = CntPhi + 1
2633/// DefX = PhiX >> 1
2634/// LOOP_BODY
2635/// Br: loop if (DefX != 0)
2636/// Use(CntPhi) or Use(CntInst)
2637///
2638/// Into:
2639/// If CntPhi used outside the loop:
2640/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
2641/// Count = CountPrev + 1
2642/// else
2643/// Count = BitWidth(InitX) - CTLZ(InitX)
2644/// loop:
2645/// CntPhi = PHI [Cnt0, CntInst]
2646/// PhiX = PHI [InitX, DefX]
2647/// PhiCount = PHI [Count, Dec]
2648/// CntInst = CntPhi + 1
2649/// DefX = PhiX >> 1
2650/// Dec = PhiCount - 1
2651/// LOOP_BODY
2652/// Br: loop if (Dec != 0)
2653/// Use(CountPrev + Cnt0) // Use(CntPhi)
2654/// or
2655/// Use(Count + Cnt0) // Use(CntInst)
2656///
2657/// If LOOP_BODY is empty the loop will be deleted.
2658/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
2659void LoopIdiomRecognize::transformLoopToCountable(
2660 Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
2661 PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
2662 bool ZeroCheck, bool IsCntPhiUsedOutsideLoop, bool InsertSub) {
2663 BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
2664
2665 // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
2666 IRBuilder<> Builder(PreheaderBr);
2667 Builder.SetCurrentDebugLocation(DL);
2668
2669 // If there are no uses of CntPhi crate:
2670 // Count = BitWidth - CTLZ(InitX);
2671 // NewCount = Count;
2672 // If there are uses of CntPhi create:
2673 // NewCount = BitWidth - CTLZ(InitX >> 1);
2674 // Count = NewCount + 1;
2675 Value *InitXNext;
2676 if (IsCntPhiUsedOutsideLoop) {
2677 if (DefX->getOpcode() == Instruction::AShr)
2678 InitXNext = Builder.CreateAShr(InitX, 1);
2679 else if (DefX->getOpcode() == Instruction::LShr)
2680 InitXNext = Builder.CreateLShr(InitX, 1);
2681 else if (DefX->getOpcode() == Instruction::Shl) // cttz
2682 InitXNext = Builder.CreateShl(InitX, 1);
2683 else
2684 llvm_unreachable("Unexpected opcode!");
2685 } else
2686 InitXNext = InitX;
2687 Value *Count =
2688 createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
2689 Type *CountTy = Count->getType();
2690 Count = Builder.CreateSub(
2691 ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count);
2692 if (InsertSub)
2693 Count = Builder.CreateSub(Count, ConstantInt::get(CountTy, 1));
2694 Value *NewCount = Count;
2695 if (IsCntPhiUsedOutsideLoop)
2696 Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1));
2697
2698 NewCount = Builder.CreateZExtOrTrunc(NewCount, CntInst->getType());
2699
2700 Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
2701 if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
2702 // If the counter was being incremented in the loop, add NewCount to the
2703 // counter's initial value, but only if the initial value is not zero.
2704 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2705 if (!InitConst || !InitConst->isZero())
2706 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2707 } else {
2708 // If the count was being decremented in the loop, subtract NewCount from
2709 // the counter's initial value.
2710 NewCount = Builder.CreateSub(CntInitVal, NewCount);
2711 }
2712
2713 // Step 2: Insert new IV and loop condition:
2714 // loop:
2715 // ...
2716 // PhiCount = PHI [Count, Dec]
2717 // ...
2718 // Dec = PhiCount - 1
2719 // ...
2720 // Br: loop if (Dec != 0)
2721 BasicBlock *Body = *(CurLoop->block_begin());
2722 auto *LbBr = cast<BranchInst>(Body->getTerminator());
2723 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2724
2725 PHINode *TcPhi = PHINode::Create(CountTy, 2, "tcphi");
2726 TcPhi->insertBefore(Body->begin());
2727
2728 Builder.SetInsertPoint(LbCond);
2729 Instruction *TcDec = cast<Instruction>(Builder.CreateSub(
2730 TcPhi, ConstantInt::get(CountTy, 1), "tcdec", false, true));
2731
2732 TcPhi->addIncoming(Count, Preheader);
2733 TcPhi->addIncoming(TcDec, Body);
2734
2735 CmpInst::Predicate Pred =
2736 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
2737 LbCond->setPredicate(Pred);
2738 LbCond->setOperand(0, TcDec);
2739 LbCond->setOperand(1, ConstantInt::get(CountTy, 0));
2740
2741 // Step 3: All the references to the original counter outside
2742 // the loop are replaced with the NewCount
2743 if (IsCntPhiUsedOutsideLoop)
2744 CntPhi->replaceUsesOutsideBlock(NewCount, Body);
2745 else
2746 CntInst->replaceUsesOutsideBlock(NewCount, Body);
2747
2748 // step 4: Forget the "non-computable" trip-count SCEV associated with the
2749 // loop. The loop would otherwise not be deleted even if it becomes empty.
2750 SE->forgetLoop(CurLoop);
2751}
2752
2753void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
2754 Instruction *CntInst,
2755 PHINode *CntPhi, Value *Var) {
2756 BasicBlock *PreHead = CurLoop->getLoopPreheader();
2757 auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
2758 const DebugLoc &DL = CntInst->getDebugLoc();
2759
2760 // Assuming before transformation, the loop is following:
2761 // if (x) // the precondition
2762 // do { cnt++; x &= x - 1; } while(x);
2763
2764 // Step 1: Insert the ctpop instruction at the end of the precondition block
2765 IRBuilder<> Builder(PreCondBr);
2766 Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
2767 {
2768 PopCnt = createPopcntIntrinsic(Builder, Var, DL);
2769 NewCount = PopCntZext =
2770 Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
2771
2772 if (NewCount != PopCnt)
2773 (cast<Instruction>(NewCount))->setDebugLoc(DL);
2774
2775 // TripCnt is exactly the number of iterations the loop has
2776 TripCnt = NewCount;
2777
2778 // If the population counter's initial value is not zero, insert Add Inst.
2779 Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
2780 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2781 if (!InitConst || !InitConst->isZero()) {
2782 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2783 (cast<Instruction>(NewCount))->setDebugLoc(DL);
2784 }
2785 }
2786
2787 // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
2788 // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
2789 // function would be partial dead code, and downstream passes will drag
2790 // it back from the precondition block to the preheader.
2791 {
2792 ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
2793
2794 Value *Opnd0 = PopCntZext;
2795 Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
2796 if (PreCond->getOperand(0) != Var)
2797 std::swap(Opnd0, Opnd1);
2798
2799 ICmpInst *NewPreCond = cast<ICmpInst>(
2800 Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
2801 PreCondBr->setCondition(NewPreCond);
2802
2804 }
2805
2806 // Step 3: Note that the population count is exactly the trip count of the
2807 // loop in question, which enable us to convert the loop from noncountable
2808 // loop into a countable one. The benefit is twofold:
2809 //
2810 // - If the loop only counts population, the entire loop becomes dead after
2811 // the transformation. It is a lot easier to prove a countable loop dead
2812 // than to prove a noncountable one. (In some C dialects, an infinite loop
2813 // isn't dead even if it computes nothing useful. In general, DCE needs
2814 // to prove a noncountable loop finite before safely delete it.)
2815 //
2816 // - If the loop also performs something else, it remains alive.
2817 // Since it is transformed to countable form, it can be aggressively
2818 // optimized by some optimizations which are in general not applicable
2819 // to a noncountable loop.
2820 //
2821 // After this step, this loop (conceptually) would look like following:
2822 // newcnt = __builtin_ctpop(x);
2823 // t = newcnt;
2824 // if (x)
2825 // do { cnt++; x &= x-1; t--) } while (t > 0);
2826 BasicBlock *Body = *(CurLoop->block_begin());
2827 {
2828 auto *LbBr = cast<BranchInst>(Body->getTerminator());
2829 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2830 Type *Ty = TripCnt->getType();
2831
2832 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi");
2833 TcPhi->insertBefore(Body->begin());
2834
2835 Builder.SetInsertPoint(LbCond);
2837 Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
2838 "tcdec", false, true));
2839
2840 TcPhi->addIncoming(TripCnt, PreHead);
2841 TcPhi->addIncoming(TcDec, Body);
2842
2843 CmpInst::Predicate Pred =
2844 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
2845 LbCond->setPredicate(Pred);
2846 LbCond->setOperand(0, TcDec);
2847 LbCond->setOperand(1, ConstantInt::get(Ty, 0));
2848 }
2849
2850 // Step 4: All the references to the original population counter outside
2851 // the loop are replaced with the NewCount -- the value returned from
2852 // __builtin_ctpop().
2853 CntInst->replaceUsesOutsideBlock(NewCount, Body);
2854
2855 // step 5: Forget the "non-computable" trip-count SCEV associated with the
2856 // loop. The loop would otherwise not be deleted even if it becomes empty.
2857 SE->forgetLoop(CurLoop);
2858}
2859
2860/// Match loop-invariant value.
2861template <typename SubPattern_t> struct match_LoopInvariant {
2862 SubPattern_t SubPattern;
2863 const Loop *L;
2864
2865 match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
2866 : SubPattern(SP), L(L) {}
2867
2868 template <typename ITy> bool match(ITy *V) const {
2869 return L->isLoopInvariant(V) && SubPattern.match(V);
2870 }
2871};
2872
2873/// Matches if the value is loop-invariant.
2874template <typename Ty>
2875inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
2876 return match_LoopInvariant<Ty>(M, L);
2877}
2878
2879/// Return true if the idiom is detected in the loop.
2880///
2881/// The core idiom we are trying to detect is:
2882/// \code
2883/// entry:
2884/// <...>
2885/// %bitmask = shl i32 1, %bitpos
2886/// br label %loop
2887///
2888/// loop:
2889/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2890/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2891/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2892/// %x.next = shl i32 %x.curr, 1
2893/// <...>
2894/// br i1 %x.curr.isbitunset, label %loop, label %end
2895///
2896/// end:
2897/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2898/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2899/// <...>
2900/// \endcode
2901static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
2902 Value *&BitMask, Value *&BitPos,
2903 Value *&CurrX, Instruction *&NextX) {
2905 " Performing shift-until-bittest idiom detection.\n");
2906
2907 // Give up if the loop has multiple blocks or multiple backedges.
2908 if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
2909 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
2910 return false;
2911 }
2912
2913 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2914 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2915 assert(LoopPreheaderBB && "There is always a loop preheader.");
2916
2917 using namespace PatternMatch;
2918
2919 // Step 1: Check if the loop backedge is in desirable form.
2920
2921 CmpPredicate Pred;
2922 Value *CmpLHS, *CmpRHS;
2923 BasicBlock *TrueBB, *FalseBB;
2924 if (!match(LoopHeaderBB->getTerminator(),
2925 m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
2926 m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {
2927 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
2928 return false;
2929 }
2930
2931 // Step 2: Check if the backedge's condition is in desirable form.
2932
2933 auto MatchVariableBitMask = [&]() {
2934 return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2935 match(CmpLHS,
2936 m_c_And(m_Value(CurrX),
2938 m_Value(BitMask),
2939 m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),
2940 CurLoop))));
2941 };
2942
2943 auto MatchDecomposableConstantBitMask = [&]() {
2944 auto Res = llvm::decomposeBitTestICmp(
2945 CmpLHS, CmpRHS, Pred, /*LookThroughTrunc=*/true,
2946 /*AllowNonZeroC=*/false, /*DecomposeAnd=*/true);
2947 if (Res && Res->Mask.isPowerOf2()) {
2948 assert(ICmpInst::isEquality(Res->Pred));
2949 Pred = Res->Pred;
2950 CurrX = Res->X;
2951 BitMask = ConstantInt::get(CurrX->getType(), Res->Mask);
2952 BitPos = ConstantInt::get(CurrX->getType(), Res->Mask.logBase2());
2953 return true;
2954 }
2955 return false;
2956 };
2957
2958 if (!MatchVariableBitMask() && !MatchDecomposableConstantBitMask()) {
2959 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n");
2960 return false;
2961 }
2962
2963 // Step 3: Check if the recurrence is in desirable form.
2964 auto *CurrXPN = dyn_cast<PHINode>(CurrX);
2965 if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) {
2966 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
2967 return false;
2968 }
2969
2970 BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);
2971 NextX =
2972 dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));
2973
2974 assert(CurLoop->isLoopInvariant(BaseX) &&
2975 "Expected BaseX to be available in the preheader!");
2976
2977 if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {
2978 // FIXME: support right-shift?
2979 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
2980 return false;
2981 }
2982
2983 // Step 4: Check if the backedge's destinations are in desirable form.
2984
2986 "Should only get equality predicates here.");
2987
2988 // cmp-br is commutative, so canonicalize to a single variant.
2989 if (Pred != ICmpInst::Predicate::ICMP_EQ) {
2990 Pred = ICmpInst::getInversePredicate(Pred);
2991 std::swap(TrueBB, FalseBB);
2992 }
2993
2994 // We expect to exit loop when comparison yields false,
2995 // so when it yields true we should branch back to loop header.
2996 if (TrueBB != LoopHeaderBB) {
2997 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
2998 return false;
2999 }
3000
3001 // Okay, idiom checks out.
3002 return true;
3003}
3004
3005/// Look for the following loop:
3006/// \code
3007/// entry:
3008/// <...>
3009/// %bitmask = shl i32 1, %bitpos
3010/// br label %loop
3011///
3012/// loop:
3013/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
3014/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
3015/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
3016/// %x.next = shl i32 %x.curr, 1
3017/// <...>
3018/// br i1 %x.curr.isbitunset, label %loop, label %end
3019///
3020/// end:
3021/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
3022/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
3023/// <...>
3024/// \endcode
3025///
3026/// And transform it into:
3027/// \code
3028/// entry:
3029/// %bitmask = shl i32 1, %bitpos
3030/// %lowbitmask = add i32 %bitmask, -1
3031/// %mask = or i32 %lowbitmask, %bitmask
3032/// %x.masked = and i32 %x, %mask
3033/// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
3034/// i1 true)
3035/// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
3036/// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
3037/// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
3038/// %tripcount = add i32 %backedgetakencount, 1
3039/// %x.curr = shl i32 %x, %backedgetakencount
3040/// %x.next = shl i32 %x, %tripcount
3041/// br label %loop
3042///
3043/// loop:
3044/// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
3045/// %loop.iv.next = add nuw i32 %loop.iv, 1
3046/// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
3047/// <...>
3048/// br i1 %loop.ivcheck, label %end, label %loop
3049///
3050/// end:
3051/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
3052/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
3053/// <...>
3054/// \endcode
3055bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
3056 bool MadeChange = false;
3057
3058 Value *X, *BitMask, *BitPos, *XCurr;
3059 Instruction *XNext;
3060 if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,
3061 XNext)) {
3063 " shift-until-bittest idiom detection failed.\n");
3064 return MadeChange;
3065 }
3066 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n");
3067
3068 // Ok, it is the idiom we were looking for, we *could* transform this loop,
3069 // but is it profitable to transform?
3070
3071 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
3072 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
3073 assert(LoopPreheaderBB && "There is always a loop preheader.");
3074
3075 BasicBlock *SuccessorBB = CurLoop->getExitBlock();
3076 assert(SuccessorBB && "There is only a single successor.");
3077
3078 IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
3079 Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());
3080
3081 Intrinsic::ID IntrID = Intrinsic::ctlz;
3082 Type *Ty = X->getType();
3083 unsigned Bitwidth = Ty->getScalarSizeInBits();
3084
3087
3088 // The rewrite is considered to be unprofitable iff and only iff the
3089 // intrinsic/shift we'll use are not cheap. Note that we are okay with *just*
3090 // making the loop countable, even if nothing else changes.
3092 IntrID, Ty, {PoisonValue::get(Ty), /*is_zero_poison=*/Builder.getTrue()});
3093 InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
3096 " Intrinsic is too costly, not beneficial\n");
3097 return MadeChange;
3098 }
3099 if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >
3101 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n");
3102 return MadeChange;
3103 }
3104
3105 // Ok, transform appears worthwhile.
3106 MadeChange = true;
3107
3108 if (!isGuaranteedNotToBeUndefOrPoison(BitPos)) {
3109 // BitMask may be computed from BitPos, Freeze BitPos so we can increase
3110 // it's use count.
3111 std::optional<BasicBlock::iterator> InsertPt = std::nullopt;
3112 if (auto *BitPosI = dyn_cast<Instruction>(BitPos))
3113 InsertPt = BitPosI->getInsertionPointAfterDef();
3114 else
3115 InsertPt = DT->getRoot()->getFirstNonPHIOrDbgOrAlloca();
3116 if (!InsertPt)
3117 return false;
3118 FreezeInst *BitPosFrozen =
3119 new FreezeInst(BitPos, BitPos->getName() + ".fr", *InsertPt);
3120 BitPos->replaceUsesWithIf(BitPosFrozen, [BitPosFrozen](Use &U) {
3121 return U.getUser() != BitPosFrozen;
3122 });
3123 BitPos = BitPosFrozen;
3124 }
3125
3126 // Step 1: Compute the loop trip count.
3127
3128 Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
3129 BitPos->getName() + ".lowbitmask");
3130 Value *Mask =
3131 Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
3132 Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
3133 CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
3134 IntrID, Ty, {XMasked, /*is_zero_poison=*/Builder.getTrue()},
3135 /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros");
3136 Value *XMaskedNumActiveBits = Builder.CreateSub(
3137 ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
3138 XMasked->getName() + ".numactivebits", /*HasNUW=*/true,
3139 /*HasNSW=*/Bitwidth != 2);
3140 Value *XMaskedLeadingOnePos =
3141 Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),
3142 XMasked->getName() + ".leadingonepos", /*HasNUW=*/false,
3143 /*HasNSW=*/Bitwidth > 2);
3144
3145 Value *LoopBackedgeTakenCount = Builder.CreateSub(
3146 BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount",
3147 /*HasNUW=*/true, /*HasNSW=*/true);
3148 // We know loop's backedge-taken count, but what's loop's trip count?
3149 // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
3150 Value *LoopTripCount =
3151 Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
3152 CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
3153 /*HasNSW=*/Bitwidth != 2);
3154
3155 // Step 2: Compute the recurrence's final value without a loop.
3156
3157 // NewX is always safe to compute, because `LoopBackedgeTakenCount`
3158 // will always be smaller than `bitwidth(X)`, i.e. we never get poison.
3159 Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);
3160 NewX->takeName(XCurr);
3161 if (auto *I = dyn_cast<Instruction>(NewX))
3162 I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
3163
3164 Value *NewXNext;
3165 // Rewriting XNext is more complicated, however, because `X << LoopTripCount`
3166 // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
3167 // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
3168 // that isn't the case, we'll need to emit an alternative, safe IR.
3169 if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() ||
3173 Ty->getScalarSizeInBits() - 1))))
3174 NewXNext = Builder.CreateShl(X, LoopTripCount);
3175 else {
3176 // Otherwise, just additionally shift by one. It's the smallest solution,
3177 // alternatively, we could check that NewX is INT_MIN (or BitPos is )
3178 // and select 0 instead.
3179 NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));
3180 }
3181
3182 NewXNext->takeName(XNext);
3183 if (auto *I = dyn_cast<Instruction>(NewXNext))
3184 I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
3185
3186 // Step 3: Adjust the successor basic block to recieve the computed
3187 // recurrence's final value instead of the recurrence itself.
3188
3189 XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);
3190 XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);
3191
3192 // Step 4: Rewrite the loop into a countable form, with canonical IV.
3193
3194 // The new canonical induction variable.
3195 Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->begin());
3196 auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
3197
3198 // The induction itself.
3199 // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
3200 Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
3201 auto *IVNext =
3202 Builder.CreateAdd(IV, ConstantInt::get(Ty, 1), IV->getName() + ".next",
3203 /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
3204
3205 // The loop trip count check.
3206 auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
3207 CurLoop->getName() + ".ivcheck");
3208 SmallVector<uint32_t> BranchWeights;
3209 const bool HasBranchWeights =
3211 extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
3212
3213 auto *BI = Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
3214 if (HasBranchWeights) {
3215 if (SuccessorBB == LoopHeaderBB->getTerminator()->getSuccessor(1))
3216 std::swap(BranchWeights[0], BranchWeights[1]);
3217 // We're not changing the loop profile, so we can reuse the original loop's
3218 // profile.
3219 setBranchWeights(*BI, BranchWeights,
3220 /*IsExpected=*/false);
3221 }
3222
3223 LoopHeaderBB->getTerminator()->eraseFromParent();
3224
3225 // Populate the IV PHI.
3226 IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
3227 IV->addIncoming(IVNext, LoopHeaderBB);
3228
3229 // Step 5: Forget the "non-computable" trip-count SCEV associated with the
3230 // loop. The loop would otherwise not be deleted even if it becomes empty.
3231
3232 SE->forgetLoop(CurLoop);
3233
3234 // Other passes will take care of actually deleting the loop if possible.
3235
3236 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n");
3237
3238 ++NumShiftUntilBitTest;
3239 return MadeChange;
3240}
3241
3242/// Return true if the idiom is detected in the loop.
3243///
3244/// The core idiom we are trying to detect is:
3245/// \code
3246/// entry:
3247/// <...>
3248/// %start = <...>
3249/// %extraoffset = <...>
3250/// <...>
3251/// br label %for.cond
3252///
3253/// loop:
3254/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
3255/// %nbits = add nsw i8 %iv, %extraoffset
3256/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
3257/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
3258/// %iv.next = add i8 %iv, 1
3259/// <...>
3260/// br i1 %val.shifted.iszero, label %end, label %loop
3261///
3262/// end:
3263/// %iv.res = phi i8 [ %iv, %loop ] <...>
3264/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
3265/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
3266/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
3267/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
3268/// <...>
3269/// \endcode
3271 Instruction *&ValShiftedIsZero,
3272 Intrinsic::ID &IntrinID, Instruction *&IV,
3273 Value *&Start, Value *&Val,
3274 const SCEV *&ExtraOffsetExpr,
3275 bool &InvertedCond) {
3277 " Performing shift-until-zero idiom detection.\n");
3278
3279 // Give up if the loop has multiple blocks or multiple backedges.
3280 if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
3281 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
3282 return false;
3283 }
3284
3285 Instruction *ValShifted, *NBits, *IVNext;
3286 Value *ExtraOffset;
3287
3288 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
3289 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
3290 assert(LoopPreheaderBB && "There is always a loop preheader.");
3291
3292 using namespace PatternMatch;
3293
3294 // Step 1: Check if the loop backedge, condition is in desirable form.
3295
3296 CmpPredicate Pred;
3297 BasicBlock *TrueBB, *FalseBB;
3298 if (!match(LoopHeaderBB->getTerminator(),
3299 m_Br(m_Instruction(ValShiftedIsZero), m_BasicBlock(TrueBB),
3300 m_BasicBlock(FalseBB))) ||
3301 !match(ValShiftedIsZero,
3302 m_ICmp(Pred, m_Instruction(ValShifted), m_Zero())) ||
3303 !ICmpInst::isEquality(Pred)) {
3304 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
3305 return false;
3306 }
3307
3308 // Step 2: Check if the comparison's operand is in desirable form.
3309 // FIXME: Val could be a one-input PHI node, which we should look past.
3310 if (!match(ValShifted, m_Shift(m_LoopInvariant(m_Value(Val), CurLoop),
3311 m_Instruction(NBits)))) {
3312 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad comparisons value computation.\n");
3313 return false;
3314 }
3315 IntrinID = ValShifted->getOpcode() == Instruction::Shl ? Intrinsic::cttz
3316 : Intrinsic::ctlz;
3317
3318 // Step 3: Check if the shift amount is in desirable form.
3319
3320 if (match(NBits, m_c_Add(m_Instruction(IV),
3321 m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
3322 (NBits->hasNoSignedWrap() || NBits->hasNoUnsignedWrap()))
3323 ExtraOffsetExpr = SE->getNegativeSCEV(SE->getSCEV(ExtraOffset));
3324 else if (match(NBits,
3326 m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
3327 NBits->hasNoSignedWrap())
3328 ExtraOffsetExpr = SE->getSCEV(ExtraOffset);
3329 else {
3330 IV = NBits;
3331 ExtraOffsetExpr = SE->getZero(NBits->getType());
3332 }
3333
3334 // Step 4: Check if the recurrence is in desirable form.
3335 auto *IVPN = dyn_cast<PHINode>(IV);
3336 if (!IVPN || IVPN->getParent() != LoopHeaderBB) {
3337 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
3338 return false;
3339 }
3340
3341 Start = IVPN->getIncomingValueForBlock(LoopPreheaderBB);
3342 IVNext = dyn_cast<Instruction>(IVPN->getIncomingValueForBlock(LoopHeaderBB));
3343
3344 if (!IVNext || !match(IVNext, m_Add(m_Specific(IVPN), m_One()))) {
3345 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
3346 return false;
3347 }
3348
3349 // Step 4: Check if the backedge's destinations are in desirable form.
3350
3352 "Should only get equality predicates here.");
3353
3354 // cmp-br is commutative, so canonicalize to a single variant.
3355 InvertedCond = Pred != ICmpInst::Predicate::ICMP_EQ;
3356 if (InvertedCond) {
3357 Pred = ICmpInst::getInversePredicate(Pred);
3358 std::swap(TrueBB, FalseBB);
3359 }
3360
3361 // We expect to exit loop when comparison yields true,
3362 // so when it yields false we should branch back to loop header.
3363 if (FalseBB != LoopHeaderBB) {
3364 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
3365 return false;
3366 }
3367
3368 // The new, countable, loop will certainly only run a known number of
3369 // iterations, It won't be infinite. But the old loop might be infinite
3370 // under certain conditions. For logical shifts, the value will become zero
3371 // after at most bitwidth(%Val) loop iterations. However, for arithmetic
3372 // right-shift, iff the sign bit was set, the value will never become zero,
3373 // and the loop may never finish.
3374 if (ValShifted->getOpcode() == Instruction::AShr &&
3375 !isMustProgress(CurLoop) && !SE->isKnownNonNegative(SE->getSCEV(Val))) {
3376 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Can not prove the loop is finite.\n");
3377 return false;
3378 }
3379
3380 // Okay, idiom checks out.
3381 return true;
3382}
3383
3384/// Look for the following loop:
3385/// \code
3386/// entry:
3387/// <...>
3388/// %start = <...>
3389/// %extraoffset = <...>
3390/// <...>
3391/// br label %loop
3392///
3393/// loop:
3394/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %loop ]
3395/// %nbits = add nsw i8 %iv, %extraoffset
3396/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
3397/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
3398/// %iv.next = add i8 %iv, 1
3399/// <...>
3400/// br i1 %val.shifted.iszero, label %end, label %loop
3401///
3402/// end:
3403/// %iv.res = phi i8 [ %iv, %loop ] <...>
3404/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
3405/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
3406/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
3407/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
3408/// <...>
3409/// \endcode
3410///
3411/// And transform it into:
3412/// \code
3413/// entry:
3414/// <...>
3415/// %start = <...>
3416/// %extraoffset = <...>
3417/// <...>
3418/// %val.numleadingzeros = call i8 @llvm.ct{l,t}z.i8(i8 %val, i1 0)
3419/// %val.numactivebits = sub i8 8, %val.numleadingzeros
3420/// %extraoffset.neg = sub i8 0, %extraoffset
3421/// %tmp = add i8 %val.numactivebits, %extraoffset.neg
3422/// %iv.final = call i8 @llvm.smax.i8(i8 %tmp, i8 %start)
3423/// %loop.tripcount = sub i8 %iv.final, %start
3424/// br label %loop
3425///
3426/// loop:
3427/// %loop.iv = phi i8 [ 0, %entry ], [ %loop.iv.next, %loop ]
3428/// %loop.iv.next = add i8 %loop.iv, 1
3429/// %loop.ivcheck = icmp eq i8 %loop.iv.next, %loop.tripcount
3430/// %iv = add i8 %loop.iv, %start
3431/// <...>
3432/// br i1 %loop.ivcheck, label %end, label %loop
3433///
3434/// end:
3435/// %iv.res = phi i8 [ %iv.final, %loop ] <...>
3436/// <...>
3437/// \endcode
3438bool LoopIdiomRecognize::recognizeShiftUntilZero() {
3439 bool MadeChange = false;
3440
3441 Instruction *ValShiftedIsZero;
3442 Intrinsic::ID IntrID;
3443 Instruction *IV;
3444 Value *Start, *Val;
3445 const SCEV *ExtraOffsetExpr;
3446 bool InvertedCond;
3447 if (!detectShiftUntilZeroIdiom(CurLoop, SE, ValShiftedIsZero, IntrID, IV,
3448 Start, Val, ExtraOffsetExpr, InvertedCond)) {
3450 " shift-until-zero idiom detection failed.\n");
3451 return MadeChange;
3452 }
3453 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom detected!\n");
3454
3455 // Ok, it is the idiom we were looking for, we *could* transform this loop,
3456 // but is it profitable to transform?
3457
3458 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
3459 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
3460 assert(LoopPreheaderBB && "There is always a loop preheader.");
3461
3462 BasicBlock *SuccessorBB = CurLoop->getExitBlock();
3463 assert(SuccessorBB && "There is only a single successor.");
3464
3465 IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
3466 Builder.SetCurrentDebugLocation(IV->getDebugLoc());
3467
3468 Type *Ty = Val->getType();
3469 unsigned Bitwidth = Ty->getScalarSizeInBits();
3470
3473
3474 // The rewrite is considered to be unprofitable iff and only iff the
3475 // intrinsic we'll use are not cheap. Note that we are okay with *just*
3476 // making the loop countable, even if nothing else changes.
3478 IntrID, Ty, {PoisonValue::get(Ty), /*is_zero_poison=*/Builder.getFalse()});
3479 InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
3482 " Intrinsic is too costly, not beneficial\n");
3483 return MadeChange;
3484 }
3485
3486 // Ok, transform appears worthwhile.
3487 MadeChange = true;
3488
3489 bool OffsetIsZero = ExtraOffsetExpr->isZero();
3490
3491 // Step 1: Compute the loop's final IV value / trip count.
3492
3493 CallInst *ValNumLeadingZeros = Builder.CreateIntrinsic(
3494 IntrID, Ty, {Val, /*is_zero_poison=*/Builder.getFalse()},
3495 /*FMFSource=*/nullptr, Val->getName() + ".numleadingzeros");
3496 Value *ValNumActiveBits = Builder.CreateSub(
3497 ConstantInt::get(Ty, Ty->getScalarSizeInBits()), ValNumLeadingZeros,
3498 Val->getName() + ".numactivebits", /*HasNUW=*/true,
3499 /*HasNSW=*/Bitwidth != 2);
3500
3501 SCEVExpander Expander(*SE, *DL, "loop-idiom");
3502 Expander.setInsertPoint(&*Builder.GetInsertPoint());
3503 Value *ExtraOffset = Expander.expandCodeFor(ExtraOffsetExpr);
3504
3505 Value *ValNumActiveBitsOffset = Builder.CreateAdd(
3506 ValNumActiveBits, ExtraOffset, ValNumActiveBits->getName() + ".offset",
3507 /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true);
3508 Value *IVFinal = Builder.CreateIntrinsic(Intrinsic::smax, {Ty},
3509 {ValNumActiveBitsOffset, Start},
3510 /*FMFSource=*/nullptr, "iv.final");
3511
3512 auto *LoopBackedgeTakenCount = cast<Instruction>(Builder.CreateSub(
3513 IVFinal, Start, CurLoop->getName() + ".backedgetakencount",
3514 /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true));
3515 // FIXME: or when the offset was `add nuw`
3516
3517 // We know loop's backedge-taken count, but what's loop's trip count?
3518 Value *LoopTripCount =
3519 Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
3520 CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
3521 /*HasNSW=*/Bitwidth != 2);
3522
3523 // Step 2: Adjust the successor basic block to recieve the original
3524 // induction variable's final value instead of the orig. IV itself.
3525
3526 IV->replaceUsesOutsideBlock(IVFinal, LoopHeaderBB);
3527
3528 // Step 3: Rewrite the loop into a countable form, with canonical IV.
3529
3530 // The new canonical induction variable.
3531 Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->begin());
3532 auto *CIV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
3533
3534 // The induction itself.
3535 Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->getFirstNonPHIIt());
3536 auto *CIVNext =
3537 Builder.CreateAdd(CIV, ConstantInt::get(Ty, 1), CIV->getName() + ".next",
3538 /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
3539
3540 // The loop trip count check.
3541 auto *CIVCheck = Builder.CreateICmpEQ(CIVNext, LoopTripCount,
3542 CurLoop->getName() + ".ivcheck");
3543 auto *NewIVCheck = CIVCheck;
3544 if (InvertedCond) {
3545 NewIVCheck = Builder.CreateNot(CIVCheck);
3546 NewIVCheck->takeName(ValShiftedIsZero);
3547 }
3548
3549 // The original IV, but rebased to be an offset to the CIV.
3550 auto *IVDePHId = Builder.CreateAdd(CIV, Start, "", /*HasNUW=*/false,
3551 /*HasNSW=*/true); // FIXME: what about NUW?
3552 IVDePHId->takeName(IV);
3553
3554 // The loop terminator.
3555 Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
3556 SmallVector<uint32_t> BranchWeights;
3557 const bool HasBranchWeights =
3559 extractBranchWeights(*LoopHeaderBB->getTerminator(), BranchWeights);
3560
3561 auto *BI = Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
3562 if (HasBranchWeights) {
3563 if (InvertedCond)
3564 std::swap(BranchWeights[0], BranchWeights[1]);
3565 // We're not changing the loop profile, so we can reuse the original loop's
3566 // profile.
3567 setBranchWeights(*BI, BranchWeights, /*IsExpected=*/false);
3568 }
3569 LoopHeaderBB->getTerminator()->eraseFromParent();
3570
3571 // Populate the IV PHI.
3572 CIV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
3573 CIV->addIncoming(CIVNext, LoopHeaderBB);
3574
3575 // Step 4: Forget the "non-computable" trip-count SCEV associated with the
3576 // loop. The loop would otherwise not be deleted even if it becomes empty.
3577
3578 SE->forgetLoop(CurLoop);
3579
3580 // Step 5: Try to cleanup the loop's body somewhat.
3581 IV->replaceAllUsesWith(IVDePHId);
3582 IV->eraseFromParent();
3583
3584 ValShiftedIsZero->replaceAllUsesWith(NewIVCheck);
3585 ValShiftedIsZero->eraseFromParent();
3586
3587 // Other passes will take care of actually deleting the loop if possible.
3588
3589 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom optimized!\n");
3590
3591 ++NumShiftUntilZero;
3592 return MadeChange;
3593}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const Function * getParent(const Value *V)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
DXIL Resource Access
This file defines the DenseMap class.
#define DEBUG_TYPE
static const HTTPClientCleanup Cleanup
static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, const SCEV *BECount, unsigned StoreSize, AliasAnalysis &AA, SmallPtrSetImpl< Instruction * > &Ignored)
mayLoopAccessLocation - Return true if the specified loop might access the specified pointer location...
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This header defines various interfaces for pass management in LLVM.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static Value * matchCondition(BranchInst *BI, BasicBlock *LoopEntry, bool JmpOnZero=false)
Check if the given conditional branch is based on the comparison between a variable and zero,...
static PHINode * getRecurrenceVar(Value *VarX, Instruction *DefX, BasicBlock *LoopEntry)
static CallInst * createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL, bool ZeroCheck, Intrinsic::ID IID)
static bool detectShiftUntilLessThanIdiom(Loop *CurLoop, const DataLayout &DL, Intrinsic::ID &IntrinID, Value *&InitX, Instruction *&CntInst, PHINode *&CntPhi, Instruction *&DefX, APInt &Threshold)
Return true if the idiom is detected in the loop.
static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX, Value *&BitMask, Value *&BitPos, Value *&CurrX, Instruction *&NextX)
Return true if the idiom is detected in the loop.
static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, Instruction *&CntInst, PHINode *&CntPhi, Value *&Var)
Return true iff the idiom is detected in the loop.
static Constant * getMemSetPatternValue(Value *V, const DataLayout *DL)
getMemSetPatternValue - If a strided store of the specified value is safe to turn into a memset....
static CallInst * createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL)
static const SCEV * getNumBytes(const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, Loop *CurLoop, const DataLayout *DL, ScalarEvolution *SE)
Compute the number of bytes as a SCEV from the backedge taken count.
static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL, Intrinsic::ID &IntrinID, Value *&InitX, Instruction *&CntInst, PHINode *&CntPhi, Instruction *&DefX)
Return true if the idiom is detected in the loop.
static const SCEV * getStartForNegStride(const SCEV *Start, const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, ScalarEvolution *SE)
static APInt getStoreStride(const SCEVAddRecExpr *StoreEv)
static Value * matchShiftULTCondition(BranchInst *BI, BasicBlock *LoopEntry, APInt &Threshold)
Check if the given conditional branch is based on an unsigned less-than comparison between a variable...
match_LoopInvariant< Ty > m_LoopInvariant(const Ty &M, const Loop *L)
Matches if the value is loop-invariant.
static void deleteDeadInstruction(Instruction *I)
#define I(x, y, z)
Definition MD5.cpp:58
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
#define T
Contains a collection of routines for determining if a given instruction is guaranteed to execute if ...
if(PassOpts->AAPipeline)
This file contains the declarations for profiling metadata utility functions.
const SmallVectorImpl< MachineOperand > & Cond
static bool isSimple(Instruction *I)
verify safepoint Safepoint IR Verifier
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
std::optional< uint64_t > tryZExtValue() const
Get zero extended value if possible.
Definition APInt.h:1553
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1541
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1489
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1563
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:528
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI iterator_range< filter_iterator< BasicBlock::const_iterator, std::function< bool(const Instruction &)> > > instructionsWithoutDebug(bool SkipPseudoOp=true) const
Return a const iterator range over the instructions in the block, skipping any debug instructions.
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction & front() const
Definition BasicBlock.h:482
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
size_t size() const
Definition BasicBlock.h:480
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
LLVM_ABI const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
BinaryOps getOpcode() const
Definition InstrTypes.h:374
Conditional or Unconditional Branch instruction.
void setCondition(Value *V)
bool isConditional() const
unsigned getNumSuccessors() const
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
This class represents a function call, abstracting a target machine's calling convention.
void setPredicate(Predicate P)
Set the predicate for this instruction to the specified value.
Definition InstrTypes.h:768
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:765
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isMinusOne() const
This function will return true iff every bit in this constant is set to true.
Definition Constants.h:226
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
Definition Constants.h:220
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:214
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:163
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:154
static LLVM_ABI ConstantInt * getBool(LLVMContext &Context, bool V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
LLVM_ABI IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
A debug info location.
Definition DebugLoc.h:124
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
This class represents a freeze function that returns random concrete value if an operand is either a ...
PointerType * getType() const
Global values are always pointers.
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition GlobalValue.h:61
static CRCTable genSarwateTable(const APInt &GenPoly, bool ByteOrderSwapped)
Generate a lookup table of 256 entries by interleaving the generating polynomial.
This instruction compares its operands according to the predicate given to the constructor.
bool isEquality() const
Return true if this predicate is either EQ or NE.
static bool isEquality(Predicate P)
Return true if this predicate is either EQ or NE.
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
ConstantInt * getInt1(bool V)
Get a constant value representing either true or false.
Definition IRBuilder.h:497
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition IRBuilder.h:2103
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
LLVM_ABI bool hasNoUnsignedWrap() const LLVM_READONLY
Determine whether the no unsigned wrap flag is set.
LLVM_ABI bool hasNoSignedWrap() const LLVM_READONLY
Determine whether the no signed wrap flag is set.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void setAAMetadata(const AAMDNodes &N)
Sets the AA metadata on this instruction from the AAMDNodes structure.
LLVM_ABI bool isAtomic() const LLVM_READONLY
Return true if this instruction has an AtomicOrdering of unordered or higher.
LLVM_ABI void insertBefore(InstListType::iterator InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified position.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isShift() const
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getPointerOperand()
bool isVolatile() const
Return true if this is a load from a volatile memory location.
bool isUnordered() const
Align getAlign() const
Return the alignment of the access that is being performed.
static LocationSize precise(uint64_t Value)
static constexpr LocationSize afterPointer()
Any location after the base pointer (but still within the underlying object).
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
bool isOutermost() const
Return true if the loop does not have a parent (natural) loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
unsigned getNumBackEdges() const
Calculate the number of back edges to the loop header.
BlockT * getHeader() const
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
void getUniqueExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all unique successor blocks of this loop.
block_iterator block_begin() const
BlockT * getUniqueExitBlock() const
If getUniqueExitBlocks would return exactly one block, return that block.
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition LoopInfo.cpp:632
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition LoopInfo.cpp:61
ICmpInst * getLatchCmpInst() const
Get the latch condition instruction.
Definition LoopInfo.cpp:175
StringRef getName() const
Definition LoopInfo.h:389
PHINode * getCanonicalInductionVariable() const
Check to see if the loop has a canonical induction variable: an integer recurrence that starts at 0 a...
Definition LoopInfo.cpp:151
This class wraps the llvm.memcpy intrinsic.
Value * getLength() const
Value * getDest() const
This is just like getRawDest, but it strips off any cast instructions (including addrspacecast) that ...
MaybeAlign getDestAlign() const
bool isForceInlined() const
bool isVolatile() const
Value * getValue() const
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
MaybeAlign getSourceAlign() const
Value * getSource() const
This is just like getRawSource, but it strips off any cast instructions that feed it,...
Representation for a specific memory location.
An analysis that produces MemorySSA for a function.
Definition MemorySSA.h:936
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition MemorySSA.h:702
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
This node represents a polynomial recurrence on the trip count of the specified loop.
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents a constant integer value.
ConstantInt * getValue() const
const APInt & getAPInt() const
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
This class uses information about analyze scalars to rewrite expressions in canonical form.
const SCEV * getOperand(unsigned i) const
This class represents an analyzed expression in the program.
LLVM_ABI bool isOne() const
Return true if the expression is a constant one.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI bool isKnownNonNegative(const SCEV *S)
Test if the given expression is known to be non-negative.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getTripCountFromExitCount(const SCEV *ExitCount)
A version of getTripCountFromExitCount below which always picks an evaluation type which can not resu...
LLVM_ABI void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
LLVM_ABI const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getTruncateOrZeroExtend(const SCEV *V, Type *Ty, unsigned Depth=0)
Return a SCEV corresponding to a conversion of the input value to the specified type.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVM_ABI const SCEV * getTruncateOrSignExtend(const SCEV *V, Type *Ty, unsigned Depth=0)
Return a SCEV corresponding to a conversion of the input value to the specified type.
A vector that has set insertion semantics.
Definition SetVector.h:59
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition SetVector.h:261
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:150
Simple and conservative implementation of LoopSafetyInfo that can give false-positive answers to its ...
void computeLoopSafetyInfo(const Loop *CurLoop) override
Computes safety information for a loop checks loop body & header for the possibility of may throw exc...
bool anyBlockMayThrow() const override
Returns true iff any block of the loop for which this info is contains an instruction that may throw ...
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Align getAlign() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Provides information about what library functions are available for the current target.
unsigned getWCharSize(const Module &M) const
Returns the size of the wchar_t type in bytes or 0 if the size is unknown.
bool has(LibFunc F) const
Tests whether a library function is available.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
TargetCostKind
The kind of cost model.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCC_Basic
The cost of a typical 'add' instruction.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI unsigned getIntegerBitWidth() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:295
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntOrPtrTy() const
Return true if this is an integer type or a pointer type.
Definition Type.h:255
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:237
Value * getOperand(unsigned i) const
Definition User.h:232
unsigned getNumOperands() const
Definition User.h:254
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:554
LLVM_ABI void replaceUsesOutsideBlock(Value *V, BasicBlock *BB)
replaceUsesOutsideBlock - Go through the uses list for this definition and make each use point to "V"...
Definition Value.cpp:599
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
Value handle that is nullable, but tries to track the Value.
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:201
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:169
const ParentTy * getParent() const
Definition ilist_node.h:34
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
@ HeaderSize
Definition BTF.h:61
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
OperandType
Operands are tagged with one of the values of this enum.
Definition MCInstrDesc.h:60
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinOpPred_match< LHS, RHS, is_shift_op > m_Shift(const LHS &L, const RHS &R)
Matches shift operations.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
class_match< BasicBlock > m_BasicBlock()
Match an arbitrary basic block value and ignore it.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
cst_pred_ty< icmp_pred_with_threshold > m_SpecificInt_ICMP(ICmpInst::Predicate Predicate, const APInt &Threshold)
Match an integer or vector with every element comparing 'pred' (eg/ne/...) to Threshold.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
class_match< const SCEVConstant > m_SCEVConstant()
specificloop_ty m_SpecificLoop(const Loop *L)
SCEVAffineAddRec_match< Op0_t, Op1_t, class_match< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
bool match(const SCEV *S, const Pattern &P)
specificscev_ty m_scev_Specific(const SCEV *S)
Match if we have a specific specified SCEV.
class_match< const SCEV > m_SCEV()
initializer< Ty > init(const Ty &Val)
LocationClass< Ty > location(Ty &L)
constexpr double e
DiagnosticInfoOptimizationBase::Argument NV
DiagnosticInfoOptimizationBase::setExtraArgs setExtraArgs
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
static cl::opt< bool, true > EnableLIRPWcslen("disable-loop-idiom-wcslen", cl::desc("Proceed with loop idiom recognize pass, " "enable conversion of loop(s) to wcslen."), cl::location(DisableLIRP::Wcslen), cl::init(false), cl::ReallyHidden)
InstructionCost Cost
static cl::opt< bool, true > DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memcpy."), cl::location(DisableLIRP::Memcpy), cl::init(false), cl::ReallyHidden)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
static cl::opt< bool, true > DisableLIRPStrlen("disable-loop-idiom-strlen", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to strlen."), cl::location(DisableLIRP::Strlen), cl::init(false), cl::ReallyHidden)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
static cl::opt< bool > ForceMemsetPatternIntrinsic("loop-idiom-force-memset-pattern-intrinsic", cl::desc("Use memset.pattern intrinsic whenever possible"), cl::init(false), cl::Hidden)
LLVM_ABI bool isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI, LibFunc TheLibFunc)
Check whether the library function is available on target and also that it in the current Module is a...
LLVM_ABI void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights, bool IsExpected, bool ElideAllZero=false)
Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...
AnalysisManager< Loop, LoopStandardAnalysisResults & > LoopAnalysisManager
The loop analysis manager.
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:1968
LLVM_ABI bool isMustProgress(const Loop *L)
Return true if this loop can be assumed to make progress.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
bool isModOrRefSet(const ModRefInfo MRI)
Definition ModRef.h:43
LLVM_ABI Value * emitStrLen(Value *Ptr, IRBuilderBase &B, const DataLayout &DL, const TargetLibraryInfo *TLI)
Emit a call to the strlen function to the builder, for the specified pointer.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition ModRef.h:28
@ ModRef
The access may reference and may modify the value stored in memory.
Definition ModRef.h:36
@ Mod
The access may modify the value stored in memory.
Definition ModRef.h:34
static cl::opt< bool, true > DisableLIRPHashRecognize("disable-" DEBUG_TYPE "-hashrecognize", cl::desc("Proceed with loop idiom recognize pass, " "but do not optimize CRC loops."), cl::location(DisableLIRP::HashRecognize), cl::init(false), cl::ReallyHidden)
TargetTransformInfo TTI
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
LLVM_ABI bool VerifyMemorySSA
Enables verification of MemorySSA.
Definition MemorySSA.cpp:84
LLVM_ABI bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
DWARFExpression::Operation Op
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Return true if this function can prove that V does not have undef bits and is never poison.
LLVM_ABI Value * emitWcsLen(Value *Ptr, IRBuilderBase &B, const DataLayout &DL, const TargetLibraryInfo *TLI)
Emit a call to the wcslen function to the builder, for the specified pointer.
LLVM_ABI bool extractBranchWeights(const MDNode *ProfileData, SmallVectorImpl< uint32_t > &Weights)
Extract branch weights from MD_prof metadata.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
LLVM_ABI Value * isBytewiseValue(Value *V, const DataLayout &DL)
If the specified value can be set by repeating the same byte in memory, return the i8 value that it i...
static cl::opt< bool > UseLIRCodeSizeHeurs("use-lir-code-size-heurs", cl::desc("Use loop idiom recognition code size heuristics when compiling " "with -Os/-Oz"), cl::init(true), cl::Hidden)
LLVM_ABI bool RecursivelyDeleteDeadPHINode(PHINode *PN, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
If the specified value is an effectively dead PHI node, due to being a def-use chain of single-use no...
Definition Local.cpp:641
cl::opt< bool > ProfcheckDisableMetadataFixes("profcheck-disable-metadata-fixes", cl::Hidden, cl::init(false), cl::desc("Disable metadata propagation fixes discovered through Issue #147390"))
static cl::opt< bool, true > DisableLIRPMemset("disable-" DEBUG_TYPE "-memset", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memset."), cl::location(DisableLIRP::Memset), cl::init(false), cl::ReallyHidden)
static cl::opt< bool, true > DisableLIRPAll("disable-" DEBUG_TYPE "-all", cl::desc("Options to disable Loop Idiom Recognize Pass."), cl::location(DisableLIRP::All), cl::init(false), cl::ReallyHidden)
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
std::optional< DecomposedBitTest > decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate Pred, bool LookThroughTrunc=true, bool AllowNonZeroC=false, bool DecomposeAnd=false)
Decompose an icmp into the form ((X & Mask) pred C) if possible.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:761
LLVM_ABI AAMDNodes merge(const AAMDNodes &Other) const
Given two sets of AAMDNodes applying to potentially different locations, determine the best AAMDNodes...
AAMDNodes extendTo(ssize_t Len) const
Create a new AAMDNode that describes this AAMDNode after extending it to apply to a series of bytes o...
Definition Metadata.h:834
static bool Memcpy
When true, Memcpy is disabled.
static bool Wcslen
When true, Wcslen is disabled.
static bool Strlen
When true, Strlen is disabled.
static bool HashRecognize
When true, HashRecognize is disabled.
static bool Memset
When true, Memset is disabled.
static bool All
When true, the entire pass is disabled.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
The structure that is returned when a polynomial algorithm was recognized by the analysis.
Match loop-invariant value.
match_LoopInvariant(const SubPattern_t &SP, const Loop *L)