LLVM 22.0.0git
LoopIdiomRecognize.cpp
Go to the documentation of this file.
1//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements an idiom recognizer that transforms simple loops into a
10// non-loop form. In cases that this kicks in, it can be a significant
11// performance win.
12//
13// If compiling for code size we avoid idiom recognition if the resulting
14// code could be larger than the code for the original loop. One way this could
15// happen is if the loop is not removable after idiom recognition due to the
16// presence of non-idiom instructions. The initial implementation of the
17// heuristics applies to idioms in multi-block loops.
18//
19//===----------------------------------------------------------------------===//
20//
21// TODO List:
22//
23// Future loop memory idioms to recognize: memcmp, etc.
24//
25// This could recognize common matrix multiplies and dot product idioms and
26// replace them with calls to BLAS (if linked in??).
27//
28//===----------------------------------------------------------------------===//
29
31#include "llvm/ADT/APInt.h"
32#include "llvm/ADT/ArrayRef.h"
33#include "llvm/ADT/DenseMap.h"
34#include "llvm/ADT/MapVector.h"
35#include "llvm/ADT/SetVector.h"
38#include "llvm/ADT/Statistic.h"
39#include "llvm/ADT/StringRef.h"
56#include "llvm/IR/BasicBlock.h"
57#include "llvm/IR/Constant.h"
58#include "llvm/IR/Constants.h"
59#include "llvm/IR/DataLayout.h"
60#include "llvm/IR/DebugLoc.h"
62#include "llvm/IR/Dominators.h"
63#include "llvm/IR/GlobalValue.h"
65#include "llvm/IR/IRBuilder.h"
66#include "llvm/IR/InstrTypes.h"
67#include "llvm/IR/Instruction.h"
70#include "llvm/IR/Intrinsics.h"
71#include "llvm/IR/LLVMContext.h"
72#include "llvm/IR/Module.h"
73#include "llvm/IR/PassManager.h"
75#include "llvm/IR/Type.h"
76#include "llvm/IR/User.h"
77#include "llvm/IR/Value.h"
78#include "llvm/IR/ValueHandle.h"
81#include "llvm/Support/Debug.h"
88#include <algorithm>
89#include <cassert>
90#include <cstdint>
91#include <utility>
92
93using namespace llvm;
94using namespace SCEVPatternMatch;
95
96#define DEBUG_TYPE "loop-idiom"
97
98STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
99STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
100STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores");
101STATISTIC(NumStrLen, "Number of strlen's and wcslen's formed from loop loads");
103 NumShiftUntilBitTest,
104 "Number of uncountable loops recognized as 'shift until bitttest' idiom");
105STATISTIC(NumShiftUntilZero,
106 "Number of uncountable loops recognized as 'shift until zero' idiom");
107
110 DisableLIRPAll("disable-" DEBUG_TYPE "-all",
111 cl::desc("Options to disable Loop Idiom Recognize Pass."),
114
117 DisableLIRPMemset("disable-" DEBUG_TYPE "-memset",
118 cl::desc("Proceed with loop idiom recognize pass, but do "
119 "not convert loop(s) to memset."),
122
125 DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy",
126 cl::desc("Proceed with loop idiom recognize pass, but do "
127 "not convert loop(s) to memcpy."),
130
133 DisableLIRPStrlen("disable-loop-idiom-strlen",
134 cl::desc("Proceed with loop idiom recognize pass, but do "
135 "not convert loop(s) to strlen."),
138
141 EnableLIRPWcslen("disable-loop-idiom-wcslen",
142 cl::desc("Proceed with loop idiom recognize pass, "
143 "enable conversion of loop(s) to wcslen."),
146
149 DisableLIRPHashRecognize("disable-" DEBUG_TYPE "-hashrecognize",
150 cl::desc("Proceed with loop idiom recognize pass, "
151 "but do not optimize CRC loops."),
153 cl::init(false), cl::ReallyHidden);
154
156 "use-lir-code-size-heurs",
157 cl::desc("Use loop idiom recognition code size heuristics when compiling "
158 "with -Os/-Oz"),
159 cl::init(true), cl::Hidden);
160
162 "loop-idiom-force-memset-pattern-intrinsic",
163 cl::desc("Use memset.pattern intrinsic whenever possible"), cl::init(false),
164 cl::Hidden);
165
166namespace {
167
168class LoopIdiomRecognize {
169 Loop *CurLoop = nullptr;
171 DominatorTree *DT;
172 LoopInfo *LI;
173 ScalarEvolution *SE;
176 const DataLayout *DL;
178 bool ApplyCodeSizeHeuristics;
179 std::unique_ptr<MemorySSAUpdater> MSSAU;
180
181public:
182 explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
183 LoopInfo *LI, ScalarEvolution *SE,
185 const TargetTransformInfo *TTI, MemorySSA *MSSA,
186 const DataLayout *DL,
188 : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
189 if (MSSA)
190 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
191 }
192
193 bool runOnLoop(Loop *L);
194
195private:
196 using StoreList = SmallVector<StoreInst *, 8>;
197 using StoreListMap = MapVector<Value *, StoreList>;
198
199 StoreListMap StoreRefsForMemset;
200 StoreListMap StoreRefsForMemsetPattern;
201 StoreList StoreRefsForMemcpy;
202 bool HasMemset;
203 bool HasMemsetPattern;
204 bool HasMemcpy;
205
206 /// Return code for isLegalStore()
207 enum LegalStoreKind {
208 None = 0,
209 Memset,
210 MemsetPattern,
211 Memcpy,
212 UnorderedAtomicMemcpy,
213 DontUse // Dummy retval never to be used. Allows catching errors in retval
214 // handling.
215 };
216
217 /// \name Countable Loop Idiom Handling
218 /// @{
219
220 bool runOnCountableLoop();
221 bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
222 SmallVectorImpl<BasicBlock *> &ExitBlocks);
223
224 void collectStores(BasicBlock *BB);
225 LegalStoreKind isLegalStore(StoreInst *SI);
226 enum class ForMemset { No, Yes };
227 bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
228 ForMemset For);
229
230 template <typename MemInst>
231 bool processLoopMemIntrinsic(
232 BasicBlock *BB,
233 bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
234 const SCEV *BECount);
235 bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount);
236 bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
237
238 bool processLoopStridedStore(Value *DestPtr, const SCEV *StoreSizeSCEV,
239 MaybeAlign StoreAlignment, Value *StoredVal,
240 Instruction *TheStore,
241 SmallPtrSetImpl<Instruction *> &Stores,
242 const SCEVAddRecExpr *Ev, const SCEV *BECount,
243 bool IsNegStride, bool IsLoopMemset = false);
244 bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
245 bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr,
246 const SCEV *StoreSize, MaybeAlign StoreAlign,
247 MaybeAlign LoadAlign, Instruction *TheStore,
248 Instruction *TheLoad,
249 const SCEVAddRecExpr *StoreEv,
250 const SCEVAddRecExpr *LoadEv,
251 const SCEV *BECount);
252 bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
253 bool IsLoopMemset = false);
254 bool optimizeCRCLoop(const PolynomialInfo &Info);
255
256 /// @}
257 /// \name Noncountable Loop Idiom Handling
258 /// @{
259
260 bool runOnNoncountableLoop();
261
262 bool recognizePopcount();
263 void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
264 PHINode *CntPhi, Value *Var);
265 bool isProfitableToInsertFFS(Intrinsic::ID IntrinID, Value *InitX,
266 bool ZeroCheck, size_t CanonicalSize);
267 bool insertFFSIfProfitable(Intrinsic::ID IntrinID, Value *InitX,
268 Instruction *DefX, PHINode *CntPhi,
269 Instruction *CntInst);
270 bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
271 bool recognizeShiftUntilLessThan();
272 void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
273 Instruction *CntInst, PHINode *CntPhi,
274 Value *Var, Instruction *DefX,
275 const DebugLoc &DL, bool ZeroCheck,
276 bool IsCntPhiUsedOutsideLoop,
277 bool InsertSub = false);
278
279 bool recognizeShiftUntilBitTest();
280 bool recognizeShiftUntilZero();
281 bool recognizeAndInsertStrLen();
282
283 /// @}
284};
285} // end anonymous namespace
286
289 LPMUpdater &) {
291 return PreservedAnalyses::all();
292
293 const auto *DL = &L.getHeader()->getDataLayout();
294
295 // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
296 // pass. Function analyses need to be preserved across loop transformations
297 // but ORE cannot be preserved (see comment before the pass definition).
298 OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
299
300 std::optional<PolynomialInfo> HR;
301
302 LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
303 AR.MSSA, DL, ORE);
304 if (!LIR.runOnLoop(&L))
305 return PreservedAnalyses::all();
306
308 if (AR.MSSA)
309 PA.preserve<MemorySSAAnalysis>();
310 return PA;
311}
312
314 I->replaceAllUsesWith(PoisonValue::get(I->getType()));
315 I->eraseFromParent();
316}
317
318//===----------------------------------------------------------------------===//
319//
320// Implementation of LoopIdiomRecognize
321//
322//===----------------------------------------------------------------------===//
323
324bool LoopIdiomRecognize::runOnLoop(Loop *L) {
325 CurLoop = L;
326 // If the loop could not be converted to canonical form, it must have an
327 // indirectbr in it, just give up.
328 if (!L->getLoopPreheader())
329 return false;
330
331 // Disable loop idiom recognition if the function's name is a common idiom.
332 StringRef Name = L->getHeader()->getParent()->getName();
333 if (Name == "memset" || Name == "memcpy" || Name == "strlen" ||
334 Name == "wcslen")
335 return false;
336
337 // Determine if code size heuristics need to be applied.
338 ApplyCodeSizeHeuristics =
339 L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
340
341 HasMemset = TLI->has(LibFunc_memset);
342 // TODO: Unconditionally enable use of the memset pattern intrinsic (or at
343 // least, opt-in via target hook) once we are confident it will never result
344 // in worse codegen than without. For now, use it only when the target
345 // supports memset_pattern16 libcall (or unless this is overridden by
346 // command line option).
347 HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
348 HasMemcpy = TLI->has(LibFunc_memcpy);
349
350 if (HasMemset || HasMemsetPattern || ForceMemsetPatternIntrinsic ||
351 HasMemcpy || !DisableLIRP::HashRecognize)
353 return runOnCountableLoop();
354
355 return runOnNoncountableLoop();
356}
357
358bool LoopIdiomRecognize::runOnCountableLoop() {
359 const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
361 "runOnCountableLoop() called on a loop without a predictable"
362 "backedge-taken count");
363
364 // If this loop executes exactly one time, then it should be peeled, not
365 // optimized by this pass.
366 if (BECount->isZero())
367 return false;
368
370 CurLoop->getUniqueExitBlocks(ExitBlocks);
371
372 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
373 << CurLoop->getHeader()->getParent()->getName()
374 << "] Countable Loop %" << CurLoop->getHeader()->getName()
375 << "\n");
376
377 // The following transforms hoist stores/memsets into the loop pre-header.
378 // Give up if the loop has instructions that may throw.
379 SimpleLoopSafetyInfo SafetyInfo;
380 SafetyInfo.computeLoopSafetyInfo(CurLoop);
381 if (SafetyInfo.anyBlockMayThrow())
382 return false;
383
384 bool MadeChange = false;
385
386 // Scan all the blocks in the loop that are not in subloops.
387 for (auto *BB : CurLoop->getBlocks()) {
388 // Ignore blocks in subloops.
389 if (LI->getLoopFor(BB) != CurLoop)
390 continue;
391
392 MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
393 }
394
395 // Optimize a CRC loop if HashRecognize found one, provided we're not
396 // optimizing for size.
397 if (!DisableLIRP::HashRecognize && !ApplyCodeSizeHeuristics)
398 if (auto Res = HashRecognize(*CurLoop, *SE).getResult())
399 optimizeCRCLoop(*Res);
400
401 return MadeChange;
402}
403
404static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
405 const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
406 return ConstStride->getAPInt();
407}
408
409/// getMemSetPatternValue - If a strided store of the specified value is safe to
410/// turn into a memset.patternn intrinsic, return the Constant that should
411/// be passed in. Otherwise, return null.
412///
413/// TODO this function could allow more constants than it does today (e.g.
414/// those over 16 bytes) now it has transitioned to being used for the
415/// memset.pattern intrinsic rather than directly the memset_pattern16
416/// libcall.
418 // FIXME: This could check for UndefValue because it can be merged into any
419 // other valid pattern.
420
421 // If the value isn't a constant, we can't promote it to being in a constant
422 // array. We could theoretically do a store to an alloca or something, but
423 // that doesn't seem worthwhile.
425 if (!C || isa<ConstantExpr>(C))
426 return nullptr;
427
428 // Only handle simple values that are a power of two bytes in size.
429 uint64_t Size = DL->getTypeSizeInBits(V->getType());
430 if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
431 return nullptr;
432
433 // Don't care enough about darwin/ppc to implement this.
434 if (DL->isBigEndian())
435 return nullptr;
436
437 // Convert to size in bytes.
438 Size /= 8;
439
440 // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
441 // if the top and bottom are the same (e.g. for vectors and large integers).
442 if (Size > 16)
443 return nullptr;
444
445 // For now, don't handle types that aren't int, floats, or pointers.
446 Type *CTy = C->getType();
447 if (!CTy->isIntOrPtrTy() && !CTy->isFloatingPointTy())
448 return nullptr;
449
450 return C;
451}
452
453LoopIdiomRecognize::LegalStoreKind
454LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
455 // Don't touch volatile stores.
456 if (SI->isVolatile())
457 return LegalStoreKind::None;
458 // We only want simple or unordered-atomic stores.
459 if (!SI->isUnordered())
460 return LegalStoreKind::None;
461
462 // Avoid merging nontemporal stores.
463 if (SI->getMetadata(LLVMContext::MD_nontemporal))
464 return LegalStoreKind::None;
465
466 Value *StoredVal = SI->getValueOperand();
467 Value *StorePtr = SI->getPointerOperand();
468
469 // Don't convert stores of non-integral pointer types to memsets (which stores
470 // integers).
471 if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
472 return LegalStoreKind::None;
473
474 // Reject stores that are so large that they overflow an unsigned.
475 // When storing out scalable vectors we bail out for now, since the code
476 // below currently only works for constant strides.
477 TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
478 if (SizeInBits.isScalable() || (SizeInBits.getFixedValue() & 7) ||
479 (SizeInBits.getFixedValue() >> 32) != 0)
480 return LegalStoreKind::None;
481
482 // See if the pointer expression is an AddRec like {base,+,1} on the current
483 // loop, which indicates a strided store. If we have something else, it's a
484 // random store we can't handle.
485 const SCEV *StoreEv = SE->getSCEV(StorePtr);
486 const SCEVConstant *Stride;
487 if (!match(StoreEv, m_scev_AffineAddRec(m_SCEV(), m_SCEVConstant(Stride),
488 m_SpecificLoop(CurLoop))))
489 return LegalStoreKind::None;
490
491 // See if the store can be turned into a memset.
492
493 // If the stored value is a byte-wise value (like i32 -1), then it may be
494 // turned into a memset of i8 -1, assuming that all the consecutive bytes
495 // are stored. A store of i32 0x01020304 can never be turned into a memset,
496 // but it can be turned into memset_pattern if the target supports it.
497 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
498
499 // Note: memset and memset_pattern on unordered-atomic is yet not supported
500 bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
501
502 // If we're allowed to form a memset, and the stored value would be
503 // acceptable for memset, use it.
504 if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
505 // Verify that the stored value is loop invariant. If not, we can't
506 // promote the memset.
507 CurLoop->isLoopInvariant(SplatValue)) {
508 // It looks like we can use SplatValue.
509 return LegalStoreKind::Memset;
510 }
511 if (!UnorderedAtomic && (HasMemsetPattern || ForceMemsetPatternIntrinsic) &&
513 // Don't create memset_pattern16s with address spaces.
514 StorePtr->getType()->getPointerAddressSpace() == 0 &&
515 getMemSetPatternValue(StoredVal, DL)) {
516 // It looks like we can use PatternValue!
517 return LegalStoreKind::MemsetPattern;
518 }
519
520 // Otherwise, see if the store can be turned into a memcpy.
521 if (HasMemcpy && !DisableLIRP::Memcpy) {
522 // Check to see if the stride matches the size of the store. If so, then we
523 // know that every byte is touched in the loop.
524 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
525 APInt StrideAP = Stride->getAPInt();
526 if (StoreSize != StrideAP && StoreSize != -StrideAP)
527 return LegalStoreKind::None;
528
529 // The store must be feeding a non-volatile load.
530 LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
531
532 // Only allow non-volatile loads
533 if (!LI || LI->isVolatile())
534 return LegalStoreKind::None;
535 // Only allow simple or unordered-atomic loads
536 if (!LI->isUnordered())
537 return LegalStoreKind::None;
538
539 // See if the pointer expression is an AddRec like {base,+,1} on the current
540 // loop, which indicates a strided load. If we have something else, it's a
541 // random load we can't handle.
542 const SCEV *LoadEv = SE->getSCEV(LI->getPointerOperand());
543
544 // The store and load must share the same stride.
545 if (!match(LoadEv, m_scev_AffineAddRec(m_SCEV(), m_scev_Specific(Stride),
546 m_SpecificLoop(CurLoop))))
547 return LegalStoreKind::None;
548
549 // Success. This store can be converted into a memcpy.
550 UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
551 return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
552 : LegalStoreKind::Memcpy;
553 }
554 // This store can't be transformed into a memset/memcpy.
555 return LegalStoreKind::None;
556}
557
558void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
559 StoreRefsForMemset.clear();
560 StoreRefsForMemsetPattern.clear();
561 StoreRefsForMemcpy.clear();
562 for (Instruction &I : *BB) {
564 if (!SI)
565 continue;
566
567 // Make sure this is a strided store with a constant stride.
568 switch (isLegalStore(SI)) {
569 case LegalStoreKind::None:
570 // Nothing to do
571 break;
572 case LegalStoreKind::Memset: {
573 // Find the base pointer.
574 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
575 StoreRefsForMemset[Ptr].push_back(SI);
576 } break;
577 case LegalStoreKind::MemsetPattern: {
578 // Find the base pointer.
579 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
580 StoreRefsForMemsetPattern[Ptr].push_back(SI);
581 } break;
582 case LegalStoreKind::Memcpy:
583 case LegalStoreKind::UnorderedAtomicMemcpy:
584 StoreRefsForMemcpy.push_back(SI);
585 break;
586 default:
587 assert(false && "unhandled return value");
588 break;
589 }
590 }
591}
592
593/// runOnLoopBlock - Process the specified block, which lives in a counted loop
594/// with the specified backedge count. This block is known to be in the current
595/// loop and not in any subloops.
596bool LoopIdiomRecognize::runOnLoopBlock(
597 BasicBlock *BB, const SCEV *BECount,
598 SmallVectorImpl<BasicBlock *> &ExitBlocks) {
599 // We can only promote stores in this block if they are unconditionally
600 // executed in the loop. For a block to be unconditionally executed, it has
601 // to dominate all the exit blocks of the loop. Verify this now.
602 for (BasicBlock *ExitBlock : ExitBlocks)
603 if (!DT->dominates(BB, ExitBlock))
604 return false;
605
606 bool MadeChange = false;
607 // Look for store instructions, which may be optimized to memset/memcpy.
608 collectStores(BB);
609
610 // Look for a single store or sets of stores with a common base, which can be
611 // optimized into a memset (memset_pattern). The latter most commonly happens
612 // with structs and handunrolled loops.
613 for (auto &SL : StoreRefsForMemset)
614 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
615
616 for (auto &SL : StoreRefsForMemsetPattern)
617 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
618
619 // Optimize the store into a memcpy, if it feeds an similarly strided load.
620 for (auto &SI : StoreRefsForMemcpy)
621 MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
622
623 MadeChange |= processLoopMemIntrinsic<MemCpyInst>(
624 BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
625 MadeChange |= processLoopMemIntrinsic<MemSetInst>(
626 BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
627
628 return MadeChange;
629}
630
631/// See if this store(s) can be promoted to a memset.
632bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
633 const SCEV *BECount, ForMemset For) {
634 // Try to find consecutive stores that can be transformed into memsets.
635 SetVector<StoreInst *> Heads, Tails;
637
638 // Do a quadratic search on all of the given stores and find
639 // all of the pairs of stores that follow each other.
640 SmallVector<unsigned, 16> IndexQueue;
641 for (unsigned i = 0, e = SL.size(); i < e; ++i) {
642 assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
643
644 Value *FirstStoredVal = SL[i]->getValueOperand();
645 Value *FirstStorePtr = SL[i]->getPointerOperand();
646 const SCEVAddRecExpr *FirstStoreEv =
647 cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
648 APInt FirstStride = getStoreStride(FirstStoreEv);
649 unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
650
651 // See if we can optimize just this store in isolation.
652 if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
653 Heads.insert(SL[i]);
654 continue;
655 }
656
657 Value *FirstSplatValue = nullptr;
658 Constant *FirstPatternValue = nullptr;
659
660 if (For == ForMemset::Yes)
661 FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
662 else
663 FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
664
665 assert((FirstSplatValue || FirstPatternValue) &&
666 "Expected either splat value or pattern value.");
667
668 IndexQueue.clear();
669 // If a store has multiple consecutive store candidates, search Stores
670 // array according to the sequence: from i+1 to e, then from i-1 to 0.
671 // This is because usually pairing with immediate succeeding or preceding
672 // candidate create the best chance to find memset opportunity.
673 unsigned j = 0;
674 for (j = i + 1; j < e; ++j)
675 IndexQueue.push_back(j);
676 for (j = i; j > 0; --j)
677 IndexQueue.push_back(j - 1);
678
679 for (auto &k : IndexQueue) {
680 assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
681 Value *SecondStorePtr = SL[k]->getPointerOperand();
682 const SCEVAddRecExpr *SecondStoreEv =
683 cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
684 APInt SecondStride = getStoreStride(SecondStoreEv);
685
686 if (FirstStride != SecondStride)
687 continue;
688
689 Value *SecondStoredVal = SL[k]->getValueOperand();
690 Value *SecondSplatValue = nullptr;
691 Constant *SecondPatternValue = nullptr;
692
693 if (For == ForMemset::Yes)
694 SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
695 else
696 SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
697
698 assert((SecondSplatValue || SecondPatternValue) &&
699 "Expected either splat value or pattern value.");
700
701 if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
702 if (For == ForMemset::Yes) {
703 if (isa<UndefValue>(FirstSplatValue))
704 FirstSplatValue = SecondSplatValue;
705 if (FirstSplatValue != SecondSplatValue)
706 continue;
707 } else {
708 if (isa<UndefValue>(FirstPatternValue))
709 FirstPatternValue = SecondPatternValue;
710 if (FirstPatternValue != SecondPatternValue)
711 continue;
712 }
713 Tails.insert(SL[k]);
714 Heads.insert(SL[i]);
715 ConsecutiveChain[SL[i]] = SL[k];
716 break;
717 }
718 }
719 }
720
721 // We may run into multiple chains that merge into a single chain. We mark the
722 // stores that we transformed so that we don't visit the same store twice.
723 SmallPtrSet<Value *, 16> TransformedStores;
724 bool Changed = false;
725
726 // For stores that start but don't end a link in the chain:
727 for (StoreInst *I : Heads) {
728 if (Tails.count(I))
729 continue;
730
731 // We found a store instr that starts a chain. Now follow the chain and try
732 // to transform it.
733 SmallPtrSet<Instruction *, 8> AdjacentStores;
734 StoreInst *HeadStore = I;
735 unsigned StoreSize = 0;
736
737 // Collect the chain into a list.
738 while (Tails.count(I) || Heads.count(I)) {
739 if (TransformedStores.count(I))
740 break;
741 AdjacentStores.insert(I);
742
743 StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
744 // Move to the next value in the chain.
745 I = ConsecutiveChain[I];
746 }
747
748 Value *StoredVal = HeadStore->getValueOperand();
749 Value *StorePtr = HeadStore->getPointerOperand();
750 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
751 APInt Stride = getStoreStride(StoreEv);
752
753 // Check to see if the stride matches the size of the stores. If so, then
754 // we know that every byte is touched in the loop.
755 if (StoreSize != Stride && StoreSize != -Stride)
756 continue;
757
758 bool IsNegStride = StoreSize == -Stride;
759
760 Type *IntIdxTy = DL->getIndexType(StorePtr->getType());
761 const SCEV *StoreSizeSCEV = SE->getConstant(IntIdxTy, StoreSize);
762 if (processLoopStridedStore(StorePtr, StoreSizeSCEV,
763 MaybeAlign(HeadStore->getAlign()), StoredVal,
764 HeadStore, AdjacentStores, StoreEv, BECount,
765 IsNegStride)) {
766 TransformedStores.insert_range(AdjacentStores);
767 Changed = true;
768 }
769 }
770
771 return Changed;
772}
773
774/// processLoopMemIntrinsic - Template function for calling different processor
775/// functions based on mem intrinsic type.
776template <typename MemInst>
777bool LoopIdiomRecognize::processLoopMemIntrinsic(
778 BasicBlock *BB,
779 bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
780 const SCEV *BECount) {
781 bool MadeChange = false;
782 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
783 Instruction *Inst = &*I++;
784 // Look for memory instructions, which may be optimized to a larger one.
785 if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
786 WeakTrackingVH InstPtr(&*I);
787 if (!(this->*Processor)(MI, BECount))
788 continue;
789 MadeChange = true;
790
791 // If processing the instruction invalidated our iterator, start over from
792 // the top of the block.
793 if (!InstPtr)
794 I = BB->begin();
795 }
796 }
797 return MadeChange;
798}
799
800/// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy
801bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
802 const SCEV *BECount) {
803 // We can only handle non-volatile memcpys with a constant size.
804 if (MCI->isVolatile() || !isa<ConstantInt>(MCI->getLength()))
805 return false;
806
807 // If we're not allowed to hack on memcpy, we fail.
808 if ((!HasMemcpy && !MCI->isForceInlined()) || DisableLIRP::Memcpy)
809 return false;
810
811 Value *Dest = MCI->getDest();
812 Value *Source = MCI->getSource();
813 if (!Dest || !Source)
814 return false;
815
816 // See if the load and store pointer expressions are AddRec like {base,+,1} on
817 // the current loop, which indicates a strided load and store. If we have
818 // something else, it's a random load or store we can't handle.
819 const SCEV *StoreEv = SE->getSCEV(Dest);
820 const SCEV *LoadEv = SE->getSCEV(Source);
821 const APInt *StoreStrideValue, *LoadStrideValue;
822 if (!match(StoreEv,
823 m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StoreStrideValue),
824 m_SpecificLoop(CurLoop))) ||
825 !match(LoadEv,
826 m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(LoadStrideValue),
827 m_SpecificLoop(CurLoop))))
828 return false;
829
830 // Reject memcpys that are so large that they overflow an unsigned.
831 uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength())->getZExtValue();
832 if ((SizeInBytes >> 32) != 0)
833 return false;
834
835 // Huge stride value - give up
836 if (StoreStrideValue->getBitWidth() > 64 ||
837 LoadStrideValue->getBitWidth() > 64)
838 return false;
839
840 if (SizeInBytes != *StoreStrideValue && SizeInBytes != -*StoreStrideValue) {
841 ORE.emit([&]() {
842 return OptimizationRemarkMissed(DEBUG_TYPE, "SizeStrideUnequal", MCI)
843 << ore::NV("Inst", "memcpy") << " in "
844 << ore::NV("Function", MCI->getFunction())
845 << " function will not be hoisted: "
846 << ore::NV("Reason", "memcpy size is not equal to stride");
847 });
848 return false;
849 }
850
851 int64_t StoreStrideInt = StoreStrideValue->getSExtValue();
852 int64_t LoadStrideInt = LoadStrideValue->getSExtValue();
853 // Check if the load stride matches the store stride.
854 if (StoreStrideInt != LoadStrideInt)
855 return false;
856
857 return processLoopStoreOfLoopLoad(
858 Dest, Source, SE->getConstant(Dest->getType(), SizeInBytes),
859 MCI->getDestAlign(), MCI->getSourceAlign(), MCI, MCI,
860 cast<SCEVAddRecExpr>(StoreEv), cast<SCEVAddRecExpr>(LoadEv), BECount);
861}
862
863/// processLoopMemSet - See if this memset can be promoted to a large memset.
864bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
865 const SCEV *BECount) {
866 // We can only handle non-volatile memsets.
867 if (MSI->isVolatile())
868 return false;
869
870 // If we're not allowed to hack on memset, we fail.
871 if (!HasMemset || DisableLIRP::Memset)
872 return false;
873
874 Value *Pointer = MSI->getDest();
875
876 // See if the pointer expression is an AddRec like {base,+,1} on the current
877 // loop, which indicates a strided store. If we have something else, it's a
878 // random store we can't handle.
879 const SCEV *Ev = SE->getSCEV(Pointer);
880 const SCEV *PointerStrideSCEV;
881 if (!match(Ev, m_scev_AffineAddRec(m_SCEV(), m_SCEV(PointerStrideSCEV),
882 m_SpecificLoop(CurLoop)))) {
883 LLVM_DEBUG(dbgs() << " Pointer is not affine, abort\n");
884 return false;
885 }
886
887 const SCEV *MemsetSizeSCEV = SE->getSCEV(MSI->getLength());
888
889 bool IsNegStride = false;
890 const bool IsConstantSize = isa<ConstantInt>(MSI->getLength());
891
892 if (IsConstantSize) {
893 // Memset size is constant.
894 // Check if the pointer stride matches the memset size. If so, then
895 // we know that every byte is touched in the loop.
896 LLVM_DEBUG(dbgs() << " memset size is constant\n");
897 uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
898 const APInt *Stride;
899 if (!match(PointerStrideSCEV, m_scev_APInt(Stride)))
900 return false;
901
902 if (SizeInBytes != *Stride && SizeInBytes != -*Stride)
903 return false;
904
905 IsNegStride = SizeInBytes == -*Stride;
906 } else {
907 // Memset size is non-constant.
908 // Check if the pointer stride matches the memset size.
909 // To be conservative, the pass would not promote pointers that aren't in
910 // address space zero. Also, the pass only handles memset length and stride
911 // that are invariant for the top level loop.
912 LLVM_DEBUG(dbgs() << " memset size is non-constant\n");
913 if (Pointer->getType()->getPointerAddressSpace() != 0) {
914 LLVM_DEBUG(dbgs() << " pointer is not in address space zero, "
915 << "abort\n");
916 return false;
917 }
918 if (!SE->isLoopInvariant(MemsetSizeSCEV, CurLoop)) {
919 LLVM_DEBUG(dbgs() << " memset size is not a loop-invariant, "
920 << "abort\n");
921 return false;
922 }
923
924 // Compare positive direction PointerStrideSCEV with MemsetSizeSCEV
925 IsNegStride = PointerStrideSCEV->isNonConstantNegative();
926 const SCEV *PositiveStrideSCEV =
927 IsNegStride ? SE->getNegativeSCEV(PointerStrideSCEV)
928 : PointerStrideSCEV;
929 LLVM_DEBUG(dbgs() << " MemsetSizeSCEV: " << *MemsetSizeSCEV << "\n"
930 << " PositiveStrideSCEV: " << *PositiveStrideSCEV
931 << "\n");
932
933 if (PositiveStrideSCEV != MemsetSizeSCEV) {
934 // If an expression is covered by the loop guard, compare again and
935 // proceed with optimization if equal.
936 const SCEV *FoldedPositiveStride =
937 SE->applyLoopGuards(PositiveStrideSCEV, CurLoop);
938 const SCEV *FoldedMemsetSize =
939 SE->applyLoopGuards(MemsetSizeSCEV, CurLoop);
940
941 LLVM_DEBUG(dbgs() << " Try to fold SCEV based on loop guard\n"
942 << " FoldedMemsetSize: " << *FoldedMemsetSize << "\n"
943 << " FoldedPositiveStride: " << *FoldedPositiveStride
944 << "\n");
945
946 if (FoldedPositiveStride != FoldedMemsetSize) {
947 LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n");
948 return false;
949 }
950 }
951 }
952
953 // Verify that the memset value is loop invariant. If not, we can't promote
954 // the memset.
955 Value *SplatValue = MSI->getValue();
956 if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
957 return false;
958
960 MSIs.insert(MSI);
961 return processLoopStridedStore(Pointer, SE->getSCEV(MSI->getLength()),
962 MSI->getDestAlign(), SplatValue, MSI, MSIs,
963 cast<SCEVAddRecExpr>(Ev), BECount, IsNegStride,
964 /*IsLoopMemset=*/true);
965}
966
967/// mayLoopAccessLocation - Return true if the specified loop might access the
968/// specified pointer location, which is a loop-strided access. The 'Access'
969/// argument specifies what the verboten forms of access are (read or write).
970static bool
972 const SCEV *BECount, const SCEV *StoreSizeSCEV,
974 SmallPtrSetImpl<Instruction *> &IgnoredInsts) {
975 // Get the location that may be stored across the loop. Since the access is
976 // strided positively through memory, we say that the modified location starts
977 // at the pointer and has infinite size.
979
980 // If the loop iterates a fixed number of times, we can refine the access size
981 // to be exactly the size of the memset, which is (BECount+1)*StoreSize
982 const APInt *BECst, *ConstSize;
983 if (match(BECount, m_scev_APInt(BECst)) &&
984 match(StoreSizeSCEV, m_scev_APInt(ConstSize))) {
985 std::optional<uint64_t> BEInt = BECst->tryZExtValue();
986 std::optional<uint64_t> SizeInt = ConstSize->tryZExtValue();
987 // FIXME: Should this check for overflow?
988 if (BEInt && SizeInt)
989 AccessSize = LocationSize::precise((*BEInt + 1) * *SizeInt);
990 }
991
992 // TODO: For this to be really effective, we have to dive into the pointer
993 // operand in the store. Store to &A[i] of 100 will always return may alias
994 // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
995 // which will then no-alias a store to &A[100].
996 MemoryLocation StoreLoc(Ptr, AccessSize);
997
998 for (BasicBlock *B : L->blocks())
999 for (Instruction &I : *B)
1000 if (!IgnoredInsts.contains(&I) &&
1001 isModOrRefSet(AA.getModRefInfo(&I, StoreLoc) & Access))
1002 return true;
1003 return false;
1004}
1005
1006// If we have a negative stride, Start refers to the end of the memory location
1007// we're trying to memset. Therefore, we need to recompute the base pointer,
1008// which is just Start - BECount*Size.
1009static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
1010 Type *IntPtr, const SCEV *StoreSizeSCEV,
1011 ScalarEvolution *SE) {
1012 const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
1013 if (!StoreSizeSCEV->isOne()) {
1014 // index = back edge count * store size
1015 Index = SE->getMulExpr(Index,
1016 SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
1018 }
1019 // base pointer = start - index * store size
1020 return SE->getMinusSCEV(Start, Index);
1021}
1022
1023/// Compute the number of bytes as a SCEV from the backedge taken count.
1024///
1025/// This also maps the SCEV into the provided type and tries to handle the
1026/// computation in a way that will fold cleanly.
1027static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
1028 const SCEV *StoreSizeSCEV, Loop *CurLoop,
1029 const DataLayout *DL, ScalarEvolution *SE) {
1030 const SCEV *TripCountSCEV =
1031 SE->getTripCountFromExitCount(BECount, IntPtr, CurLoop);
1032 return SE->getMulExpr(TripCountSCEV,
1033 SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
1035}
1036
1037/// processLoopStridedStore - We see a strided store of some value. If we can
1038/// transform this into a memset or memset_pattern in the loop preheader, do so.
1039bool LoopIdiomRecognize::processLoopStridedStore(
1040 Value *DestPtr, const SCEV *StoreSizeSCEV, MaybeAlign StoreAlignment,
1041 Value *StoredVal, Instruction *TheStore,
1043 const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) {
1044 Module *M = TheStore->getModule();
1045
1046 // The trip count of the loop and the base pointer of the addrec SCEV is
1047 // guaranteed to be loop invariant, which means that it should dominate the
1048 // header. This allows us to insert code for it in the preheader.
1049 unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
1050 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1051 IRBuilder<> Builder(Preheader->getTerminator());
1052 SCEVExpander Expander(*SE, *DL, "loop-idiom");
1053 SCEVExpanderCleaner ExpCleaner(Expander);
1054
1055 Type *DestInt8PtrTy = Builder.getPtrTy(DestAS);
1056 Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
1057
1058 bool Changed = false;
1059 const SCEV *Start = Ev->getStart();
1060 // Handle negative strided loops.
1061 if (IsNegStride)
1062 Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSizeSCEV, SE);
1063
1064 // TODO: ideally we should still be able to generate memset if SCEV expander
1065 // is taught to generate the dependencies at the latest point.
1066 if (!Expander.isSafeToExpand(Start))
1067 return Changed;
1068
1069 // Okay, we have a strided store "p[i]" of a splattable value. We can turn
1070 // this into a memset in the loop preheader now if we want. However, this
1071 // would be unsafe to do if there is anything else in the loop that may read
1072 // or write to the aliased location. Check for any overlap by generating the
1073 // base pointer and checking the region.
1074 Value *BasePtr =
1075 Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
1076
1077 // From here on out, conservatively report to the pass manager that we've
1078 // changed the IR, even if we later clean up these added instructions. There
1079 // may be structural differences e.g. in the order of use lists not accounted
1080 // for in just a textual dump of the IR. This is written as a variable, even
1081 // though statically all the places this dominates could be replaced with
1082 // 'true', with the hope that anyone trying to be clever / "more precise" with
1083 // the return value will read this comment, and leave them alone.
1084 Changed = true;
1085
1086 if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1087 StoreSizeSCEV, *AA, Stores))
1088 return Changed;
1089
1090 if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
1091 return Changed;
1092
1093 // Okay, everything looks good, insert the memset.
1094 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
1095 Constant *PatternValue = nullptr;
1096 if (!SplatValue)
1097 PatternValue = getMemSetPatternValue(StoredVal, DL);
1098
1099 // MemsetArg is the number of bytes for the memset libcall, and the number
1100 // of pattern repetitions if the memset.pattern intrinsic is being used.
1101 Value *MemsetArg;
1102 std::optional<int64_t> BytesWritten;
1103
1104 if (PatternValue && (HasMemsetPattern || ForceMemsetPatternIntrinsic)) {
1105 const SCEV *TripCountS =
1106 SE->getTripCountFromExitCount(BECount, IntIdxTy, CurLoop);
1107 if (!Expander.isSafeToExpand(TripCountS))
1108 return Changed;
1109 const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
1110 if (!ConstStoreSize)
1111 return Changed;
1112 Value *TripCount = Expander.expandCodeFor(TripCountS, IntIdxTy,
1113 Preheader->getTerminator());
1114 uint64_t PatternRepsPerTrip =
1115 (ConstStoreSize->getValue()->getZExtValue() * 8) /
1116 DL->getTypeSizeInBits(PatternValue->getType());
1117 // If ConstStoreSize is not equal to the width of PatternValue, then
1118 // MemsetArg is TripCount * (ConstStoreSize/PatternValueWidth). Else
1119 // MemSetArg is just TripCount.
1120 MemsetArg =
1121 PatternRepsPerTrip == 1
1122 ? TripCount
1123 : Builder.CreateMul(TripCount,
1124 Builder.getIntN(IntIdxTy->getIntegerBitWidth(),
1125 PatternRepsPerTrip));
1126 if (auto *CI = dyn_cast<ConstantInt>(TripCount))
1127 BytesWritten =
1128 CI->getZExtValue() * ConstStoreSize->getValue()->getZExtValue();
1129
1130 } else {
1131 const SCEV *NumBytesS =
1132 getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1133
1134 // TODO: ideally we should still be able to generate memset if SCEV expander
1135 // is taught to generate the dependencies at the latest point.
1136 if (!Expander.isSafeToExpand(NumBytesS))
1137 return Changed;
1138 MemsetArg =
1139 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1140 if (auto *CI = dyn_cast<ConstantInt>(MemsetArg))
1141 BytesWritten = CI->getZExtValue();
1142 }
1143 assert(MemsetArg && "MemsetArg should have been set");
1144
1145 AAMDNodes AATags = TheStore->getAAMetadata();
1146 for (Instruction *Store : Stores)
1147 AATags = AATags.merge(Store->getAAMetadata());
1148 if (BytesWritten)
1149 AATags = AATags.extendTo(BytesWritten.value());
1150 else
1151 AATags = AATags.extendTo(-1);
1152
1153 CallInst *NewCall;
1154 if (SplatValue) {
1155 NewCall = Builder.CreateMemSet(BasePtr, SplatValue, MemsetArg,
1156 MaybeAlign(StoreAlignment),
1157 /*isVolatile=*/false, AATags);
1158 } else if (ForceMemsetPatternIntrinsic ||
1159 isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)) {
1160 assert(isa<SCEVConstant>(StoreSizeSCEV) && "Expected constant store size");
1161
1162 NewCall = Builder.CreateIntrinsic(
1163 Intrinsic::experimental_memset_pattern,
1164 {DestInt8PtrTy, PatternValue->getType(), IntIdxTy},
1165 {BasePtr, PatternValue, MemsetArg,
1166 ConstantInt::getFalse(M->getContext())});
1167 if (StoreAlignment)
1168 cast<MemSetPatternInst>(NewCall)->setDestAlignment(*StoreAlignment);
1169 NewCall->setAAMetadata(AATags);
1170 } else {
1171 // Neither a memset, nor memset_pattern16
1172 return Changed;
1173 }
1174
1175 NewCall->setDebugLoc(TheStore->getDebugLoc());
1176
1177 if (MSSAU) {
1178 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1179 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1180 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1181 }
1182
1183 LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
1184 << " from store to: " << *Ev << " at: " << *TheStore
1185 << "\n");
1186
1187 ORE.emit([&]() {
1188 OptimizationRemark R(DEBUG_TYPE, "ProcessLoopStridedStore",
1189 NewCall->getDebugLoc(), Preheader);
1190 R << "Transformed loop-strided store in "
1191 << ore::NV("Function", TheStore->getFunction())
1192 << " function into a call to "
1193 << ore::NV("NewFunction", NewCall->getCalledFunction())
1194 << "() intrinsic";
1195 if (!Stores.empty())
1196 R << ore::setExtraArgs();
1197 for (auto *I : Stores) {
1198 R << ore::NV("FromBlock", I->getParent()->getName())
1199 << ore::NV("ToBlock", Preheader->getName());
1200 }
1201 return R;
1202 });
1203
1204 // Okay, the memset has been formed. Zap the original store and anything that
1205 // feeds into it.
1206 for (auto *I : Stores) {
1207 if (MSSAU)
1208 MSSAU->removeMemoryAccess(I, true);
1210 }
1211 if (MSSAU && VerifyMemorySSA)
1212 MSSAU->getMemorySSA()->verifyMemorySSA();
1213 ++NumMemSet;
1214 ExpCleaner.markResultUsed();
1215 return true;
1216}
1217
1218/// If the stored value is a strided load in the same loop with the same stride
1219/// this may be transformable into a memcpy. This kicks in for stuff like
1220/// for (i) A[i] = B[i];
1221bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1222 const SCEV *BECount) {
1223 assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
1224
1225 Value *StorePtr = SI->getPointerOperand();
1226 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1227 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1228
1229 // The store must be feeding a non-volatile load.
1230 LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
1231 assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
1232
1233 // See if the pointer expression is an AddRec like {base,+,1} on the current
1234 // loop, which indicates a strided load. If we have something else, it's a
1235 // random load we can't handle.
1236 Value *LoadPtr = LI->getPointerOperand();
1237 const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
1238
1239 const SCEV *StoreSizeSCEV = SE->getConstant(StorePtr->getType(), StoreSize);
1240 return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSizeSCEV,
1241 SI->getAlign(), LI->getAlign(), SI, LI,
1242 StoreEv, LoadEv, BECount);
1243}
1244
1245namespace {
1246class MemmoveVerifier {
1247public:
1248 explicit MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr,
1249 const DataLayout &DL)
1251 LoadBasePtr.stripPointerCasts(), LoadOff, DL)),
1253 StoreBasePtr.stripPointerCasts(), StoreOff, DL)),
1254 IsSameObject(BP1 == BP2) {}
1255
1256 bool loadAndStoreMayFormMemmove(unsigned StoreSize, bool IsNegStride,
1257 const Instruction &TheLoad,
1258 bool IsMemCpy) const {
1259 if (IsMemCpy) {
1260 // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1261 // for negative stride.
1262 if ((!IsNegStride && LoadOff <= StoreOff) ||
1263 (IsNegStride && LoadOff >= StoreOff))
1264 return false;
1265 } else {
1266 // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1267 // for negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr.
1268 int64_t LoadSize =
1269 DL.getTypeSizeInBits(TheLoad.getType()).getFixedValue() / 8;
1270 if (BP1 != BP2 || LoadSize != int64_t(StoreSize))
1271 return false;
1272 if ((!IsNegStride && LoadOff < StoreOff + int64_t(StoreSize)) ||
1273 (IsNegStride && LoadOff + LoadSize > StoreOff))
1274 return false;
1275 }
1276 return true;
1277 }
1278
1279private:
1280 const DataLayout &DL;
1281 int64_t LoadOff = 0;
1282 int64_t StoreOff = 0;
1283 const Value *BP1;
1284 const Value *BP2;
1285
1286public:
1287 const bool IsSameObject;
1288};
1289} // namespace
1290
1291bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
1292 Value *DestPtr, Value *SourcePtr, const SCEV *StoreSizeSCEV,
1293 MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore,
1294 Instruction *TheLoad, const SCEVAddRecExpr *StoreEv,
1295 const SCEVAddRecExpr *LoadEv, const SCEV *BECount) {
1296
1297 // FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to
1298 // conservatively bail here, since otherwise we may have to transform
1299 // llvm.memcpy.inline into llvm.memcpy which is illegal.
1300 if (auto *MCI = dyn_cast<MemCpyInst>(TheStore); MCI && MCI->isForceInlined())
1301 return false;
1302
1303 // The trip count of the loop and the base pointer of the addrec SCEV is
1304 // guaranteed to be loop invariant, which means that it should dominate the
1305 // header. This allows us to insert code for it in the preheader.
1306 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1307 IRBuilder<> Builder(Preheader->getTerminator());
1308 SCEVExpander Expander(*SE, *DL, "loop-idiom");
1309
1310 SCEVExpanderCleaner ExpCleaner(Expander);
1311
1312 bool Changed = false;
1313 const SCEV *StrStart = StoreEv->getStart();
1314 unsigned StrAS = DestPtr->getType()->getPointerAddressSpace();
1315 Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
1316
1317 APInt Stride = getStoreStride(StoreEv);
1318 const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
1319
1320 // TODO: Deal with non-constant size; Currently expect constant store size
1321 assert(ConstStoreSize && "store size is expected to be a constant");
1322
1323 int64_t StoreSize = ConstStoreSize->getValue()->getZExtValue();
1324 bool IsNegStride = StoreSize == -Stride;
1325
1326 // Handle negative strided loops.
1327 if (IsNegStride)
1328 StrStart =
1329 getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1330
1331 // Okay, we have a strided store "p[i]" of a loaded value. We can turn
1332 // this into a memcpy in the loop preheader now if we want. However, this
1333 // would be unsafe to do if there is anything else in the loop that may read
1334 // or write the memory region we're storing to. This includes the load that
1335 // feeds the stores. Check for an alias by generating the base address and
1336 // checking everything.
1337 Value *StoreBasePtr = Expander.expandCodeFor(
1338 StrStart, Builder.getPtrTy(StrAS), Preheader->getTerminator());
1339
1340 // From here on out, conservatively report to the pass manager that we've
1341 // changed the IR, even if we later clean up these added instructions. There
1342 // may be structural differences e.g. in the order of use lists not accounted
1343 // for in just a textual dump of the IR. This is written as a variable, even
1344 // though statically all the places this dominates could be replaced with
1345 // 'true', with the hope that anyone trying to be clever / "more precise" with
1346 // the return value will read this comment, and leave them alone.
1347 Changed = true;
1348
1349 SmallPtrSet<Instruction *, 2> IgnoredInsts;
1350 IgnoredInsts.insert(TheStore);
1351
1352 bool IsMemCpy = isa<MemCpyInst>(TheStore);
1353 const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";
1354
1355 bool LoopAccessStore =
1356 mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1357 StoreSizeSCEV, *AA, IgnoredInsts);
1358 if (LoopAccessStore) {
1359 // For memmove case it's not enough to guarantee that loop doesn't access
1360 // TheStore and TheLoad. Additionally we need to make sure that TheStore is
1361 // the only user of TheLoad.
1362 if (!TheLoad->hasOneUse())
1363 return Changed;
1364 IgnoredInsts.insert(TheLoad);
1365 if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop,
1366 BECount, StoreSizeSCEV, *AA, IgnoredInsts)) {
1367 ORE.emit([&]() {
1368 return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore",
1369 TheStore)
1370 << ore::NV("Inst", InstRemark) << " in "
1371 << ore::NV("Function", TheStore->getFunction())
1372 << " function will not be hoisted: "
1373 << ore::NV("Reason", "The loop may access store location");
1374 });
1375 return Changed;
1376 }
1377 IgnoredInsts.erase(TheLoad);
1378 }
1379
1380 const SCEV *LdStart = LoadEv->getStart();
1381 unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
1382
1383 // Handle negative strided loops.
1384 if (IsNegStride)
1385 LdStart =
1386 getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1387
1388 // For a memcpy, we have to make sure that the input array is not being
1389 // mutated by the loop.
1390 Value *LoadBasePtr = Expander.expandCodeFor(LdStart, Builder.getPtrTy(LdAS),
1391 Preheader->getTerminator());
1392
1393 // If the store is a memcpy instruction, we must check if it will write to
1394 // the load memory locations. So remove it from the ignored stores.
1395 MemmoveVerifier Verifier(*LoadBasePtr, *StoreBasePtr, *DL);
1396 if (IsMemCpy && !Verifier.IsSameObject)
1397 IgnoredInsts.erase(TheStore);
1398 if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1399 StoreSizeSCEV, *AA, IgnoredInsts)) {
1400 ORE.emit([&]() {
1401 return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
1402 << ore::NV("Inst", InstRemark) << " in "
1403 << ore::NV("Function", TheStore->getFunction())
1404 << " function will not be hoisted: "
1405 << ore::NV("Reason", "The loop may access load location");
1406 });
1407 return Changed;
1408 }
1409
1410 bool IsAtomic = TheStore->isAtomic() || TheLoad->isAtomic();
1411 bool UseMemMove = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore;
1412
1413 if (IsAtomic) {
1414 // For now don't support unordered atomic memmove.
1415 if (UseMemMove)
1416 return Changed;
1417
1418 // We cannot allow unaligned ops for unordered load/store, so reject
1419 // anything where the alignment isn't at least the element size.
1420 assert((StoreAlign && LoadAlign) &&
1421 "Expect unordered load/store to have align.");
1422 if (*StoreAlign < StoreSize || *LoadAlign < StoreSize)
1423 return Changed;
1424
1425 // If the element.atomic memcpy is not lowered into explicit
1426 // loads/stores later, then it will be lowered into an element-size
1427 // specific lib call. If the lib call doesn't exist for our store size, then
1428 // we shouldn't generate the memcpy.
1429 if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1430 return Changed;
1431 }
1432
1433 if (UseMemMove)
1434 if (!Verifier.loadAndStoreMayFormMemmove(StoreSize, IsNegStride, *TheLoad,
1435 IsMemCpy))
1436 return Changed;
1437
1438 if (avoidLIRForMultiBlockLoop())
1439 return Changed;
1440
1441 // Okay, everything is safe, we can transform this!
1442
1443 const SCEV *NumBytesS =
1444 getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1445
1446 Value *NumBytes =
1447 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1448
1449 AAMDNodes AATags = TheLoad->getAAMetadata();
1450 AAMDNodes StoreAATags = TheStore->getAAMetadata();
1451 AATags = AATags.merge(StoreAATags);
1452 if (auto CI = dyn_cast<ConstantInt>(NumBytes))
1453 AATags = AATags.extendTo(CI->getZExtValue());
1454 else
1455 AATags = AATags.extendTo(-1);
1456
1457 CallInst *NewCall = nullptr;
1458 // Check whether to generate an unordered atomic memcpy:
1459 // If the load or store are atomic, then they must necessarily be unordered
1460 // by previous checks.
1461 if (!IsAtomic) {
1462 if (UseMemMove)
1463 NewCall = Builder.CreateMemMove(StoreBasePtr, StoreAlign, LoadBasePtr,
1464 LoadAlign, NumBytes,
1465 /*isVolatile=*/false, AATags);
1466 else
1467 NewCall =
1468 Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign,
1469 NumBytes, /*isVolatile=*/false, AATags);
1470 } else {
1471 // Create the call.
1472 // Note that unordered atomic loads/stores are *required* by the spec to
1473 // have an alignment but non-atomic loads/stores may not.
1474 NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1475 StoreBasePtr, *StoreAlign, LoadBasePtr, *LoadAlign, NumBytes, StoreSize,
1476 AATags);
1477 }
1478 NewCall->setDebugLoc(TheStore->getDebugLoc());
1479
1480 if (MSSAU) {
1481 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1482 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1483 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1484 }
1485
1486 LLVM_DEBUG(dbgs() << " Formed new call: " << *NewCall << "\n"
1487 << " from load ptr=" << *LoadEv << " at: " << *TheLoad
1488 << "\n"
1489 << " from store ptr=" << *StoreEv << " at: " << *TheStore
1490 << "\n");
1491
1492 ORE.emit([&]() {
1493 return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",
1494 NewCall->getDebugLoc(), Preheader)
1495 << "Formed a call to "
1496 << ore::NV("NewFunction", NewCall->getCalledFunction())
1497 << "() intrinsic from " << ore::NV("Inst", InstRemark)
1498 << " instruction in " << ore::NV("Function", TheStore->getFunction())
1499 << " function"
1501 << ore::NV("FromBlock", TheStore->getParent()->getName())
1502 << ore::NV("ToBlock", Preheader->getName());
1503 });
1504
1505 // Okay, a new call to memcpy/memmove has been formed. Zap the original store
1506 // and anything that feeds into it.
1507 if (MSSAU)
1508 MSSAU->removeMemoryAccess(TheStore, true);
1509 deleteDeadInstruction(TheStore);
1510 if (MSSAU && VerifyMemorySSA)
1511 MSSAU->getMemorySSA()->verifyMemorySSA();
1512 if (UseMemMove)
1513 ++NumMemMove;
1514 else
1515 ++NumMemCpy;
1516 ExpCleaner.markResultUsed();
1517 return true;
1518}
1519
1520// When compiling for codesize we avoid idiom recognition for a multi-block loop
1521// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1522//
1523bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1524 bool IsLoopMemset) {
1525 if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
1526 if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
1527 LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
1528 << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
1529 << " avoided: multi-block top-level loop\n");
1530 return true;
1531 }
1532 }
1533
1534 return false;
1535}
1536
1537bool LoopIdiomRecognize::optimizeCRCLoop(const PolynomialInfo &Info) {
1538 // FIXME: Hexagon has a special HexagonLoopIdiom that optimizes CRC using
1539 // carry-less multiplication instructions, which is more efficient than our
1540 // Sarwate table-lookup optimization. Hence, until we're able to emit
1541 // target-specific instructions for Hexagon, subsuming HexagonLoopIdiom,
1542 // disable the optimization for Hexagon.
1543 Module &M = *CurLoop->getHeader()->getModule();
1544 Triple TT(M.getTargetTriple());
1545 if (TT.getArch() == Triple::hexagon)
1546 return false;
1547
1548 // First, create a new GlobalVariable corresponding to the
1549 // Sarwate-lookup-table.
1550 Type *CRCTy = Info.LHS->getType();
1551 unsigned CRCBW = CRCTy->getIntegerBitWidth();
1552 std::array<Constant *, 256> CRCConstants;
1553 transform(HashRecognize::genSarwateTable(Info.RHS, Info.ByteOrderSwapped),
1554 CRCConstants.begin(),
1555 [CRCTy](const APInt &E) { return ConstantInt::get(CRCTy, E); });
1556 Constant *ConstArray =
1557 ConstantArray::get(ArrayType::get(CRCTy, 256), CRCConstants);
1558 GlobalVariable *GV =
1559 new GlobalVariable(M, ConstArray->getType(), true,
1560 GlobalValue::PrivateLinkage, ConstArray, ".crctable");
1561
1564
1565 // Next, mark all PHIs for removal except IV.
1566 {
1567 for (PHINode &PN : CurLoop->getHeader()->phis()) {
1568 if (&PN == IV)
1569 continue;
1570 PN.replaceAllUsesWith(PoisonValue::get(PN.getType()));
1571 Cleanup.push_back(&PN);
1572 }
1573 }
1574
1575 // Next, fix up the trip count.
1576 {
1577 unsigned NewBTC = (Info.TripCount / 8) - 1;
1578 BasicBlock *LoopBlk = CurLoop->getLoopLatch();
1579 BranchInst *BrInst = cast<BranchInst>(LoopBlk->getTerminator());
1580 CmpPredicate ExitPred = BrInst->getSuccessor(0) == LoopBlk
1583 Instruction *ExitCond = CurLoop->getLatchCmpInst();
1584 Value *ExitLimit = ConstantInt::get(IV->getType(), NewBTC);
1585 IRBuilder<> Builder(ExitCond);
1586 Value *NewExitCond =
1587 Builder.CreateICmp(ExitPred, IV, ExitLimit, "exit.cond");
1588 ExitCond->replaceAllUsesWith(NewExitCond);
1589 deleteDeadInstruction(ExitCond);
1590 }
1591
1592 // Finally, fill the loop with the Sarwate-table-lookup logic, and replace all
1593 // uses of ComputedValue.
1594 //
1595 // Little-endian:
1596 // crc = (crc >> 8) ^ tbl[(iv'th byte of data) ^ (bottom byte of crc)]
1597 // Big-Endian:
1598 // crc = (crc << 8) ^ tbl[(iv'th byte of data) ^ (top byte of crc)]
1599 {
1600 auto LoByte = [](IRBuilderBase &Builder, Value *Op, const Twine &Name) {
1601 return Builder.CreateZExtOrTrunc(
1602 Op, IntegerType::getInt8Ty(Op->getContext()), Name);
1603 };
1604 auto HiIdx = [LoByte, CRCBW](IRBuilderBase &Builder, Value *Op,
1605 const Twine &Name) {
1606 Type *OpTy = Op->getType();
1607
1608 // When the bitwidth of the CRC mismatches the Op's bitwidth, we need to
1609 // use the CRC's bitwidth as the reference for shifting right.
1610 return LoByte(Builder,
1611 CRCBW > 8 ? Builder.CreateLShr(
1612 Op, ConstantInt::get(OpTy, CRCBW - 8), Name)
1613 : Op,
1614 Name + ".lo.byte");
1615 };
1616
1617 IRBuilder<> Builder(CurLoop->getHeader(),
1618 CurLoop->getHeader()->getFirstNonPHIIt());
1619
1620 // Create the CRC PHI, and initialize its incoming value to the initial
1621 // value of CRC.
1622 PHINode *CRCPhi = Builder.CreatePHI(CRCTy, 2, "crc");
1623 CRCPhi->addIncoming(Info.LHS, CurLoop->getLoopPreheader());
1624
1625 // CRC is now an evolving variable, initialized to the PHI.
1626 Value *CRC = CRCPhi;
1627
1628 // TableIndexer = ((top|bottom) byte of CRC). It is XOR'ed with (iv'th byte
1629 // of LHSAux), if LHSAux is non-nullptr.
1630 Value *Indexer = CRC;
1631 if (Value *Data = Info.LHSAux) {
1632 Type *DataTy = Data->getType();
1633
1634 // To index into the (iv'th byte of LHSAux), we multiply iv by 8, and we
1635 // shift right by that amount, and take the lo-byte (in the little-endian
1636 // case), or shift left by that amount, and take the hi-idx (in the
1637 // big-endian case).
1638 Value *IVBits = Builder.CreateZExtOrTrunc(
1639 Builder.CreateShl(IV, 3, "iv.bits"), DataTy, "iv.indexer");
1640 Value *DataIndexer =
1641 Info.ByteOrderSwapped
1642 ? Builder.CreateShl(Data, IVBits, "data.indexer")
1643 : Builder.CreateLShr(Data, IVBits, "data.indexer");
1644 Indexer = Builder.CreateXor(
1645 DataIndexer,
1646 Builder.CreateZExtOrTrunc(Indexer, DataTy, "crc.indexer.cast"),
1647 "crc.data.indexer");
1648 }
1649
1650 Indexer = Info.ByteOrderSwapped ? HiIdx(Builder, Indexer, "indexer.hi")
1651 : LoByte(Builder, Indexer, "indexer.lo");
1652
1653 // Always index into a GEP using the index type.
1654 Indexer = Builder.CreateZExt(
1655 Indexer, SE->getDataLayout().getIndexType(GV->getType()),
1656 "indexer.ext");
1657
1658 // CRCTableLd = CRCTable[(iv'th byte of data) ^ (top|bottom) byte of CRC].
1659 Value *CRCTableGEP =
1660 Builder.CreateInBoundsGEP(CRCTy, GV, Indexer, "tbl.ptradd");
1661 Value *CRCTableLd = Builder.CreateLoad(CRCTy, CRCTableGEP, "tbl.ld");
1662
1663 // CRCNext = (CRC (<<|>>) 8) ^ CRCTableLd, or simply CRCTableLd in case of
1664 // CRC-8.
1665 Value *CRCNext = CRCTableLd;
1666 if (CRCBW > 8) {
1667 Value *CRCShift = Info.ByteOrderSwapped
1668 ? Builder.CreateShl(CRC, 8, "crc.be.shift")
1669 : Builder.CreateLShr(CRC, 8, "crc.le.shift");
1670 CRCNext = Builder.CreateXor(CRCShift, CRCTableLd, "crc.next");
1671 }
1672
1673 // Connect the back-edge for the loop, and RAUW the ComputedValue.
1674 CRCPhi->addIncoming(CRCNext, CurLoop->getLoopLatch());
1675 Info.ComputedValue->replaceUsesOutsideBlock(CRCNext,
1676 CurLoop->getLoopLatch());
1677 }
1678
1679 // Cleanup.
1680 {
1681 for (PHINode *PN : Cleanup)
1683 SE->forgetLoop(CurLoop);
1684 }
1685 return true;
1686}
1687
1688bool LoopIdiomRecognize::runOnNoncountableLoop() {
1689 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
1690 << CurLoop->getHeader()->getParent()->getName()
1691 << "] Noncountable Loop %"
1692 << CurLoop->getHeader()->getName() << "\n");
1693
1694 return recognizePopcount() || recognizeAndInsertFFS() ||
1695 recognizeShiftUntilBitTest() || recognizeShiftUntilZero() ||
1696 recognizeShiftUntilLessThan() || recognizeAndInsertStrLen();
1697}
1698
1699/// Check if the given conditional branch is based on the comparison between
1700/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1701/// true), the control yields to the loop entry. If the branch matches the
1702/// behavior, the variable involved in the comparison is returned. This function
1703/// will be called to see if the precondition and postcondition of the loop are
1704/// in desirable form.
1706 bool JmpOnZero = false) {
1707 if (!BI || !BI->isConditional())
1708 return nullptr;
1709
1711 if (!Cond)
1712 return nullptr;
1713
1714 auto *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
1715 if (!CmpZero || !CmpZero->isZero())
1716 return nullptr;
1717
1718 BasicBlock *TrueSucc = BI->getSuccessor(0);
1719 BasicBlock *FalseSucc = BI->getSuccessor(1);
1720 if (JmpOnZero)
1721 std::swap(TrueSucc, FalseSucc);
1722
1723 ICmpInst::Predicate Pred = Cond->getPredicate();
1724 if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
1725 (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
1726 return Cond->getOperand(0);
1727
1728 return nullptr;
1729}
1730
1731namespace {
1732
1733class StrlenVerifier {
1734public:
1735 explicit StrlenVerifier(const Loop *CurLoop, ScalarEvolution *SE,
1736 const TargetLibraryInfo *TLI)
1737 : CurLoop(CurLoop), SE(SE), TLI(TLI) {}
1738
1739 bool isValidStrlenIdiom() {
1740 // Give up if the loop has multiple blocks, multiple backedges, or
1741 // multiple exit blocks
1742 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1 ||
1743 !CurLoop->getUniqueExitBlock())
1744 return false;
1745
1746 // It should have a preheader and a branch instruction.
1747 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1748 if (!Preheader)
1749 return false;
1750
1751 BranchInst *EntryBI = dyn_cast<BranchInst>(Preheader->getTerminator());
1752 if (!EntryBI)
1753 return false;
1754
1755 // The loop exit must be conditioned on an icmp with 0 the null terminator.
1756 // The icmp operand has to be a load on some SSA reg that increments
1757 // by 1 in the loop.
1758 BasicBlock *LoopBody = *CurLoop->block_begin();
1759
1760 // Skip if the body is too big as it most likely is not a strlen idiom.
1761 if (!LoopBody || LoopBody->size() >= 15)
1762 return false;
1763
1764 BranchInst *LoopTerm = dyn_cast<BranchInst>(LoopBody->getTerminator());
1765 Value *LoopCond = matchCondition(LoopTerm, LoopBody);
1766 if (!LoopCond)
1767 return false;
1768
1769 LoadInst *LoopLoad = dyn_cast<LoadInst>(LoopCond);
1770 if (!LoopLoad || LoopLoad->getPointerAddressSpace() != 0)
1771 return false;
1772
1773 OperandType = LoopLoad->getType();
1774 if (!OperandType || !OperandType->isIntegerTy())
1775 return false;
1776
1777 // See if the pointer expression is an AddRec with constant step a of form
1778 // ({n,+,a}) where a is the width of the char type.
1779 Value *IncPtr = LoopLoad->getPointerOperand();
1780 const SCEV *LoadEv = SE->getSCEV(IncPtr);
1781 const APInt *Step;
1782 if (!match(LoadEv,
1783 m_scev_AffineAddRec(m_SCEV(LoadBaseEv), m_scev_APInt(Step))))
1784 return false;
1785
1786 LLVM_DEBUG(dbgs() << "pointer load scev: " << *LoadEv << "\n");
1787
1788 unsigned StepSize = Step->getZExtValue();
1789
1790 // Verify that StepSize is consistent with platform char width.
1791 OpWidth = OperandType->getIntegerBitWidth();
1792 unsigned WcharSize = TLI->getWCharSize(*LoopLoad->getModule());
1793 if (OpWidth != StepSize * 8)
1794 return false;
1795 if (OpWidth != 8 && OpWidth != 16 && OpWidth != 32)
1796 return false;
1797 if (OpWidth >= 16)
1798 if (OpWidth != WcharSize * 8)
1799 return false;
1800
1801 // Scan every instruction in the loop to ensure there are no side effects.
1802 for (Instruction &I : *LoopBody)
1803 if (I.mayHaveSideEffects())
1804 return false;
1805
1806 BasicBlock *LoopExitBB = CurLoop->getExitBlock();
1807 if (!LoopExitBB)
1808 return false;
1809
1810 for (PHINode &PN : LoopExitBB->phis()) {
1811 if (!SE->isSCEVable(PN.getType()))
1812 return false;
1813
1814 const SCEV *Ev = SE->getSCEV(&PN);
1815 if (!Ev)
1816 return false;
1817
1818 LLVM_DEBUG(dbgs() << "loop exit phi scev: " << *Ev << "\n");
1819
1820 // Since we verified that the loop trip count will be a valid strlen
1821 // idiom, we can expand all lcssa phi with {n,+,1} as (n + strlen) and use
1822 // SCEVExpander materialize the loop output.
1823 const SCEVAddRecExpr *AddRecEv = dyn_cast<SCEVAddRecExpr>(Ev);
1824 if (!AddRecEv || !AddRecEv->isAffine())
1825 return false;
1826
1827 // We only want RecAddExpr with recurrence step that is constant. This
1828 // is good enough for all the idioms we want to recognize. Later we expand
1829 // and materialize the recurrence as {base,+,a} -> (base + a * strlen)
1830 if (!isa<SCEVConstant>(AddRecEv->getStepRecurrence(*SE)))
1831 return false;
1832 }
1833
1834 return true;
1835 }
1836
1837public:
1838 const Loop *CurLoop;
1839 ScalarEvolution *SE;
1840 const TargetLibraryInfo *TLI;
1841
1842 unsigned OpWidth;
1843 ConstantInt *StepSizeCI;
1844 const SCEV *LoadBaseEv;
1846};
1847
1848} // namespace
1849
1850/// The Strlen Idiom we are trying to detect has the following structure
1851///
1852/// preheader:
1853/// ...
1854/// br label %body, ...
1855///
1856/// body:
1857/// ... ; %0 is incremented by a gep
1858/// %1 = load i8, ptr %0, align 1
1859/// %2 = icmp eq i8 %1, 0
1860/// br i1 %2, label %exit, label %body
1861///
1862/// exit:
1863/// %lcssa = phi [%0, %body], ...
1864///
1865/// We expect the strlen idiom to have a load of a character type that
1866/// is compared against '\0', and such load pointer operand must have scev
1867/// expression of the form {%str,+,c} where c is a ConstantInt of the
1868/// appropiate character width for the idiom, and %str is the base of the string
1869/// And, that all lcssa phis have the form {...,+,n} where n is a constant,
1870///
1871/// When transforming the output of the strlen idiom, the lccsa phi are
1872/// expanded using SCEVExpander as {base scev,+,a} -> (base scev + a * strlen)
1873/// and all subsequent uses are replaced. For example,
1874///
1875/// \code{.c}
1876/// const char* base = str;
1877/// while (*str != '\0')
1878/// ++str;
1879/// size_t result = str - base;
1880/// \endcode
1881///
1882/// will be transformed as follows: The idiom will be replaced by a strlen
1883/// computation to compute the address of the null terminator of the string.
1884///
1885/// \code{.c}
1886/// const char* base = str;
1887/// const char* end = base + strlen(str);
1888/// size_t result = end - base;
1889/// \endcode
1890///
1891/// In the case we index by an induction variable, as long as the induction
1892/// variable has a constant int increment, we can replace all such indvars
1893/// with the closed form computation of strlen
1894///
1895/// \code{.c}
1896/// size_t i = 0;
1897/// while (str[i] != '\0')
1898/// ++i;
1899/// size_t result = i;
1900/// \endcode
1901///
1902/// Will be replaced by
1903///
1904/// \code{.c}
1905/// size_t i = 0 + strlen(str);
1906/// size_t result = i;
1907/// \endcode
1908///
1909bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
1910 if (DisableLIRP::All)
1911 return false;
1912
1913 StrlenVerifier Verifier(CurLoop, SE, TLI);
1914
1915 if (!Verifier.isValidStrlenIdiom())
1916 return false;
1917
1918 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1919 BasicBlock *LoopBody = *CurLoop->block_begin();
1920 BasicBlock *LoopExitBB = CurLoop->getExitBlock();
1921 BranchInst *LoopTerm = dyn_cast<BranchInst>(LoopBody->getTerminator());
1922 assert(Preheader && LoopBody && LoopExitBB && LoopTerm &&
1923 "Should be verified to be valid by StrlenVerifier");
1924
1925 if (Verifier.OpWidth == 8) {
1927 return false;
1928 if (!isLibFuncEmittable(Preheader->getModule(), TLI, LibFunc_strlen))
1929 return false;
1930 } else {
1932 return false;
1933 if (!isLibFuncEmittable(Preheader->getModule(), TLI, LibFunc_wcslen))
1934 return false;
1935 }
1936
1937 IRBuilder<> Builder(Preheader->getTerminator());
1938 Builder.SetCurrentDebugLocation(CurLoop->getStartLoc());
1939 SCEVExpander Expander(*SE, Preheader->getModule()->getDataLayout(),
1940 "strlen_idiom");
1941 Value *MaterialzedBase = Expander.expandCodeFor(
1942 Verifier.LoadBaseEv, Verifier.LoadBaseEv->getType(),
1943 Builder.GetInsertPoint());
1944
1945 Value *StrLenFunc = nullptr;
1946 if (Verifier.OpWidth == 8) {
1947 StrLenFunc = emitStrLen(MaterialzedBase, Builder, *DL, TLI);
1948 } else {
1949 StrLenFunc = emitWcsLen(MaterialzedBase, Builder, *DL, TLI);
1950 }
1951 assert(StrLenFunc && "Failed to emit strlen function.");
1952
1953 const SCEV *StrlenEv = SE->getSCEV(StrLenFunc);
1955 for (PHINode &PN : LoopExitBB->phis()) {
1956 // We can now materialize the loop output as all phi have scev {base,+,a}.
1957 // We expand the phi as:
1958 // %strlen = call i64 @strlen(%str)
1959 // %phi.new = base expression + step * %strlen
1960 const SCEV *Ev = SE->getSCEV(&PN);
1961 const SCEVAddRecExpr *AddRecEv = dyn_cast<SCEVAddRecExpr>(Ev);
1962 const SCEVConstant *Step =
1964 const SCEV *Base = AddRecEv->getStart();
1965
1966 // It is safe to truncate to base since if base is narrower than size_t
1967 // the equivalent user code will have to truncate anyways.
1968 const SCEV *NewEv = SE->getAddExpr(
1970 StrlenEv, Base->getType())));
1971
1972 Value *MaterializedPHI = Expander.expandCodeFor(NewEv, NewEv->getType(),
1973 Builder.GetInsertPoint());
1974 Expander.clear();
1975 PN.replaceAllUsesWith(MaterializedPHI);
1976 Cleanup.push_back(&PN);
1977 }
1978
1979 // All LCSSA Loop Phi are dead, the left over dead loop body can be cleaned
1980 // up by later passes
1981 for (PHINode *PN : Cleanup)
1983
1984 // LoopDeletion only delete invariant loops with known trip-count. We can
1985 // update the condition so it will reliablely delete the invariant loop
1986 assert(LoopTerm->getNumSuccessors() == 2 &&
1987 (LoopTerm->getSuccessor(0) == LoopBody ||
1988 LoopTerm->getSuccessor(1) == LoopBody) &&
1989 "loop body must have a successor that is it self");
1990 ConstantInt *NewLoopCond = LoopTerm->getSuccessor(0) == LoopBody
1991 ? Builder.getFalse()
1992 : Builder.getTrue();
1993 LoopTerm->setCondition(NewLoopCond);
1994 SE->forgetLoop(CurLoop);
1995
1996 ++NumStrLen;
1997 LLVM_DEBUG(dbgs() << " Formed strlen idiom: " << *StrLenFunc << "\n");
1998 ORE.emit([&]() {
1999 return OptimizationRemark(DEBUG_TYPE, "recognizeAndInsertStrLen",
2000 CurLoop->getStartLoc(), Preheader)
2001 << "Transformed " << StrLenFunc->getName() << " loop idiom";
2002 });
2003
2004 return true;
2005}
2006
2007/// Check if the given conditional branch is based on an unsigned less-than
2008/// comparison between a variable and a constant, and if the comparison is false
2009/// the control yields to the loop entry. If the branch matches the behaviour,
2010/// the variable involved in the comparison is returned.
2012 APInt &Threshold) {
2013 if (!BI || !BI->isConditional())
2014 return nullptr;
2015
2017 if (!Cond)
2018 return nullptr;
2019
2020 ConstantInt *CmpConst = dyn_cast<ConstantInt>(Cond->getOperand(1));
2021 if (!CmpConst)
2022 return nullptr;
2023
2024 BasicBlock *FalseSucc = BI->getSuccessor(1);
2025 ICmpInst::Predicate Pred = Cond->getPredicate();
2026
2027 if (Pred == ICmpInst::ICMP_ULT && FalseSucc == LoopEntry) {
2028 Threshold = CmpConst->getValue();
2029 return Cond->getOperand(0);
2030 }
2031
2032 return nullptr;
2033}
2034
2035// Check if the recurrence variable `VarX` is in the right form to create
2036// the idiom. Returns the value coerced to a PHINode if so.
2038 BasicBlock *LoopEntry) {
2039 auto *PhiX = dyn_cast<PHINode>(VarX);
2040 if (PhiX && PhiX->getParent() == LoopEntry &&
2041 (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
2042 return PhiX;
2043 return nullptr;
2044}
2045
2046/// Return true if the idiom is detected in the loop.
2047///
2048/// Additionally:
2049/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
2050/// or nullptr if there is no such.
2051/// 2) \p CntPhi is set to the corresponding phi node
2052/// or nullptr if there is no such.
2053/// 3) \p InitX is set to the value whose CTLZ could be used.
2054/// 4) \p DefX is set to the instruction calculating Loop exit condition.
2055/// 5) \p Threshold is set to the constant involved in the unsigned less-than
2056/// comparison.
2057///
2058/// The core idiom we are trying to detect is:
2059/// \code
2060/// if (x0 < 2)
2061/// goto loop-exit // the precondition of the loop
2062/// cnt0 = init-val
2063/// do {
2064/// x = phi (x0, x.next); //PhiX
2065/// cnt = phi (cnt0, cnt.next)
2066///
2067/// cnt.next = cnt + 1;
2068/// ...
2069/// x.next = x >> 1; // DefX
2070/// } while (x >= 4)
2071/// loop-exit:
2072/// \endcode
2074 Intrinsic::ID &IntrinID,
2075 Value *&InitX, Instruction *&CntInst,
2076 PHINode *&CntPhi, Instruction *&DefX,
2077 APInt &Threshold) {
2078 BasicBlock *LoopEntry;
2079
2080 DefX = nullptr;
2081 CntInst = nullptr;
2082 CntPhi = nullptr;
2083 LoopEntry = *(CurLoop->block_begin());
2084
2085 // step 1: Check if the loop-back branch is in desirable form.
2087 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry,
2088 Threshold))
2089 DefX = dyn_cast<Instruction>(T);
2090 else
2091 return false;
2092
2093 // step 2: Check the recurrence of variable X
2094 if (!DefX || !isa<PHINode>(DefX))
2095 return false;
2096
2097 PHINode *VarPhi = cast<PHINode>(DefX);
2098 int Idx = VarPhi->getBasicBlockIndex(LoopEntry);
2099 if (Idx == -1)
2100 return false;
2101
2102 DefX = dyn_cast<Instruction>(VarPhi->getIncomingValue(Idx));
2103 if (!DefX || DefX->getNumOperands() == 0 || DefX->getOperand(0) != VarPhi)
2104 return false;
2105
2106 // step 3: detect instructions corresponding to "x.next = x >> 1"
2107 if (DefX->getOpcode() != Instruction::LShr)
2108 return false;
2109
2110 IntrinID = Intrinsic::ctlz;
2112 if (!Shft || !Shft->isOne())
2113 return false;
2114
2115 InitX = VarPhi->getIncomingValueForBlock(CurLoop->getLoopPreheader());
2116
2117 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
2118 // or cnt.next = cnt + -1.
2119 // TODO: We can skip the step. If loop trip count is known (CTLZ),
2120 // then all uses of "cnt.next" could be optimized to the trip count
2121 // plus "cnt0". Currently it is not optimized.
2122 // This step could be used to detect POPCNT instruction:
2123 // cnt.next = cnt + (x.next & 1)
2124 for (Instruction &Inst :
2125 llvm::make_range(LoopEntry->getFirstNonPHIIt(), LoopEntry->end())) {
2126 if (Inst.getOpcode() != Instruction::Add)
2127 continue;
2128
2130 if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
2131 continue;
2132
2133 PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
2134 if (!Phi)
2135 continue;
2136
2137 CntInst = &Inst;
2138 CntPhi = Phi;
2139 break;
2140 }
2141 if (!CntInst)
2142 return false;
2143
2144 return true;
2145}
2146
2147/// Return true iff the idiom is detected in the loop.
2148///
2149/// Additionally:
2150/// 1) \p CntInst is set to the instruction counting the population bit.
2151/// 2) \p CntPhi is set to the corresponding phi node.
2152/// 3) \p Var is set to the value whose population bits are being counted.
2153///
2154/// The core idiom we are trying to detect is:
2155/// \code
2156/// if (x0 != 0)
2157/// goto loop-exit // the precondition of the loop
2158/// cnt0 = init-val;
2159/// do {
2160/// x1 = phi (x0, x2);
2161/// cnt1 = phi(cnt0, cnt2);
2162///
2163/// cnt2 = cnt1 + 1;
2164/// ...
2165/// x2 = x1 & (x1 - 1);
2166/// ...
2167/// } while(x != 0);
2168///
2169/// loop-exit:
2170/// \endcode
2171static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
2172 Instruction *&CntInst, PHINode *&CntPhi,
2173 Value *&Var) {
2174 // step 1: Check to see if the look-back branch match this pattern:
2175 // "if (a!=0) goto loop-entry".
2176 BasicBlock *LoopEntry;
2177 Instruction *DefX2, *CountInst;
2178 Value *VarX1, *VarX0;
2179 PHINode *PhiX, *CountPhi;
2180
2181 DefX2 = CountInst = nullptr;
2182 VarX1 = VarX0 = nullptr;
2183 PhiX = CountPhi = nullptr;
2184 LoopEntry = *(CurLoop->block_begin());
2185
2186 // step 1: Check if the loop-back branch is in desirable form.
2187 {
2188 if (Value *T = matchCondition(
2189 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
2190 DefX2 = dyn_cast<Instruction>(T);
2191 else
2192 return false;
2193 }
2194
2195 // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
2196 {
2197 if (!DefX2 || DefX2->getOpcode() != Instruction::And)
2198 return false;
2199
2200 BinaryOperator *SubOneOp;
2201
2202 if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
2203 VarX1 = DefX2->getOperand(1);
2204 else {
2205 VarX1 = DefX2->getOperand(0);
2206 SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
2207 }
2208 if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
2209 return false;
2210
2211 ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
2212 if (!Dec ||
2213 !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
2214 (SubOneOp->getOpcode() == Instruction::Add &&
2215 Dec->isMinusOne()))) {
2216 return false;
2217 }
2218 }
2219
2220 // step 3: Check the recurrence of variable X
2221 PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
2222 if (!PhiX)
2223 return false;
2224
2225 // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
2226 {
2227 CountInst = nullptr;
2228 for (Instruction &Inst :
2229 llvm::make_range(LoopEntry->getFirstNonPHIIt(), LoopEntry->end())) {
2230 if (Inst.getOpcode() != Instruction::Add)
2231 continue;
2232
2234 if (!Inc || !Inc->isOne())
2235 continue;
2236
2237 PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
2238 if (!Phi)
2239 continue;
2240
2241 // Check if the result of the instruction is live of the loop.
2242 bool LiveOutLoop = false;
2243 for (User *U : Inst.users()) {
2244 if ((cast<Instruction>(U))->getParent() != LoopEntry) {
2245 LiveOutLoop = true;
2246 break;
2247 }
2248 }
2249
2250 if (LiveOutLoop) {
2251 CountInst = &Inst;
2252 CountPhi = Phi;
2253 break;
2254 }
2255 }
2256
2257 if (!CountInst)
2258 return false;
2259 }
2260
2261 // step 5: check if the precondition is in this form:
2262 // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
2263 {
2264 auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
2265 Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
2266 if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
2267 return false;
2268
2269 CntInst = CountInst;
2270 CntPhi = CountPhi;
2271 Var = T;
2272 }
2273
2274 return true;
2275}
2276
2277/// Return true if the idiom is detected in the loop.
2278///
2279/// Additionally:
2280/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
2281/// or nullptr if there is no such.
2282/// 2) \p CntPhi is set to the corresponding phi node
2283/// or nullptr if there is no such.
2284/// 3) \p Var is set to the value whose CTLZ could be used.
2285/// 4) \p DefX is set to the instruction calculating Loop exit condition.
2286///
2287/// The core idiom we are trying to detect is:
2288/// \code
2289/// if (x0 == 0)
2290/// goto loop-exit // the precondition of the loop
2291/// cnt0 = init-val;
2292/// do {
2293/// x = phi (x0, x.next); //PhiX
2294/// cnt = phi(cnt0, cnt.next);
2295///
2296/// cnt.next = cnt + 1;
2297/// ...
2298/// x.next = x >> 1; // DefX
2299/// ...
2300/// } while(x.next != 0);
2301///
2302/// loop-exit:
2303/// \endcode
2304static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
2305 Intrinsic::ID &IntrinID, Value *&InitX,
2306 Instruction *&CntInst, PHINode *&CntPhi,
2307 Instruction *&DefX) {
2308 BasicBlock *LoopEntry;
2309 Value *VarX = nullptr;
2310
2311 DefX = nullptr;
2312 CntInst = nullptr;
2313 CntPhi = nullptr;
2314 LoopEntry = *(CurLoop->block_begin());
2315
2316 // step 1: Check if the loop-back branch is in desirable form.
2317 if (Value *T = matchCondition(
2318 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
2319 DefX = dyn_cast<Instruction>(T);
2320 else
2321 return false;
2322
2323 // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
2324 if (!DefX || !DefX->isShift())
2325 return false;
2326 IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
2327 Intrinsic::ctlz;
2329 if (!Shft || !Shft->isOne())
2330 return false;
2331 VarX = DefX->getOperand(0);
2332
2333 // step 3: Check the recurrence of variable X
2334 PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
2335 if (!PhiX)
2336 return false;
2337
2338 InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
2339
2340 // Make sure the initial value can't be negative otherwise the ashr in the
2341 // loop might never reach zero which would make the loop infinite.
2342 if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
2343 return false;
2344
2345 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
2346 // or cnt.next = cnt + -1.
2347 // TODO: We can skip the step. If loop trip count is known (CTLZ),
2348 // then all uses of "cnt.next" could be optimized to the trip count
2349 // plus "cnt0". Currently it is not optimized.
2350 // This step could be used to detect POPCNT instruction:
2351 // cnt.next = cnt + (x.next & 1)
2352 for (Instruction &Inst :
2353 llvm::make_range(LoopEntry->getFirstNonPHIIt(), LoopEntry->end())) {
2354 if (Inst.getOpcode() != Instruction::Add)
2355 continue;
2356
2358 if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
2359 continue;
2360
2361 PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
2362 if (!Phi)
2363 continue;
2364
2365 CntInst = &Inst;
2366 CntPhi = Phi;
2367 break;
2368 }
2369 if (!CntInst)
2370 return false;
2371
2372 return true;
2373}
2374
2375// Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
2376// profitable if we delete the loop.
2377bool LoopIdiomRecognize::isProfitableToInsertFFS(Intrinsic::ID IntrinID,
2378 Value *InitX, bool ZeroCheck,
2379 size_t CanonicalSize) {
2380 const Value *Args[] = {InitX,
2381 ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
2382
2383 // @llvm.dbg doesn't count as they have no semantic effect.
2384 auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
2385 uint32_t HeaderSize =
2386 std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
2387
2388 IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
2389 InstructionCost Cost = TTI->getIntrinsicInstrCost(
2391 if (HeaderSize != CanonicalSize && Cost > TargetTransformInfo::TCC_Basic)
2392 return false;
2393
2394 return true;
2395}
2396
2397/// Convert CTLZ / CTTZ idiom loop into countable loop.
2398/// If CTLZ / CTTZ inserted as a new trip count returns true; otherwise,
2399/// returns false.
2400bool LoopIdiomRecognize::insertFFSIfProfitable(Intrinsic::ID IntrinID,
2401 Value *InitX, Instruction *DefX,
2402 PHINode *CntPhi,
2403 Instruction *CntInst) {
2404 bool IsCntPhiUsedOutsideLoop = false;
2405 for (User *U : CntPhi->users())
2406 if (!CurLoop->contains(cast<Instruction>(U))) {
2407 IsCntPhiUsedOutsideLoop = true;
2408 break;
2409 }
2410 bool IsCntInstUsedOutsideLoop = false;
2411 for (User *U : CntInst->users())
2412 if (!CurLoop->contains(cast<Instruction>(U))) {
2413 IsCntInstUsedOutsideLoop = true;
2414 break;
2415 }
2416 // If both CntInst and CntPhi are used outside the loop the profitability
2417 // is questionable.
2418 if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
2419 return false;
2420
2421 // For some CPUs result of CTLZ(X) intrinsic is undefined
2422 // when X is 0. If we can not guarantee X != 0, we need to check this
2423 // when expand.
2424 bool ZeroCheck = false;
2425 // It is safe to assume Preheader exist as it was checked in
2426 // parent function RunOnLoop.
2427 BasicBlock *PH = CurLoop->getLoopPreheader();
2428
2429 // If we are using the count instruction outside the loop, make sure we
2430 // have a zero check as a precondition. Without the check the loop would run
2431 // one iteration for before any check of the input value. This means 0 and 1
2432 // would have identical behavior in the original loop and thus
2433 if (!IsCntPhiUsedOutsideLoop) {
2434 auto *PreCondBB = PH->getSinglePredecessor();
2435 if (!PreCondBB)
2436 return false;
2437 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
2438 if (!PreCondBI)
2439 return false;
2440 if (matchCondition(PreCondBI, PH) != InitX)
2441 return false;
2442 ZeroCheck = true;
2443 }
2444
2445 // FFS idiom loop has only 6 instructions:
2446 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
2447 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
2448 // %shr = ashr %n.addr.0, 1
2449 // %tobool = icmp eq %shr, 0
2450 // %inc = add nsw %i.0, 1
2451 // br i1 %tobool
2452 size_t IdiomCanonicalSize = 6;
2453 if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, IdiomCanonicalSize))
2454 return false;
2455
2456 transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
2457 DefX->getDebugLoc(), ZeroCheck,
2458 IsCntPhiUsedOutsideLoop);
2459 return true;
2460}
2461
2462/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
2463/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
2464/// trip count returns true; otherwise, returns false.
2465bool LoopIdiomRecognize::recognizeAndInsertFFS() {
2466 // Give up if the loop has multiple blocks or multiple backedges.
2467 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
2468 return false;
2469
2470 Intrinsic::ID IntrinID;
2471 Value *InitX;
2472 Instruction *DefX = nullptr;
2473 PHINode *CntPhi = nullptr;
2474 Instruction *CntInst = nullptr;
2475
2476 if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX, CntInst, CntPhi,
2477 DefX))
2478 return false;
2479
2480 return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst);
2481}
2482
2483bool LoopIdiomRecognize::recognizeShiftUntilLessThan() {
2484 // Give up if the loop has multiple blocks or multiple backedges.
2485 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
2486 return false;
2487
2488 Intrinsic::ID IntrinID;
2489 Value *InitX;
2490 Instruction *DefX = nullptr;
2491 PHINode *CntPhi = nullptr;
2492 Instruction *CntInst = nullptr;
2493
2494 APInt LoopThreshold;
2495 if (!detectShiftUntilLessThanIdiom(CurLoop, *DL, IntrinID, InitX, CntInst,
2496 CntPhi, DefX, LoopThreshold))
2497 return false;
2498
2499 if (LoopThreshold == 2) {
2500 // Treat as regular FFS.
2501 return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst);
2502 }
2503
2504 // Look for Floor Log2 Idiom.
2505 if (LoopThreshold != 4)
2506 return false;
2507
2508 // Abort if CntPhi is used outside of the loop.
2509 for (User *U : CntPhi->users())
2510 if (!CurLoop->contains(cast<Instruction>(U)))
2511 return false;
2512
2513 // It is safe to assume Preheader exist as it was checked in
2514 // parent function RunOnLoop.
2515 BasicBlock *PH = CurLoop->getLoopPreheader();
2516 auto *PreCondBB = PH->getSinglePredecessor();
2517 if (!PreCondBB)
2518 return false;
2519 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
2520 if (!PreCondBI)
2521 return false;
2522
2523 APInt PreLoopThreshold;
2524 if (matchShiftULTCondition(PreCondBI, PH, PreLoopThreshold) != InitX ||
2525 PreLoopThreshold != 2)
2526 return false;
2527
2528 bool ZeroCheck = true;
2529
2530 // the loop has only 6 instructions:
2531 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
2532 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
2533 // %shr = ashr %n.addr.0, 1
2534 // %tobool = icmp ult %n.addr.0, C
2535 // %inc = add nsw %i.0, 1
2536 // br i1 %tobool
2537 size_t IdiomCanonicalSize = 6;
2538 if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, IdiomCanonicalSize))
2539 return false;
2540
2541 // log2(x) = w − 1 − clz(x)
2542 transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
2543 DefX->getDebugLoc(), ZeroCheck,
2544 /*IsCntPhiUsedOutsideLoop=*/false,
2545 /*InsertSub=*/true);
2546 return true;
2547}
2548
2549/// Recognizes a population count idiom in a non-countable loop.
2550///
2551/// If detected, transforms the relevant code to issue the popcount intrinsic
2552/// function call, and returns true; otherwise, returns false.
2553bool LoopIdiomRecognize::recognizePopcount() {
2554 if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
2555 return false;
2556
2557 // Counting population are usually conducted by few arithmetic instructions.
2558 // Such instructions can be easily "absorbed" by vacant slots in a
2559 // non-compact loop. Therefore, recognizing popcount idiom only makes sense
2560 // in a compact loop.
2561
2562 // Give up if the loop has multiple blocks or multiple backedges.
2563 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
2564 return false;
2565
2566 BasicBlock *LoopBody = *(CurLoop->block_begin());
2567 if (LoopBody->size() >= 20) {
2568 // The loop is too big, bail out.
2569 return false;
2570 }
2571
2572 // It should have a preheader containing nothing but an unconditional branch.
2573 BasicBlock *PH = CurLoop->getLoopPreheader();
2574 if (!PH || &PH->front() != PH->getTerminator())
2575 return false;
2576 auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
2577 if (!EntryBI || EntryBI->isConditional())
2578 return false;
2579
2580 // It should have a precondition block where the generated popcount intrinsic
2581 // function can be inserted.
2582 auto *PreCondBB = PH->getSinglePredecessor();
2583 if (!PreCondBB)
2584 return false;
2585 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
2586 if (!PreCondBI || PreCondBI->isUnconditional())
2587 return false;
2588
2589 Instruction *CntInst;
2590 PHINode *CntPhi;
2591 Value *Val;
2592 if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
2593 return false;
2594
2595 transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
2596 return true;
2597}
2598
2600 const DebugLoc &DL) {
2601 Value *Ops[] = {Val};
2602 Type *Tys[] = {Val->getType()};
2603
2604 CallInst *CI = IRBuilder.CreateIntrinsic(Intrinsic::ctpop, Tys, Ops);
2605 CI->setDebugLoc(DL);
2606
2607 return CI;
2608}
2609
2611 const DebugLoc &DL, bool ZeroCheck,
2612 Intrinsic::ID IID) {
2613 Value *Ops[] = {Val, IRBuilder.getInt1(ZeroCheck)};
2614 Type *Tys[] = {Val->getType()};
2615
2616 CallInst *CI = IRBuilder.CreateIntrinsic(IID, Tys, Ops);
2617 CI->setDebugLoc(DL);
2618
2619 return CI;
2620}
2621
2622/// Transform the following loop (Using CTLZ, CTTZ is similar):
2623/// loop:
2624/// CntPhi = PHI [Cnt0, CntInst]
2625/// PhiX = PHI [InitX, DefX]
2626/// CntInst = CntPhi + 1
2627/// DefX = PhiX >> 1
2628/// LOOP_BODY
2629/// Br: loop if (DefX != 0)
2630/// Use(CntPhi) or Use(CntInst)
2631///
2632/// Into:
2633/// If CntPhi used outside the loop:
2634/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
2635/// Count = CountPrev + 1
2636/// else
2637/// Count = BitWidth(InitX) - CTLZ(InitX)
2638/// loop:
2639/// CntPhi = PHI [Cnt0, CntInst]
2640/// PhiX = PHI [InitX, DefX]
2641/// PhiCount = PHI [Count, Dec]
2642/// CntInst = CntPhi + 1
2643/// DefX = PhiX >> 1
2644/// Dec = PhiCount - 1
2645/// LOOP_BODY
2646/// Br: loop if (Dec != 0)
2647/// Use(CountPrev + Cnt0) // Use(CntPhi)
2648/// or
2649/// Use(Count + Cnt0) // Use(CntInst)
2650///
2651/// If LOOP_BODY is empty the loop will be deleted.
2652/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
2653void LoopIdiomRecognize::transformLoopToCountable(
2654 Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
2655 PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
2656 bool ZeroCheck, bool IsCntPhiUsedOutsideLoop, bool InsertSub) {
2657 BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
2658
2659 // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
2660 IRBuilder<> Builder(PreheaderBr);
2661 Builder.SetCurrentDebugLocation(DL);
2662
2663 // If there are no uses of CntPhi crate:
2664 // Count = BitWidth - CTLZ(InitX);
2665 // NewCount = Count;
2666 // If there are uses of CntPhi create:
2667 // NewCount = BitWidth - CTLZ(InitX >> 1);
2668 // Count = NewCount + 1;
2669 Value *InitXNext;
2670 if (IsCntPhiUsedOutsideLoop) {
2671 if (DefX->getOpcode() == Instruction::AShr)
2672 InitXNext = Builder.CreateAShr(InitX, 1);
2673 else if (DefX->getOpcode() == Instruction::LShr)
2674 InitXNext = Builder.CreateLShr(InitX, 1);
2675 else if (DefX->getOpcode() == Instruction::Shl) // cttz
2676 InitXNext = Builder.CreateShl(InitX, 1);
2677 else
2678 llvm_unreachable("Unexpected opcode!");
2679 } else
2680 InitXNext = InitX;
2681 Value *Count =
2682 createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
2683 Type *CountTy = Count->getType();
2684 Count = Builder.CreateSub(
2685 ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count);
2686 if (InsertSub)
2687 Count = Builder.CreateSub(Count, ConstantInt::get(CountTy, 1));
2688 Value *NewCount = Count;
2689 if (IsCntPhiUsedOutsideLoop)
2690 Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1));
2691
2692 NewCount = Builder.CreateZExtOrTrunc(NewCount, CntInst->getType());
2693
2694 Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
2695 if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
2696 // If the counter was being incremented in the loop, add NewCount to the
2697 // counter's initial value, but only if the initial value is not zero.
2698 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2699 if (!InitConst || !InitConst->isZero())
2700 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2701 } else {
2702 // If the count was being decremented in the loop, subtract NewCount from
2703 // the counter's initial value.
2704 NewCount = Builder.CreateSub(CntInitVal, NewCount);
2705 }
2706
2707 // Step 2: Insert new IV and loop condition:
2708 // loop:
2709 // ...
2710 // PhiCount = PHI [Count, Dec]
2711 // ...
2712 // Dec = PhiCount - 1
2713 // ...
2714 // Br: loop if (Dec != 0)
2715 BasicBlock *Body = *(CurLoop->block_begin());
2716 auto *LbBr = cast<BranchInst>(Body->getTerminator());
2717 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2718
2719 PHINode *TcPhi = PHINode::Create(CountTy, 2, "tcphi");
2720 TcPhi->insertBefore(Body->begin());
2721
2722 Builder.SetInsertPoint(LbCond);
2723 Instruction *TcDec = cast<Instruction>(Builder.CreateSub(
2724 TcPhi, ConstantInt::get(CountTy, 1), "tcdec", false, true));
2725
2726 TcPhi->addIncoming(Count, Preheader);
2727 TcPhi->addIncoming(TcDec, Body);
2728
2729 CmpInst::Predicate Pred =
2730 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
2731 LbCond->setPredicate(Pred);
2732 LbCond->setOperand(0, TcDec);
2733 LbCond->setOperand(1, ConstantInt::get(CountTy, 0));
2734
2735 // Step 3: All the references to the original counter outside
2736 // the loop are replaced with the NewCount
2737 if (IsCntPhiUsedOutsideLoop)
2738 CntPhi->replaceUsesOutsideBlock(NewCount, Body);
2739 else
2740 CntInst->replaceUsesOutsideBlock(NewCount, Body);
2741
2742 // step 4: Forget the "non-computable" trip-count SCEV associated with the
2743 // loop. The loop would otherwise not be deleted even if it becomes empty.
2744 SE->forgetLoop(CurLoop);
2745}
2746
2747void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
2748 Instruction *CntInst,
2749 PHINode *CntPhi, Value *Var) {
2750 BasicBlock *PreHead = CurLoop->getLoopPreheader();
2751 auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
2752 const DebugLoc &DL = CntInst->getDebugLoc();
2753
2754 // Assuming before transformation, the loop is following:
2755 // if (x) // the precondition
2756 // do { cnt++; x &= x - 1; } while(x);
2757
2758 // Step 1: Insert the ctpop instruction at the end of the precondition block
2759 IRBuilder<> Builder(PreCondBr);
2760 Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
2761 {
2762 PopCnt = createPopcntIntrinsic(Builder, Var, DL);
2763 NewCount = PopCntZext =
2764 Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
2765
2766 if (NewCount != PopCnt)
2767 (cast<Instruction>(NewCount))->setDebugLoc(DL);
2768
2769 // TripCnt is exactly the number of iterations the loop has
2770 TripCnt = NewCount;
2771
2772 // If the population counter's initial value is not zero, insert Add Inst.
2773 Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
2774 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2775 if (!InitConst || !InitConst->isZero()) {
2776 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2777 (cast<Instruction>(NewCount))->setDebugLoc(DL);
2778 }
2779 }
2780
2781 // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
2782 // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
2783 // function would be partial dead code, and downstream passes will drag
2784 // it back from the precondition block to the preheader.
2785 {
2786 ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
2787
2788 Value *Opnd0 = PopCntZext;
2789 Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
2790 if (PreCond->getOperand(0) != Var)
2791 std::swap(Opnd0, Opnd1);
2792
2793 ICmpInst *NewPreCond = cast<ICmpInst>(
2794 Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
2795 PreCondBr->setCondition(NewPreCond);
2796
2798 }
2799
2800 // Step 3: Note that the population count is exactly the trip count of the
2801 // loop in question, which enable us to convert the loop from noncountable
2802 // loop into a countable one. The benefit is twofold:
2803 //
2804 // - If the loop only counts population, the entire loop becomes dead after
2805 // the transformation. It is a lot easier to prove a countable loop dead
2806 // than to prove a noncountable one. (In some C dialects, an infinite loop
2807 // isn't dead even if it computes nothing useful. In general, DCE needs
2808 // to prove a noncountable loop finite before safely delete it.)
2809 //
2810 // - If the loop also performs something else, it remains alive.
2811 // Since it is transformed to countable form, it can be aggressively
2812 // optimized by some optimizations which are in general not applicable
2813 // to a noncountable loop.
2814 //
2815 // After this step, this loop (conceptually) would look like following:
2816 // newcnt = __builtin_ctpop(x);
2817 // t = newcnt;
2818 // if (x)
2819 // do { cnt++; x &= x-1; t--) } while (t > 0);
2820 BasicBlock *Body = *(CurLoop->block_begin());
2821 {
2822 auto *LbBr = cast<BranchInst>(Body->getTerminator());
2823 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2824 Type *Ty = TripCnt->getType();
2825
2826 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi");
2827 TcPhi->insertBefore(Body->begin());
2828
2829 Builder.SetInsertPoint(LbCond);
2831 Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
2832 "tcdec", false, true));
2833
2834 TcPhi->addIncoming(TripCnt, PreHead);
2835 TcPhi->addIncoming(TcDec, Body);
2836
2837 CmpInst::Predicate Pred =
2838 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
2839 LbCond->setPredicate(Pred);
2840 LbCond->setOperand(0, TcDec);
2841 LbCond->setOperand(1, ConstantInt::get(Ty, 0));
2842 }
2843
2844 // Step 4: All the references to the original population counter outside
2845 // the loop are replaced with the NewCount -- the value returned from
2846 // __builtin_ctpop().
2847 CntInst->replaceUsesOutsideBlock(NewCount, Body);
2848
2849 // step 5: Forget the "non-computable" trip-count SCEV associated with the
2850 // loop. The loop would otherwise not be deleted even if it becomes empty.
2851 SE->forgetLoop(CurLoop);
2852}
2853
2854/// Match loop-invariant value.
2855template <typename SubPattern_t> struct match_LoopInvariant {
2856 SubPattern_t SubPattern;
2857 const Loop *L;
2858
2859 match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
2860 : SubPattern(SP), L(L) {}
2861
2862 template <typename ITy> bool match(ITy *V) const {
2863 return L->isLoopInvariant(V) && SubPattern.match(V);
2864 }
2865};
2866
2867/// Matches if the value is loop-invariant.
2868template <typename Ty>
2869inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
2870 return match_LoopInvariant<Ty>(M, L);
2871}
2872
2873/// Return true if the idiom is detected in the loop.
2874///
2875/// The core idiom we are trying to detect is:
2876/// \code
2877/// entry:
2878/// <...>
2879/// %bitmask = shl i32 1, %bitpos
2880/// br label %loop
2881///
2882/// loop:
2883/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2884/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2885/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2886/// %x.next = shl i32 %x.curr, 1
2887/// <...>
2888/// br i1 %x.curr.isbitunset, label %loop, label %end
2889///
2890/// end:
2891/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2892/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2893/// <...>
2894/// \endcode
2895static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
2896 Value *&BitMask, Value *&BitPos,
2897 Value *&CurrX, Instruction *&NextX) {
2899 " Performing shift-until-bittest idiom detection.\n");
2900
2901 // Give up if the loop has multiple blocks or multiple backedges.
2902 if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
2903 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
2904 return false;
2905 }
2906
2907 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2908 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2909 assert(LoopPreheaderBB && "There is always a loop preheader.");
2910
2911 using namespace PatternMatch;
2912
2913 // Step 1: Check if the loop backedge is in desirable form.
2914
2915 CmpPredicate Pred;
2916 Value *CmpLHS, *CmpRHS;
2917 BasicBlock *TrueBB, *FalseBB;
2918 if (!match(LoopHeaderBB->getTerminator(),
2919 m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
2920 m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {
2921 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
2922 return false;
2923 }
2924
2925 // Step 2: Check if the backedge's condition is in desirable form.
2926
2927 auto MatchVariableBitMask = [&]() {
2928 return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2929 match(CmpLHS,
2930 m_c_And(m_Value(CurrX),
2932 m_Value(BitMask),
2933 m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),
2934 CurLoop))));
2935 };
2936
2937 auto MatchDecomposableConstantBitMask = [&]() {
2938 auto Res = llvm::decomposeBitTestICmp(
2939 CmpLHS, CmpRHS, Pred, /*LookThroughTrunc=*/true,
2940 /*AllowNonZeroC=*/false, /*DecomposeAnd=*/true);
2941 if (Res && Res->Mask.isPowerOf2()) {
2942 assert(ICmpInst::isEquality(Res->Pred));
2943 Pred = Res->Pred;
2944 CurrX = Res->X;
2945 BitMask = ConstantInt::get(CurrX->getType(), Res->Mask);
2946 BitPos = ConstantInt::get(CurrX->getType(), Res->Mask.logBase2());
2947 return true;
2948 }
2949 return false;
2950 };
2951
2952 if (!MatchVariableBitMask() && !MatchDecomposableConstantBitMask()) {
2953 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n");
2954 return false;
2955 }
2956
2957 // Step 3: Check if the recurrence is in desirable form.
2958 auto *CurrXPN = dyn_cast<PHINode>(CurrX);
2959 if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) {
2960 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
2961 return false;
2962 }
2963
2964 BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);
2965 NextX =
2966 dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));
2967
2968 assert(CurLoop->isLoopInvariant(BaseX) &&
2969 "Expected BaseX to be available in the preheader!");
2970
2971 if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {
2972 // FIXME: support right-shift?
2973 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
2974 return false;
2975 }
2976
2977 // Step 4: Check if the backedge's destinations are in desirable form.
2978
2980 "Should only get equality predicates here.");
2981
2982 // cmp-br is commutative, so canonicalize to a single variant.
2983 if (Pred != ICmpInst::Predicate::ICMP_EQ) {
2984 Pred = ICmpInst::getInversePredicate(Pred);
2985 std::swap(TrueBB, FalseBB);
2986 }
2987
2988 // We expect to exit loop when comparison yields false,
2989 // so when it yields true we should branch back to loop header.
2990 if (TrueBB != LoopHeaderBB) {
2991 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
2992 return false;
2993 }
2994
2995 // Okay, idiom checks out.
2996 return true;
2997}
2998
2999/// Look for the following loop:
3000/// \code
3001/// entry:
3002/// <...>
3003/// %bitmask = shl i32 1, %bitpos
3004/// br label %loop
3005///
3006/// loop:
3007/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
3008/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
3009/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
3010/// %x.next = shl i32 %x.curr, 1
3011/// <...>
3012/// br i1 %x.curr.isbitunset, label %loop, label %end
3013///
3014/// end:
3015/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
3016/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
3017/// <...>
3018/// \endcode
3019///
3020/// And transform it into:
3021/// \code
3022/// entry:
3023/// %bitmask = shl i32 1, %bitpos
3024/// %lowbitmask = add i32 %bitmask, -1
3025/// %mask = or i32 %lowbitmask, %bitmask
3026/// %x.masked = and i32 %x, %mask
3027/// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
3028/// i1 true)
3029/// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
3030/// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
3031/// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
3032/// %tripcount = add i32 %backedgetakencount, 1
3033/// %x.curr = shl i32 %x, %backedgetakencount
3034/// %x.next = shl i32 %x, %tripcount
3035/// br label %loop
3036///
3037/// loop:
3038/// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
3039/// %loop.iv.next = add nuw i32 %loop.iv, 1
3040/// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
3041/// <...>
3042/// br i1 %loop.ivcheck, label %end, label %loop
3043///
3044/// end:
3045/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
3046/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
3047/// <...>
3048/// \endcode
3049bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
3050 bool MadeChange = false;
3051
3052 Value *X, *BitMask, *BitPos, *XCurr;
3053 Instruction *XNext;
3054 if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,
3055 XNext)) {
3057 " shift-until-bittest idiom detection failed.\n");
3058 return MadeChange;
3059 }
3060 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n");
3061
3062 // Ok, it is the idiom we were looking for, we *could* transform this loop,
3063 // but is it profitable to transform?
3064
3065 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
3066 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
3067 assert(LoopPreheaderBB && "There is always a loop preheader.");
3068
3069 BasicBlock *SuccessorBB = CurLoop->getExitBlock();
3070 assert(SuccessorBB && "There is only a single successor.");
3071
3072 IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
3073 Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());
3074
3075 Intrinsic::ID IntrID = Intrinsic::ctlz;
3076 Type *Ty = X->getType();
3077 unsigned Bitwidth = Ty->getScalarSizeInBits();
3078
3081
3082 // The rewrite is considered to be unprofitable iff and only iff the
3083 // intrinsic/shift we'll use are not cheap. Note that we are okay with *just*
3084 // making the loop countable, even if nothing else changes.
3086 IntrID, Ty, {PoisonValue::get(Ty), /*is_zero_poison=*/Builder.getTrue()});
3087 InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
3090 " Intrinsic is too costly, not beneficial\n");
3091 return MadeChange;
3092 }
3093 if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >
3095 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n");
3096 return MadeChange;
3097 }
3098
3099 // Ok, transform appears worthwhile.
3100 MadeChange = true;
3101
3102 if (!isGuaranteedNotToBeUndefOrPoison(BitPos)) {
3103 // BitMask may be computed from BitPos, Freeze BitPos so we can increase
3104 // it's use count.
3105 std::optional<BasicBlock::iterator> InsertPt = std::nullopt;
3106 if (auto *BitPosI = dyn_cast<Instruction>(BitPos))
3107 InsertPt = BitPosI->getInsertionPointAfterDef();
3108 else
3109 InsertPt = DT->getRoot()->getFirstNonPHIOrDbgOrAlloca();
3110 if (!InsertPt)
3111 return false;
3112 FreezeInst *BitPosFrozen =
3113 new FreezeInst(BitPos, BitPos->getName() + ".fr", *InsertPt);
3114 BitPos->replaceUsesWithIf(BitPosFrozen, [BitPosFrozen](Use &U) {
3115 return U.getUser() != BitPosFrozen;
3116 });
3117 BitPos = BitPosFrozen;
3118 }
3119
3120 // Step 1: Compute the loop trip count.
3121
3122 Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
3123 BitPos->getName() + ".lowbitmask");
3124 Value *Mask =
3125 Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
3126 Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
3127 CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
3128 IntrID, Ty, {XMasked, /*is_zero_poison=*/Builder.getTrue()},
3129 /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros");
3130 Value *XMaskedNumActiveBits = Builder.CreateSub(
3131 ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
3132 XMasked->getName() + ".numactivebits", /*HasNUW=*/true,
3133 /*HasNSW=*/Bitwidth != 2);
3134 Value *XMaskedLeadingOnePos =
3135 Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),
3136 XMasked->getName() + ".leadingonepos", /*HasNUW=*/false,
3137 /*HasNSW=*/Bitwidth > 2);
3138
3139 Value *LoopBackedgeTakenCount = Builder.CreateSub(
3140 BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount",
3141 /*HasNUW=*/true, /*HasNSW=*/true);
3142 // We know loop's backedge-taken count, but what's loop's trip count?
3143 // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
3144 Value *LoopTripCount =
3145 Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
3146 CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
3147 /*HasNSW=*/Bitwidth != 2);
3148
3149 // Step 2: Compute the recurrence's final value without a loop.
3150
3151 // NewX is always safe to compute, because `LoopBackedgeTakenCount`
3152 // will always be smaller than `bitwidth(X)`, i.e. we never get poison.
3153 Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);
3154 NewX->takeName(XCurr);
3155 if (auto *I = dyn_cast<Instruction>(NewX))
3156 I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
3157
3158 Value *NewXNext;
3159 // Rewriting XNext is more complicated, however, because `X << LoopTripCount`
3160 // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
3161 // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
3162 // that isn't the case, we'll need to emit an alternative, safe IR.
3163 if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() ||
3167 Ty->getScalarSizeInBits() - 1))))
3168 NewXNext = Builder.CreateShl(X, LoopTripCount);
3169 else {
3170 // Otherwise, just additionally shift by one. It's the smallest solution,
3171 // alternatively, we could check that NewX is INT_MIN (or BitPos is )
3172 // and select 0 instead.
3173 NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));
3174 }
3175
3176 NewXNext->takeName(XNext);
3177 if (auto *I = dyn_cast<Instruction>(NewXNext))
3178 I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
3179
3180 // Step 3: Adjust the successor basic block to recieve the computed
3181 // recurrence's final value instead of the recurrence itself.
3182
3183 XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);
3184 XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);
3185
3186 // Step 4: Rewrite the loop into a countable form, with canonical IV.
3187
3188 // The new canonical induction variable.
3189 Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->begin());
3190 auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
3191
3192 // The induction itself.
3193 // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
3194 Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
3195 auto *IVNext =
3196 Builder.CreateAdd(IV, ConstantInt::get(Ty, 1), IV->getName() + ".next",
3197 /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
3198
3199 // The loop trip count check.
3200 auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
3201 CurLoop->getName() + ".ivcheck");
3202 Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
3203 LoopHeaderBB->getTerminator()->eraseFromParent();
3204
3205 // Populate the IV PHI.
3206 IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
3207 IV->addIncoming(IVNext, LoopHeaderBB);
3208
3209 // Step 5: Forget the "non-computable" trip-count SCEV associated with the
3210 // loop. The loop would otherwise not be deleted even if it becomes empty.
3211
3212 SE->forgetLoop(CurLoop);
3213
3214 // Other passes will take care of actually deleting the loop if possible.
3215
3216 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n");
3217
3218 ++NumShiftUntilBitTest;
3219 return MadeChange;
3220}
3221
3222/// Return true if the idiom is detected in the loop.
3223///
3224/// The core idiom we are trying to detect is:
3225/// \code
3226/// entry:
3227/// <...>
3228/// %start = <...>
3229/// %extraoffset = <...>
3230/// <...>
3231/// br label %for.cond
3232///
3233/// loop:
3234/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
3235/// %nbits = add nsw i8 %iv, %extraoffset
3236/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
3237/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
3238/// %iv.next = add i8 %iv, 1
3239/// <...>
3240/// br i1 %val.shifted.iszero, label %end, label %loop
3241///
3242/// end:
3243/// %iv.res = phi i8 [ %iv, %loop ] <...>
3244/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
3245/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
3246/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
3247/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
3248/// <...>
3249/// \endcode
3251 Instruction *&ValShiftedIsZero,
3252 Intrinsic::ID &IntrinID, Instruction *&IV,
3253 Value *&Start, Value *&Val,
3254 const SCEV *&ExtraOffsetExpr,
3255 bool &InvertedCond) {
3257 " Performing shift-until-zero idiom detection.\n");
3258
3259 // Give up if the loop has multiple blocks or multiple backedges.
3260 if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
3261 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
3262 return false;
3263 }
3264
3265 Instruction *ValShifted, *NBits, *IVNext;
3266 Value *ExtraOffset;
3267
3268 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
3269 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
3270 assert(LoopPreheaderBB && "There is always a loop preheader.");
3271
3272 using namespace PatternMatch;
3273
3274 // Step 1: Check if the loop backedge, condition is in desirable form.
3275
3276 CmpPredicate Pred;
3277 BasicBlock *TrueBB, *FalseBB;
3278 if (!match(LoopHeaderBB->getTerminator(),
3279 m_Br(m_Instruction(ValShiftedIsZero), m_BasicBlock(TrueBB),
3280 m_BasicBlock(FalseBB))) ||
3281 !match(ValShiftedIsZero,
3282 m_ICmp(Pred, m_Instruction(ValShifted), m_Zero())) ||
3283 !ICmpInst::isEquality(Pred)) {
3284 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
3285 return false;
3286 }
3287
3288 // Step 2: Check if the comparison's operand is in desirable form.
3289 // FIXME: Val could be a one-input PHI node, which we should look past.
3290 if (!match(ValShifted, m_Shift(m_LoopInvariant(m_Value(Val), CurLoop),
3291 m_Instruction(NBits)))) {
3292 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad comparisons value computation.\n");
3293 return false;
3294 }
3295 IntrinID = ValShifted->getOpcode() == Instruction::Shl ? Intrinsic::cttz
3296 : Intrinsic::ctlz;
3297
3298 // Step 3: Check if the shift amount is in desirable form.
3299
3300 if (match(NBits, m_c_Add(m_Instruction(IV),
3301 m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
3302 (NBits->hasNoSignedWrap() || NBits->hasNoUnsignedWrap()))
3303 ExtraOffsetExpr = SE->getNegativeSCEV(SE->getSCEV(ExtraOffset));
3304 else if (match(NBits,
3306 m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
3307 NBits->hasNoSignedWrap())
3308 ExtraOffsetExpr = SE->getSCEV(ExtraOffset);
3309 else {
3310 IV = NBits;
3311 ExtraOffsetExpr = SE->getZero(NBits->getType());
3312 }
3313
3314 // Step 4: Check if the recurrence is in desirable form.
3315 auto *IVPN = dyn_cast<PHINode>(IV);
3316 if (!IVPN || IVPN->getParent() != LoopHeaderBB) {
3317 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
3318 return false;
3319 }
3320
3321 Start = IVPN->getIncomingValueForBlock(LoopPreheaderBB);
3322 IVNext = dyn_cast<Instruction>(IVPN->getIncomingValueForBlock(LoopHeaderBB));
3323
3324 if (!IVNext || !match(IVNext, m_Add(m_Specific(IVPN), m_One()))) {
3325 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
3326 return false;
3327 }
3328
3329 // Step 4: Check if the backedge's destinations are in desirable form.
3330
3332 "Should only get equality predicates here.");
3333
3334 // cmp-br is commutative, so canonicalize to a single variant.
3335 InvertedCond = Pred != ICmpInst::Predicate::ICMP_EQ;
3336 if (InvertedCond) {
3337 Pred = ICmpInst::getInversePredicate(Pred);
3338 std::swap(TrueBB, FalseBB);
3339 }
3340
3341 // We expect to exit loop when comparison yields true,
3342 // so when it yields false we should branch back to loop header.
3343 if (FalseBB != LoopHeaderBB) {
3344 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
3345 return false;
3346 }
3347
3348 // The new, countable, loop will certainly only run a known number of
3349 // iterations, It won't be infinite. But the old loop might be infinite
3350 // under certain conditions. For logical shifts, the value will become zero
3351 // after at most bitwidth(%Val) loop iterations. However, for arithmetic
3352 // right-shift, iff the sign bit was set, the value will never become zero,
3353 // and the loop may never finish.
3354 if (ValShifted->getOpcode() == Instruction::AShr &&
3355 !isMustProgress(CurLoop) && !SE->isKnownNonNegative(SE->getSCEV(Val))) {
3356 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Can not prove the loop is finite.\n");
3357 return false;
3358 }
3359
3360 // Okay, idiom checks out.
3361 return true;
3362}
3363
3364/// Look for the following loop:
3365/// \code
3366/// entry:
3367/// <...>
3368/// %start = <...>
3369/// %extraoffset = <...>
3370/// <...>
3371/// br label %for.cond
3372///
3373/// loop:
3374/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
3375/// %nbits = add nsw i8 %iv, %extraoffset
3376/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
3377/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
3378/// %iv.next = add i8 %iv, 1
3379/// <...>
3380/// br i1 %val.shifted.iszero, label %end, label %loop
3381///
3382/// end:
3383/// %iv.res = phi i8 [ %iv, %loop ] <...>
3384/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
3385/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
3386/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
3387/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
3388/// <...>
3389/// \endcode
3390///
3391/// And transform it into:
3392/// \code
3393/// entry:
3394/// <...>
3395/// %start = <...>
3396/// %extraoffset = <...>
3397/// <...>
3398/// %val.numleadingzeros = call i8 @llvm.ct{l,t}z.i8(i8 %val, i1 0)
3399/// %val.numactivebits = sub i8 8, %val.numleadingzeros
3400/// %extraoffset.neg = sub i8 0, %extraoffset
3401/// %tmp = add i8 %val.numactivebits, %extraoffset.neg
3402/// %iv.final = call i8 @llvm.smax.i8(i8 %tmp, i8 %start)
3403/// %loop.tripcount = sub i8 %iv.final, %start
3404/// br label %loop
3405///
3406/// loop:
3407/// %loop.iv = phi i8 [ 0, %entry ], [ %loop.iv.next, %loop ]
3408/// %loop.iv.next = add i8 %loop.iv, 1
3409/// %loop.ivcheck = icmp eq i8 %loop.iv.next, %loop.tripcount
3410/// %iv = add i8 %loop.iv, %start
3411/// <...>
3412/// br i1 %loop.ivcheck, label %end, label %loop
3413///
3414/// end:
3415/// %iv.res = phi i8 [ %iv.final, %loop ] <...>
3416/// <...>
3417/// \endcode
3418bool LoopIdiomRecognize::recognizeShiftUntilZero() {
3419 bool MadeChange = false;
3420
3421 Instruction *ValShiftedIsZero;
3422 Intrinsic::ID IntrID;
3423 Instruction *IV;
3424 Value *Start, *Val;
3425 const SCEV *ExtraOffsetExpr;
3426 bool InvertedCond;
3427 if (!detectShiftUntilZeroIdiom(CurLoop, SE, ValShiftedIsZero, IntrID, IV,
3428 Start, Val, ExtraOffsetExpr, InvertedCond)) {
3430 " shift-until-zero idiom detection failed.\n");
3431 return MadeChange;
3432 }
3433 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom detected!\n");
3434
3435 // Ok, it is the idiom we were looking for, we *could* transform this loop,
3436 // but is it profitable to transform?
3437
3438 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
3439 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
3440 assert(LoopPreheaderBB && "There is always a loop preheader.");
3441
3442 BasicBlock *SuccessorBB = CurLoop->getExitBlock();
3443 assert(SuccessorBB && "There is only a single successor.");
3444
3445 IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
3446 Builder.SetCurrentDebugLocation(IV->getDebugLoc());
3447
3448 Type *Ty = Val->getType();
3449 unsigned Bitwidth = Ty->getScalarSizeInBits();
3450
3453
3454 // The rewrite is considered to be unprofitable iff and only iff the
3455 // intrinsic we'll use are not cheap. Note that we are okay with *just*
3456 // making the loop countable, even if nothing else changes.
3458 IntrID, Ty, {PoisonValue::get(Ty), /*is_zero_poison=*/Builder.getFalse()});
3459 InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
3462 " Intrinsic is too costly, not beneficial\n");
3463 return MadeChange;
3464 }
3465
3466 // Ok, transform appears worthwhile.
3467 MadeChange = true;
3468
3469 bool OffsetIsZero = ExtraOffsetExpr->isZero();
3470
3471 // Step 1: Compute the loop's final IV value / trip count.
3472
3473 CallInst *ValNumLeadingZeros = Builder.CreateIntrinsic(
3474 IntrID, Ty, {Val, /*is_zero_poison=*/Builder.getFalse()},
3475 /*FMFSource=*/nullptr, Val->getName() + ".numleadingzeros");
3476 Value *ValNumActiveBits = Builder.CreateSub(
3477 ConstantInt::get(Ty, Ty->getScalarSizeInBits()), ValNumLeadingZeros,
3478 Val->getName() + ".numactivebits", /*HasNUW=*/true,
3479 /*HasNSW=*/Bitwidth != 2);
3480
3481 SCEVExpander Expander(*SE, *DL, "loop-idiom");
3482 Expander.setInsertPoint(&*Builder.GetInsertPoint());
3483 Value *ExtraOffset = Expander.expandCodeFor(ExtraOffsetExpr);
3484
3485 Value *ValNumActiveBitsOffset = Builder.CreateAdd(
3486 ValNumActiveBits, ExtraOffset, ValNumActiveBits->getName() + ".offset",
3487 /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true);
3488 Value *IVFinal = Builder.CreateIntrinsic(Intrinsic::smax, {Ty},
3489 {ValNumActiveBitsOffset, Start},
3490 /*FMFSource=*/nullptr, "iv.final");
3491
3492 auto *LoopBackedgeTakenCount = cast<Instruction>(Builder.CreateSub(
3493 IVFinal, Start, CurLoop->getName() + ".backedgetakencount",
3494 /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true));
3495 // FIXME: or when the offset was `add nuw`
3496
3497 // We know loop's backedge-taken count, but what's loop's trip count?
3498 Value *LoopTripCount =
3499 Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
3500 CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
3501 /*HasNSW=*/Bitwidth != 2);
3502
3503 // Step 2: Adjust the successor basic block to recieve the original
3504 // induction variable's final value instead of the orig. IV itself.
3505
3506 IV->replaceUsesOutsideBlock(IVFinal, LoopHeaderBB);
3507
3508 // Step 3: Rewrite the loop into a countable form, with canonical IV.
3509
3510 // The new canonical induction variable.
3511 Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->begin());
3512 auto *CIV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
3513
3514 // The induction itself.
3515 Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->getFirstNonPHIIt());
3516 auto *CIVNext =
3517 Builder.CreateAdd(CIV, ConstantInt::get(Ty, 1), CIV->getName() + ".next",
3518 /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
3519
3520 // The loop trip count check.
3521 auto *CIVCheck = Builder.CreateICmpEQ(CIVNext, LoopTripCount,
3522 CurLoop->getName() + ".ivcheck");
3523 auto *NewIVCheck = CIVCheck;
3524 if (InvertedCond) {
3525 NewIVCheck = Builder.CreateNot(CIVCheck);
3526 NewIVCheck->takeName(ValShiftedIsZero);
3527 }
3528
3529 // The original IV, but rebased to be an offset to the CIV.
3530 auto *IVDePHId = Builder.CreateAdd(CIV, Start, "", /*HasNUW=*/false,
3531 /*HasNSW=*/true); // FIXME: what about NUW?
3532 IVDePHId->takeName(IV);
3533
3534 // The loop terminator.
3535 Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
3536 Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
3537 LoopHeaderBB->getTerminator()->eraseFromParent();
3538
3539 // Populate the IV PHI.
3540 CIV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
3541 CIV->addIncoming(CIVNext, LoopHeaderBB);
3542
3543 // Step 4: Forget the "non-computable" trip-count SCEV associated with the
3544 // loop. The loop would otherwise not be deleted even if it becomes empty.
3545
3546 SE->forgetLoop(CurLoop);
3547
3548 // Step 5: Try to cleanup the loop's body somewhat.
3549 IV->replaceAllUsesWith(IVDePHId);
3550 IV->eraseFromParent();
3551
3552 ValShiftedIsZero->replaceAllUsesWith(NewIVCheck);
3553 ValShiftedIsZero->eraseFromParent();
3554
3555 // Other passes will take care of actually deleting the loop if possible.
3556
3557 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom optimized!\n");
3558
3559 ++NumShiftUntilZero;
3560 return MadeChange;
3561}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const Function * getParent(const Value *V)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
DXIL Resource Access
This file defines the DenseMap class.
#define DEBUG_TYPE
static const HTTPClientCleanup Cleanup
static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, const SCEV *BECount, unsigned StoreSize, AliasAnalysis &AA, SmallPtrSetImpl< Instruction * > &Ignored)
mayLoopAccessLocation - Return true if the specified loop might access the specified pointer location...
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This header defines various interfaces for pass management in LLVM.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static Value * matchCondition(BranchInst *BI, BasicBlock *LoopEntry, bool JmpOnZero=false)
Check if the given conditional branch is based on the comparison between a variable and zero,...
static PHINode * getRecurrenceVar(Value *VarX, Instruction *DefX, BasicBlock *LoopEntry)
static cl::opt< bool, true > DisableLIRPMemset("disable-" DEBUG_TYPE "-memset", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memset."), cl::location(DisableLIRP::Memset), cl::init(false), cl::ReallyHidden)
static cl::opt< bool > ForceMemsetPatternIntrinsic("loop-idiom-force-memset-pattern-intrinsic", cl::desc("Use memset.pattern intrinsic whenever possible"), cl::init(false), cl::Hidden)
static CallInst * createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL, bool ZeroCheck, Intrinsic::ID IID)
static cl::opt< bool, true > EnableLIRPWcslen("disable-loop-idiom-wcslen", cl::desc("Proceed with loop idiom recognize pass, " "enable conversion of loop(s) to wcslen."), cl::location(DisableLIRP::Wcslen), cl::init(false), cl::ReallyHidden)
static bool detectShiftUntilLessThanIdiom(Loop *CurLoop, const DataLayout &DL, Intrinsic::ID &IntrinID, Value *&InitX, Instruction *&CntInst, PHINode *&CntPhi, Instruction *&DefX, APInt &Threshold)
Return true if the idiom is detected in the loop.
static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX, Value *&BitMask, Value *&BitPos, Value *&CurrX, Instruction *&NextX)
Return true if the idiom is detected in the loop.
static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, Instruction *&CntInst, PHINode *&CntPhi, Value *&Var)
Return true iff the idiom is detected in the loop.
static Constant * getMemSetPatternValue(Value *V, const DataLayout *DL)
getMemSetPatternValue - If a strided store of the specified value is safe to turn into a memset....
static cl::opt< bool, true > DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memcpy."), cl::location(DisableLIRP::Memcpy), cl::init(false), cl::ReallyHidden)
static CallInst * createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL)
static const SCEV * getNumBytes(const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, Loop *CurLoop, const DataLayout *DL, ScalarEvolution *SE)
Compute the number of bytes as a SCEV from the backedge taken count.
static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL, Intrinsic::ID &IntrinID, Value *&InitX, Instruction *&CntInst, PHINode *&CntPhi, Instruction *&DefX)
Return true if the idiom is detected in the loop.
static cl::opt< bool, true > DisableLIRPStrlen("disable-loop-idiom-strlen", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to strlen."), cl::location(DisableLIRP::Strlen), cl::init(false), cl::ReallyHidden)
static const SCEV * getStartForNegStride(const SCEV *Start, const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, ScalarEvolution *SE)
static APInt getStoreStride(const SCEVAddRecExpr *StoreEv)
static Value * matchShiftULTCondition(BranchInst *BI, BasicBlock *LoopEntry, APInt &Threshold)
Check if the given conditional branch is based on an unsigned less-than comparison between a variable...
match_LoopInvariant< Ty > m_LoopInvariant(const Ty &M, const Loop *L)
Matches if the value is loop-invariant.
static cl::opt< bool, true > DisableLIRPAll("disable-" DEBUG_TYPE "-all", cl::desc("Options to disable Loop Idiom Recognize Pass."), cl::location(DisableLIRP::All), cl::init(false), cl::ReallyHidden)
static void deleteDeadInstruction(Instruction *I)
static cl::opt< bool, true > DisableLIRPHashRecognize("disable-" DEBUG_TYPE "-hashrecognize", cl::desc("Proceed with loop idiom recognize pass, " "but do not optimize CRC loops."), cl::location(DisableLIRP::HashRecognize), cl::init(false), cl::ReallyHidden)
static cl::opt< bool > UseLIRCodeSizeHeurs("use-lir-code-size-heurs", cl::desc("Use loop idiom recognition code size heuristics when compiling " "with -Os/-Oz"), cl::init(true), cl::Hidden)
#define I(x, y, z)
Definition MD5.cpp:58
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
#define T
Contains a collection of routines for determining if a given instruction is guaranteed to execute if ...
if(PassOpts->AAPipeline)
const SmallVectorImpl< MachineOperand > & Cond
static bool isSimple(Instruction *I)
verify safepoint Safepoint IR Verifier
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
std::optional< uint64_t > tryZExtValue() const
Get zero extended value if possible.
Definition APInt.h:1552
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:528
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI iterator_range< filter_iterator< BasicBlock::const_iterator, std::function< bool(const Instruction &)> > > instructionsWithoutDebug(bool SkipPseudoOp=true) const
Return a const iterator range over the instructions in the block, skipping any debug instructions.
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction & front() const
Definition BasicBlock.h:482
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
size_t size() const
Definition BasicBlock.h:480
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
LLVM_ABI const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
BinaryOps getOpcode() const
Definition InstrTypes.h:374
Conditional or Unconditional Branch instruction.
void setCondition(Value *V)
bool isConditional() const
unsigned getNumSuccessors() const
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
This class represents a function call, abstracting a target machine's calling convention.
void setPredicate(Predicate P)
Set the predicate for this instruction to the specified value.
Definition InstrTypes.h:770
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:708
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:701
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:703
@ ICMP_NE
not equal
Definition InstrTypes.h:700
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:791
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:767
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isMinusOne() const
This function will return true iff every bit in this constant is set to true.
Definition Constants.h:226
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
Definition Constants.h:220
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:214
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:163
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:154
static LLVM_ABI ConstantInt * getBool(LLVMContext &Context, bool V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
LLVM_ABI IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
A debug info location.
Definition DebugLoc.h:124
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
This class represents a freeze function that returns random concrete value if an operand is either a ...
PointerType * getType() const
Global values are always pointers.
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition GlobalValue.h:61
static CRCTable genSarwateTable(const APInt &GenPoly, bool ByteOrderSwapped)
Generate a lookup table of 256 entries by interleaving the generating polynomial.
This instruction compares its operands according to the predicate given to the constructor.
bool isEquality() const
Return true if this predicate is either EQ or NE.
static bool isEquality(Predicate P)
Return true if this predicate is either EQ or NE.
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
ConstantInt * getInt1(bool V)
Get a constant value representing either true or false.
Definition IRBuilder.h:497
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition IRBuilder.h:2100
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
LLVM_ABI bool hasNoUnsignedWrap() const LLVM_READONLY
Determine whether the no unsigned wrap flag is set.
LLVM_ABI bool hasNoSignedWrap() const LLVM_READONLY
Determine whether the no signed wrap flag is set.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void setAAMetadata(const AAMDNodes &N)
Sets the AA metadata on this instruction from the AAMDNodes structure.
LLVM_ABI bool isAtomic() const LLVM_READONLY
Return true if this instruction has an AtomicOrdering of unordered or higher.
LLVM_ABI void insertBefore(InstListType::iterator InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified position.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isShift() const
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getPointerOperand()
bool isVolatile() const
Return true if this is a load from a volatile memory location.
bool isUnordered() const
Align getAlign() const
Return the alignment of the access that is being performed.
static LocationSize precise(uint64_t Value)
static constexpr LocationSize afterPointer()
Any location after the base pointer (but still within the underlying object).
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
bool isOutermost() const
Return true if the loop does not have a parent (natural) loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
unsigned getNumBackEdges() const
Calculate the number of back edges to the loop header.
BlockT * getHeader() const
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
void getUniqueExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all unique successor blocks of this loop.
block_iterator block_begin() const
BlockT * getUniqueExitBlock() const
If getUniqueExitBlocks would return exactly one block, return that block.
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition LoopInfo.cpp:632
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition LoopInfo.cpp:61
ICmpInst * getLatchCmpInst() const
Get the latch condition instruction.
Definition LoopInfo.cpp:175
StringRef getName() const
Definition LoopInfo.h:389
PHINode * getCanonicalInductionVariable() const
Check to see if the loop has a canonical induction variable: an integer recurrence that starts at 0 a...
Definition LoopInfo.cpp:151
This class wraps the llvm.memcpy intrinsic.
Value * getLength() const
Value * getDest() const
This is just like getRawDest, but it strips off any cast instructions (including addrspacecast) that ...
MaybeAlign getDestAlign() const
bool isForceInlined() const
bool isVolatile() const
Value * getValue() const
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
MaybeAlign getSourceAlign() const
Value * getSource() const
This is just like getRawSource, but it strips off any cast instructions that feed it,...
Representation for a specific memory location.
An analysis that produces MemorySSA for a function.
Definition MemorySSA.h:936
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition MemorySSA.h:702
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
This node represents a polynomial recurrence on the trip count of the specified loop.
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents a constant integer value.
ConstantInt * getValue() const
const APInt & getAPInt() const
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
This class uses information about analyze scalars to rewrite expressions in canonical form.
const SCEV * getOperand(unsigned i) const
This class represents an analyzed expression in the program.
LLVM_ABI bool isOne() const
Return true if the expression is a constant one.
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI bool isKnownNonNegative(const SCEV *S)
Test if the given expression is known to be non-negative.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getTripCountFromExitCount(const SCEV *ExitCount)
A version of getTripCountFromExitCount below which always picks an evaluation type which can not resu...
LLVM_ABI void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
LLVM_ABI const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getTruncateOrZeroExtend(const SCEV *V, Type *Ty, unsigned Depth=0)
Return a SCEV corresponding to a conversion of the input value to the specified type.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVM_ABI const SCEV * getTruncateOrSignExtend(const SCEV *V, Type *Ty, unsigned Depth=0)
Return a SCEV corresponding to a conversion of the input value to the specified type.
A vector that has set insertion semantics.
Definition SetVector.h:59
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition SetVector.h:261
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:150
Simple and conservative implementation of LoopSafetyInfo that can give false-positive answers to its ...
void computeLoopSafetyInfo(const Loop *CurLoop) override
Computes safety information for a loop checks loop body & header for the possibility of may throw exc...
bool anyBlockMayThrow() const override
Returns true iff any block of the loop for which this info is contains an instruction that may throw ...
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Align getAlign() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Provides information about what library functions are available for the current target.
unsigned getWCharSize(const Module &M) const
Returns the size of the wchar_t type in bytes or 0 if the size is unknown.
bool has(LibFunc F) const
Tests whether a library function is available.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
TargetCostKind
The kind of cost model.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCC_Basic
The cost of a typical 'add' instruction.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI unsigned getIntegerBitWidth() const
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:295
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntOrPtrTy() const
Return true if this is an integer type or a pointer type.
Definition Type.h:255
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:237
Value * getOperand(unsigned i) const
Definition User.h:232
unsigned getNumOperands() const
Definition User.h:254
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:554
LLVM_ABI void replaceUsesOutsideBlock(Value *V, BasicBlock *BB)
replaceUsesOutsideBlock - Go through the uses list for this definition and make each use point to "V"...
Definition Value.cpp:599
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
Value handle that is nullable, but tries to track the Value.
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:201
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:169
const ParentTy * getParent() const
Definition ilist_node.h:34
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
@ HeaderSize
Definition BTF.h:61
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
OperandType
Operands are tagged with one of the values of this enum.
Definition MCInstrDesc.h:60
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinOpPred_match< LHS, RHS, is_shift_op > m_Shift(const LHS &L, const RHS &R)
Matches shift operations.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
class_match< BasicBlock > m_BasicBlock()
Match an arbitrary basic block value and ignore it.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
cst_pred_ty< icmp_pred_with_threshold > m_SpecificInt_ICMP(ICmpInst::Predicate Predicate, const APInt &Threshold)
Match an integer or vector with every element comparing 'pred' (eg/ne/...) to Threshold.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
class_match< const SCEVConstant > m_SCEVConstant()
specificloop_ty m_SpecificLoop(const Loop *L)
SCEVAffineAddRec_match< Op0_t, Op1_t, class_match< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
bool match(const SCEV *S, const Pattern &P)
specificscev_ty m_scev_Specific(const SCEV *S)
Match if we have a specific specified SCEV.
class_match< const SCEV > m_SCEV()
initializer< Ty > init(const Ty &Val)
LocationClass< Ty > location(Ty &L)
constexpr double e
Definition MathExtras.h:47
DiagnosticInfoOptimizationBase::Argument NV
DiagnosticInfoOptimizationBase::setExtraArgs setExtraArgs
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
InstructionCost Cost
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
LLVM_ABI bool isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI, LibFunc TheLibFunc)
Check whether the library function is available on target and also that it in the current Module is a...
AnalysisManager< Loop, LoopStandardAnalysisResults & > LoopAnalysisManager
The loop analysis manager.
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:1948
LLVM_ABI bool isMustProgress(const Loop *L)
Return true if this loop can be assumed to make progress.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
bool isModOrRefSet(const ModRefInfo MRI)
Definition ModRef.h:43
LLVM_ABI Value * emitStrLen(Value *Ptr, IRBuilderBase &B, const DataLayout &DL, const TargetLibraryInfo *TLI)
Emit a call to the strlen function to the builder, for the specified pointer.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition ModRef.h:28
@ ModRef
The access may reference and may modify the value stored in memory.
Definition ModRef.h:36
@ Mod
The access may modify the value stored in memory.
Definition ModRef.h:34
TargetTransformInfo TTI
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
LLVM_ABI bool VerifyMemorySSA
Enables verification of MemorySSA.
Definition MemorySSA.cpp:84
LLVM_ABI bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
DWARFExpression::Operation Op
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Return true if this function can prove that V does not have undef bits and is never poison.
LLVM_ABI Value * emitWcsLen(Value *Ptr, IRBuilderBase &B, const DataLayout &DL, const TargetLibraryInfo *TLI)
Emit a call to the wcslen function to the builder, for the specified pointer.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560
LLVM_ABI PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
LLVM_ABI Value * isBytewiseValue(Value *V, const DataLayout &DL)
If the specified value can be set by repeating the same byte in memory, return the i8 value that it i...
LLVM_ABI bool RecursivelyDeleteDeadPHINode(PHINode *PN, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
If the specified value is an effectively dead PHI node, due to being a def-use chain of single-use no...
Definition Local.cpp:641
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
std::optional< DecomposedBitTest > decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate Pred, bool LookThroughTrunc=true, bool AllowNonZeroC=false, bool DecomposeAnd=false)
Decompose an icmp into the form ((X & Mask) pred C) if possible.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:761
LLVM_ABI AAMDNodes merge(const AAMDNodes &Other) const
Given two sets of AAMDNodes applying to potentially different locations, determine the best AAMDNodes...
AAMDNodes extendTo(ssize_t Len) const
Create a new AAMDNode that describes this AAMDNode after extending it to apply to a series of bytes o...
Definition Metadata.h:834
static bool Wcslen
When true, Wcslen is disabled.
static bool HashRecognize
When true, HashRecognize is disabled.
static bool Strlen
When true, Strlen is disabled.
static bool Memset
When true, Memset is disabled.
static bool All
When true, the entire pass is disabled.
static bool Memcpy
When true, Memcpy is disabled.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
The structure that is returned when a polynomial algorithm was recognized by the analysis.
Match loop-invariant value.
match_LoopInvariant(const SubPattern_t &SP, const Loop *L)