LLVM 17.0.0git
LoopIdiomRecognize.cpp
Go to the documentation of this file.
1//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements an idiom recognizer that transforms simple loops into a
10// non-loop form. In cases that this kicks in, it can be a significant
11// performance win.
12//
13// If compiling for code size we avoid idiom recognition if the resulting
14// code could be larger than the code for the original loop. One way this could
15// happen is if the loop is not removable after idiom recognition due to the
16// presence of non-idiom instructions. The initial implementation of the
17// heuristics applies to idioms in multi-block loops.
18//
19//===----------------------------------------------------------------------===//
20//
21// TODO List:
22//
23// Future loop memory idioms to recognize:
24// memcmp, strlen, etc.
25// Future floating point idioms to recognize in -ffast-math mode:
26// fpowi
27// Future integer operation idioms to recognize:
28// ctpop
29//
30// Beware that isel's default lowering for ctpop is highly inefficient for
31// i64 and larger types when i64 is legal and the value has few bits set. It
32// would be good to enhance isel to emit a loop for ctpop in this case.
33//
34// This could recognize common matrix multiplies and dot product idioms and
35// replace them with calls to BLAS (if linked in??).
36//
37//===----------------------------------------------------------------------===//
38
40#include "llvm/ADT/APInt.h"
41#include "llvm/ADT/ArrayRef.h"
42#include "llvm/ADT/DenseMap.h"
43#include "llvm/ADT/MapVector.h"
44#include "llvm/ADT/SetVector.h"
47#include "llvm/ADT/Statistic.h"
48#include "llvm/ADT/StringRef.h"
64#include "llvm/IR/BasicBlock.h"
65#include "llvm/IR/Constant.h"
66#include "llvm/IR/Constants.h"
67#include "llvm/IR/DataLayout.h"
68#include "llvm/IR/DebugLoc.h"
70#include "llvm/IR/Dominators.h"
71#include "llvm/IR/GlobalValue.h"
73#include "llvm/IR/IRBuilder.h"
74#include "llvm/IR/InstrTypes.h"
75#include "llvm/IR/Instruction.h"
78#include "llvm/IR/Intrinsics.h"
79#include "llvm/IR/LLVMContext.h"
80#include "llvm/IR/Module.h"
81#include "llvm/IR/PassManager.h"
83#include "llvm/IR/Type.h"
84#include "llvm/IR/User.h"
85#include "llvm/IR/Value.h"
86#include "llvm/IR/ValueHandle.h"
88#include "llvm/Pass.h"
91#include "llvm/Support/Debug.h"
99#include <algorithm>
100#include <cassert>
101#include <cstdint>
102#include <utility>
103#include <vector>
104
105using namespace llvm;
106
107#define DEBUG_TYPE "loop-idiom"
108
109STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
110STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
111STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores");
113 NumShiftUntilBitTest,
114 "Number of uncountable loops recognized as 'shift until bitttest' idiom");
115STATISTIC(NumShiftUntilZero,
116 "Number of uncountable loops recognized as 'shift until zero' idiom");
117
120 DisableLIRPAll("disable-" DEBUG_TYPE "-all",
121 cl::desc("Options to disable Loop Idiom Recognize Pass."),
124
127 DisableLIRPMemset("disable-" DEBUG_TYPE "-memset",
128 cl::desc("Proceed with loop idiom recognize pass, but do "
129 "not convert loop(s) to memset."),
132
135 DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy",
136 cl::desc("Proceed with loop idiom recognize pass, but do "
137 "not convert loop(s) to memcpy."),
140
142 "use-lir-code-size-heurs",
143 cl::desc("Use loop idiom recognition code size heuristics when compiling"
144 "with -Os/-Oz"),
145 cl::init(true), cl::Hidden);
146
147namespace {
148
149class LoopIdiomRecognize {
150 Loop *CurLoop = nullptr;
151 AliasAnalysis *AA;
152 DominatorTree *DT;
153 LoopInfo *LI;
154 ScalarEvolution *SE;
157 const DataLayout *DL;
159 bool ApplyCodeSizeHeuristics;
160 std::unique_ptr<MemorySSAUpdater> MSSAU;
161
162public:
163 explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
164 LoopInfo *LI, ScalarEvolution *SE,
166 const TargetTransformInfo *TTI, MemorySSA *MSSA,
167 const DataLayout *DL,
169 : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
170 if (MSSA)
171 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
172 }
173
174 bool runOnLoop(Loop *L);
175
176private:
177 using StoreList = SmallVector<StoreInst *, 8>;
178 using StoreListMap = MapVector<Value *, StoreList>;
179
180 StoreListMap StoreRefsForMemset;
181 StoreListMap StoreRefsForMemsetPattern;
182 StoreList StoreRefsForMemcpy;
183 bool HasMemset;
184 bool HasMemsetPattern;
185 bool HasMemcpy;
186
187 /// Return code for isLegalStore()
188 enum LegalStoreKind {
189 None = 0,
190 Memset,
191 MemsetPattern,
192 Memcpy,
193 UnorderedAtomicMemcpy,
194 DontUse // Dummy retval never to be used. Allows catching errors in retval
195 // handling.
196 };
197
198 /// \name Countable Loop Idiom Handling
199 /// @{
200
201 bool runOnCountableLoop();
202 bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
204
205 void collectStores(BasicBlock *BB);
206 LegalStoreKind isLegalStore(StoreInst *SI);
207 enum class ForMemset { No, Yes };
208 bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
209 ForMemset For);
210
211 template <typename MemInst>
212 bool processLoopMemIntrinsic(
213 BasicBlock *BB,
214 bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
215 const SCEV *BECount);
216 bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount);
217 bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
218
219 bool processLoopStridedStore(Value *DestPtr, const SCEV *StoreSizeSCEV,
220 MaybeAlign StoreAlignment, Value *StoredVal,
221 Instruction *TheStore,
223 const SCEVAddRecExpr *Ev, const SCEV *BECount,
224 bool IsNegStride, bool IsLoopMemset = false);
225 bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
226 bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr,
227 const SCEV *StoreSize, MaybeAlign StoreAlign,
228 MaybeAlign LoadAlign, Instruction *TheStore,
229 Instruction *TheLoad,
230 const SCEVAddRecExpr *StoreEv,
231 const SCEVAddRecExpr *LoadEv,
232 const SCEV *BECount);
233 bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
234 bool IsLoopMemset = false);
235
236 /// @}
237 /// \name Noncountable Loop Idiom Handling
238 /// @{
239
240 bool runOnNoncountableLoop();
241
242 bool recognizePopcount();
243 void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
244 PHINode *CntPhi, Value *Var);
245 bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
246 void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
247 Instruction *CntInst, PHINode *CntPhi,
248 Value *Var, Instruction *DefX,
249 const DebugLoc &DL, bool ZeroCheck,
250 bool IsCntPhiUsedOutsideLoop);
251
252 bool recognizeShiftUntilBitTest();
253 bool recognizeShiftUntilZero();
254
255 /// @}
256};
257} // end anonymous namespace
258
261 LPMUpdater &) {
263 return PreservedAnalyses::all();
264
265 const auto *DL = &L.getHeader()->getModule()->getDataLayout();
266
267 // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
268 // pass. Function analyses need to be preserved across loop transformations
269 // but ORE cannot be preserved (see comment before the pass definition).
270 OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
271
272 LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
273 AR.MSSA, DL, ORE);
274 if (!LIR.runOnLoop(&L))
275 return PreservedAnalyses::all();
276
278 if (AR.MSSA)
279 PA.preserve<MemorySSAAnalysis>();
280 return PA;
281}
282
284 I->replaceAllUsesWith(PoisonValue::get(I->getType()));
285 I->eraseFromParent();
286}
287
288//===----------------------------------------------------------------------===//
289//
290// Implementation of LoopIdiomRecognize
291//
292//===----------------------------------------------------------------------===//
293
294bool LoopIdiomRecognize::runOnLoop(Loop *L) {
295 CurLoop = L;
296 // If the loop could not be converted to canonical form, it must have an
297 // indirectbr in it, just give up.
298 if (!L->getLoopPreheader())
299 return false;
300
301 // Disable loop idiom recognition if the function's name is a common idiom.
302 StringRef Name = L->getHeader()->getParent()->getName();
303 if (Name == "memset" || Name == "memcpy")
304 return false;
305
306 // Determine if code size heuristics need to be applied.
307 ApplyCodeSizeHeuristics =
308 L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
309
310 HasMemset = TLI->has(LibFunc_memset);
311 HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
312 HasMemcpy = TLI->has(LibFunc_memcpy);
313
314 if (HasMemset || HasMemsetPattern || HasMemcpy)
316 return runOnCountableLoop();
317
318 return runOnNoncountableLoop();
319}
320
321bool LoopIdiomRecognize::runOnCountableLoop() {
322 const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
323 assert(!isa<SCEVCouldNotCompute>(BECount) &&
324 "runOnCountableLoop() called on a loop without a predictable"
325 "backedge-taken count");
326
327 // If this loop executes exactly one time, then it should be peeled, not
328 // optimized by this pass.
329 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
330 if (BECst->getAPInt() == 0)
331 return false;
332
334 CurLoop->getUniqueExitBlocks(ExitBlocks);
335
336 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
337 << CurLoop->getHeader()->getParent()->getName()
338 << "] Countable Loop %" << CurLoop->getHeader()->getName()
339 << "\n");
340
341 // The following transforms hoist stores/memsets into the loop pre-header.
342 // Give up if the loop has instructions that may throw.
343 SimpleLoopSafetyInfo SafetyInfo;
344 SafetyInfo.computeLoopSafetyInfo(CurLoop);
345 if (SafetyInfo.anyBlockMayThrow())
346 return false;
347
348 bool MadeChange = false;
349
350 // Scan all the blocks in the loop that are not in subloops.
351 for (auto *BB : CurLoop->getBlocks()) {
352 // Ignore blocks in subloops.
353 if (LI->getLoopFor(BB) != CurLoop)
354 continue;
355
356 MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
357 }
358 return MadeChange;
359}
360
361static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
362 const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
363 return ConstStride->getAPInt();
364}
365
366/// getMemSetPatternValue - If a strided store of the specified value is safe to
367/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
368/// be passed in. Otherwise, return null.
369///
370/// Note that we don't ever attempt to use memset_pattern8 or 4, because these
371/// just replicate their input array and then pass on to memset_pattern16.
373 // FIXME: This could check for UndefValue because it can be merged into any
374 // other valid pattern.
375
376 // If the value isn't a constant, we can't promote it to being in a constant
377 // array. We could theoretically do a store to an alloca or something, but
378 // that doesn't seem worthwhile.
379 Constant *C = dyn_cast<Constant>(V);
380 if (!C || isa<ConstantExpr>(C))
381 return nullptr;
382
383 // Only handle simple values that are a power of two bytes in size.
384 uint64_t Size = DL->getTypeSizeInBits(V->getType());
385 if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
386 return nullptr;
387
388 // Don't care enough about darwin/ppc to implement this.
389 if (DL->isBigEndian())
390 return nullptr;
391
392 // Convert to size in bytes.
393 Size /= 8;
394
395 // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
396 // if the top and bottom are the same (e.g. for vectors and large integers).
397 if (Size > 16)
398 return nullptr;
399
400 // If the constant is exactly 16 bytes, just use it.
401 if (Size == 16)
402 return C;
403
404 // Otherwise, we'll use an array of the constants.
405 unsigned ArraySize = 16 / Size;
406 ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
407 return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
408}
409
410LoopIdiomRecognize::LegalStoreKind
411LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
412 // Don't touch volatile stores.
413 if (SI->isVolatile())
414 return LegalStoreKind::None;
415 // We only want simple or unordered-atomic stores.
416 if (!SI->isUnordered())
417 return LegalStoreKind::None;
418
419 // Avoid merging nontemporal stores.
420 if (SI->getMetadata(LLVMContext::MD_nontemporal))
421 return LegalStoreKind::None;
422
423 Value *StoredVal = SI->getValueOperand();
424 Value *StorePtr = SI->getPointerOperand();
425
426 // Don't convert stores of non-integral pointer types to memsets (which stores
427 // integers).
428 if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
429 return LegalStoreKind::None;
430
431 // Reject stores that are so large that they overflow an unsigned.
432 // When storing out scalable vectors we bail out for now, since the code
433 // below currently only works for constant strides.
434 TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
435 if (SizeInBits.isScalable() || (SizeInBits.getFixedValue() & 7) ||
436 (SizeInBits.getFixedValue() >> 32) != 0)
437 return LegalStoreKind::None;
438
439 // See if the pointer expression is an AddRec like {base,+,1} on the current
440 // loop, which indicates a strided store. If we have something else, it's a
441 // random store we can't handle.
442 const SCEVAddRecExpr *StoreEv =
443 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
444 if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
445 return LegalStoreKind::None;
446
447 // Check to see if we have a constant stride.
448 if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
449 return LegalStoreKind::None;
450
451 // See if the store can be turned into a memset.
452
453 // If the stored value is a byte-wise value (like i32 -1), then it may be
454 // turned into a memset of i8 -1, assuming that all the consecutive bytes
455 // are stored. A store of i32 0x01020304 can never be turned into a memset,
456 // but it can be turned into memset_pattern if the target supports it.
457 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
458
459 // Note: memset and memset_pattern on unordered-atomic is yet not supported
460 bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
461
462 // If we're allowed to form a memset, and the stored value would be
463 // acceptable for memset, use it.
464 if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
465 // Verify that the stored value is loop invariant. If not, we can't
466 // promote the memset.
467 CurLoop->isLoopInvariant(SplatValue)) {
468 // It looks like we can use SplatValue.
469 return LegalStoreKind::Memset;
470 }
471 if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
472 // Don't create memset_pattern16s with address spaces.
473 StorePtr->getType()->getPointerAddressSpace() == 0 &&
474 getMemSetPatternValue(StoredVal, DL)) {
475 // It looks like we can use PatternValue!
476 return LegalStoreKind::MemsetPattern;
477 }
478
479 // Otherwise, see if the store can be turned into a memcpy.
480 if (HasMemcpy && !DisableLIRP::Memcpy) {
481 // Check to see if the stride matches the size of the store. If so, then we
482 // know that every byte is touched in the loop.
483 APInt Stride = getStoreStride(StoreEv);
484 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
485 if (StoreSize != Stride && StoreSize != -Stride)
486 return LegalStoreKind::None;
487
488 // The store must be feeding a non-volatile load.
489 LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
490
491 // Only allow non-volatile loads
492 if (!LI || LI->isVolatile())
493 return LegalStoreKind::None;
494 // Only allow simple or unordered-atomic loads
495 if (!LI->isUnordered())
496 return LegalStoreKind::None;
497
498 // See if the pointer expression is an AddRec like {base,+,1} on the current
499 // loop, which indicates a strided load. If we have something else, it's a
500 // random load we can't handle.
501 const SCEVAddRecExpr *LoadEv =
502 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
503 if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
504 return LegalStoreKind::None;
505
506 // The store and load must share the same stride.
507 if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
508 return LegalStoreKind::None;
509
510 // Success. This store can be converted into a memcpy.
511 UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
512 return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
513 : LegalStoreKind::Memcpy;
514 }
515 // This store can't be transformed into a memset/memcpy.
516 return LegalStoreKind::None;
517}
518
519void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
520 StoreRefsForMemset.clear();
521 StoreRefsForMemsetPattern.clear();
522 StoreRefsForMemcpy.clear();
523 for (Instruction &I : *BB) {
524 StoreInst *SI = dyn_cast<StoreInst>(&I);
525 if (!SI)
526 continue;
527
528 // Make sure this is a strided store with a constant stride.
529 switch (isLegalStore(SI)) {
530 case LegalStoreKind::None:
531 // Nothing to do
532 break;
533 case LegalStoreKind::Memset: {
534 // Find the base pointer.
535 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
536 StoreRefsForMemset[Ptr].push_back(SI);
537 } break;
538 case LegalStoreKind::MemsetPattern: {
539 // Find the base pointer.
540 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
541 StoreRefsForMemsetPattern[Ptr].push_back(SI);
542 } break;
543 case LegalStoreKind::Memcpy:
544 case LegalStoreKind::UnorderedAtomicMemcpy:
545 StoreRefsForMemcpy.push_back(SI);
546 break;
547 default:
548 assert(false && "unhandled return value");
549 break;
550 }
551 }
552}
553
554/// runOnLoopBlock - Process the specified block, which lives in a counted loop
555/// with the specified backedge count. This block is known to be in the current
556/// loop and not in any subloops.
557bool LoopIdiomRecognize::runOnLoopBlock(
558 BasicBlock *BB, const SCEV *BECount,
559 SmallVectorImpl<BasicBlock *> &ExitBlocks) {
560 // We can only promote stores in this block if they are unconditionally
561 // executed in the loop. For a block to be unconditionally executed, it has
562 // to dominate all the exit blocks of the loop. Verify this now.
563 for (BasicBlock *ExitBlock : ExitBlocks)
564 if (!DT->dominates(BB, ExitBlock))
565 return false;
566
567 bool MadeChange = false;
568 // Look for store instructions, which may be optimized to memset/memcpy.
569 collectStores(BB);
570
571 // Look for a single store or sets of stores with a common base, which can be
572 // optimized into a memset (memset_pattern). The latter most commonly happens
573 // with structs and handunrolled loops.
574 for (auto &SL : StoreRefsForMemset)
575 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
576
577 for (auto &SL : StoreRefsForMemsetPattern)
578 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
579
580 // Optimize the store into a memcpy, if it feeds an similarly strided load.
581 for (auto &SI : StoreRefsForMemcpy)
582 MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
583
584 MadeChange |= processLoopMemIntrinsic<MemCpyInst>(
585 BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
586 MadeChange |= processLoopMemIntrinsic<MemSetInst>(
587 BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
588
589 return MadeChange;
590}
591
592/// See if this store(s) can be promoted to a memset.
593bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
594 const SCEV *BECount, ForMemset For) {
595 // Try to find consecutive stores that can be transformed into memsets.
596 SetVector<StoreInst *> Heads, Tails;
598
599 // Do a quadratic search on all of the given stores and find
600 // all of the pairs of stores that follow each other.
601 SmallVector<unsigned, 16> IndexQueue;
602 for (unsigned i = 0, e = SL.size(); i < e; ++i) {
603 assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
604
605 Value *FirstStoredVal = SL[i]->getValueOperand();
606 Value *FirstStorePtr = SL[i]->getPointerOperand();
607 const SCEVAddRecExpr *FirstStoreEv =
608 cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
609 APInt FirstStride = getStoreStride(FirstStoreEv);
610 unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
611
612 // See if we can optimize just this store in isolation.
613 if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
614 Heads.insert(SL[i]);
615 continue;
616 }
617
618 Value *FirstSplatValue = nullptr;
619 Constant *FirstPatternValue = nullptr;
620
621 if (For == ForMemset::Yes)
622 FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
623 else
624 FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
625
626 assert((FirstSplatValue || FirstPatternValue) &&
627 "Expected either splat value or pattern value.");
628
629 IndexQueue.clear();
630 // If a store has multiple consecutive store candidates, search Stores
631 // array according to the sequence: from i+1 to e, then from i-1 to 0.
632 // This is because usually pairing with immediate succeeding or preceding
633 // candidate create the best chance to find memset opportunity.
634 unsigned j = 0;
635 for (j = i + 1; j < e; ++j)
636 IndexQueue.push_back(j);
637 for (j = i; j > 0; --j)
638 IndexQueue.push_back(j - 1);
639
640 for (auto &k : IndexQueue) {
641 assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
642 Value *SecondStorePtr = SL[k]->getPointerOperand();
643 const SCEVAddRecExpr *SecondStoreEv =
644 cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
645 APInt SecondStride = getStoreStride(SecondStoreEv);
646
647 if (FirstStride != SecondStride)
648 continue;
649
650 Value *SecondStoredVal = SL[k]->getValueOperand();
651 Value *SecondSplatValue = nullptr;
652 Constant *SecondPatternValue = nullptr;
653
654 if (For == ForMemset::Yes)
655 SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
656 else
657 SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
658
659 assert((SecondSplatValue || SecondPatternValue) &&
660 "Expected either splat value or pattern value.");
661
662 if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
663 if (For == ForMemset::Yes) {
664 if (isa<UndefValue>(FirstSplatValue))
665 FirstSplatValue = SecondSplatValue;
666 if (FirstSplatValue != SecondSplatValue)
667 continue;
668 } else {
669 if (isa<UndefValue>(FirstPatternValue))
670 FirstPatternValue = SecondPatternValue;
671 if (FirstPatternValue != SecondPatternValue)
672 continue;
673 }
674 Tails.insert(SL[k]);
675 Heads.insert(SL[i]);
676 ConsecutiveChain[SL[i]] = SL[k];
677 break;
678 }
679 }
680 }
681
682 // We may run into multiple chains that merge into a single chain. We mark the
683 // stores that we transformed so that we don't visit the same store twice.
684 SmallPtrSet<Value *, 16> TransformedStores;
685 bool Changed = false;
686
687 // For stores that start but don't end a link in the chain:
688 for (StoreInst *I : Heads) {
689 if (Tails.count(I))
690 continue;
691
692 // We found a store instr that starts a chain. Now follow the chain and try
693 // to transform it.
694 SmallPtrSet<Instruction *, 8> AdjacentStores;
695 StoreInst *HeadStore = I;
696 unsigned StoreSize = 0;
697
698 // Collect the chain into a list.
699 while (Tails.count(I) || Heads.count(I)) {
700 if (TransformedStores.count(I))
701 break;
702 AdjacentStores.insert(I);
703
704 StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
705 // Move to the next value in the chain.
706 I = ConsecutiveChain[I];
707 }
708
709 Value *StoredVal = HeadStore->getValueOperand();
710 Value *StorePtr = HeadStore->getPointerOperand();
711 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
712 APInt Stride = getStoreStride(StoreEv);
713
714 // Check to see if the stride matches the size of the stores. If so, then
715 // we know that every byte is touched in the loop.
716 if (StoreSize != Stride && StoreSize != -Stride)
717 continue;
718
719 bool IsNegStride = StoreSize == -Stride;
720
721 Type *IntIdxTy = DL->getIndexType(StorePtr->getType());
722 const SCEV *StoreSizeSCEV = SE->getConstant(IntIdxTy, StoreSize);
723 if (processLoopStridedStore(StorePtr, StoreSizeSCEV,
724 MaybeAlign(HeadStore->getAlign()), StoredVal,
725 HeadStore, AdjacentStores, StoreEv, BECount,
726 IsNegStride)) {
727 TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
728 Changed = true;
729 }
730 }
731
732 return Changed;
733}
734
735/// processLoopMemIntrinsic - Template function for calling different processor
736/// functions based on mem intrinsic type.
737template <typename MemInst>
738bool LoopIdiomRecognize::processLoopMemIntrinsic(
739 BasicBlock *BB,
740 bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
741 const SCEV *BECount) {
742 bool MadeChange = false;
743 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
744 Instruction *Inst = &*I++;
745 // Look for memory instructions, which may be optimized to a larger one.
746 if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
747 WeakTrackingVH InstPtr(&*I);
748 if (!(this->*Processor)(MI, BECount))
749 continue;
750 MadeChange = true;
751
752 // If processing the instruction invalidated our iterator, start over from
753 // the top of the block.
754 if (!InstPtr)
755 I = BB->begin();
756 }
757 }
758 return MadeChange;
759}
760
761/// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy
762bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
763 const SCEV *BECount) {
764 // We can only handle non-volatile memcpys with a constant size.
765 if (MCI->isVolatile() || !isa<ConstantInt>(MCI->getLength()))
766 return false;
767
768 // If we're not allowed to hack on memcpy, we fail.
769 if ((!HasMemcpy && !isa<MemCpyInlineInst>(MCI)) || DisableLIRP::Memcpy)
770 return false;
771
772 Value *Dest = MCI->getDest();
773 Value *Source = MCI->getSource();
774 if (!Dest || !Source)
775 return false;
776
777 // See if the load and store pointer expressions are AddRec like {base,+,1} on
778 // the current loop, which indicates a strided load and store. If we have
779 // something else, it's a random load or store we can't handle.
780 const SCEVAddRecExpr *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Dest));
781 if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
782 return false;
783 const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Source));
784 if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
785 return false;
786
787 // Reject memcpys that are so large that they overflow an unsigned.
788 uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength())->getZExtValue();
789 if ((SizeInBytes >> 32) != 0)
790 return false;
791
792 // Check if the stride matches the size of the memcpy. If so, then we know
793 // that every byte is touched in the loop.
794 const SCEVConstant *ConstStoreStride =
795 dyn_cast<SCEVConstant>(StoreEv->getOperand(1));
796 const SCEVConstant *ConstLoadStride =
797 dyn_cast<SCEVConstant>(LoadEv->getOperand(1));
798 if (!ConstStoreStride || !ConstLoadStride)
799 return false;
800
801 APInt StoreStrideValue = ConstStoreStride->getAPInt();
802 APInt LoadStrideValue = ConstLoadStride->getAPInt();
803 // Huge stride value - give up
804 if (StoreStrideValue.getBitWidth() > 64 || LoadStrideValue.getBitWidth() > 64)
805 return false;
806
807 if (SizeInBytes != StoreStrideValue && SizeInBytes != -StoreStrideValue) {
808 ORE.emit([&]() {
809 return OptimizationRemarkMissed(DEBUG_TYPE, "SizeStrideUnequal", MCI)
810 << ore::NV("Inst", "memcpy") << " in "
811 << ore::NV("Function", MCI->getFunction())
812 << " function will not be hoisted: "
813 << ore::NV("Reason", "memcpy size is not equal to stride");
814 });
815 return false;
816 }
817
818 int64_t StoreStrideInt = StoreStrideValue.getSExtValue();
819 int64_t LoadStrideInt = LoadStrideValue.getSExtValue();
820 // Check if the load stride matches the store stride.
821 if (StoreStrideInt != LoadStrideInt)
822 return false;
823
824 return processLoopStoreOfLoopLoad(
825 Dest, Source, SE->getConstant(Dest->getType(), SizeInBytes),
826 MCI->getDestAlign(), MCI->getSourceAlign(), MCI, MCI, StoreEv, LoadEv,
827 BECount);
828}
829
830/// processLoopMemSet - See if this memset can be promoted to a large memset.
831bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
832 const SCEV *BECount) {
833 // We can only handle non-volatile memsets.
834 if (MSI->isVolatile())
835 return false;
836
837 // If we're not allowed to hack on memset, we fail.
838 if (!HasMemset || DisableLIRP::Memset)
839 return false;
840
841 Value *Pointer = MSI->getDest();
842
843 // See if the pointer expression is an AddRec like {base,+,1} on the current
844 // loop, which indicates a strided store. If we have something else, it's a
845 // random store we can't handle.
846 const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
847 if (!Ev || Ev->getLoop() != CurLoop)
848 return false;
849 if (!Ev->isAffine()) {
850 LLVM_DEBUG(dbgs() << " Pointer is not affine, abort\n");
851 return false;
852 }
853
854 const SCEV *PointerStrideSCEV = Ev->getOperand(1);
855 const SCEV *MemsetSizeSCEV = SE->getSCEV(MSI->getLength());
856 if (!PointerStrideSCEV || !MemsetSizeSCEV)
857 return false;
858
859 bool IsNegStride = false;
860 const bool IsConstantSize = isa<ConstantInt>(MSI->getLength());
861
862 if (IsConstantSize) {
863 // Memset size is constant.
864 // Check if the pointer stride matches the memset size. If so, then
865 // we know that every byte is touched in the loop.
866 LLVM_DEBUG(dbgs() << " memset size is constant\n");
867 uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
868 const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
869 if (!ConstStride)
870 return false;
871
872 APInt Stride = ConstStride->getAPInt();
873 if (SizeInBytes != Stride && SizeInBytes != -Stride)
874 return false;
875
876 IsNegStride = SizeInBytes == -Stride;
877 } else {
878 // Memset size is non-constant.
879 // Check if the pointer stride matches the memset size.
880 // To be conservative, the pass would not promote pointers that aren't in
881 // address space zero. Also, the pass only handles memset length and stride
882 // that are invariant for the top level loop.
883 LLVM_DEBUG(dbgs() << " memset size is non-constant\n");
884 if (Pointer->getType()->getPointerAddressSpace() != 0) {
885 LLVM_DEBUG(dbgs() << " pointer is not in address space zero, "
886 << "abort\n");
887 return false;
888 }
889 if (!SE->isLoopInvariant(MemsetSizeSCEV, CurLoop)) {
890 LLVM_DEBUG(dbgs() << " memset size is not a loop-invariant, "
891 << "abort\n");
892 return false;
893 }
894
895 // Compare positive direction PointerStrideSCEV with MemsetSizeSCEV
896 IsNegStride = PointerStrideSCEV->isNonConstantNegative();
897 const SCEV *PositiveStrideSCEV =
898 IsNegStride ? SE->getNegativeSCEV(PointerStrideSCEV)
899 : PointerStrideSCEV;
900 LLVM_DEBUG(dbgs() << " MemsetSizeSCEV: " << *MemsetSizeSCEV << "\n"
901 << " PositiveStrideSCEV: " << *PositiveStrideSCEV
902 << "\n");
903
904 if (PositiveStrideSCEV != MemsetSizeSCEV) {
905 // If an expression is covered by the loop guard, compare again and
906 // proceed with optimization if equal.
907 const SCEV *FoldedPositiveStride =
908 SE->applyLoopGuards(PositiveStrideSCEV, CurLoop);
909 const SCEV *FoldedMemsetSize =
910 SE->applyLoopGuards(MemsetSizeSCEV, CurLoop);
911
912 LLVM_DEBUG(dbgs() << " Try to fold SCEV based on loop guard\n"
913 << " FoldedMemsetSize: " << *FoldedMemsetSize << "\n"
914 << " FoldedPositiveStride: " << *FoldedPositiveStride
915 << "\n");
916
917 if (FoldedPositiveStride != FoldedMemsetSize) {
918 LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n");
919 return false;
920 }
921 }
922 }
923
924 // Verify that the memset value is loop invariant. If not, we can't promote
925 // the memset.
926 Value *SplatValue = MSI->getValue();
927 if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
928 return false;
929
931 MSIs.insert(MSI);
932 return processLoopStridedStore(Pointer, SE->getSCEV(MSI->getLength()),
933 MSI->getDestAlign(), SplatValue, MSI, MSIs, Ev,
934 BECount, IsNegStride, /*IsLoopMemset=*/true);
935}
936
937/// mayLoopAccessLocation - Return true if the specified loop might access the
938/// specified pointer location, which is a loop-strided access. The 'Access'
939/// argument specifies what the verboten forms of access are (read or write).
940static bool
942 const SCEV *BECount, const SCEV *StoreSizeSCEV,
943 AliasAnalysis &AA,
944 SmallPtrSetImpl<Instruction *> &IgnoredInsts) {
945 // Get the location that may be stored across the loop. Since the access is
946 // strided positively through memory, we say that the modified location starts
947 // at the pointer and has infinite size.
949
950 // If the loop iterates a fixed number of times, we can refine the access size
951 // to be exactly the size of the memset, which is (BECount+1)*StoreSize
952 const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount);
953 const SCEVConstant *ConstSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
954 if (BECst && ConstSize)
955 AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
956 ConstSize->getValue()->getZExtValue());
957
958 // TODO: For this to be really effective, we have to dive into the pointer
959 // operand in the store. Store to &A[i] of 100 will always return may alias
960 // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
961 // which will then no-alias a store to &A[100].
962 MemoryLocation StoreLoc(Ptr, AccessSize);
963
964 for (BasicBlock *B : L->blocks())
965 for (Instruction &I : *B)
966 if (!IgnoredInsts.contains(&I) &&
967 isModOrRefSet(AA.getModRefInfo(&I, StoreLoc) & Access))
968 return true;
969 return false;
970}
971
972// If we have a negative stride, Start refers to the end of the memory location
973// we're trying to memset. Therefore, we need to recompute the base pointer,
974// which is just Start - BECount*Size.
975static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
976 Type *IntPtr, const SCEV *StoreSizeSCEV,
977 ScalarEvolution *SE) {
978 const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
979 if (!StoreSizeSCEV->isOne()) {
980 // index = back edge count * store size
981 Index = SE->getMulExpr(Index,
982 SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
984 }
985 // base pointer = start - index * store size
986 return SE->getMinusSCEV(Start, Index);
987}
988
989/// Compute trip count from the backedge taken count.
990static const SCEV *getTripCount(const SCEV *BECount, Type *IntPtr,
991 Loop *CurLoop, const DataLayout *DL,
992 ScalarEvolution *SE) {
993 const SCEV *TripCountS = nullptr;
994 // The # stored bytes is (BECount+1). Expand the trip count out to
995 // pointer size if it isn't already.
996 //
997 // If we're going to need to zero extend the BE count, check if we can add
998 // one to it prior to zero extending without overflow. Provided this is safe,
999 // it allows better simplification of the +1.
1000 if (DL->getTypeSizeInBits(BECount->getType()) <
1001 DL->getTypeSizeInBits(IntPtr) &&
1003 CurLoop, ICmpInst::ICMP_NE, BECount,
1004 SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
1005 TripCountS = SE->getZeroExtendExpr(
1006 SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
1007 IntPtr);
1008 } else {
1009 TripCountS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
1010 SE->getOne(IntPtr), SCEV::FlagNUW);
1011 }
1012
1013 return TripCountS;
1014}
1015
1016/// Compute the number of bytes as a SCEV from the backedge taken count.
1017///
1018/// This also maps the SCEV into the provided type and tries to handle the
1019/// computation in a way that will fold cleanly.
1020static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
1021 const SCEV *StoreSizeSCEV, Loop *CurLoop,
1022 const DataLayout *DL, ScalarEvolution *SE) {
1023 const SCEV *TripCountSCEV = getTripCount(BECount, IntPtr, CurLoop, DL, SE);
1024
1025 return SE->getMulExpr(TripCountSCEV,
1026 SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
1028}
1029
1030/// processLoopStridedStore - We see a strided store of some value. If we can
1031/// transform this into a memset or memset_pattern in the loop preheader, do so.
1032bool LoopIdiomRecognize::processLoopStridedStore(
1033 Value *DestPtr, const SCEV *StoreSizeSCEV, MaybeAlign StoreAlignment,
1034 Value *StoredVal, Instruction *TheStore,
1036 const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) {
1037 Module *M = TheStore->getModule();
1038 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
1039 Constant *PatternValue = nullptr;
1040
1041 if (!SplatValue)
1042 PatternValue = getMemSetPatternValue(StoredVal, DL);
1043
1044 assert((SplatValue || PatternValue) &&
1045 "Expected either splat value or pattern value.");
1046
1047 // The trip count of the loop and the base pointer of the addrec SCEV is
1048 // guaranteed to be loop invariant, which means that it should dominate the
1049 // header. This allows us to insert code for it in the preheader.
1050 unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
1051 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1052 IRBuilder<> Builder(Preheader->getTerminator());
1053 SCEVExpander Expander(*SE, *DL, "loop-idiom");
1054 SCEVExpanderCleaner ExpCleaner(Expander);
1055
1056 Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
1057 Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
1058
1059 bool Changed = false;
1060 const SCEV *Start = Ev->getStart();
1061 // Handle negative strided loops.
1062 if (IsNegStride)
1063 Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSizeSCEV, SE);
1064
1065 // TODO: ideally we should still be able to generate memset if SCEV expander
1066 // is taught to generate the dependencies at the latest point.
1067 if (!Expander.isSafeToExpand(Start))
1068 return Changed;
1069
1070 // Okay, we have a strided store "p[i]" of a splattable value. We can turn
1071 // this into a memset in the loop preheader now if we want. However, this
1072 // would be unsafe to do if there is anything else in the loop that may read
1073 // or write to the aliased location. Check for any overlap by generating the
1074 // base pointer and checking the region.
1075 Value *BasePtr =
1076 Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
1077
1078 // From here on out, conservatively report to the pass manager that we've
1079 // changed the IR, even if we later clean up these added instructions. There
1080 // may be structural differences e.g. in the order of use lists not accounted
1081 // for in just a textual dump of the IR. This is written as a variable, even
1082 // though statically all the places this dominates could be replaced with
1083 // 'true', with the hope that anyone trying to be clever / "more precise" with
1084 // the return value will read this comment, and leave them alone.
1085 Changed = true;
1086
1087 if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1088 StoreSizeSCEV, *AA, Stores))
1089 return Changed;
1090
1091 if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
1092 return Changed;
1093
1094 // Okay, everything looks good, insert the memset.
1095
1096 const SCEV *NumBytesS =
1097 getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1098
1099 // TODO: ideally we should still be able to generate memset if SCEV expander
1100 // is taught to generate the dependencies at the latest point.
1101 if (!Expander.isSafeToExpand(NumBytesS))
1102 return Changed;
1103
1104 Value *NumBytes =
1105 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1106
1107 CallInst *NewCall;
1108 if (SplatValue) {
1109 AAMDNodes AATags = TheStore->getAAMetadata();
1110 for (Instruction *Store : Stores)
1111 AATags = AATags.merge(Store->getAAMetadata());
1112 if (auto CI = dyn_cast<ConstantInt>(NumBytes))
1113 AATags = AATags.extendTo(CI->getZExtValue());
1114 else
1115 AATags = AATags.extendTo(-1);
1116
1117 NewCall = Builder.CreateMemSet(
1118 BasePtr, SplatValue, NumBytes, MaybeAlign(StoreAlignment),
1119 /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
1120 } else if (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)) {
1121 // Everything is emitted in default address space
1122 Type *Int8PtrTy = DestInt8PtrTy;
1123
1124 StringRef FuncName = "memset_pattern16";
1125 FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16,
1126 Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy);
1127 inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI);
1128
1129 // Otherwise we should form a memset_pattern16. PatternValue is known to be
1130 // an constant array of 16-bytes. Plop the value into a mergable global.
1131 GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
1133 PatternValue, ".memset_pattern");
1134 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
1135 GV->setAlignment(Align(16));
1136 Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
1137 NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
1138 } else
1139 return Changed;
1140
1141 NewCall->setDebugLoc(TheStore->getDebugLoc());
1142
1143 if (MSSAU) {
1144 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1145 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1146 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1147 }
1148
1149 LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
1150 << " from store to: " << *Ev << " at: " << *TheStore
1151 << "\n");
1152
1153 ORE.emit([&]() {
1154 OptimizationRemark R(DEBUG_TYPE, "ProcessLoopStridedStore",
1155 NewCall->getDebugLoc(), Preheader);
1156 R << "Transformed loop-strided store in "
1157 << ore::NV("Function", TheStore->getFunction())
1158 << " function into a call to "
1159 << ore::NV("NewFunction", NewCall->getCalledFunction())
1160 << "() intrinsic";
1161 if (!Stores.empty())
1162 R << ore::setExtraArgs();
1163 for (auto *I : Stores) {
1164 R << ore::NV("FromBlock", I->getParent()->getName())
1165 << ore::NV("ToBlock", Preheader->getName());
1166 }
1167 return R;
1168 });
1169
1170 // Okay, the memset has been formed. Zap the original store and anything that
1171 // feeds into it.
1172 for (auto *I : Stores) {
1173 if (MSSAU)
1174 MSSAU->removeMemoryAccess(I, true);
1176 }
1177 if (MSSAU && VerifyMemorySSA)
1178 MSSAU->getMemorySSA()->verifyMemorySSA();
1179 ++NumMemSet;
1180 ExpCleaner.markResultUsed();
1181 return true;
1182}
1183
1184/// If the stored value is a strided load in the same loop with the same stride
1185/// this may be transformable into a memcpy. This kicks in for stuff like
1186/// for (i) A[i] = B[i];
1187bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1188 const SCEV *BECount) {
1189 assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
1190
1191 Value *StorePtr = SI->getPointerOperand();
1192 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1193 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1194
1195 // The store must be feeding a non-volatile load.
1196 LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
1197 assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
1198
1199 // See if the pointer expression is an AddRec like {base,+,1} on the current
1200 // loop, which indicates a strided load. If we have something else, it's a
1201 // random load we can't handle.
1202 Value *LoadPtr = LI->getPointerOperand();
1203 const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
1204
1205 const SCEV *StoreSizeSCEV = SE->getConstant(StorePtr->getType(), StoreSize);
1206 return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSizeSCEV,
1207 SI->getAlign(), LI->getAlign(), SI, LI,
1208 StoreEv, LoadEv, BECount);
1209}
1210
1211namespace {
1212class MemmoveVerifier {
1213public:
1214 explicit MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr,
1215 const DataLayout &DL)
1217 LoadBasePtr.stripPointerCasts(), LoadOff, DL)),
1219 StoreBasePtr.stripPointerCasts(), StoreOff, DL)),
1220 IsSameObject(BP1 == BP2) {}
1221
1222 bool loadAndStoreMayFormMemmove(unsigned StoreSize, bool IsNegStride,
1223 const Instruction &TheLoad,
1224 bool IsMemCpy) const {
1225 if (IsMemCpy) {
1226 // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1227 // for negative stride.
1228 if ((!IsNegStride && LoadOff <= StoreOff) ||
1229 (IsNegStride && LoadOff >= StoreOff))
1230 return false;
1231 } else {
1232 // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1233 // for negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr.
1234 int64_t LoadSize =
1235 DL.getTypeSizeInBits(TheLoad.getType()).getFixedValue() / 8;
1236 if (BP1 != BP2 || LoadSize != int64_t(StoreSize))
1237 return false;
1238 if ((!IsNegStride && LoadOff < StoreOff + int64_t(StoreSize)) ||
1239 (IsNegStride && LoadOff + LoadSize > StoreOff))
1240 return false;
1241 }
1242 return true;
1243 }
1244
1245private:
1246 const DataLayout &DL;
1247 int64_t LoadOff = 0;
1248 int64_t StoreOff = 0;
1249 const Value *BP1;
1250 const Value *BP2;
1251
1252public:
1253 const bool IsSameObject;
1254};
1255} // namespace
1256
1257bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
1258 Value *DestPtr, Value *SourcePtr, const SCEV *StoreSizeSCEV,
1259 MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore,
1260 Instruction *TheLoad, const SCEVAddRecExpr *StoreEv,
1261 const SCEVAddRecExpr *LoadEv, const SCEV *BECount) {
1262
1263 // FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to
1264 // conservatively bail here, since otherwise we may have to transform
1265 // llvm.memcpy.inline into llvm.memcpy which is illegal.
1266 if (isa<MemCpyInlineInst>(TheStore))
1267 return false;
1268
1269 // The trip count of the loop and the base pointer of the addrec SCEV is
1270 // guaranteed to be loop invariant, which means that it should dominate the
1271 // header. This allows us to insert code for it in the preheader.
1272 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1273 IRBuilder<> Builder(Preheader->getTerminator());
1274 SCEVExpander Expander(*SE, *DL, "loop-idiom");
1275
1276 SCEVExpanderCleaner ExpCleaner(Expander);
1277
1278 bool Changed = false;
1279 const SCEV *StrStart = StoreEv->getStart();
1280 unsigned StrAS = DestPtr->getType()->getPointerAddressSpace();
1281 Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
1282
1283 APInt Stride = getStoreStride(StoreEv);
1284 const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
1285
1286 // TODO: Deal with non-constant size; Currently expect constant store size
1287 assert(ConstStoreSize && "store size is expected to be a constant");
1288
1289 int64_t StoreSize = ConstStoreSize->getValue()->getZExtValue();
1290 bool IsNegStride = StoreSize == -Stride;
1291
1292 // Handle negative strided loops.
1293 if (IsNegStride)
1294 StrStart =
1295 getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1296
1297 // Okay, we have a strided store "p[i]" of a loaded value. We can turn
1298 // this into a memcpy in the loop preheader now if we want. However, this
1299 // would be unsafe to do if there is anything else in the loop that may read
1300 // or write the memory region we're storing to. This includes the load that
1301 // feeds the stores. Check for an alias by generating the base address and
1302 // checking everything.
1303 Value *StoreBasePtr = Expander.expandCodeFor(
1304 StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
1305
1306 // From here on out, conservatively report to the pass manager that we've
1307 // changed the IR, even if we later clean up these added instructions. There
1308 // may be structural differences e.g. in the order of use lists not accounted
1309 // for in just a textual dump of the IR. This is written as a variable, even
1310 // though statically all the places this dominates could be replaced with
1311 // 'true', with the hope that anyone trying to be clever / "more precise" with
1312 // the return value will read this comment, and leave them alone.
1313 Changed = true;
1314
1315 SmallPtrSet<Instruction *, 2> IgnoredInsts;
1316 IgnoredInsts.insert(TheStore);
1317
1318 bool IsMemCpy = isa<MemCpyInst>(TheStore);
1319 const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";
1320
1321 bool LoopAccessStore =
1322 mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1323 StoreSizeSCEV, *AA, IgnoredInsts);
1324 if (LoopAccessStore) {
1325 // For memmove case it's not enough to guarantee that loop doesn't access
1326 // TheStore and TheLoad. Additionally we need to make sure that TheStore is
1327 // the only user of TheLoad.
1328 if (!TheLoad->hasOneUse())
1329 return Changed;
1330 IgnoredInsts.insert(TheLoad);
1331 if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop,
1332 BECount, StoreSizeSCEV, *AA, IgnoredInsts)) {
1333 ORE.emit([&]() {
1334 return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore",
1335 TheStore)
1336 << ore::NV("Inst", InstRemark) << " in "
1337 << ore::NV("Function", TheStore->getFunction())
1338 << " function will not be hoisted: "
1339 << ore::NV("Reason", "The loop may access store location");
1340 });
1341 return Changed;
1342 }
1343 IgnoredInsts.erase(TheLoad);
1344 }
1345
1346 const SCEV *LdStart = LoadEv->getStart();
1347 unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
1348
1349 // Handle negative strided loops.
1350 if (IsNegStride)
1351 LdStart =
1352 getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1353
1354 // For a memcpy, we have to make sure that the input array is not being
1355 // mutated by the loop.
1356 Value *LoadBasePtr = Expander.expandCodeFor(
1357 LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
1358
1359 // If the store is a memcpy instruction, we must check if it will write to
1360 // the load memory locations. So remove it from the ignored stores.
1361 MemmoveVerifier Verifier(*LoadBasePtr, *StoreBasePtr, *DL);
1362 if (IsMemCpy && !Verifier.IsSameObject)
1363 IgnoredInsts.erase(TheStore);
1364 if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1365 StoreSizeSCEV, *AA, IgnoredInsts)) {
1366 ORE.emit([&]() {
1367 return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
1368 << ore::NV("Inst", InstRemark) << " in "
1369 << ore::NV("Function", TheStore->getFunction())
1370 << " function will not be hoisted: "
1371 << ore::NV("Reason", "The loop may access load location");
1372 });
1373 return Changed;
1374 }
1375
1376 bool UseMemMove = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore;
1377 if (UseMemMove)
1378 if (!Verifier.loadAndStoreMayFormMemmove(StoreSize, IsNegStride, *TheLoad,
1379 IsMemCpy))
1380 return Changed;
1381
1382 if (avoidLIRForMultiBlockLoop())
1383 return Changed;
1384
1385 // Okay, everything is safe, we can transform this!
1386
1387 const SCEV *NumBytesS =
1388 getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1389
1390 Value *NumBytes =
1391 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1392
1393 AAMDNodes AATags = TheLoad->getAAMetadata();
1394 AAMDNodes StoreAATags = TheStore->getAAMetadata();
1395 AATags = AATags.merge(StoreAATags);
1396 if (auto CI = dyn_cast<ConstantInt>(NumBytes))
1397 AATags = AATags.extendTo(CI->getZExtValue());
1398 else
1399 AATags = AATags.extendTo(-1);
1400
1401 CallInst *NewCall = nullptr;
1402 // Check whether to generate an unordered atomic memcpy:
1403 // If the load or store are atomic, then they must necessarily be unordered
1404 // by previous checks.
1405 if (!TheStore->isAtomic() && !TheLoad->isAtomic()) {
1406 if (UseMemMove)
1407 NewCall = Builder.CreateMemMove(
1408 StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, NumBytes,
1409 /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
1410 else
1411 NewCall =
1412 Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign,
1413 NumBytes, /*isVolatile=*/false, AATags.TBAA,
1414 AATags.TBAAStruct, AATags.Scope, AATags.NoAlias);
1415 } else {
1416 // For now don't support unordered atomic memmove.
1417 if (UseMemMove)
1418 return Changed;
1419 // We cannot allow unaligned ops for unordered load/store, so reject
1420 // anything where the alignment isn't at least the element size.
1421 assert((StoreAlign && LoadAlign) &&
1422 "Expect unordered load/store to have align.");
1423 if (*StoreAlign < StoreSize || *LoadAlign < StoreSize)
1424 return Changed;
1425
1426 // If the element.atomic memcpy is not lowered into explicit
1427 // loads/stores later, then it will be lowered into an element-size
1428 // specific lib call. If the lib call doesn't exist for our store size, then
1429 // we shouldn't generate the memcpy.
1430 if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1431 return Changed;
1432
1433 // Create the call.
1434 // Note that unordered atomic loads/stores are *required* by the spec to
1435 // have an alignment but non-atomic loads/stores may not.
1436 NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1437 StoreBasePtr, *StoreAlign, LoadBasePtr, *LoadAlign, NumBytes, StoreSize,
1438 AATags.TBAA, AATags.TBAAStruct, AATags.Scope, AATags.NoAlias);
1439 }
1440 NewCall->setDebugLoc(TheStore->getDebugLoc());
1441
1442 if (MSSAU) {
1443 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1444 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1445 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1446 }
1447
1448 LLVM_DEBUG(dbgs() << " Formed new call: " << *NewCall << "\n"
1449 << " from load ptr=" << *LoadEv << " at: " << *TheLoad
1450 << "\n"
1451 << " from store ptr=" << *StoreEv << " at: " << *TheStore
1452 << "\n");
1453
1454 ORE.emit([&]() {
1455 return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",
1456 NewCall->getDebugLoc(), Preheader)
1457 << "Formed a call to "
1458 << ore::NV("NewFunction", NewCall->getCalledFunction())
1459 << "() intrinsic from " << ore::NV("Inst", InstRemark)
1460 << " instruction in " << ore::NV("Function", TheStore->getFunction())
1461 << " function"
1463 << ore::NV("FromBlock", TheStore->getParent()->getName())
1464 << ore::NV("ToBlock", Preheader->getName());
1465 });
1466
1467 // Okay, a new call to memcpy/memmove has been formed. Zap the original store
1468 // and anything that feeds into it.
1469 if (MSSAU)
1470 MSSAU->removeMemoryAccess(TheStore, true);
1471 deleteDeadInstruction(TheStore);
1472 if (MSSAU && VerifyMemorySSA)
1473 MSSAU->getMemorySSA()->verifyMemorySSA();
1474 if (UseMemMove)
1475 ++NumMemMove;
1476 else
1477 ++NumMemCpy;
1478 ExpCleaner.markResultUsed();
1479 return true;
1480}
1481
1482// When compiling for codesize we avoid idiom recognition for a multi-block loop
1483// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1484//
1485bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1486 bool IsLoopMemset) {
1487 if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
1488 if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
1489 LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
1490 << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
1491 << " avoided: multi-block top-level loop\n");
1492 return true;
1493 }
1494 }
1495
1496 return false;
1497}
1498
1499bool LoopIdiomRecognize::runOnNoncountableLoop() {
1500 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
1501 << CurLoop->getHeader()->getParent()->getName()
1502 << "] Noncountable Loop %"
1503 << CurLoop->getHeader()->getName() << "\n");
1504
1505 return recognizePopcount() || recognizeAndInsertFFS() ||
1506 recognizeShiftUntilBitTest() || recognizeShiftUntilZero();
1507}
1508
1509/// Check if the given conditional branch is based on the comparison between
1510/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1511/// true), the control yields to the loop entry. If the branch matches the
1512/// behavior, the variable involved in the comparison is returned. This function
1513/// will be called to see if the precondition and postcondition of the loop are
1514/// in desirable form.
1516 bool JmpOnZero = false) {
1517 if (!BI || !BI->isConditional())
1518 return nullptr;
1519
1520 ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
1521 if (!Cond)
1522 return nullptr;
1523
1524 ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
1525 if (!CmpZero || !CmpZero->isZero())
1526 return nullptr;
1527
1528 BasicBlock *TrueSucc = BI->getSuccessor(0);
1529 BasicBlock *FalseSucc = BI->getSuccessor(1);
1530 if (JmpOnZero)
1531 std::swap(TrueSucc, FalseSucc);
1532
1533 ICmpInst::Predicate Pred = Cond->getPredicate();
1534 if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
1535 (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
1536 return Cond->getOperand(0);
1537
1538 return nullptr;
1539}
1540
1541// Check if the recurrence variable `VarX` is in the right form to create
1542// the idiom. Returns the value coerced to a PHINode if so.
1544 BasicBlock *LoopEntry) {
1545 auto *PhiX = dyn_cast<PHINode>(VarX);
1546 if (PhiX && PhiX->getParent() == LoopEntry &&
1547 (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
1548 return PhiX;
1549 return nullptr;
1550}
1551
1552/// Return true iff the idiom is detected in the loop.
1553///
1554/// Additionally:
1555/// 1) \p CntInst is set to the instruction counting the population bit.
1556/// 2) \p CntPhi is set to the corresponding phi node.
1557/// 3) \p Var is set to the value whose population bits are being counted.
1558///
1559/// The core idiom we are trying to detect is:
1560/// \code
1561/// if (x0 != 0)
1562/// goto loop-exit // the precondition of the loop
1563/// cnt0 = init-val;
1564/// do {
1565/// x1 = phi (x0, x2);
1566/// cnt1 = phi(cnt0, cnt2);
1567///
1568/// cnt2 = cnt1 + 1;
1569/// ...
1570/// x2 = x1 & (x1 - 1);
1571/// ...
1572/// } while(x != 0);
1573///
1574/// loop-exit:
1575/// \endcode
1576static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
1577 Instruction *&CntInst, PHINode *&CntPhi,
1578 Value *&Var) {
1579 // step 1: Check to see if the look-back branch match this pattern:
1580 // "if (a!=0) goto loop-entry".
1581 BasicBlock *LoopEntry;
1582 Instruction *DefX2, *CountInst;
1583 Value *VarX1, *VarX0;
1584 PHINode *PhiX, *CountPhi;
1585
1586 DefX2 = CountInst = nullptr;
1587 VarX1 = VarX0 = nullptr;
1588 PhiX = CountPhi = nullptr;
1589 LoopEntry = *(CurLoop->block_begin());
1590
1591 // step 1: Check if the loop-back branch is in desirable form.
1592 {
1593 if (Value *T = matchCondition(
1594 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1595 DefX2 = dyn_cast<Instruction>(T);
1596 else
1597 return false;
1598 }
1599
1600 // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1601 {
1602 if (!DefX2 || DefX2->getOpcode() != Instruction::And)
1603 return false;
1604
1605 BinaryOperator *SubOneOp;
1606
1607 if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
1608 VarX1 = DefX2->getOperand(1);
1609 else {
1610 VarX1 = DefX2->getOperand(0);
1611 SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
1612 }
1613 if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
1614 return false;
1615
1616 ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
1617 if (!Dec ||
1618 !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
1619 (SubOneOp->getOpcode() == Instruction::Add &&
1620 Dec->isMinusOne()))) {
1621 return false;
1622 }
1623 }
1624
1625 // step 3: Check the recurrence of variable X
1626 PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
1627 if (!PhiX)
1628 return false;
1629
1630 // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
1631 {
1632 CountInst = nullptr;
1633 for (Instruction &Inst : llvm::make_range(
1634 LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
1635 if (Inst.getOpcode() != Instruction::Add)
1636 continue;
1637
1638 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
1639 if (!Inc || !Inc->isOne())
1640 continue;
1641
1642 PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
1643 if (!Phi)
1644 continue;
1645
1646 // Check if the result of the instruction is live of the loop.
1647 bool LiveOutLoop = false;
1648 for (User *U : Inst.users()) {
1649 if ((cast<Instruction>(U))->getParent() != LoopEntry) {
1650 LiveOutLoop = true;
1651 break;
1652 }
1653 }
1654
1655 if (LiveOutLoop) {
1656 CountInst = &Inst;
1657 CountPhi = Phi;
1658 break;
1659 }
1660 }
1661
1662 if (!CountInst)
1663 return false;
1664 }
1665
1666 // step 5: check if the precondition is in this form:
1667 // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
1668 {
1669 auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1670 Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
1671 if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
1672 return false;
1673
1674 CntInst = CountInst;
1675 CntPhi = CountPhi;
1676 Var = T;
1677 }
1678
1679 return true;
1680}
1681
1682/// Return true if the idiom is detected in the loop.
1683///
1684/// Additionally:
1685/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1686/// or nullptr if there is no such.
1687/// 2) \p CntPhi is set to the corresponding phi node
1688/// or nullptr if there is no such.
1689/// 3) \p Var is set to the value whose CTLZ could be used.
1690/// 4) \p DefX is set to the instruction calculating Loop exit condition.
1691///
1692/// The core idiom we are trying to detect is:
1693/// \code
1694/// if (x0 == 0)
1695/// goto loop-exit // the precondition of the loop
1696/// cnt0 = init-val;
1697/// do {
1698/// x = phi (x0, x.next); //PhiX
1699/// cnt = phi(cnt0, cnt.next);
1700///
1701/// cnt.next = cnt + 1;
1702/// ...
1703/// x.next = x >> 1; // DefX
1704/// ...
1705/// } while(x.next != 0);
1706///
1707/// loop-exit:
1708/// \endcode
1709static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
1710 Intrinsic::ID &IntrinID, Value *&InitX,
1711 Instruction *&CntInst, PHINode *&CntPhi,
1712 Instruction *&DefX) {
1713 BasicBlock *LoopEntry;
1714 Value *VarX = nullptr;
1715
1716 DefX = nullptr;
1717 CntInst = nullptr;
1718 CntPhi = nullptr;
1719 LoopEntry = *(CurLoop->block_begin());
1720
1721 // step 1: Check if the loop-back branch is in desirable form.
1722 if (Value *T = matchCondition(
1723 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1724 DefX = dyn_cast<Instruction>(T);
1725 else
1726 return false;
1727
1728 // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
1729 if (!DefX || !DefX->isShift())
1730 return false;
1731 IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
1732 Intrinsic::ctlz;
1733 ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
1734 if (!Shft || !Shft->isOne())
1735 return false;
1736 VarX = DefX->getOperand(0);
1737
1738 // step 3: Check the recurrence of variable X
1739 PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
1740 if (!PhiX)
1741 return false;
1742
1743 InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
1744
1745 // Make sure the initial value can't be negative otherwise the ashr in the
1746 // loop might never reach zero which would make the loop infinite.
1747 if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
1748 return false;
1749
1750 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1751 // or cnt.next = cnt + -1.
1752 // TODO: We can skip the step. If loop trip count is known (CTLZ),
1753 // then all uses of "cnt.next" could be optimized to the trip count
1754 // plus "cnt0". Currently it is not optimized.
1755 // This step could be used to detect POPCNT instruction:
1756 // cnt.next = cnt + (x.next & 1)
1757 for (Instruction &Inst : llvm::make_range(
1758 LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
1759 if (Inst.getOpcode() != Instruction::Add)
1760 continue;
1761
1762 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
1763 if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
1764 continue;
1765
1766 PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
1767 if (!Phi)
1768 continue;
1769
1770 CntInst = &Inst;
1771 CntPhi = Phi;
1772 break;
1773 }
1774 if (!CntInst)
1775 return false;
1776
1777 return true;
1778}
1779
1780/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
1781/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
1782/// trip count returns true; otherwise, returns false.
1783bool LoopIdiomRecognize::recognizeAndInsertFFS() {
1784 // Give up if the loop has multiple blocks or multiple backedges.
1785 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1786 return false;
1787
1788 Intrinsic::ID IntrinID;
1789 Value *InitX;
1790 Instruction *DefX = nullptr;
1791 PHINode *CntPhi = nullptr;
1792 Instruction *CntInst = nullptr;
1793 // Help decide if transformation is profitable. For ShiftUntilZero idiom,
1794 // this is always 6.
1795 size_t IdiomCanonicalSize = 6;
1796
1797 if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
1798 CntInst, CntPhi, DefX))
1799 return false;
1800
1801 bool IsCntPhiUsedOutsideLoop = false;
1802 for (User *U : CntPhi->users())
1803 if (!CurLoop->contains(cast<Instruction>(U))) {
1804 IsCntPhiUsedOutsideLoop = true;
1805 break;
1806 }
1807 bool IsCntInstUsedOutsideLoop = false;
1808 for (User *U : CntInst->users())
1809 if (!CurLoop->contains(cast<Instruction>(U))) {
1810 IsCntInstUsedOutsideLoop = true;
1811 break;
1812 }
1813 // If both CntInst and CntPhi are used outside the loop the profitability
1814 // is questionable.
1815 if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
1816 return false;
1817
1818 // For some CPUs result of CTLZ(X) intrinsic is undefined
1819 // when X is 0. If we can not guarantee X != 0, we need to check this
1820 // when expand.
1821 bool ZeroCheck = false;
1822 // It is safe to assume Preheader exist as it was checked in
1823 // parent function RunOnLoop.
1824 BasicBlock *PH = CurLoop->getLoopPreheader();
1825
1826 // If we are using the count instruction outside the loop, make sure we
1827 // have a zero check as a precondition. Without the check the loop would run
1828 // one iteration for before any check of the input value. This means 0 and 1
1829 // would have identical behavior in the original loop and thus
1830 if (!IsCntPhiUsedOutsideLoop) {
1831 auto *PreCondBB = PH->getSinglePredecessor();
1832 if (!PreCondBB)
1833 return false;
1834 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1835 if (!PreCondBI)
1836 return false;
1837 if (matchCondition(PreCondBI, PH) != InitX)
1838 return false;
1839 ZeroCheck = true;
1840 }
1841
1842 // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
1843 // profitable if we delete the loop.
1844
1845 // the loop has only 6 instructions:
1846 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
1847 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
1848 // %shr = ashr %n.addr.0, 1
1849 // %tobool = icmp eq %shr, 0
1850 // %inc = add nsw %i.0, 1
1851 // br i1 %tobool
1852
1853 const Value *Args[] = {InitX,
1854 ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
1855
1856 // @llvm.dbg doesn't count as they have no semantic effect.
1857 auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
1859 std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
1860
1861 IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
1864 if (HeaderSize != IdiomCanonicalSize &&
1866 return false;
1867
1868 transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
1869 DefX->getDebugLoc(), ZeroCheck,
1870 IsCntPhiUsedOutsideLoop);
1871 return true;
1872}
1873
1874/// Recognizes a population count idiom in a non-countable loop.
1875///
1876/// If detected, transforms the relevant code to issue the popcount intrinsic
1877/// function call, and returns true; otherwise, returns false.
1878bool LoopIdiomRecognize::recognizePopcount() {
1880 return false;
1881
1882 // Counting population are usually conducted by few arithmetic instructions.
1883 // Such instructions can be easily "absorbed" by vacant slots in a
1884 // non-compact loop. Therefore, recognizing popcount idiom only makes sense
1885 // in a compact loop.
1886
1887 // Give up if the loop has multiple blocks or multiple backedges.
1888 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1889 return false;
1890
1891 BasicBlock *LoopBody = *(CurLoop->block_begin());
1892 if (LoopBody->size() >= 20) {
1893 // The loop is too big, bail out.
1894 return false;
1895 }
1896
1897 // It should have a preheader containing nothing but an unconditional branch.
1898 BasicBlock *PH = CurLoop->getLoopPreheader();
1899 if (!PH || &PH->front() != PH->getTerminator())
1900 return false;
1901 auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
1902 if (!EntryBI || EntryBI->isConditional())
1903 return false;
1904
1905 // It should have a precondition block where the generated popcount intrinsic
1906 // function can be inserted.
1907 auto *PreCondBB = PH->getSinglePredecessor();
1908 if (!PreCondBB)
1909 return false;
1910 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1911 if (!PreCondBI || PreCondBI->isUnconditional())
1912 return false;
1913
1914 Instruction *CntInst;
1915 PHINode *CntPhi;
1916 Value *Val;
1917 if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
1918 return false;
1919
1920 transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
1921 return true;
1922}
1923
1925 const DebugLoc &DL) {
1926 Value *Ops[] = {Val};
1927 Type *Tys[] = {Val->getType()};
1928
1930 Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
1931 CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1932 CI->setDebugLoc(DL);
1933
1934 return CI;
1935}
1936
1938 const DebugLoc &DL, bool ZeroCheck,
1939 Intrinsic::ID IID) {
1940 Value *Ops[] = {Val, IRBuilder.getInt1(ZeroCheck)};
1941 Type *Tys[] = {Val->getType()};
1942
1944 Function *Func = Intrinsic::getDeclaration(M, IID, Tys);
1945 CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1946 CI->setDebugLoc(DL);
1947
1948 return CI;
1949}
1950
1951/// Transform the following loop (Using CTLZ, CTTZ is similar):
1952/// loop:
1953/// CntPhi = PHI [Cnt0, CntInst]
1954/// PhiX = PHI [InitX, DefX]
1955/// CntInst = CntPhi + 1
1956/// DefX = PhiX >> 1
1957/// LOOP_BODY
1958/// Br: loop if (DefX != 0)
1959/// Use(CntPhi) or Use(CntInst)
1960///
1961/// Into:
1962/// If CntPhi used outside the loop:
1963/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
1964/// Count = CountPrev + 1
1965/// else
1966/// Count = BitWidth(InitX) - CTLZ(InitX)
1967/// loop:
1968/// CntPhi = PHI [Cnt0, CntInst]
1969/// PhiX = PHI [InitX, DefX]
1970/// PhiCount = PHI [Count, Dec]
1971/// CntInst = CntPhi + 1
1972/// DefX = PhiX >> 1
1973/// Dec = PhiCount - 1
1974/// LOOP_BODY
1975/// Br: loop if (Dec != 0)
1976/// Use(CountPrev + Cnt0) // Use(CntPhi)
1977/// or
1978/// Use(Count + Cnt0) // Use(CntInst)
1979///
1980/// If LOOP_BODY is empty the loop will be deleted.
1981/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
1982void LoopIdiomRecognize::transformLoopToCountable(
1983 Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
1984 PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
1985 bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
1986 BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
1987
1988 // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
1989 IRBuilder<> Builder(PreheaderBr);
1990 Builder.SetCurrentDebugLocation(DL);
1991
1992 // If there are no uses of CntPhi crate:
1993 // Count = BitWidth - CTLZ(InitX);
1994 // NewCount = Count;
1995 // If there are uses of CntPhi create:
1996 // NewCount = BitWidth - CTLZ(InitX >> 1);
1997 // Count = NewCount + 1;
1998 Value *InitXNext;
1999 if (IsCntPhiUsedOutsideLoop) {
2000 if (DefX->getOpcode() == Instruction::AShr)
2001 InitXNext = Builder.CreateAShr(InitX, 1);
2002 else if (DefX->getOpcode() == Instruction::LShr)
2003 InitXNext = Builder.CreateLShr(InitX, 1);
2004 else if (DefX->getOpcode() == Instruction::Shl) // cttz
2005 InitXNext = Builder.CreateShl(InitX, 1);
2006 else
2007 llvm_unreachable("Unexpected opcode!");
2008 } else
2009 InitXNext = InitX;
2010 Value *Count =
2011 createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
2012 Type *CountTy = Count->getType();
2013 Count = Builder.CreateSub(
2014 ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count);
2015 Value *NewCount = Count;
2016 if (IsCntPhiUsedOutsideLoop)
2017 Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1));
2018
2019 NewCount = Builder.CreateZExtOrTrunc(NewCount, CntInst->getType());
2020
2021 Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
2022 if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
2023 // If the counter was being incremented in the loop, add NewCount to the
2024 // counter's initial value, but only if the initial value is not zero.
2025 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2026 if (!InitConst || !InitConst->isZero())
2027 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2028 } else {
2029 // If the count was being decremented in the loop, subtract NewCount from
2030 // the counter's initial value.
2031 NewCount = Builder.CreateSub(CntInitVal, NewCount);
2032 }
2033
2034 // Step 2: Insert new IV and loop condition:
2035 // loop:
2036 // ...
2037 // PhiCount = PHI [Count, Dec]
2038 // ...
2039 // Dec = PhiCount - 1
2040 // ...
2041 // Br: loop if (Dec != 0)
2042 BasicBlock *Body = *(CurLoop->block_begin());
2043 auto *LbBr = cast<BranchInst>(Body->getTerminator());
2044 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2045
2046 PHINode *TcPhi = PHINode::Create(CountTy, 2, "tcphi", &Body->front());
2047
2048 Builder.SetInsertPoint(LbCond);
2049 Instruction *TcDec = cast<Instruction>(Builder.CreateSub(
2050 TcPhi, ConstantInt::get(CountTy, 1), "tcdec", false, true));
2051
2052 TcPhi->addIncoming(Count, Preheader);
2053 TcPhi->addIncoming(TcDec, Body);
2054
2055 CmpInst::Predicate Pred =
2056 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
2057 LbCond->setPredicate(Pred);
2058 LbCond->setOperand(0, TcDec);
2059 LbCond->setOperand(1, ConstantInt::get(CountTy, 0));
2060
2061 // Step 3: All the references to the original counter outside
2062 // the loop are replaced with the NewCount
2063 if (IsCntPhiUsedOutsideLoop)
2064 CntPhi->replaceUsesOutsideBlock(NewCount, Body);
2065 else
2066 CntInst->replaceUsesOutsideBlock(NewCount, Body);
2067
2068 // step 4: Forget the "non-computable" trip-count SCEV associated with the
2069 // loop. The loop would otherwise not be deleted even if it becomes empty.
2070 SE->forgetLoop(CurLoop);
2071}
2072
2073void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
2074 Instruction *CntInst,
2075 PHINode *CntPhi, Value *Var) {
2076 BasicBlock *PreHead = CurLoop->getLoopPreheader();
2077 auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
2078 const DebugLoc &DL = CntInst->getDebugLoc();
2079
2080 // Assuming before transformation, the loop is following:
2081 // if (x) // the precondition
2082 // do { cnt++; x &= x - 1; } while(x);
2083
2084 // Step 1: Insert the ctpop instruction at the end of the precondition block
2085 IRBuilder<> Builder(PreCondBr);
2086 Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
2087 {
2088 PopCnt = createPopcntIntrinsic(Builder, Var, DL);
2089 NewCount = PopCntZext =
2090 Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
2091
2092 if (NewCount != PopCnt)
2093 (cast<Instruction>(NewCount))->setDebugLoc(DL);
2094
2095 // TripCnt is exactly the number of iterations the loop has
2096 TripCnt = NewCount;
2097
2098 // If the population counter's initial value is not zero, insert Add Inst.
2099 Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
2100 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2101 if (!InitConst || !InitConst->isZero()) {
2102 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2103 (cast<Instruction>(NewCount))->setDebugLoc(DL);
2104 }
2105 }
2106
2107 // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
2108 // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
2109 // function would be partial dead code, and downstream passes will drag
2110 // it back from the precondition block to the preheader.
2111 {
2112 ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
2113
2114 Value *Opnd0 = PopCntZext;
2115 Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
2116 if (PreCond->getOperand(0) != Var)
2117 std::swap(Opnd0, Opnd1);
2118
2119 ICmpInst *NewPreCond = cast<ICmpInst>(
2120 Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
2121 PreCondBr->setCondition(NewPreCond);
2122
2124 }
2125
2126 // Step 3: Note that the population count is exactly the trip count of the
2127 // loop in question, which enable us to convert the loop from noncountable
2128 // loop into a countable one. The benefit is twofold:
2129 //
2130 // - If the loop only counts population, the entire loop becomes dead after
2131 // the transformation. It is a lot easier to prove a countable loop dead
2132 // than to prove a noncountable one. (In some C dialects, an infinite loop
2133 // isn't dead even if it computes nothing useful. In general, DCE needs
2134 // to prove a noncountable loop finite before safely delete it.)
2135 //
2136 // - If the loop also performs something else, it remains alive.
2137 // Since it is transformed to countable form, it can be aggressively
2138 // optimized by some optimizations which are in general not applicable
2139 // to a noncountable loop.
2140 //
2141 // After this step, this loop (conceptually) would look like following:
2142 // newcnt = __builtin_ctpop(x);
2143 // t = newcnt;
2144 // if (x)
2145 // do { cnt++; x &= x-1; t--) } while (t > 0);
2146 BasicBlock *Body = *(CurLoop->block_begin());
2147 {
2148 auto *LbBr = cast<BranchInst>(Body->getTerminator());
2149 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2150 Type *Ty = TripCnt->getType();
2151
2152 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
2153
2154 Builder.SetInsertPoint(LbCond);
2155 Instruction *TcDec = cast<Instruction>(
2156 Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
2157 "tcdec", false, true));
2158
2159 TcPhi->addIncoming(TripCnt, PreHead);
2160 TcPhi->addIncoming(TcDec, Body);
2161
2162 CmpInst::Predicate Pred =
2163 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
2164 LbCond->setPredicate(Pred);
2165 LbCond->setOperand(0, TcDec);
2166 LbCond->setOperand(1, ConstantInt::get(Ty, 0));
2167 }
2168
2169 // Step 4: All the references to the original population counter outside
2170 // the loop are replaced with the NewCount -- the value returned from
2171 // __builtin_ctpop().
2172 CntInst->replaceUsesOutsideBlock(NewCount, Body);
2173
2174 // step 5: Forget the "non-computable" trip-count SCEV associated with the
2175 // loop. The loop would otherwise not be deleted even if it becomes empty.
2176 SE->forgetLoop(CurLoop);
2177}
2178
2179/// Match loop-invariant value.
2180template <typename SubPattern_t> struct match_LoopInvariant {
2181 SubPattern_t SubPattern;
2182 const Loop *L;
2183
2184 match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
2185 : SubPattern(SP), L(L) {}
2186
2187 template <typename ITy> bool match(ITy *V) {
2188 return L->isLoopInvariant(V) && SubPattern.match(V);
2189 }
2190};
2191
2192/// Matches if the value is loop-invariant.
2193template <typename Ty>
2194inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
2195 return match_LoopInvariant<Ty>(M, L);
2196}
2197
2198/// Return true if the idiom is detected in the loop.
2199///
2200/// The core idiom we are trying to detect is:
2201/// \code
2202/// entry:
2203/// <...>
2204/// %bitmask = shl i32 1, %bitpos
2205/// br label %loop
2206///
2207/// loop:
2208/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2209/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2210/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2211/// %x.next = shl i32 %x.curr, 1
2212/// <...>
2213/// br i1 %x.curr.isbitunset, label %loop, label %end
2214///
2215/// end:
2216/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2217/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2218/// <...>
2219/// \endcode
2220static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
2221 Value *&BitMask, Value *&BitPos,
2222 Value *&CurrX, Instruction *&NextX) {
2224 " Performing shift-until-bittest idiom detection.\n");
2225
2226 // Give up if the loop has multiple blocks or multiple backedges.
2227 if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
2228 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
2229 return false;
2230 }
2231
2232 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2233 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2234 assert(LoopPreheaderBB && "There is always a loop preheader.");
2235
2236 using namespace PatternMatch;
2237
2238 // Step 1: Check if the loop backedge is in desirable form.
2239
2241 Value *CmpLHS, *CmpRHS;
2242 BasicBlock *TrueBB, *FalseBB;
2243 if (!match(LoopHeaderBB->getTerminator(),
2244 m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
2245 m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {
2246 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
2247 return false;
2248 }
2249
2250 // Step 2: Check if the backedge's condition is in desirable form.
2251
2252 auto MatchVariableBitMask = [&]() {
2253 return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2254 match(CmpLHS,
2255 m_c_And(m_Value(CurrX),
2257 m_Value(BitMask),
2258 m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),
2259 CurLoop))));
2260 };
2261 auto MatchConstantBitMask = [&]() {
2262 return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2263 match(CmpLHS, m_And(m_Value(CurrX),
2264 m_CombineAnd(m_Value(BitMask), m_Power2()))) &&
2265 (BitPos = ConstantExpr::getExactLogBase2(cast<Constant>(BitMask)));
2266 };
2267 auto MatchDecomposableConstantBitMask = [&]() {
2268 APInt Mask;
2269 return llvm::decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, CurrX, Mask) &&
2270 ICmpInst::isEquality(Pred) && Mask.isPowerOf2() &&
2271 (BitMask = ConstantInt::get(CurrX->getType(), Mask)) &&
2272 (BitPos = ConstantInt::get(CurrX->getType(), Mask.logBase2()));
2273 };
2274
2275 if (!MatchVariableBitMask() && !MatchConstantBitMask() &&
2276 !MatchDecomposableConstantBitMask()) {
2277 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n");
2278 return false;
2279 }
2280
2281 // Step 3: Check if the recurrence is in desirable form.
2282 auto *CurrXPN = dyn_cast<PHINode>(CurrX);
2283 if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) {
2284 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
2285 return false;
2286 }
2287
2288 BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);
2289 NextX =
2290 dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));
2291
2292 assert(CurLoop->isLoopInvariant(BaseX) &&
2293 "Expected BaseX to be avaliable in the preheader!");
2294
2295 if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {
2296 // FIXME: support right-shift?
2297 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
2298 return false;
2299 }
2300
2301 // Step 4: Check if the backedge's destinations are in desirable form.
2302
2304 "Should only get equality predicates here.");
2305
2306 // cmp-br is commutative, so canonicalize to a single variant.
2307 if (Pred != ICmpInst::Predicate::ICMP_EQ) {
2308 Pred = ICmpInst::getInversePredicate(Pred);
2309 std::swap(TrueBB, FalseBB);
2310 }
2311
2312 // We expect to exit loop when comparison yields false,
2313 // so when it yields true we should branch back to loop header.
2314 if (TrueBB != LoopHeaderBB) {
2315 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
2316 return false;
2317 }
2318
2319 // Okay, idiom checks out.
2320 return true;
2321}
2322
2323/// Look for the following loop:
2324/// \code
2325/// entry:
2326/// <...>
2327/// %bitmask = shl i32 1, %bitpos
2328/// br label %loop
2329///
2330/// loop:
2331/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2332/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2333/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2334/// %x.next = shl i32 %x.curr, 1
2335/// <...>
2336/// br i1 %x.curr.isbitunset, label %loop, label %end
2337///
2338/// end:
2339/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2340/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2341/// <...>
2342/// \endcode
2343///
2344/// And transform it into:
2345/// \code
2346/// entry:
2347/// %bitmask = shl i32 1, %bitpos
2348/// %lowbitmask = add i32 %bitmask, -1
2349/// %mask = or i32 %lowbitmask, %bitmask
2350/// %x.masked = and i32 %x, %mask
2351/// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
2352/// i1 true)
2353/// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
2354/// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
2355/// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
2356/// %tripcount = add i32 %backedgetakencount, 1
2357/// %x.curr = shl i32 %x, %backedgetakencount
2358/// %x.next = shl i32 %x, %tripcount
2359/// br label %loop
2360///
2361/// loop:
2362/// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
2363/// %loop.iv.next = add nuw i32 %loop.iv, 1
2364/// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
2365/// <...>
2366/// br i1 %loop.ivcheck, label %end, label %loop
2367///
2368/// end:
2369/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2370/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2371/// <...>
2372/// \endcode
2373bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
2374 bool MadeChange = false;
2375
2376 Value *X, *BitMask, *BitPos, *XCurr;
2377 Instruction *XNext;
2378 if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,
2379 XNext)) {
2381 " shift-until-bittest idiom detection failed.\n");
2382 return MadeChange;
2383 }
2384 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n");
2385
2386 // Ok, it is the idiom we were looking for, we *could* transform this loop,
2387 // but is it profitable to transform?
2388
2389 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2390 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2391 assert(LoopPreheaderBB && "There is always a loop preheader.");
2392
2393 BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2394 assert(SuccessorBB && "There is only a single successor.");
2395
2396 IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2397 Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());
2398
2399 Intrinsic::ID IntrID = Intrinsic::ctlz;
2400 Type *Ty = X->getType();
2401 unsigned Bitwidth = Ty->getScalarSizeInBits();
2402
2405
2406 // The rewrite is considered to be unprofitable iff and only iff the
2407 // intrinsic/shift we'll use are not cheap. Note that we are okay with *just*
2408 // making the loop countable, even if nothing else changes.
2410 IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getTrue()});
2412 if (Cost > TargetTransformInfo::TCC_Basic) {
2414 " Intrinsic is too costly, not beneficial\n");
2415 return MadeChange;
2416 }
2417 if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >
2419 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n");
2420 return MadeChange;
2421 }
2422
2423 // Ok, transform appears worthwhile.
2424 MadeChange = true;
2425
2426 // Step 1: Compute the loop trip count.
2427
2428 Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
2429 BitPos->getName() + ".lowbitmask");
2430 Value *Mask =
2431 Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
2432 Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
2433 CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
2434 IntrID, Ty, {XMasked, /*is_zero_undef=*/Builder.getTrue()},
2435 /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros");
2436 Value *XMaskedNumActiveBits = Builder.CreateSub(
2437 ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
2438 XMasked->getName() + ".numactivebits", /*HasNUW=*/true,
2439 /*HasNSW=*/Bitwidth != 2);
2440 Value *XMaskedLeadingOnePos =
2441 Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),
2442 XMasked->getName() + ".leadingonepos", /*HasNUW=*/false,
2443 /*HasNSW=*/Bitwidth > 2);
2444
2445 Value *LoopBackedgeTakenCount = Builder.CreateSub(
2446 BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount",
2447 /*HasNUW=*/true, /*HasNSW=*/true);
2448 // We know loop's backedge-taken count, but what's loop's trip count?
2449 // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2450 Value *LoopTripCount =
2451 Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
2452 CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
2453 /*HasNSW=*/Bitwidth != 2);
2454
2455 // Step 2: Compute the recurrence's final value without a loop.
2456
2457 // NewX is always safe to compute, because `LoopBackedgeTakenCount`
2458 // will always be smaller than `bitwidth(X)`, i.e. we never get poison.
2459 Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);
2460 NewX->takeName(XCurr);
2461 if (auto *I = dyn_cast<Instruction>(NewX))
2462 I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
2463
2464 Value *NewXNext;
2465 // Rewriting XNext is more complicated, however, because `X << LoopTripCount`
2466 // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
2467 // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
2468 // that isn't the case, we'll need to emit an alternative, safe IR.
2469 if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() ||
2473 Ty->getScalarSizeInBits() - 1))))
2474 NewXNext = Builder.CreateShl(X, LoopTripCount);
2475 else {
2476 // Otherwise, just additionally shift by one. It's the smallest solution,
2477 // alternatively, we could check that NewX is INT_MIN (or BitPos is )
2478 // and select 0 instead.
2479 NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));
2480 }
2481
2482 NewXNext->takeName(XNext);
2483 if (auto *I = dyn_cast<Instruction>(NewXNext))
2484 I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
2485
2486 // Step 3: Adjust the successor basic block to recieve the computed
2487 // recurrence's final value instead of the recurrence itself.
2488
2489 XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);
2490 XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);
2491
2492 // Step 4: Rewrite the loop into a countable form, with canonical IV.
2493
2494 // The new canonical induction variable.
2495 Builder.SetInsertPoint(&LoopHeaderBB->front());
2496 auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
2497
2498 // The induction itself.
2499 // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2500 Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
2501 auto *IVNext =
2502 Builder.CreateAdd(IV, ConstantInt::get(Ty, 1), IV->getName() + ".next",
2503 /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
2504
2505 // The loop trip count check.
2506 auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
2507 CurLoop->getName() + ".ivcheck");
2508 Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
2509 LoopHeaderBB->getTerminator()->eraseFromParent();
2510
2511 // Populate the IV PHI.
2512 IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
2513 IV->addIncoming(IVNext, LoopHeaderBB);
2514
2515 // Step 5: Forget the "non-computable" trip-count SCEV associated with the
2516 // loop. The loop would otherwise not be deleted even if it becomes empty.
2517
2518 SE->forgetLoop(CurLoop);
2519
2520 // Other passes will take care of actually deleting the loop if possible.
2521
2522 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n");
2523
2524 ++NumShiftUntilBitTest;
2525 return MadeChange;
2526}
2527
2528/// Return true if the idiom is detected in the loop.
2529///
2530/// The core idiom we are trying to detect is:
2531/// \code
2532/// entry:
2533/// <...>
2534/// %start = <...>
2535/// %extraoffset = <...>
2536/// <...>
2537/// br label %for.cond
2538///
2539/// loop:
2540/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
2541/// %nbits = add nsw i8 %iv, %extraoffset
2542/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
2543/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
2544/// %iv.next = add i8 %iv, 1
2545/// <...>
2546/// br i1 %val.shifted.iszero, label %end, label %loop
2547///
2548/// end:
2549/// %iv.res = phi i8 [ %iv, %loop ] <...>
2550/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
2551/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
2552/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
2553/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
2554/// <...>
2555/// \endcode
2557 Instruction *&ValShiftedIsZero,
2558 Intrinsic::ID &IntrinID, Instruction *&IV,
2559 Value *&Start, Value *&Val,
2560 const SCEV *&ExtraOffsetExpr,
2561 bool &InvertedCond) {
2563 " Performing shift-until-zero idiom detection.\n");
2564
2565 // Give up if the loop has multiple blocks or multiple backedges.
2566 if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
2567 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
2568 return false;
2569 }
2570
2571 Instruction *ValShifted, *NBits, *IVNext;
2572 Value *ExtraOffset;
2573
2574 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2575 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2576 assert(LoopPreheaderBB && "There is always a loop preheader.");
2577
2578 using namespace PatternMatch;
2579
2580 // Step 1: Check if the loop backedge, condition is in desirable form.
2581
2583 BasicBlock *TrueBB, *FalseBB;
2584 if (!match(LoopHeaderBB->getTerminator(),
2585 m_Br(m_Instruction(ValShiftedIsZero), m_BasicBlock(TrueBB),
2586 m_BasicBlock(FalseBB))) ||
2587 !match(ValShiftedIsZero,
2588 m_ICmp(Pred, m_Instruction(ValShifted), m_Zero())) ||
2589 !ICmpInst::isEquality(Pred)) {
2590 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
2591 return false;
2592 }
2593
2594 // Step 2: Check if the comparison's operand is in desirable form.
2595 // FIXME: Val could be a one-input PHI node, which we should look past.
2596 if (!match(ValShifted, m_Shift(m_LoopInvariant(m_Value(Val), CurLoop),
2597 m_Instruction(NBits)))) {
2598 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad comparisons value computation.\n");
2599 return false;
2600 }
2601 IntrinID = ValShifted->getOpcode() == Instruction::Shl ? Intrinsic::cttz
2602 : Intrinsic::ctlz;
2603
2604 // Step 3: Check if the shift amount is in desirable form.
2605
2606 if (match(NBits, m_c_Add(m_Instruction(IV),
2607 m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
2608 (NBits->hasNoSignedWrap() || NBits->hasNoUnsignedWrap()))
2609 ExtraOffsetExpr = SE->getNegativeSCEV(SE->getSCEV(ExtraOffset));
2610 else if (match(NBits,
2612 m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
2613 NBits->hasNoSignedWrap())
2614 ExtraOffsetExpr = SE->getSCEV(ExtraOffset);
2615 else {
2616 IV = NBits;
2617 ExtraOffsetExpr = SE->getZero(NBits->getType());
2618 }
2619
2620 // Step 4: Check if the recurrence is in desirable form.
2621 auto *IVPN = dyn_cast<PHINode>(IV);
2622 if (!IVPN || IVPN->getParent() != LoopHeaderBB) {
2623 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
2624 return false;
2625 }
2626
2627 Start = IVPN->getIncomingValueForBlock(LoopPreheaderBB);
2628 IVNext = dyn_cast<Instruction>(IVPN->getIncomingValueForBlock(LoopHeaderBB));
2629
2630 if (!IVNext || !match(IVNext, m_Add(m_Specific(IVPN), m_One()))) {
2631 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
2632 return false;
2633 }
2634
2635 // Step 4: Check if the backedge's destinations are in desirable form.
2636
2638 "Should only get equality predicates here.");
2639
2640 // cmp-br is commutative, so canonicalize to a single variant.
2641 InvertedCond = Pred != ICmpInst::Predicate::ICMP_EQ;
2642 if (InvertedCond) {
2643 Pred = ICmpInst::getInversePredicate(Pred);
2644 std::swap(TrueBB, FalseBB);
2645 }
2646
2647 // We expect to exit loop when comparison yields true,
2648 // so when it yields false we should branch back to loop header.
2649 if (FalseBB != LoopHeaderBB) {
2650 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
2651 return false;
2652 }
2653
2654 // The new, countable, loop will certainly only run a known number of
2655 // iterations, It won't be infinite. But the old loop might be infinite
2656 // under certain conditions. For logical shifts, the value will become zero
2657 // after at most bitwidth(%Val) loop iterations. However, for arithmetic
2658 // right-shift, iff the sign bit was set, the value will never become zero,
2659 // and the loop may never finish.
2660 if (ValShifted->getOpcode() == Instruction::AShr &&
2661 !isMustProgress(CurLoop) && !SE->isKnownNonNegative(SE->getSCEV(Val))) {
2662 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Can not prove the loop is finite.\n");
2663 return false;
2664 }
2665
2666 // Okay, idiom checks out.
2667 return true;
2668}
2669
2670/// Look for the following loop:
2671/// \code
2672/// entry:
2673/// <...>
2674/// %start = <...>
2675/// %extraoffset = <...>
2676/// <...>
2677/// br label %for.cond
2678///
2679/// loop:
2680/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
2681/// %nbits = add nsw i8 %iv, %extraoffset
2682/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
2683/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
2684/// %iv.next = add i8 %iv, 1
2685/// <...>
2686/// br i1 %val.shifted.iszero, label %end, label %loop
2687///
2688/// end:
2689/// %iv.res = phi i8 [ %iv, %loop ] <...>
2690/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
2691/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
2692/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
2693/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
2694/// <...>
2695/// \endcode
2696///
2697/// And transform it into:
2698/// \code
2699/// entry:
2700/// <...>
2701/// %start = <...>
2702/// %extraoffset = <...>
2703/// <...>
2704/// %val.numleadingzeros = call i8 @llvm.ct{l,t}z.i8(i8 %val, i1 0)
2705/// %val.numactivebits = sub i8 8, %val.numleadingzeros
2706/// %extraoffset.neg = sub i8 0, %extraoffset
2707/// %tmp = add i8 %val.numactivebits, %extraoffset.neg
2708/// %iv.final = call i8 @llvm.smax.i8(i8 %tmp, i8 %start)
2709/// %loop.tripcount = sub i8 %iv.final, %start
2710/// br label %loop
2711///
2712/// loop:
2713/// %loop.iv = phi i8 [ 0, %entry ], [ %loop.iv.next, %loop ]
2714/// %loop.iv.next = add i8 %loop.iv, 1
2715/// %loop.ivcheck = icmp eq i8 %loop.iv.next, %loop.tripcount
2716/// %iv = add i8 %loop.iv, %start
2717/// <...>
2718/// br i1 %loop.ivcheck, label %end, label %loop
2719///
2720/// end:
2721/// %iv.res = phi i8 [ %iv.final, %loop ] <...>
2722/// <...>
2723/// \endcode
2724bool LoopIdiomRecognize::recognizeShiftUntilZero() {
2725 bool MadeChange = false;
2726
2727 Instruction *ValShiftedIsZero;
2728 Intrinsic::ID IntrID;
2729 Instruction *IV;
2730 Value *Start, *Val;
2731 const SCEV *ExtraOffsetExpr;
2732 bool InvertedCond;
2733 if (!detectShiftUntilZeroIdiom(CurLoop, SE, ValShiftedIsZero, IntrID, IV,
2734 Start, Val, ExtraOffsetExpr, InvertedCond)) {
2736 " shift-until-zero idiom detection failed.\n");
2737 return MadeChange;
2738 }
2739 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom detected!\n");
2740
2741 // Ok, it is the idiom we were looking for, we *could* transform this loop,
2742 // but is it profitable to transform?
2743
2744 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2745 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2746 assert(LoopPreheaderBB && "There is always a loop preheader.");
2747
2748 BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2749 assert(SuccessorBB && "There is only a single successor.");
2750
2751 IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2752 Builder.SetCurrentDebugLocation(IV->getDebugLoc());
2753
2754 Type *Ty = Val->getType();
2755 unsigned Bitwidth = Ty->getScalarSizeInBits();
2756
2759
2760 // The rewrite is considered to be unprofitable iff and only iff the
2761 // intrinsic we'll use are not cheap. Note that we are okay with *just*
2762 // making the loop countable, even if nothing else changes.
2764 IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getFalse()});
2766 if (Cost > TargetTransformInfo::TCC_Basic) {
2768 " Intrinsic is too costly, not beneficial\n");
2769 return MadeChange;
2770 }
2771
2772 // Ok, transform appears worthwhile.
2773 MadeChange = true;
2774
2775 bool OffsetIsZero = false;
2776 if (auto *ExtraOffsetExprC = dyn_cast<SCEVConstant>(ExtraOffsetExpr))
2777 OffsetIsZero = ExtraOffsetExprC->isZero();
2778
2779 // Step 1: Compute the loop's final IV value / trip count.
2780
2781 CallInst *ValNumLeadingZeros = Builder.CreateIntrinsic(
2782 IntrID, Ty, {Val, /*is_zero_undef=*/Builder.getFalse()},
2783 /*FMFSource=*/nullptr, Val->getName() + ".numleadingzeros");
2784 Value *ValNumActiveBits = Builder.CreateSub(
2785 ConstantInt::get(Ty, Ty->getScalarSizeInBits()), ValNumLeadingZeros,
2786 Val->getName() + ".numactivebits", /*HasNUW=*/true,
2787 /*HasNSW=*/Bitwidth != 2);
2788
2789 SCEVExpander Expander(*SE, *DL, "loop-idiom");
2790 Expander.setInsertPoint(&*Builder.GetInsertPoint());
2791 Value *ExtraOffset = Expander.expandCodeFor(ExtraOffsetExpr);
2792
2793 Value *ValNumActiveBitsOffset = Builder.CreateAdd(
2794 ValNumActiveBits, ExtraOffset, ValNumActiveBits->getName() + ".offset",
2795 /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true);
2796 Value *IVFinal = Builder.CreateIntrinsic(Intrinsic::smax, {Ty},
2797 {ValNumActiveBitsOffset, Start},
2798 /*FMFSource=*/nullptr, "iv.final");
2799
2800 auto *LoopBackedgeTakenCount = cast<Instruction>(Builder.CreateSub(
2801 IVFinal, Start, CurLoop->getName() + ".backedgetakencount",
2802 /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true));
2803 // FIXME: or when the offset was `add nuw`
2804
2805 // We know loop's backedge-taken count, but what's loop's trip count?
2806 Value *LoopTripCount =
2807 Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
2808 CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
2809 /*HasNSW=*/Bitwidth != 2);
2810
2811 // Step 2: Adjust the successor basic block to recieve the original
2812 // induction variable's final value instead of the orig. IV itself.
2813
2814 IV->replaceUsesOutsideBlock(IVFinal, LoopHeaderBB);
2815
2816 // Step 3: Rewrite the loop into a countable form, with canonical IV.
2817
2818 // The new canonical induction variable.
2819 Builder.SetInsertPoint(&LoopHeaderBB->front());
2820 auto *CIV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
2821
2822 // The induction itself.
2823 Builder.SetInsertPoint(LoopHeaderBB->getFirstNonPHI());
2824 auto *CIVNext =
2825 Builder.CreateAdd(CIV, ConstantInt::get(Ty, 1), CIV->getName() + ".next",
2826 /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
2827
2828 // The loop trip count check.
2829 auto *CIVCheck = Builder.CreateICmpEQ(CIVNext, LoopTripCount,
2830 CurLoop->getName() + ".ivcheck");
2831 auto *NewIVCheck = CIVCheck;
2832 if (InvertedCond) {
2833 NewIVCheck = Builder.CreateNot(CIVCheck);
2834 NewIVCheck->takeName(ValShiftedIsZero);
2835 }
2836
2837 // The original IV, but rebased to be an offset to the CIV.
2838 auto *IVDePHId = Builder.CreateAdd(CIV, Start, "", /*HasNUW=*/false,
2839 /*HasNSW=*/true); // FIXME: what about NUW?
2840 IVDePHId->takeName(IV);
2841
2842 // The loop terminator.
2843 Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
2844 Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
2845 LoopHeaderBB->getTerminator()->eraseFromParent();
2846
2847 // Populate the IV PHI.
2848 CIV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
2849 CIV->addIncoming(CIVNext, LoopHeaderBB);
2850
2851 // Step 4: Forget the "non-computable" trip-count SCEV associated with the
2852 // loop. The loop would otherwise not be deleted even if it becomes empty.
2853
2854 SE->forgetLoop(CurLoop);
2855
2856 // Step 5: Try to cleanup the loop's body somewhat.
2857 IV->replaceAllUsesWith(IVDePHId);
2858 IV->eraseFromParent();
2859
2860 ValShiftedIsZero->replaceAllUsesWith(NewIVCheck);
2861 ValShiftedIsZero->eraseFromParent();
2862
2863 // Other passes will take care of actually deleting the loop if possible.
2864
2865 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom optimized!\n");
2866
2867 ++NumShiftUntilZero;
2868 return MadeChange;
2869}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file implements a class to represent arbitrary precision integral constant values and operations...
assume Assume Builder
static const Function * getParent(const Value *V)
SmallVector< MachineOperand, 4 > Cond
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
std::string Name
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define DEBUG_TYPE
static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, const SCEV *BECount, unsigned StoreSize, AliasAnalysis &AA, SmallPtrSetImpl< Instruction * > &Ignored)
mayLoopAccessLocation - Return true if the specified loop might access the specified pointer location...
IRTranslator LLVM IR MI
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static Value * matchCondition(BranchInst *BI, BasicBlock *LoopEntry, bool JmpOnZero=false)
Check if the given conditional branch is based on the comparison between a variable and zero,...
static PHINode * getRecurrenceVar(Value *VarX, Instruction *DefX, BasicBlock *LoopEntry)
static cl::opt< bool, true > DisableLIRPMemset("disable-" DEBUG_TYPE "-memset", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memset."), cl::location(DisableLIRP::Memset), cl::init(false), cl::ReallyHidden)
static cl::opt< bool > UseLIRCodeSizeHeurs("use-lir-code-size-heurs", cl::desc("Use loop idiom recognition code size heuristics when compiling" "with -Os/-Oz"), cl::init(true), cl::Hidden)
static CallInst * createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL, bool ZeroCheck, Intrinsic::ID IID)
static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX, Value *&BitMask, Value *&BitPos, Value *&CurrX, Instruction *&NextX)
Return true if the idiom is detected in the loop.
static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, Instruction *&CntInst, PHINode *&CntPhi, Value *&Var)
Return true iff the idiom is detected in the loop.
static Constant * getMemSetPatternValue(Value *V, const DataLayout *DL)
getMemSetPatternValue - If a strided store of the specified value is safe to turn into a memset_patte...
static const SCEV * getTripCount(const SCEV *BECount, Type *IntPtr, Loop *CurLoop, const DataLayout *DL, ScalarEvolution *SE)
Compute trip count from the backedge taken count.
static cl::opt< bool, true > DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memcpy."), cl::location(DisableLIRP::Memcpy), cl::init(false), cl::ReallyHidden)
static CallInst * createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL)
static const SCEV * getNumBytes(const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, Loop *CurLoop, const DataLayout *DL, ScalarEvolution *SE)
Compute the number of bytes as a SCEV from the backedge taken count.
static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL, Intrinsic::ID &IntrinID, Value *&InitX, Instruction *&CntInst, PHINode *&CntPhi, Instruction *&DefX)
Return true if the idiom is detected in the loop.
static const SCEV * getStartForNegStride(const SCEV *Start, const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, ScalarEvolution *SE)
static APInt getStoreStride(const SCEVAddRecExpr *StoreEv)
match_LoopInvariant< Ty > m_LoopInvariant(const Ty &M, const Loop *L)
Matches if the value is loop-invariant.
#define DEBUG_TYPE
static cl::opt< bool, true > DisableLIRPAll("disable-" DEBUG_TYPE "-all", cl::desc("Options to disable Loop Idiom Recognize Pass."), cl::location(DisableLIRP::All), cl::init(false), cl::ReallyHidden)
static void deleteDeadInstruction(Instruction *I)
#define I(x, y, z)
Definition: MD5.cpp:58
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
Module.h This file contains the declarations for the Module class.
Contains a collection of routines for determining if a given instruction is guaranteed to execute if ...
This header defines various interfaces for pass management in LLVM.
@ SI
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isSimple(Instruction *I)
verify safepoint Safepoint IR Verifier
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This pass exposes codegen information to IR-level passes.
static const uint32_t IV[8]
Definition: blake3_impl.h:77
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Check whether or not an instruction may read or write the optionally specified memory location.
Class for arbitrary precision integers.
Definition: APInt.h:75
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1516
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:620
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:658
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
iterator end()
Definition: BasicBlock.h:325
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:323
iterator_range< filter_iterator< BasicBlock::const_iterator, std::function< bool(const Instruction &)> > > instructionsWithoutDebug(bool SkipPseudoOp=true) const
Return a const iterator range over the instructions in the block, skipping any debug instructions.
Definition: BasicBlock.cpp:103
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:217
const Instruction & front() const
Definition: BasicBlock.h:335
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:293
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:112
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:87
size_t size() const
Definition: BasicBlock.h:333
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:127
BinaryOps getOpcode() const
Definition: InstrTypes.h:391
Conditional or Unconditional Branch instruction.
bool isConditional() const
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1408
This class represents a function call, abstracting a target machine's calling convention.
void setPredicate(Predicate P)
Set the predicate for this instruction to the specified value.
Definition: InstrTypes.h:811
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:718
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:748
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:741
@ ICMP_EQ
equal
Definition: InstrTypes.h:739
@ ICMP_NE
not equal
Definition: InstrTypes.h:740
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:832
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:808
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1242
static Constant * getBitCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2220
static Constant * getExactLogBase2(Constant *C)
If C is a scalar/fixed width vector of known powers of 2, then this function returns a new scalar/fix...
Definition: Constants.cpp:2664
This is the shared class of boolean and integer constants.
Definition: Constants.h:78
bool isMinusOne() const
This function will return true iff every bit in this constant is set to true.
Definition: Constants.h:209
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
Definition: Constants.h:203
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:197
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:888
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:145
static ConstantInt * getBool(LLVMContext &Context, bool V)
Definition: Constants.cpp:847
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:403
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
A debug info location.
Definition: DebugLoc.h:33
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:165
void setAlignment(Align Align)
Sets the alignment attribute of the GlobalObject.
Definition: Globals.cpp:130
void setUnnamedAddr(UnnamedAddr Val)
Definition: GlobalValue.h:227
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:652
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:56
This instruction compares its operands according to the predicate given to the constructor.
bool isEquality() const
Return true if this predicate is either EQ or NE.
ConstantInt * getInt1(bool V)
Get a constant value representing either true or false.
Definition: IRBuilder.h:447
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:174
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2307
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2564
bool hasNoUnsignedWrap() const LLVM_READONLY
Determine whether the no unsigned wrap flag is set.
bool hasNoSignedWrap() const LLVM_READONLY
Determine whether the no signed wrap flag is set.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:358
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:70
bool isAtomic() const LLVM_READONLY
Return true if this instruction has an AtomicOrdering of unordered or higher.
const BasicBlock * getParent() const
Definition: Instruction.h:90
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:74
AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
Definition: Metadata.cpp:1499
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:168
bool isShift() const
Definition: Instruction.h:175
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:82
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:355
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
An instruction for reading from memory.
Definition: Instructions.h:177
Value * getPointerOperand()
Definition: Instructions.h:264
bool isVolatile() const
Return true if this is a load from a volatile memory location.
Definition: Instructions.h:214
bool isUnordered() const
Definition: Instructions.h:258
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:220
static LocationSize precise(uint64_t Value)
static constexpr LocationSize afterPointer()
Any location after the base pointer (but still within the underlying object).
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:139
bool isOutermost() const
Return true if the loop does not have a parent (natural) loop.
Definition: LoopInfo.h:185
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:202
unsigned getNumBackEdges() const
Calculate the number of back edges to the loop header.
Definition: LoopInfo.h:267
BlockT * getHeader() const
Definition: LoopInfo.h:105
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
Definition: LoopInfoImpl.h:107
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:183
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:188
void getUniqueExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all unique successor blocks of this loop.
Definition: LoopInfoImpl.h:142
block_iterator block_begin() const
Definition: LoopInfo.h:193
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Definition: LoopInfo.h:992
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:547
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:60
StringRef getName() const
Definition: LoopInfo.h:891
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:37
This class wraps the llvm.memcpy intrinsic.
Value * getLength() const
Value * getDest() const
This is just like getRawDest, but it strips off any cast instructions (including addrspacecast) that ...
MaybeAlign getDestAlign() const
bool isVolatile() const
Value * getValue() const
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
MaybeAlign getSourceAlign() const
Value * getSource() const
This is just like getRawSource, but it strips off any cast instructions that feed it,...
Representation for a specific memory location.
An analysis that produces MemorySSA for a function.
Definition: MemorySSA.h:936
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition: MemorySSA.h:700
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
Value * getIncomingValueForBlock(const BasicBlock *BB) const
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1750
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:152
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:158
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents a constant integer value.
ConstantInt * getValue() const
const APInt & getAPInt() const
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
This class uses information about analyze scalars to rewrite expressions in canonical form.
const SCEV * getOperand(unsigned i) const
This class represents an analyzed expression in the program.
bool isOne() const
Return true if the expression is a constant one.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
bool isKnownNonNegative(const SCEV *S)
Test if the given expression is known to be non-negative.
const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getTruncateOrZeroExtend(const SCEV *V, Type *Ty, unsigned Depth=0)
Return a SCEV corresponding to a conversion of the input value to the specified type.
bool isLoopEntryGuardedByCond(const Loop *L, ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS)
Test whether entry to the loop is protected by a conditional between LHS and RHS.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
A vector that has set insertion semantics.
Definition: SetVector.h:51
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:219
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:152
Simple and conservative implementation of LoopSafetyInfo that can give false-positive answers to its ...
Definition: MustExecute.h:110
void computeLoopSafetyInfo(const Loop *CurLoop) override
Computes safety information for a loop checks loop body & header for the possibility of may throw exc...
Definition: MustExecute.cpp:51
bool anyBlockMayThrow() const override
Returns true iff any block of the loop for which this info is contains an instruction that may throw ...
Definition: MustExecute.cpp:47
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:344
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:379
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:383
iterator end() const
Definition: SmallPtrSet.h:408
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
iterator begin() const
Definition: SmallPtrSet.h:403
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:389
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
An instruction for storing to memory.
Definition: Instructions.h:301
Align getAlign() const
Definition: Instructions.h:345
Value * getValueOperand()
Definition: Instructions.h:390
Value * getPointerOperand()
Definition: Instructions.h:393
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Provides information about what library functions are available for the current target.
bool has(LibFunc F) const
Tests whether a library function is available.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
unsigned getAtomicMemIntrinsicMaxElementSize() const
PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const
Return hardware support for population count.
TargetCostKind
The kind of cost model.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
@ TCC_Basic
The cost of a typical 'add' instruction.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:350
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1731
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
void replaceUsesOutsideBlock(Value *V, BasicBlock *BB)
replaceUsesOutsideBlock - Go through the uses list for this definition and make each use point to "V"...
Definition: Value.cpp:586
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:996
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:308
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:182
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:166
self_iterator getIterator()
Definition: ilist_node.h:82
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
@ HeaderSize
Definition: BTF.h:60
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:119
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1506
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
Definition: PatternMatch.h:979
cst_pred_ty< is_power2 > m_Power2()
Match an integer or vector power-of-2.
Definition: PatternMatch.h:544
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:716
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:772
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:517
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
Definition: PatternMatch.h:224
CmpClass_match< LHS, RHS, ICmpInst, ICmpInst::Predicate > m_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R)
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
BinOpPred_match< LHS, RHS, is_shift_op > m_Shift(const LHS &L, const RHS &R)
Matches shift operations.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
class_match< BasicBlock > m_BasicBlock()
Match an arbitrary basic block value and ignore it.
Definition: PatternMatch.h:168
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:537
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
Definition: PatternMatch.h:991
cst_pred_ty< icmp_pred_with_threshold > m_SpecificInt_ICMP(ICmpInst::Predicate Predicate, const APInt &Threshold)
Match an integer or vector with every element comparing 'pred' (eg/ne/...) to Threshold.
Definition: PatternMatch.h:598
@ ReallyHidden
Definition: CommandLine.h:139
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:465
constexpr double e
Definition: MathExtras.h:31
DiagnosticInfoOptimizationBase::setExtraArgs setExtraArgs
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:537
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
bool inferNonMandatoryLibFuncAttrs(Module *M, StringRef Name, const TargetLibraryInfo &TLI)
Analyze the name and prototype of the given function and set any applicable attributes.
bool isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI, LibFunc TheLibFunc)
Check whether the library function is available on target and also that it in the current Module is a...
bool isMustProgress(const Loop *L)
Return true if this loop can be assumed to make progress.
Definition: LoopInfo.cpp:1118
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
FunctionCallee getOrInsertLibFunc(Module *M, const TargetLibraryInfo &TLI, LibFunc TheLibFunc, FunctionType *T, AttributeList AttributeList)
Calls getOrInsertFunction() and then makes sure to add mandatory argument attributes.
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition: ModRef.h:27
@ ModRef
The access may reference and may modify the value stored in memory.
@ Mod
The access may modify the value stored in memory.
bool VerifyMemorySSA
Enables verification of MemorySSA.
Definition: MemorySSA.cpp:89
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
bool isKnownNonNegative(const Value *V, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Returns true if the give value is known to be non-negative.
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
Value * isBytewiseValue(Value *V, const DataLayout &DL)
If the specified value can be set by repeating the same byte in memory, return the i8 value that it i...
bool decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate &Pred, Value *&X, APInt &Mask, bool LookThroughTrunc=true)
Decompose an icmp into the form ((X & Mask) pred 0) if possible.
constexpr std::nullopt_t None
Definition: None.h:28
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition: Metadata.h:651
MDNode * TBAAStruct
The tag for type-based alias analysis (tbaa struct).
Definition: Metadata.h:671
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition: Metadata.h:674
MDNode * TBAA
The tag for type-based alias analysis.
Definition: Metadata.h:668
AAMDNodes merge(const AAMDNodes &Other) const
Given two sets of AAMDNodes applying to potentially different locations, determine the best AAMDNodes...
MDNode * NoAlias
The tag specifying the noalias scope.
Definition: Metadata.h:677
AAMDNodes extendTo(ssize_t Len) const
Create a new AAMDNode that describes this AAMDNode after extending it to apply to a series of bytes o...
Definition: Metadata.h:718
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static bool Memset
When true, Memset is disabled.
static bool All
When true, the entire pass is disabled.
static bool Memcpy
When true, Memcpy is disabled.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Match loop-invariant value.
match_LoopInvariant(const SubPattern_t &SP, const Loop *L)