LLVM 18.0.0git
LoopIdiomRecognize.cpp
Go to the documentation of this file.
1//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements an idiom recognizer that transforms simple loops into a
10// non-loop form. In cases that this kicks in, it can be a significant
11// performance win.
12//
13// If compiling for code size we avoid idiom recognition if the resulting
14// code could be larger than the code for the original loop. One way this could
15// happen is if the loop is not removable after idiom recognition due to the
16// presence of non-idiom instructions. The initial implementation of the
17// heuristics applies to idioms in multi-block loops.
18//
19//===----------------------------------------------------------------------===//
20//
21// TODO List:
22//
23// Future loop memory idioms to recognize:
24// memcmp, strlen, etc.
25// Future floating point idioms to recognize in -ffast-math mode:
26// fpowi
27// Future integer operation idioms to recognize:
28// ctpop
29//
30// Beware that isel's default lowering for ctpop is highly inefficient for
31// i64 and larger types when i64 is legal and the value has few bits set. It
32// would be good to enhance isel to emit a loop for ctpop in this case.
33//
34// This could recognize common matrix multiplies and dot product idioms and
35// replace them with calls to BLAS (if linked in??).
36//
37//===----------------------------------------------------------------------===//
38
40#include "llvm/ADT/APInt.h"
41#include "llvm/ADT/ArrayRef.h"
42#include "llvm/ADT/DenseMap.h"
43#include "llvm/ADT/MapVector.h"
44#include "llvm/ADT/SetVector.h"
47#include "llvm/ADT/Statistic.h"
48#include "llvm/ADT/StringRef.h"
64#include "llvm/IR/BasicBlock.h"
65#include "llvm/IR/Constant.h"
66#include "llvm/IR/Constants.h"
67#include "llvm/IR/DataLayout.h"
68#include "llvm/IR/DebugLoc.h"
70#include "llvm/IR/Dominators.h"
71#include "llvm/IR/GlobalValue.h"
73#include "llvm/IR/IRBuilder.h"
74#include "llvm/IR/InstrTypes.h"
75#include "llvm/IR/Instruction.h"
78#include "llvm/IR/Intrinsics.h"
79#include "llvm/IR/LLVMContext.h"
80#include "llvm/IR/Module.h"
81#include "llvm/IR/PassManager.h"
83#include "llvm/IR/Type.h"
84#include "llvm/IR/User.h"
85#include "llvm/IR/Value.h"
86#include "llvm/IR/ValueHandle.h"
89#include "llvm/Support/Debug.h"
96#include <algorithm>
97#include <cassert>
98#include <cstdint>
99#include <utility>
100#include <vector>
101
102using namespace llvm;
103
104#define DEBUG_TYPE "loop-idiom"
105
106STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
107STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
108STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores");
110 NumShiftUntilBitTest,
111 "Number of uncountable loops recognized as 'shift until bitttest' idiom");
112STATISTIC(NumShiftUntilZero,
113 "Number of uncountable loops recognized as 'shift until zero' idiom");
114
117 DisableLIRPAll("disable-" DEBUG_TYPE "-all",
118 cl::desc("Options to disable Loop Idiom Recognize Pass."),
121
124 DisableLIRPMemset("disable-" DEBUG_TYPE "-memset",
125 cl::desc("Proceed with loop idiom recognize pass, but do "
126 "not convert loop(s) to memset."),
129
132 DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy",
133 cl::desc("Proceed with loop idiom recognize pass, but do "
134 "not convert loop(s) to memcpy."),
137
139 "use-lir-code-size-heurs",
140 cl::desc("Use loop idiom recognition code size heuristics when compiling"
141 "with -Os/-Oz"),
142 cl::init(true), cl::Hidden);
143
144namespace {
145
146class LoopIdiomRecognize {
147 Loop *CurLoop = nullptr;
148 AliasAnalysis *AA;
149 DominatorTree *DT;
150 LoopInfo *LI;
151 ScalarEvolution *SE;
154 const DataLayout *DL;
156 bool ApplyCodeSizeHeuristics;
157 std::unique_ptr<MemorySSAUpdater> MSSAU;
158
159public:
160 explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
161 LoopInfo *LI, ScalarEvolution *SE,
163 const TargetTransformInfo *TTI, MemorySSA *MSSA,
164 const DataLayout *DL,
166 : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
167 if (MSSA)
168 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
169 }
170
171 bool runOnLoop(Loop *L);
172
173private:
174 using StoreList = SmallVector<StoreInst *, 8>;
175 using StoreListMap = MapVector<Value *, StoreList>;
176
177 StoreListMap StoreRefsForMemset;
178 StoreListMap StoreRefsForMemsetPattern;
179 StoreList StoreRefsForMemcpy;
180 bool HasMemset;
181 bool HasMemsetPattern;
182 bool HasMemcpy;
183
184 /// Return code for isLegalStore()
185 enum LegalStoreKind {
186 None = 0,
187 Memset,
188 MemsetPattern,
189 Memcpy,
190 UnorderedAtomicMemcpy,
191 DontUse // Dummy retval never to be used. Allows catching errors in retval
192 // handling.
193 };
194
195 /// \name Countable Loop Idiom Handling
196 /// @{
197
198 bool runOnCountableLoop();
199 bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
201
202 void collectStores(BasicBlock *BB);
203 LegalStoreKind isLegalStore(StoreInst *SI);
204 enum class ForMemset { No, Yes };
205 bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
206 ForMemset For);
207
208 template <typename MemInst>
209 bool processLoopMemIntrinsic(
210 BasicBlock *BB,
211 bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
212 const SCEV *BECount);
213 bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount);
214 bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
215
216 bool processLoopStridedStore(Value *DestPtr, const SCEV *StoreSizeSCEV,
217 MaybeAlign StoreAlignment, Value *StoredVal,
218 Instruction *TheStore,
220 const SCEVAddRecExpr *Ev, const SCEV *BECount,
221 bool IsNegStride, bool IsLoopMemset = false);
222 bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
223 bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr,
224 const SCEV *StoreSize, MaybeAlign StoreAlign,
225 MaybeAlign LoadAlign, Instruction *TheStore,
226 Instruction *TheLoad,
227 const SCEVAddRecExpr *StoreEv,
228 const SCEVAddRecExpr *LoadEv,
229 const SCEV *BECount);
230 bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
231 bool IsLoopMemset = false);
232
233 /// @}
234 /// \name Noncountable Loop Idiom Handling
235 /// @{
236
237 bool runOnNoncountableLoop();
238
239 bool recognizePopcount();
240 void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
241 PHINode *CntPhi, Value *Var);
242 bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
243 void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
244 Instruction *CntInst, PHINode *CntPhi,
245 Value *Var, Instruction *DefX,
246 const DebugLoc &DL, bool ZeroCheck,
247 bool IsCntPhiUsedOutsideLoop);
248
249 bool recognizeShiftUntilBitTest();
250 bool recognizeShiftUntilZero();
251
252 /// @}
253};
254} // end anonymous namespace
255
258 LPMUpdater &) {
260 return PreservedAnalyses::all();
261
262 const auto *DL = &L.getHeader()->getModule()->getDataLayout();
263
264 // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
265 // pass. Function analyses need to be preserved across loop transformations
266 // but ORE cannot be preserved (see comment before the pass definition).
267 OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
268
269 LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
270 AR.MSSA, DL, ORE);
271 if (!LIR.runOnLoop(&L))
272 return PreservedAnalyses::all();
273
275 if (AR.MSSA)
276 PA.preserve<MemorySSAAnalysis>();
277 return PA;
278}
279
281 I->replaceAllUsesWith(PoisonValue::get(I->getType()));
282 I->eraseFromParent();
283}
284
285//===----------------------------------------------------------------------===//
286//
287// Implementation of LoopIdiomRecognize
288//
289//===----------------------------------------------------------------------===//
290
291bool LoopIdiomRecognize::runOnLoop(Loop *L) {
292 CurLoop = L;
293 // If the loop could not be converted to canonical form, it must have an
294 // indirectbr in it, just give up.
295 if (!L->getLoopPreheader())
296 return false;
297
298 // Disable loop idiom recognition if the function's name is a common idiom.
299 StringRef Name = L->getHeader()->getParent()->getName();
300 if (Name == "memset" || Name == "memcpy")
301 return false;
302
303 // Determine if code size heuristics need to be applied.
304 ApplyCodeSizeHeuristics =
305 L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
306
307 HasMemset = TLI->has(LibFunc_memset);
308 HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
309 HasMemcpy = TLI->has(LibFunc_memcpy);
310
311 if (HasMemset || HasMemsetPattern || HasMemcpy)
313 return runOnCountableLoop();
314
315 return runOnNoncountableLoop();
316}
317
318bool LoopIdiomRecognize::runOnCountableLoop() {
319 const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
320 assert(!isa<SCEVCouldNotCompute>(BECount) &&
321 "runOnCountableLoop() called on a loop without a predictable"
322 "backedge-taken count");
323
324 // If this loop executes exactly one time, then it should be peeled, not
325 // optimized by this pass.
326 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
327 if (BECst->getAPInt() == 0)
328 return false;
329
331 CurLoop->getUniqueExitBlocks(ExitBlocks);
332
333 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
334 << CurLoop->getHeader()->getParent()->getName()
335 << "] Countable Loop %" << CurLoop->getHeader()->getName()
336 << "\n");
337
338 // The following transforms hoist stores/memsets into the loop pre-header.
339 // Give up if the loop has instructions that may throw.
340 SimpleLoopSafetyInfo SafetyInfo;
341 SafetyInfo.computeLoopSafetyInfo(CurLoop);
342 if (SafetyInfo.anyBlockMayThrow())
343 return false;
344
345 bool MadeChange = false;
346
347 // Scan all the blocks in the loop that are not in subloops.
348 for (auto *BB : CurLoop->getBlocks()) {
349 // Ignore blocks in subloops.
350 if (LI->getLoopFor(BB) != CurLoop)
351 continue;
352
353 MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
354 }
355 return MadeChange;
356}
357
358static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
359 const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
360 return ConstStride->getAPInt();
361}
362
363/// getMemSetPatternValue - If a strided store of the specified value is safe to
364/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
365/// be passed in. Otherwise, return null.
366///
367/// Note that we don't ever attempt to use memset_pattern8 or 4, because these
368/// just replicate their input array and then pass on to memset_pattern16.
370 // FIXME: This could check for UndefValue because it can be merged into any
371 // other valid pattern.
372
373 // If the value isn't a constant, we can't promote it to being in a constant
374 // array. We could theoretically do a store to an alloca or something, but
375 // that doesn't seem worthwhile.
376 Constant *C = dyn_cast<Constant>(V);
377 if (!C || isa<ConstantExpr>(C))
378 return nullptr;
379
380 // Only handle simple values that are a power of two bytes in size.
381 uint64_t Size = DL->getTypeSizeInBits(V->getType());
382 if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
383 return nullptr;
384
385 // Don't care enough about darwin/ppc to implement this.
386 if (DL->isBigEndian())
387 return nullptr;
388
389 // Convert to size in bytes.
390 Size /= 8;
391
392 // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
393 // if the top and bottom are the same (e.g. for vectors and large integers).
394 if (Size > 16)
395 return nullptr;
396
397 // If the constant is exactly 16 bytes, just use it.
398 if (Size == 16)
399 return C;
400
401 // Otherwise, we'll use an array of the constants.
402 unsigned ArraySize = 16 / Size;
403 ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
404 return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
405}
406
407LoopIdiomRecognize::LegalStoreKind
408LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
409 // Don't touch volatile stores.
410 if (SI->isVolatile())
411 return LegalStoreKind::None;
412 // We only want simple or unordered-atomic stores.
413 if (!SI->isUnordered())
414 return LegalStoreKind::None;
415
416 // Avoid merging nontemporal stores.
417 if (SI->getMetadata(LLVMContext::MD_nontemporal))
418 return LegalStoreKind::None;
419
420 Value *StoredVal = SI->getValueOperand();
421 Value *StorePtr = SI->getPointerOperand();
422
423 // Don't convert stores of non-integral pointer types to memsets (which stores
424 // integers).
425 if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
426 return LegalStoreKind::None;
427
428 // Reject stores that are so large that they overflow an unsigned.
429 // When storing out scalable vectors we bail out for now, since the code
430 // below currently only works for constant strides.
431 TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
432 if (SizeInBits.isScalable() || (SizeInBits.getFixedValue() & 7) ||
433 (SizeInBits.getFixedValue() >> 32) != 0)
434 return LegalStoreKind::None;
435
436 // See if the pointer expression is an AddRec like {base,+,1} on the current
437 // loop, which indicates a strided store. If we have something else, it's a
438 // random store we can't handle.
439 const SCEVAddRecExpr *StoreEv =
440 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
441 if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
442 return LegalStoreKind::None;
443
444 // Check to see if we have a constant stride.
445 if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
446 return LegalStoreKind::None;
447
448 // See if the store can be turned into a memset.
449
450 // If the stored value is a byte-wise value (like i32 -1), then it may be
451 // turned into a memset of i8 -1, assuming that all the consecutive bytes
452 // are stored. A store of i32 0x01020304 can never be turned into a memset,
453 // but it can be turned into memset_pattern if the target supports it.
454 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
455
456 // Note: memset and memset_pattern on unordered-atomic is yet not supported
457 bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
458
459 // If we're allowed to form a memset, and the stored value would be
460 // acceptable for memset, use it.
461 if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
462 // Verify that the stored value is loop invariant. If not, we can't
463 // promote the memset.
464 CurLoop->isLoopInvariant(SplatValue)) {
465 // It looks like we can use SplatValue.
466 return LegalStoreKind::Memset;
467 }
468 if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
469 // Don't create memset_pattern16s with address spaces.
470 StorePtr->getType()->getPointerAddressSpace() == 0 &&
471 getMemSetPatternValue(StoredVal, DL)) {
472 // It looks like we can use PatternValue!
473 return LegalStoreKind::MemsetPattern;
474 }
475
476 // Otherwise, see if the store can be turned into a memcpy.
477 if (HasMemcpy && !DisableLIRP::Memcpy) {
478 // Check to see if the stride matches the size of the store. If so, then we
479 // know that every byte is touched in the loop.
480 APInt Stride = getStoreStride(StoreEv);
481 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
482 if (StoreSize != Stride && StoreSize != -Stride)
483 return LegalStoreKind::None;
484
485 // The store must be feeding a non-volatile load.
486 LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
487
488 // Only allow non-volatile loads
489 if (!LI || LI->isVolatile())
490 return LegalStoreKind::None;
491 // Only allow simple or unordered-atomic loads
492 if (!LI->isUnordered())
493 return LegalStoreKind::None;
494
495 // See if the pointer expression is an AddRec like {base,+,1} on the current
496 // loop, which indicates a strided load. If we have something else, it's a
497 // random load we can't handle.
498 const SCEVAddRecExpr *LoadEv =
499 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
500 if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
501 return LegalStoreKind::None;
502
503 // The store and load must share the same stride.
504 if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
505 return LegalStoreKind::None;
506
507 // Success. This store can be converted into a memcpy.
508 UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
509 return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
510 : LegalStoreKind::Memcpy;
511 }
512 // This store can't be transformed into a memset/memcpy.
513 return LegalStoreKind::None;
514}
515
516void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
517 StoreRefsForMemset.clear();
518 StoreRefsForMemsetPattern.clear();
519 StoreRefsForMemcpy.clear();
520 for (Instruction &I : *BB) {
521 StoreInst *SI = dyn_cast<StoreInst>(&I);
522 if (!SI)
523 continue;
524
525 // Make sure this is a strided store with a constant stride.
526 switch (isLegalStore(SI)) {
527 case LegalStoreKind::None:
528 // Nothing to do
529 break;
530 case LegalStoreKind::Memset: {
531 // Find the base pointer.
532 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
533 StoreRefsForMemset[Ptr].push_back(SI);
534 } break;
535 case LegalStoreKind::MemsetPattern: {
536 // Find the base pointer.
537 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
538 StoreRefsForMemsetPattern[Ptr].push_back(SI);
539 } break;
540 case LegalStoreKind::Memcpy:
541 case LegalStoreKind::UnorderedAtomicMemcpy:
542 StoreRefsForMemcpy.push_back(SI);
543 break;
544 default:
545 assert(false && "unhandled return value");
546 break;
547 }
548 }
549}
550
551/// runOnLoopBlock - Process the specified block, which lives in a counted loop
552/// with the specified backedge count. This block is known to be in the current
553/// loop and not in any subloops.
554bool LoopIdiomRecognize::runOnLoopBlock(
555 BasicBlock *BB, const SCEV *BECount,
556 SmallVectorImpl<BasicBlock *> &ExitBlocks) {
557 // We can only promote stores in this block if they are unconditionally
558 // executed in the loop. For a block to be unconditionally executed, it has
559 // to dominate all the exit blocks of the loop. Verify this now.
560 for (BasicBlock *ExitBlock : ExitBlocks)
561 if (!DT->dominates(BB, ExitBlock))
562 return false;
563
564 bool MadeChange = false;
565 // Look for store instructions, which may be optimized to memset/memcpy.
566 collectStores(BB);
567
568 // Look for a single store or sets of stores with a common base, which can be
569 // optimized into a memset (memset_pattern). The latter most commonly happens
570 // with structs and handunrolled loops.
571 for (auto &SL : StoreRefsForMemset)
572 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
573
574 for (auto &SL : StoreRefsForMemsetPattern)
575 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
576
577 // Optimize the store into a memcpy, if it feeds an similarly strided load.
578 for (auto &SI : StoreRefsForMemcpy)
579 MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
580
581 MadeChange |= processLoopMemIntrinsic<MemCpyInst>(
582 BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
583 MadeChange |= processLoopMemIntrinsic<MemSetInst>(
584 BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
585
586 return MadeChange;
587}
588
589/// See if this store(s) can be promoted to a memset.
590bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
591 const SCEV *BECount, ForMemset For) {
592 // Try to find consecutive stores that can be transformed into memsets.
593 SetVector<StoreInst *> Heads, Tails;
595
596 // Do a quadratic search on all of the given stores and find
597 // all of the pairs of stores that follow each other.
598 SmallVector<unsigned, 16> IndexQueue;
599 for (unsigned i = 0, e = SL.size(); i < e; ++i) {
600 assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
601
602 Value *FirstStoredVal = SL[i]->getValueOperand();
603 Value *FirstStorePtr = SL[i]->getPointerOperand();
604 const SCEVAddRecExpr *FirstStoreEv =
605 cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
606 APInt FirstStride = getStoreStride(FirstStoreEv);
607 unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
608
609 // See if we can optimize just this store in isolation.
610 if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
611 Heads.insert(SL[i]);
612 continue;
613 }
614
615 Value *FirstSplatValue = nullptr;
616 Constant *FirstPatternValue = nullptr;
617
618 if (For == ForMemset::Yes)
619 FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
620 else
621 FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
622
623 assert((FirstSplatValue || FirstPatternValue) &&
624 "Expected either splat value or pattern value.");
625
626 IndexQueue.clear();
627 // If a store has multiple consecutive store candidates, search Stores
628 // array according to the sequence: from i+1 to e, then from i-1 to 0.
629 // This is because usually pairing with immediate succeeding or preceding
630 // candidate create the best chance to find memset opportunity.
631 unsigned j = 0;
632 for (j = i + 1; j < e; ++j)
633 IndexQueue.push_back(j);
634 for (j = i; j > 0; --j)
635 IndexQueue.push_back(j - 1);
636
637 for (auto &k : IndexQueue) {
638 assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
639 Value *SecondStorePtr = SL[k]->getPointerOperand();
640 const SCEVAddRecExpr *SecondStoreEv =
641 cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
642 APInt SecondStride = getStoreStride(SecondStoreEv);
643
644 if (FirstStride != SecondStride)
645 continue;
646
647 Value *SecondStoredVal = SL[k]->getValueOperand();
648 Value *SecondSplatValue = nullptr;
649 Constant *SecondPatternValue = nullptr;
650
651 if (For == ForMemset::Yes)
652 SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
653 else
654 SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
655
656 assert((SecondSplatValue || SecondPatternValue) &&
657 "Expected either splat value or pattern value.");
658
659 if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
660 if (For == ForMemset::Yes) {
661 if (isa<UndefValue>(FirstSplatValue))
662 FirstSplatValue = SecondSplatValue;
663 if (FirstSplatValue != SecondSplatValue)
664 continue;
665 } else {
666 if (isa<UndefValue>(FirstPatternValue))
667 FirstPatternValue = SecondPatternValue;
668 if (FirstPatternValue != SecondPatternValue)
669 continue;
670 }
671 Tails.insert(SL[k]);
672 Heads.insert(SL[i]);
673 ConsecutiveChain[SL[i]] = SL[k];
674 break;
675 }
676 }
677 }
678
679 // We may run into multiple chains that merge into a single chain. We mark the
680 // stores that we transformed so that we don't visit the same store twice.
681 SmallPtrSet<Value *, 16> TransformedStores;
682 bool Changed = false;
683
684 // For stores that start but don't end a link in the chain:
685 for (StoreInst *I : Heads) {
686 if (Tails.count(I))
687 continue;
688
689 // We found a store instr that starts a chain. Now follow the chain and try
690 // to transform it.
691 SmallPtrSet<Instruction *, 8> AdjacentStores;
692 StoreInst *HeadStore = I;
693 unsigned StoreSize = 0;
694
695 // Collect the chain into a list.
696 while (Tails.count(I) || Heads.count(I)) {
697 if (TransformedStores.count(I))
698 break;
699 AdjacentStores.insert(I);
700
701 StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
702 // Move to the next value in the chain.
703 I = ConsecutiveChain[I];
704 }
705
706 Value *StoredVal = HeadStore->getValueOperand();
707 Value *StorePtr = HeadStore->getPointerOperand();
708 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
709 APInt Stride = getStoreStride(StoreEv);
710
711 // Check to see if the stride matches the size of the stores. If so, then
712 // we know that every byte is touched in the loop.
713 if (StoreSize != Stride && StoreSize != -Stride)
714 continue;
715
716 bool IsNegStride = StoreSize == -Stride;
717
718 Type *IntIdxTy = DL->getIndexType(StorePtr->getType());
719 const SCEV *StoreSizeSCEV = SE->getConstant(IntIdxTy, StoreSize);
720 if (processLoopStridedStore(StorePtr, StoreSizeSCEV,
721 MaybeAlign(HeadStore->getAlign()), StoredVal,
722 HeadStore, AdjacentStores, StoreEv, BECount,
723 IsNegStride)) {
724 TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
725 Changed = true;
726 }
727 }
728
729 return Changed;
730}
731
732/// processLoopMemIntrinsic - Template function for calling different processor
733/// functions based on mem intrinsic type.
734template <typename MemInst>
735bool LoopIdiomRecognize::processLoopMemIntrinsic(
736 BasicBlock *BB,
737 bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
738 const SCEV *BECount) {
739 bool MadeChange = false;
740 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
741 Instruction *Inst = &*I++;
742 // Look for memory instructions, which may be optimized to a larger one.
743 if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
744 WeakTrackingVH InstPtr(&*I);
745 if (!(this->*Processor)(MI, BECount))
746 continue;
747 MadeChange = true;
748
749 // If processing the instruction invalidated our iterator, start over from
750 // the top of the block.
751 if (!InstPtr)
752 I = BB->begin();
753 }
754 }
755 return MadeChange;
756}
757
758/// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy
759bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
760 const SCEV *BECount) {
761 // We can only handle non-volatile memcpys with a constant size.
762 if (MCI->isVolatile() || !isa<ConstantInt>(MCI->getLength()))
763 return false;
764
765 // If we're not allowed to hack on memcpy, we fail.
766 if ((!HasMemcpy && !isa<MemCpyInlineInst>(MCI)) || DisableLIRP::Memcpy)
767 return false;
768
769 Value *Dest = MCI->getDest();
770 Value *Source = MCI->getSource();
771 if (!Dest || !Source)
772 return false;
773
774 // See if the load and store pointer expressions are AddRec like {base,+,1} on
775 // the current loop, which indicates a strided load and store. If we have
776 // something else, it's a random load or store we can't handle.
777 const SCEVAddRecExpr *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Dest));
778 if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
779 return false;
780 const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Source));
781 if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
782 return false;
783
784 // Reject memcpys that are so large that they overflow an unsigned.
785 uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength())->getZExtValue();
786 if ((SizeInBytes >> 32) != 0)
787 return false;
788
789 // Check if the stride matches the size of the memcpy. If so, then we know
790 // that every byte is touched in the loop.
791 const SCEVConstant *ConstStoreStride =
792 dyn_cast<SCEVConstant>(StoreEv->getOperand(1));
793 const SCEVConstant *ConstLoadStride =
794 dyn_cast<SCEVConstant>(LoadEv->getOperand(1));
795 if (!ConstStoreStride || !ConstLoadStride)
796 return false;
797
798 APInt StoreStrideValue = ConstStoreStride->getAPInt();
799 APInt LoadStrideValue = ConstLoadStride->getAPInt();
800 // Huge stride value - give up
801 if (StoreStrideValue.getBitWidth() > 64 || LoadStrideValue.getBitWidth() > 64)
802 return false;
803
804 if (SizeInBytes != StoreStrideValue && SizeInBytes != -StoreStrideValue) {
805 ORE.emit([&]() {
806 return OptimizationRemarkMissed(DEBUG_TYPE, "SizeStrideUnequal", MCI)
807 << ore::NV("Inst", "memcpy") << " in "
808 << ore::NV("Function", MCI->getFunction())
809 << " function will not be hoisted: "
810 << ore::NV("Reason", "memcpy size is not equal to stride");
811 });
812 return false;
813 }
814
815 int64_t StoreStrideInt = StoreStrideValue.getSExtValue();
816 int64_t LoadStrideInt = LoadStrideValue.getSExtValue();
817 // Check if the load stride matches the store stride.
818 if (StoreStrideInt != LoadStrideInt)
819 return false;
820
821 return processLoopStoreOfLoopLoad(
822 Dest, Source, SE->getConstant(Dest->getType(), SizeInBytes),
823 MCI->getDestAlign(), MCI->getSourceAlign(), MCI, MCI, StoreEv, LoadEv,
824 BECount);
825}
826
827/// processLoopMemSet - See if this memset can be promoted to a large memset.
828bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
829 const SCEV *BECount) {
830 // We can only handle non-volatile memsets.
831 if (MSI->isVolatile())
832 return false;
833
834 // If we're not allowed to hack on memset, we fail.
835 if (!HasMemset || DisableLIRP::Memset)
836 return false;
837
838 Value *Pointer = MSI->getDest();
839
840 // See if the pointer expression is an AddRec like {base,+,1} on the current
841 // loop, which indicates a strided store. If we have something else, it's a
842 // random store we can't handle.
843 const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
844 if (!Ev || Ev->getLoop() != CurLoop)
845 return false;
846 if (!Ev->isAffine()) {
847 LLVM_DEBUG(dbgs() << " Pointer is not affine, abort\n");
848 return false;
849 }
850
851 const SCEV *PointerStrideSCEV = Ev->getOperand(1);
852 const SCEV *MemsetSizeSCEV = SE->getSCEV(MSI->getLength());
853 if (!PointerStrideSCEV || !MemsetSizeSCEV)
854 return false;
855
856 bool IsNegStride = false;
857 const bool IsConstantSize = isa<ConstantInt>(MSI->getLength());
858
859 if (IsConstantSize) {
860 // Memset size is constant.
861 // Check if the pointer stride matches the memset size. If so, then
862 // we know that every byte is touched in the loop.
863 LLVM_DEBUG(dbgs() << " memset size is constant\n");
864 uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
865 const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
866 if (!ConstStride)
867 return false;
868
869 APInt Stride = ConstStride->getAPInt();
870 if (SizeInBytes != Stride && SizeInBytes != -Stride)
871 return false;
872
873 IsNegStride = SizeInBytes == -Stride;
874 } else {
875 // Memset size is non-constant.
876 // Check if the pointer stride matches the memset size.
877 // To be conservative, the pass would not promote pointers that aren't in
878 // address space zero. Also, the pass only handles memset length and stride
879 // that are invariant for the top level loop.
880 LLVM_DEBUG(dbgs() << " memset size is non-constant\n");
881 if (Pointer->getType()->getPointerAddressSpace() != 0) {
882 LLVM_DEBUG(dbgs() << " pointer is not in address space zero, "
883 << "abort\n");
884 return false;
885 }
886 if (!SE->isLoopInvariant(MemsetSizeSCEV, CurLoop)) {
887 LLVM_DEBUG(dbgs() << " memset size is not a loop-invariant, "
888 << "abort\n");
889 return false;
890 }
891
892 // Compare positive direction PointerStrideSCEV with MemsetSizeSCEV
893 IsNegStride = PointerStrideSCEV->isNonConstantNegative();
894 const SCEV *PositiveStrideSCEV =
895 IsNegStride ? SE->getNegativeSCEV(PointerStrideSCEV)
896 : PointerStrideSCEV;
897 LLVM_DEBUG(dbgs() << " MemsetSizeSCEV: " << *MemsetSizeSCEV << "\n"
898 << " PositiveStrideSCEV: " << *PositiveStrideSCEV
899 << "\n");
900
901 if (PositiveStrideSCEV != MemsetSizeSCEV) {
902 // If an expression is covered by the loop guard, compare again and
903 // proceed with optimization if equal.
904 const SCEV *FoldedPositiveStride =
905 SE->applyLoopGuards(PositiveStrideSCEV, CurLoop);
906 const SCEV *FoldedMemsetSize =
907 SE->applyLoopGuards(MemsetSizeSCEV, CurLoop);
908
909 LLVM_DEBUG(dbgs() << " Try to fold SCEV based on loop guard\n"
910 << " FoldedMemsetSize: " << *FoldedMemsetSize << "\n"
911 << " FoldedPositiveStride: " << *FoldedPositiveStride
912 << "\n");
913
914 if (FoldedPositiveStride != FoldedMemsetSize) {
915 LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n");
916 return false;
917 }
918 }
919 }
920
921 // Verify that the memset value is loop invariant. If not, we can't promote
922 // the memset.
923 Value *SplatValue = MSI->getValue();
924 if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
925 return false;
926
928 MSIs.insert(MSI);
929 return processLoopStridedStore(Pointer, SE->getSCEV(MSI->getLength()),
930 MSI->getDestAlign(), SplatValue, MSI, MSIs, Ev,
931 BECount, IsNegStride, /*IsLoopMemset=*/true);
932}
933
934/// mayLoopAccessLocation - Return true if the specified loop might access the
935/// specified pointer location, which is a loop-strided access. The 'Access'
936/// argument specifies what the verboten forms of access are (read or write).
937static bool
939 const SCEV *BECount, const SCEV *StoreSizeSCEV,
940 AliasAnalysis &AA,
941 SmallPtrSetImpl<Instruction *> &IgnoredInsts) {
942 // Get the location that may be stored across the loop. Since the access is
943 // strided positively through memory, we say that the modified location starts
944 // at the pointer and has infinite size.
946
947 // If the loop iterates a fixed number of times, we can refine the access size
948 // to be exactly the size of the memset, which is (BECount+1)*StoreSize
949 const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount);
950 const SCEVConstant *ConstSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
951 if (BECst && ConstSize)
952 AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
953 ConstSize->getValue()->getZExtValue());
954
955 // TODO: For this to be really effective, we have to dive into the pointer
956 // operand in the store. Store to &A[i] of 100 will always return may alias
957 // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
958 // which will then no-alias a store to &A[100].
959 MemoryLocation StoreLoc(Ptr, AccessSize);
960
961 for (BasicBlock *B : L->blocks())
962 for (Instruction &I : *B)
963 if (!IgnoredInsts.contains(&I) &&
964 isModOrRefSet(AA.getModRefInfo(&I, StoreLoc) & Access))
965 return true;
966 return false;
967}
968
969// If we have a negative stride, Start refers to the end of the memory location
970// we're trying to memset. Therefore, we need to recompute the base pointer,
971// which is just Start - BECount*Size.
972static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
973 Type *IntPtr, const SCEV *StoreSizeSCEV,
974 ScalarEvolution *SE) {
975 const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
976 if (!StoreSizeSCEV->isOne()) {
977 // index = back edge count * store size
978 Index = SE->getMulExpr(Index,
979 SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
981 }
982 // base pointer = start - index * store size
983 return SE->getMinusSCEV(Start, Index);
984}
985
986/// Compute the number of bytes as a SCEV from the backedge taken count.
987///
988/// This also maps the SCEV into the provided type and tries to handle the
989/// computation in a way that will fold cleanly.
990static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
991 const SCEV *StoreSizeSCEV, Loop *CurLoop,
992 const DataLayout *DL, ScalarEvolution *SE) {
993 const SCEV *TripCountSCEV =
994 SE->getTripCountFromExitCount(BECount, IntPtr, CurLoop);
995 return SE->getMulExpr(TripCountSCEV,
996 SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
998}
999
1000/// processLoopStridedStore - We see a strided store of some value. If we can
1001/// transform this into a memset or memset_pattern in the loop preheader, do so.
1002bool LoopIdiomRecognize::processLoopStridedStore(
1003 Value *DestPtr, const SCEV *StoreSizeSCEV, MaybeAlign StoreAlignment,
1004 Value *StoredVal, Instruction *TheStore,
1006 const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) {
1007 Module *M = TheStore->getModule();
1008 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
1009 Constant *PatternValue = nullptr;
1010
1011 if (!SplatValue)
1012 PatternValue = getMemSetPatternValue(StoredVal, DL);
1013
1014 assert((SplatValue || PatternValue) &&
1015 "Expected either splat value or pattern value.");
1016
1017 // The trip count of the loop and the base pointer of the addrec SCEV is
1018 // guaranteed to be loop invariant, which means that it should dominate the
1019 // header. This allows us to insert code for it in the preheader.
1020 unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
1021 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1022 IRBuilder<> Builder(Preheader->getTerminator());
1023 SCEVExpander Expander(*SE, *DL, "loop-idiom");
1024 SCEVExpanderCleaner ExpCleaner(Expander);
1025
1026 Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
1027 Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
1028
1029 bool Changed = false;
1030 const SCEV *Start = Ev->getStart();
1031 // Handle negative strided loops.
1032 if (IsNegStride)
1033 Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSizeSCEV, SE);
1034
1035 // TODO: ideally we should still be able to generate memset if SCEV expander
1036 // is taught to generate the dependencies at the latest point.
1037 if (!Expander.isSafeToExpand(Start))
1038 return Changed;
1039
1040 // Okay, we have a strided store "p[i]" of a splattable value. We can turn
1041 // this into a memset in the loop preheader now if we want. However, this
1042 // would be unsafe to do if there is anything else in the loop that may read
1043 // or write to the aliased location. Check for any overlap by generating the
1044 // base pointer and checking the region.
1045 Value *BasePtr =
1046 Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
1047
1048 // From here on out, conservatively report to the pass manager that we've
1049 // changed the IR, even if we later clean up these added instructions. There
1050 // may be structural differences e.g. in the order of use lists not accounted
1051 // for in just a textual dump of the IR. This is written as a variable, even
1052 // though statically all the places this dominates could be replaced with
1053 // 'true', with the hope that anyone trying to be clever / "more precise" with
1054 // the return value will read this comment, and leave them alone.
1055 Changed = true;
1056
1057 if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1058 StoreSizeSCEV, *AA, Stores))
1059 return Changed;
1060
1061 if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
1062 return Changed;
1063
1064 // Okay, everything looks good, insert the memset.
1065
1066 const SCEV *NumBytesS =
1067 getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1068
1069 // TODO: ideally we should still be able to generate memset if SCEV expander
1070 // is taught to generate the dependencies at the latest point.
1071 if (!Expander.isSafeToExpand(NumBytesS))
1072 return Changed;
1073
1074 Value *NumBytes =
1075 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1076
1077 if (!SplatValue && !isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16))
1078 return Changed;
1079
1080 AAMDNodes AATags = TheStore->getAAMetadata();
1081 for (Instruction *Store : Stores)
1082 AATags = AATags.merge(Store->getAAMetadata());
1083 if (auto CI = dyn_cast<ConstantInt>(NumBytes))
1084 AATags = AATags.extendTo(CI->getZExtValue());
1085 else
1086 AATags = AATags.extendTo(-1);
1087
1088 CallInst *NewCall;
1089 if (SplatValue) {
1090 NewCall = Builder.CreateMemSet(
1091 BasePtr, SplatValue, NumBytes, MaybeAlign(StoreAlignment),
1092 /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
1093 } else {
1094 assert (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16));
1095 // Everything is emitted in default address space
1096 Type *Int8PtrTy = DestInt8PtrTy;
1097
1098 StringRef FuncName = "memset_pattern16";
1099 FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16,
1100 Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy);
1101 inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI);
1102
1103 // Otherwise we should form a memset_pattern16. PatternValue is known to be
1104 // an constant array of 16-bytes. Plop the value into a mergable global.
1105 GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
1107 PatternValue, ".memset_pattern");
1108 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
1109 GV->setAlignment(Align(16));
1110 Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
1111 NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
1112
1113 // Set the TBAA info if present.
1114 if (AATags.TBAA)
1115 NewCall->setMetadata(LLVMContext::MD_tbaa, AATags.TBAA);
1116
1117 if (AATags.Scope)
1118 NewCall->setMetadata(LLVMContext::MD_alias_scope, AATags.Scope);
1119
1120 if (AATags.NoAlias)
1121 NewCall->setMetadata(LLVMContext::MD_noalias, AATags.NoAlias);
1122 }
1123
1124 NewCall->setDebugLoc(TheStore->getDebugLoc());
1125
1126 if (MSSAU) {
1127 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1128 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1129 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1130 }
1131
1132 LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
1133 << " from store to: " << *Ev << " at: " << *TheStore
1134 << "\n");
1135
1136 ORE.emit([&]() {
1137 OptimizationRemark R(DEBUG_TYPE, "ProcessLoopStridedStore",
1138 NewCall->getDebugLoc(), Preheader);
1139 R << "Transformed loop-strided store in "
1140 << ore::NV("Function", TheStore->getFunction())
1141 << " function into a call to "
1142 << ore::NV("NewFunction", NewCall->getCalledFunction())
1143 << "() intrinsic";
1144 if (!Stores.empty())
1145 R << ore::setExtraArgs();
1146 for (auto *I : Stores) {
1147 R << ore::NV("FromBlock", I->getParent()->getName())
1148 << ore::NV("ToBlock", Preheader->getName());
1149 }
1150 return R;
1151 });
1152
1153 // Okay, the memset has been formed. Zap the original store and anything that
1154 // feeds into it.
1155 for (auto *I : Stores) {
1156 if (MSSAU)
1157 MSSAU->removeMemoryAccess(I, true);
1159 }
1160 if (MSSAU && VerifyMemorySSA)
1161 MSSAU->getMemorySSA()->verifyMemorySSA();
1162 ++NumMemSet;
1163 ExpCleaner.markResultUsed();
1164 return true;
1165}
1166
1167/// If the stored value is a strided load in the same loop with the same stride
1168/// this may be transformable into a memcpy. This kicks in for stuff like
1169/// for (i) A[i] = B[i];
1170bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1171 const SCEV *BECount) {
1172 assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
1173
1174 Value *StorePtr = SI->getPointerOperand();
1175 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1176 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1177
1178 // The store must be feeding a non-volatile load.
1179 LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
1180 assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
1181
1182 // See if the pointer expression is an AddRec like {base,+,1} on the current
1183 // loop, which indicates a strided load. If we have something else, it's a
1184 // random load we can't handle.
1185 Value *LoadPtr = LI->getPointerOperand();
1186 const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
1187
1188 const SCEV *StoreSizeSCEV = SE->getConstant(StorePtr->getType(), StoreSize);
1189 return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSizeSCEV,
1190 SI->getAlign(), LI->getAlign(), SI, LI,
1191 StoreEv, LoadEv, BECount);
1192}
1193
1194namespace {
1195class MemmoveVerifier {
1196public:
1197 explicit MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr,
1198 const DataLayout &DL)
1200 LoadBasePtr.stripPointerCasts(), LoadOff, DL)),
1202 StoreBasePtr.stripPointerCasts(), StoreOff, DL)),
1203 IsSameObject(BP1 == BP2) {}
1204
1205 bool loadAndStoreMayFormMemmove(unsigned StoreSize, bool IsNegStride,
1206 const Instruction &TheLoad,
1207 bool IsMemCpy) const {
1208 if (IsMemCpy) {
1209 // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1210 // for negative stride.
1211 if ((!IsNegStride && LoadOff <= StoreOff) ||
1212 (IsNegStride && LoadOff >= StoreOff))
1213 return false;
1214 } else {
1215 // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1216 // for negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr.
1217 int64_t LoadSize =
1218 DL.getTypeSizeInBits(TheLoad.getType()).getFixedValue() / 8;
1219 if (BP1 != BP2 || LoadSize != int64_t(StoreSize))
1220 return false;
1221 if ((!IsNegStride && LoadOff < StoreOff + int64_t(StoreSize)) ||
1222 (IsNegStride && LoadOff + LoadSize > StoreOff))
1223 return false;
1224 }
1225 return true;
1226 }
1227
1228private:
1229 const DataLayout &DL;
1230 int64_t LoadOff = 0;
1231 int64_t StoreOff = 0;
1232 const Value *BP1;
1233 const Value *BP2;
1234
1235public:
1236 const bool IsSameObject;
1237};
1238} // namespace
1239
1240bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
1241 Value *DestPtr, Value *SourcePtr, const SCEV *StoreSizeSCEV,
1242 MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore,
1243 Instruction *TheLoad, const SCEVAddRecExpr *StoreEv,
1244 const SCEVAddRecExpr *LoadEv, const SCEV *BECount) {
1245
1246 // FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to
1247 // conservatively bail here, since otherwise we may have to transform
1248 // llvm.memcpy.inline into llvm.memcpy which is illegal.
1249 if (isa<MemCpyInlineInst>(TheStore))
1250 return false;
1251
1252 // The trip count of the loop and the base pointer of the addrec SCEV is
1253 // guaranteed to be loop invariant, which means that it should dominate the
1254 // header. This allows us to insert code for it in the preheader.
1255 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1256 IRBuilder<> Builder(Preheader->getTerminator());
1257 SCEVExpander Expander(*SE, *DL, "loop-idiom");
1258
1259 SCEVExpanderCleaner ExpCleaner(Expander);
1260
1261 bool Changed = false;
1262 const SCEV *StrStart = StoreEv->getStart();
1263 unsigned StrAS = DestPtr->getType()->getPointerAddressSpace();
1264 Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
1265
1266 APInt Stride = getStoreStride(StoreEv);
1267 const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
1268
1269 // TODO: Deal with non-constant size; Currently expect constant store size
1270 assert(ConstStoreSize && "store size is expected to be a constant");
1271
1272 int64_t StoreSize = ConstStoreSize->getValue()->getZExtValue();
1273 bool IsNegStride = StoreSize == -Stride;
1274
1275 // Handle negative strided loops.
1276 if (IsNegStride)
1277 StrStart =
1278 getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1279
1280 // Okay, we have a strided store "p[i]" of a loaded value. We can turn
1281 // this into a memcpy in the loop preheader now if we want. However, this
1282 // would be unsafe to do if there is anything else in the loop that may read
1283 // or write the memory region we're storing to. This includes the load that
1284 // feeds the stores. Check for an alias by generating the base address and
1285 // checking everything.
1286 Value *StoreBasePtr = Expander.expandCodeFor(
1287 StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
1288
1289 // From here on out, conservatively report to the pass manager that we've
1290 // changed the IR, even if we later clean up these added instructions. There
1291 // may be structural differences e.g. in the order of use lists not accounted
1292 // for in just a textual dump of the IR. This is written as a variable, even
1293 // though statically all the places this dominates could be replaced with
1294 // 'true', with the hope that anyone trying to be clever / "more precise" with
1295 // the return value will read this comment, and leave them alone.
1296 Changed = true;
1297
1298 SmallPtrSet<Instruction *, 2> IgnoredInsts;
1299 IgnoredInsts.insert(TheStore);
1300
1301 bool IsMemCpy = isa<MemCpyInst>(TheStore);
1302 const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";
1303
1304 bool LoopAccessStore =
1305 mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1306 StoreSizeSCEV, *AA, IgnoredInsts);
1307 if (LoopAccessStore) {
1308 // For memmove case it's not enough to guarantee that loop doesn't access
1309 // TheStore and TheLoad. Additionally we need to make sure that TheStore is
1310 // the only user of TheLoad.
1311 if (!TheLoad->hasOneUse())
1312 return Changed;
1313 IgnoredInsts.insert(TheLoad);
1314 if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop,
1315 BECount, StoreSizeSCEV, *AA, IgnoredInsts)) {
1316 ORE.emit([&]() {
1317 return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore",
1318 TheStore)
1319 << ore::NV("Inst", InstRemark) << " in "
1320 << ore::NV("Function", TheStore->getFunction())
1321 << " function will not be hoisted: "
1322 << ore::NV("Reason", "The loop may access store location");
1323 });
1324 return Changed;
1325 }
1326 IgnoredInsts.erase(TheLoad);
1327 }
1328
1329 const SCEV *LdStart = LoadEv->getStart();
1330 unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
1331
1332 // Handle negative strided loops.
1333 if (IsNegStride)
1334 LdStart =
1335 getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1336
1337 // For a memcpy, we have to make sure that the input array is not being
1338 // mutated by the loop.
1339 Value *LoadBasePtr = Expander.expandCodeFor(
1340 LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
1341
1342 // If the store is a memcpy instruction, we must check if it will write to
1343 // the load memory locations. So remove it from the ignored stores.
1344 MemmoveVerifier Verifier(*LoadBasePtr, *StoreBasePtr, *DL);
1345 if (IsMemCpy && !Verifier.IsSameObject)
1346 IgnoredInsts.erase(TheStore);
1347 if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1348 StoreSizeSCEV, *AA, IgnoredInsts)) {
1349 ORE.emit([&]() {
1350 return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
1351 << ore::NV("Inst", InstRemark) << " in "
1352 << ore::NV("Function", TheStore->getFunction())
1353 << " function will not be hoisted: "
1354 << ore::NV("Reason", "The loop may access load location");
1355 });
1356 return Changed;
1357 }
1358
1359 bool UseMemMove = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore;
1360 if (UseMemMove)
1361 if (!Verifier.loadAndStoreMayFormMemmove(StoreSize, IsNegStride, *TheLoad,
1362 IsMemCpy))
1363 return Changed;
1364
1365 if (avoidLIRForMultiBlockLoop())
1366 return Changed;
1367
1368 // Okay, everything is safe, we can transform this!
1369
1370 const SCEV *NumBytesS =
1371 getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1372
1373 Value *NumBytes =
1374 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1375
1376 AAMDNodes AATags = TheLoad->getAAMetadata();
1377 AAMDNodes StoreAATags = TheStore->getAAMetadata();
1378 AATags = AATags.merge(StoreAATags);
1379 if (auto CI = dyn_cast<ConstantInt>(NumBytes))
1380 AATags = AATags.extendTo(CI->getZExtValue());
1381 else
1382 AATags = AATags.extendTo(-1);
1383
1384 CallInst *NewCall = nullptr;
1385 // Check whether to generate an unordered atomic memcpy:
1386 // If the load or store are atomic, then they must necessarily be unordered
1387 // by previous checks.
1388 if (!TheStore->isAtomic() && !TheLoad->isAtomic()) {
1389 if (UseMemMove)
1390 NewCall = Builder.CreateMemMove(
1391 StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, NumBytes,
1392 /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
1393 else
1394 NewCall =
1395 Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign,
1396 NumBytes, /*isVolatile=*/false, AATags.TBAA,
1397 AATags.TBAAStruct, AATags.Scope, AATags.NoAlias);
1398 } else {
1399 // For now don't support unordered atomic memmove.
1400 if (UseMemMove)
1401 return Changed;
1402 // We cannot allow unaligned ops for unordered load/store, so reject
1403 // anything where the alignment isn't at least the element size.
1404 assert((StoreAlign && LoadAlign) &&
1405 "Expect unordered load/store to have align.");
1406 if (*StoreAlign < StoreSize || *LoadAlign < StoreSize)
1407 return Changed;
1408
1409 // If the element.atomic memcpy is not lowered into explicit
1410 // loads/stores later, then it will be lowered into an element-size
1411 // specific lib call. If the lib call doesn't exist for our store size, then
1412 // we shouldn't generate the memcpy.
1413 if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1414 return Changed;
1415
1416 // Create the call.
1417 // Note that unordered atomic loads/stores are *required* by the spec to
1418 // have an alignment but non-atomic loads/stores may not.
1419 NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1420 StoreBasePtr, *StoreAlign, LoadBasePtr, *LoadAlign, NumBytes, StoreSize,
1421 AATags.TBAA, AATags.TBAAStruct, AATags.Scope, AATags.NoAlias);
1422 }
1423 NewCall->setDebugLoc(TheStore->getDebugLoc());
1424
1425 if (MSSAU) {
1426 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1427 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1428 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1429 }
1430
1431 LLVM_DEBUG(dbgs() << " Formed new call: " << *NewCall << "\n"
1432 << " from load ptr=" << *LoadEv << " at: " << *TheLoad
1433 << "\n"
1434 << " from store ptr=" << *StoreEv << " at: " << *TheStore
1435 << "\n");
1436
1437 ORE.emit([&]() {
1438 return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",
1439 NewCall->getDebugLoc(), Preheader)
1440 << "Formed a call to "
1441 << ore::NV("NewFunction", NewCall->getCalledFunction())
1442 << "() intrinsic from " << ore::NV("Inst", InstRemark)
1443 << " instruction in " << ore::NV("Function", TheStore->getFunction())
1444 << " function"
1446 << ore::NV("FromBlock", TheStore->getParent()->getName())
1447 << ore::NV("ToBlock", Preheader->getName());
1448 });
1449
1450 // Okay, a new call to memcpy/memmove has been formed. Zap the original store
1451 // and anything that feeds into it.
1452 if (MSSAU)
1453 MSSAU->removeMemoryAccess(TheStore, true);
1454 deleteDeadInstruction(TheStore);
1455 if (MSSAU && VerifyMemorySSA)
1456 MSSAU->getMemorySSA()->verifyMemorySSA();
1457 if (UseMemMove)
1458 ++NumMemMove;
1459 else
1460 ++NumMemCpy;
1461 ExpCleaner.markResultUsed();
1462 return true;
1463}
1464
1465// When compiling for codesize we avoid idiom recognition for a multi-block loop
1466// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1467//
1468bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1469 bool IsLoopMemset) {
1470 if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
1471 if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
1472 LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
1473 << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
1474 << " avoided: multi-block top-level loop\n");
1475 return true;
1476 }
1477 }
1478
1479 return false;
1480}
1481
1482bool LoopIdiomRecognize::runOnNoncountableLoop() {
1483 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
1484 << CurLoop->getHeader()->getParent()->getName()
1485 << "] Noncountable Loop %"
1486 << CurLoop->getHeader()->getName() << "\n");
1487
1488 return recognizePopcount() || recognizeAndInsertFFS() ||
1489 recognizeShiftUntilBitTest() || recognizeShiftUntilZero();
1490}
1491
1492/// Check if the given conditional branch is based on the comparison between
1493/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1494/// true), the control yields to the loop entry. If the branch matches the
1495/// behavior, the variable involved in the comparison is returned. This function
1496/// will be called to see if the precondition and postcondition of the loop are
1497/// in desirable form.
1499 bool JmpOnZero = false) {
1500 if (!BI || !BI->isConditional())
1501 return nullptr;
1502
1503 ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
1504 if (!Cond)
1505 return nullptr;
1506
1507 ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
1508 if (!CmpZero || !CmpZero->isZero())
1509 return nullptr;
1510
1511 BasicBlock *TrueSucc = BI->getSuccessor(0);
1512 BasicBlock *FalseSucc = BI->getSuccessor(1);
1513 if (JmpOnZero)
1514 std::swap(TrueSucc, FalseSucc);
1515
1516 ICmpInst::Predicate Pred = Cond->getPredicate();
1517 if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
1518 (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
1519 return Cond->getOperand(0);
1520
1521 return nullptr;
1522}
1523
1524// Check if the recurrence variable `VarX` is in the right form to create
1525// the idiom. Returns the value coerced to a PHINode if so.
1527 BasicBlock *LoopEntry) {
1528 auto *PhiX = dyn_cast<PHINode>(VarX);
1529 if (PhiX && PhiX->getParent() == LoopEntry &&
1530 (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
1531 return PhiX;
1532 return nullptr;
1533}
1534
1535/// Return true iff the idiom is detected in the loop.
1536///
1537/// Additionally:
1538/// 1) \p CntInst is set to the instruction counting the population bit.
1539/// 2) \p CntPhi is set to the corresponding phi node.
1540/// 3) \p Var is set to the value whose population bits are being counted.
1541///
1542/// The core idiom we are trying to detect is:
1543/// \code
1544/// if (x0 != 0)
1545/// goto loop-exit // the precondition of the loop
1546/// cnt0 = init-val;
1547/// do {
1548/// x1 = phi (x0, x2);
1549/// cnt1 = phi(cnt0, cnt2);
1550///
1551/// cnt2 = cnt1 + 1;
1552/// ...
1553/// x2 = x1 & (x1 - 1);
1554/// ...
1555/// } while(x != 0);
1556///
1557/// loop-exit:
1558/// \endcode
1559static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
1560 Instruction *&CntInst, PHINode *&CntPhi,
1561 Value *&Var) {
1562 // step 1: Check to see if the look-back branch match this pattern:
1563 // "if (a!=0) goto loop-entry".
1564 BasicBlock *LoopEntry;
1565 Instruction *DefX2, *CountInst;
1566 Value *VarX1, *VarX0;
1567 PHINode *PhiX, *CountPhi;
1568
1569 DefX2 = CountInst = nullptr;
1570 VarX1 = VarX0 = nullptr;
1571 PhiX = CountPhi = nullptr;
1572 LoopEntry = *(CurLoop->block_begin());
1573
1574 // step 1: Check if the loop-back branch is in desirable form.
1575 {
1576 if (Value *T = matchCondition(
1577 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1578 DefX2 = dyn_cast<Instruction>(T);
1579 else
1580 return false;
1581 }
1582
1583 // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1584 {
1585 if (!DefX2 || DefX2->getOpcode() != Instruction::And)
1586 return false;
1587
1588 BinaryOperator *SubOneOp;
1589
1590 if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
1591 VarX1 = DefX2->getOperand(1);
1592 else {
1593 VarX1 = DefX2->getOperand(0);
1594 SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
1595 }
1596 if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
1597 return false;
1598
1599 ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
1600 if (!Dec ||
1601 !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
1602 (SubOneOp->getOpcode() == Instruction::Add &&
1603 Dec->isMinusOne()))) {
1604 return false;
1605 }
1606 }
1607
1608 // step 3: Check the recurrence of variable X
1609 PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
1610 if (!PhiX)
1611 return false;
1612
1613 // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
1614 {
1615 CountInst = nullptr;
1616 for (Instruction &Inst : llvm::make_range(
1617 LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
1618 if (Inst.getOpcode() != Instruction::Add)
1619 continue;
1620
1621 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
1622 if (!Inc || !Inc->isOne())
1623 continue;
1624
1625 PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
1626 if (!Phi)
1627 continue;
1628
1629 // Check if the result of the instruction is live of the loop.
1630 bool LiveOutLoop = false;
1631 for (User *U : Inst.users()) {
1632 if ((cast<Instruction>(U))->getParent() != LoopEntry) {
1633 LiveOutLoop = true;
1634 break;
1635 }
1636 }
1637
1638 if (LiveOutLoop) {
1639 CountInst = &Inst;
1640 CountPhi = Phi;
1641 break;
1642 }
1643 }
1644
1645 if (!CountInst)
1646 return false;
1647 }
1648
1649 // step 5: check if the precondition is in this form:
1650 // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
1651 {
1652 auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1653 Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
1654 if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
1655 return false;
1656
1657 CntInst = CountInst;
1658 CntPhi = CountPhi;
1659 Var = T;
1660 }
1661
1662 return true;
1663}
1664
1665/// Return true if the idiom is detected in the loop.
1666///
1667/// Additionally:
1668/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1669/// or nullptr if there is no such.
1670/// 2) \p CntPhi is set to the corresponding phi node
1671/// or nullptr if there is no such.
1672/// 3) \p Var is set to the value whose CTLZ could be used.
1673/// 4) \p DefX is set to the instruction calculating Loop exit condition.
1674///
1675/// The core idiom we are trying to detect is:
1676/// \code
1677/// if (x0 == 0)
1678/// goto loop-exit // the precondition of the loop
1679/// cnt0 = init-val;
1680/// do {
1681/// x = phi (x0, x.next); //PhiX
1682/// cnt = phi(cnt0, cnt.next);
1683///
1684/// cnt.next = cnt + 1;
1685/// ...
1686/// x.next = x >> 1; // DefX
1687/// ...
1688/// } while(x.next != 0);
1689///
1690/// loop-exit:
1691/// \endcode
1692static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
1693 Intrinsic::ID &IntrinID, Value *&InitX,
1694 Instruction *&CntInst, PHINode *&CntPhi,
1695 Instruction *&DefX) {
1696 BasicBlock *LoopEntry;
1697 Value *VarX = nullptr;
1698
1699 DefX = nullptr;
1700 CntInst = nullptr;
1701 CntPhi = nullptr;
1702 LoopEntry = *(CurLoop->block_begin());
1703
1704 // step 1: Check if the loop-back branch is in desirable form.
1705 if (Value *T = matchCondition(
1706 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1707 DefX = dyn_cast<Instruction>(T);
1708 else
1709 return false;
1710
1711 // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
1712 if (!DefX || !DefX->isShift())
1713 return false;
1714 IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
1715 Intrinsic::ctlz;
1716 ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
1717 if (!Shft || !Shft->isOne())
1718 return false;
1719 VarX = DefX->getOperand(0);
1720
1721 // step 3: Check the recurrence of variable X
1722 PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
1723 if (!PhiX)
1724 return false;
1725
1726 InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
1727
1728 // Make sure the initial value can't be negative otherwise the ashr in the
1729 // loop might never reach zero which would make the loop infinite.
1730 if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
1731 return false;
1732
1733 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1734 // or cnt.next = cnt + -1.
1735 // TODO: We can skip the step. If loop trip count is known (CTLZ),
1736 // then all uses of "cnt.next" could be optimized to the trip count
1737 // plus "cnt0". Currently it is not optimized.
1738 // This step could be used to detect POPCNT instruction:
1739 // cnt.next = cnt + (x.next & 1)
1740 for (Instruction &Inst : llvm::make_range(
1741 LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
1742 if (Inst.getOpcode() != Instruction::Add)
1743 continue;
1744
1745 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
1746 if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
1747 continue;
1748
1749 PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
1750 if (!Phi)
1751 continue;
1752
1753 CntInst = &Inst;
1754 CntPhi = Phi;
1755 break;
1756 }
1757 if (!CntInst)
1758 return false;
1759
1760 return true;
1761}
1762
1763/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
1764/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
1765/// trip count returns true; otherwise, returns false.
1766bool LoopIdiomRecognize::recognizeAndInsertFFS() {
1767 // Give up if the loop has multiple blocks or multiple backedges.
1768 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1769 return false;
1770
1771 Intrinsic::ID IntrinID;
1772 Value *InitX;
1773 Instruction *DefX = nullptr;
1774 PHINode *CntPhi = nullptr;
1775 Instruction *CntInst = nullptr;
1776 // Help decide if transformation is profitable. For ShiftUntilZero idiom,
1777 // this is always 6.
1778 size_t IdiomCanonicalSize = 6;
1779
1780 if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
1781 CntInst, CntPhi, DefX))
1782 return false;
1783
1784 bool IsCntPhiUsedOutsideLoop = false;
1785 for (User *U : CntPhi->users())
1786 if (!CurLoop->contains(cast<Instruction>(U))) {
1787 IsCntPhiUsedOutsideLoop = true;
1788 break;
1789 }
1790 bool IsCntInstUsedOutsideLoop = false;
1791 for (User *U : CntInst->users())
1792 if (!CurLoop->contains(cast<Instruction>(U))) {
1793 IsCntInstUsedOutsideLoop = true;
1794 break;
1795 }
1796 // If both CntInst and CntPhi are used outside the loop the profitability
1797 // is questionable.
1798 if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
1799 return false;
1800
1801 // For some CPUs result of CTLZ(X) intrinsic is undefined
1802 // when X is 0. If we can not guarantee X != 0, we need to check this
1803 // when expand.
1804 bool ZeroCheck = false;
1805 // It is safe to assume Preheader exist as it was checked in
1806 // parent function RunOnLoop.
1807 BasicBlock *PH = CurLoop->getLoopPreheader();
1808
1809 // If we are using the count instruction outside the loop, make sure we
1810 // have a zero check as a precondition. Without the check the loop would run
1811 // one iteration for before any check of the input value. This means 0 and 1
1812 // would have identical behavior in the original loop and thus
1813 if (!IsCntPhiUsedOutsideLoop) {
1814 auto *PreCondBB = PH->getSinglePredecessor();
1815 if (!PreCondBB)
1816 return false;
1817 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1818 if (!PreCondBI)
1819 return false;
1820 if (matchCondition(PreCondBI, PH) != InitX)
1821 return false;
1822 ZeroCheck = true;
1823 }
1824
1825 // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
1826 // profitable if we delete the loop.
1827
1828 // the loop has only 6 instructions:
1829 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
1830 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
1831 // %shr = ashr %n.addr.0, 1
1832 // %tobool = icmp eq %shr, 0
1833 // %inc = add nsw %i.0, 1
1834 // br i1 %tobool
1835
1836 const Value *Args[] = {InitX,
1837 ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
1838
1839 // @llvm.dbg doesn't count as they have no semantic effect.
1840 auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
1842 std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
1843
1844 IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
1847 if (HeaderSize != IdiomCanonicalSize &&
1849 return false;
1850
1851 transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
1852 DefX->getDebugLoc(), ZeroCheck,
1853 IsCntPhiUsedOutsideLoop);
1854 return true;
1855}
1856
1857/// Recognizes a population count idiom in a non-countable loop.
1858///
1859/// If detected, transforms the relevant code to issue the popcount intrinsic
1860/// function call, and returns true; otherwise, returns false.
1861bool LoopIdiomRecognize::recognizePopcount() {
1863 return false;
1864
1865 // Counting population are usually conducted by few arithmetic instructions.
1866 // Such instructions can be easily "absorbed" by vacant slots in a
1867 // non-compact loop. Therefore, recognizing popcount idiom only makes sense
1868 // in a compact loop.
1869
1870 // Give up if the loop has multiple blocks or multiple backedges.
1871 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1872 return false;
1873
1874 BasicBlock *LoopBody = *(CurLoop->block_begin());
1875 if (LoopBody->size() >= 20) {
1876 // The loop is too big, bail out.
1877 return false;
1878 }
1879
1880 // It should have a preheader containing nothing but an unconditional branch.
1881 BasicBlock *PH = CurLoop->getLoopPreheader();
1882 if (!PH || &PH->front() != PH->getTerminator())
1883 return false;
1884 auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
1885 if (!EntryBI || EntryBI->isConditional())
1886 return false;
1887
1888 // It should have a precondition block where the generated popcount intrinsic
1889 // function can be inserted.
1890 auto *PreCondBB = PH->getSinglePredecessor();
1891 if (!PreCondBB)
1892 return false;
1893 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1894 if (!PreCondBI || PreCondBI->isUnconditional())
1895 return false;
1896
1897 Instruction *CntInst;
1898 PHINode *CntPhi;
1899 Value *Val;
1900 if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
1901 return false;
1902
1903 transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
1904 return true;
1905}
1906
1908 const DebugLoc &DL) {
1909 Value *Ops[] = {Val};
1910 Type *Tys[] = {Val->getType()};
1911
1913 Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
1914 CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1915 CI->setDebugLoc(DL);
1916
1917 return CI;
1918}
1919
1921 const DebugLoc &DL, bool ZeroCheck,
1922 Intrinsic::ID IID) {
1923 Value *Ops[] = {Val, IRBuilder.getInt1(ZeroCheck)};
1924 Type *Tys[] = {Val->getType()};
1925
1927 Function *Func = Intrinsic::getDeclaration(M, IID, Tys);
1928 CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1929 CI->setDebugLoc(DL);
1930
1931 return CI;
1932}
1933
1934/// Transform the following loop (Using CTLZ, CTTZ is similar):
1935/// loop:
1936/// CntPhi = PHI [Cnt0, CntInst]
1937/// PhiX = PHI [InitX, DefX]
1938/// CntInst = CntPhi + 1
1939/// DefX = PhiX >> 1
1940/// LOOP_BODY
1941/// Br: loop if (DefX != 0)
1942/// Use(CntPhi) or Use(CntInst)
1943///
1944/// Into:
1945/// If CntPhi used outside the loop:
1946/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
1947/// Count = CountPrev + 1
1948/// else
1949/// Count = BitWidth(InitX) - CTLZ(InitX)
1950/// loop:
1951/// CntPhi = PHI [Cnt0, CntInst]
1952/// PhiX = PHI [InitX, DefX]
1953/// PhiCount = PHI [Count, Dec]
1954/// CntInst = CntPhi + 1
1955/// DefX = PhiX >> 1
1956/// Dec = PhiCount - 1
1957/// LOOP_BODY
1958/// Br: loop if (Dec != 0)
1959/// Use(CountPrev + Cnt0) // Use(CntPhi)
1960/// or
1961/// Use(Count + Cnt0) // Use(CntInst)
1962///
1963/// If LOOP_BODY is empty the loop will be deleted.
1964/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
1965void LoopIdiomRecognize::transformLoopToCountable(
1966 Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
1967 PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
1968 bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
1969 BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
1970
1971 // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
1972 IRBuilder<> Builder(PreheaderBr);
1973 Builder.SetCurrentDebugLocation(DL);
1974
1975 // If there are no uses of CntPhi crate:
1976 // Count = BitWidth - CTLZ(InitX);
1977 // NewCount = Count;
1978 // If there are uses of CntPhi create:
1979 // NewCount = BitWidth - CTLZ(InitX >> 1);
1980 // Count = NewCount + 1;
1981 Value *InitXNext;
1982 if (IsCntPhiUsedOutsideLoop) {
1983 if (DefX->getOpcode() == Instruction::AShr)
1984 InitXNext = Builder.CreateAShr(InitX, 1);
1985 else if (DefX->getOpcode() == Instruction::LShr)
1986 InitXNext = Builder.CreateLShr(InitX, 1);
1987 else if (DefX->getOpcode() == Instruction::Shl) // cttz
1988 InitXNext = Builder.CreateShl(InitX, 1);
1989 else
1990 llvm_unreachable("Unexpected opcode!");
1991 } else
1992 InitXNext = InitX;
1993 Value *Count =
1994 createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
1995 Type *CountTy = Count->getType();
1996 Count = Builder.CreateSub(
1997 ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count);
1998 Value *NewCount = Count;
1999 if (IsCntPhiUsedOutsideLoop)
2000 Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1));
2001
2002 NewCount = Builder.CreateZExtOrTrunc(NewCount, CntInst->getType());
2003
2004 Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
2005 if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
2006 // If the counter was being incremented in the loop, add NewCount to the
2007 // counter's initial value, but only if the initial value is not zero.
2008 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2009 if (!InitConst || !InitConst->isZero())
2010 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2011 } else {
2012 // If the count was being decremented in the loop, subtract NewCount from
2013 // the counter's initial value.
2014 NewCount = Builder.CreateSub(CntInitVal, NewCount);
2015 }
2016
2017 // Step 2: Insert new IV and loop condition:
2018 // loop:
2019 // ...
2020 // PhiCount = PHI [Count, Dec]
2021 // ...
2022 // Dec = PhiCount - 1
2023 // ...
2024 // Br: loop if (Dec != 0)
2025 BasicBlock *Body = *(CurLoop->block_begin());
2026 auto *LbBr = cast<BranchInst>(Body->getTerminator());
2027 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2028
2029 PHINode *TcPhi = PHINode::Create(CountTy, 2, "tcphi");
2030 TcPhi->insertBefore(Body->begin());
2031
2032 Builder.SetInsertPoint(LbCond);
2033 Instruction *TcDec = cast<Instruction>(Builder.CreateSub(
2034 TcPhi, ConstantInt::get(CountTy, 1), "tcdec", false, true));
2035
2036 TcPhi->addIncoming(Count, Preheader);
2037 TcPhi->addIncoming(TcDec, Body);
2038
2039 CmpInst::Predicate Pred =
2040 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
2041 LbCond->setPredicate(Pred);
2042 LbCond->setOperand(0, TcDec);
2043 LbCond->setOperand(1, ConstantInt::get(CountTy, 0));
2044
2045 // Step 3: All the references to the original counter outside
2046 // the loop are replaced with the NewCount
2047 if (IsCntPhiUsedOutsideLoop)
2048 CntPhi->replaceUsesOutsideBlock(NewCount, Body);
2049 else
2050 CntInst->replaceUsesOutsideBlock(NewCount, Body);
2051
2052 // step 4: Forget the "non-computable" trip-count SCEV associated with the
2053 // loop. The loop would otherwise not be deleted even if it becomes empty.
2054 SE->forgetLoop(CurLoop);
2055}
2056
2057void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
2058 Instruction *CntInst,
2059 PHINode *CntPhi, Value *Var) {
2060 BasicBlock *PreHead = CurLoop->getLoopPreheader();
2061 auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
2062 const DebugLoc &DL = CntInst->getDebugLoc();
2063
2064 // Assuming before transformation, the loop is following:
2065 // if (x) // the precondition
2066 // do { cnt++; x &= x - 1; } while(x);
2067
2068 // Step 1: Insert the ctpop instruction at the end of the precondition block
2069 IRBuilder<> Builder(PreCondBr);
2070 Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
2071 {
2072 PopCnt = createPopcntIntrinsic(Builder, Var, DL);
2073 NewCount = PopCntZext =
2074 Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
2075
2076 if (NewCount != PopCnt)
2077 (cast<Instruction>(NewCount))->setDebugLoc(DL);
2078
2079 // TripCnt is exactly the number of iterations the loop has
2080 TripCnt = NewCount;
2081
2082 // If the population counter's initial value is not zero, insert Add Inst.
2083 Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
2084 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2085 if (!InitConst || !InitConst->isZero()) {
2086 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2087 (cast<Instruction>(NewCount))->setDebugLoc(DL);
2088 }
2089 }
2090
2091 // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
2092 // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
2093 // function would be partial dead code, and downstream passes will drag
2094 // it back from the precondition block to the preheader.
2095 {
2096 ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
2097
2098 Value *Opnd0 = PopCntZext;
2099 Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
2100 if (PreCond->getOperand(0) != Var)
2101 std::swap(Opnd0, Opnd1);
2102
2103 ICmpInst *NewPreCond = cast<ICmpInst>(
2104 Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
2105 PreCondBr->setCondition(NewPreCond);
2106
2108 }
2109
2110 // Step 3: Note that the population count is exactly the trip count of the
2111 // loop in question, which enable us to convert the loop from noncountable
2112 // loop into a countable one. The benefit is twofold:
2113 //
2114 // - If the loop only counts population, the entire loop becomes dead after
2115 // the transformation. It is a lot easier to prove a countable loop dead
2116 // than to prove a noncountable one. (In some C dialects, an infinite loop
2117 // isn't dead even if it computes nothing useful. In general, DCE needs
2118 // to prove a noncountable loop finite before safely delete it.)
2119 //
2120 // - If the loop also performs something else, it remains alive.
2121 // Since it is transformed to countable form, it can be aggressively
2122 // optimized by some optimizations which are in general not applicable
2123 // to a noncountable loop.
2124 //
2125 // After this step, this loop (conceptually) would look like following:
2126 // newcnt = __builtin_ctpop(x);
2127 // t = newcnt;
2128 // if (x)
2129 // do { cnt++; x &= x-1; t--) } while (t > 0);
2130 BasicBlock *Body = *(CurLoop->block_begin());
2131 {
2132 auto *LbBr = cast<BranchInst>(Body->getTerminator());
2133 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2134 Type *Ty = TripCnt->getType();
2135
2136 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi");
2137 TcPhi->insertBefore(Body->begin());
2138
2139 Builder.SetInsertPoint(LbCond);
2140 Instruction *TcDec = cast<Instruction>(
2141 Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
2142 "tcdec", false, true));
2143
2144 TcPhi->addIncoming(TripCnt, PreHead);
2145 TcPhi->addIncoming(TcDec, Body);
2146
2147 CmpInst::Predicate Pred =
2148 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
2149 LbCond->setPredicate(Pred);
2150 LbCond->setOperand(0, TcDec);
2151 LbCond->setOperand(1, ConstantInt::get(Ty, 0));
2152 }
2153
2154 // Step 4: All the references to the original population counter outside
2155 // the loop are replaced with the NewCount -- the value returned from
2156 // __builtin_ctpop().
2157 CntInst->replaceUsesOutsideBlock(NewCount, Body);
2158
2159 // step 5: Forget the "non-computable" trip-count SCEV associated with the
2160 // loop. The loop would otherwise not be deleted even if it becomes empty.
2161 SE->forgetLoop(CurLoop);
2162}
2163
2164/// Match loop-invariant value.
2165template <typename SubPattern_t> struct match_LoopInvariant {
2166 SubPattern_t SubPattern;
2167 const Loop *L;
2168
2169 match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
2170 : SubPattern(SP), L(L) {}
2171
2172 template <typename ITy> bool match(ITy *V) {
2173 return L->isLoopInvariant(V) && SubPattern.match(V);
2174 }
2175};
2176
2177/// Matches if the value is loop-invariant.
2178template <typename Ty>
2179inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
2180 return match_LoopInvariant<Ty>(M, L);
2181}
2182
2183/// Return true if the idiom is detected in the loop.
2184///
2185/// The core idiom we are trying to detect is:
2186/// \code
2187/// entry:
2188/// <...>
2189/// %bitmask = shl i32 1, %bitpos
2190/// br label %loop
2191///
2192/// loop:
2193/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2194/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2195/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2196/// %x.next = shl i32 %x.curr, 1
2197/// <...>
2198/// br i1 %x.curr.isbitunset, label %loop, label %end
2199///
2200/// end:
2201/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2202/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2203/// <...>
2204/// \endcode
2205static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
2206 Value *&BitMask, Value *&BitPos,
2207 Value *&CurrX, Instruction *&NextX) {
2209 " Performing shift-until-bittest idiom detection.\n");
2210
2211 // Give up if the loop has multiple blocks or multiple backedges.
2212 if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
2213 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
2214 return false;
2215 }
2216
2217 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2218 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2219 assert(LoopPreheaderBB && "There is always a loop preheader.");
2220
2221 using namespace PatternMatch;
2222
2223 // Step 1: Check if the loop backedge is in desirable form.
2224
2226 Value *CmpLHS, *CmpRHS;
2227 BasicBlock *TrueBB, *FalseBB;
2228 if (!match(LoopHeaderBB->getTerminator(),
2229 m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
2230 m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {
2231 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
2232 return false;
2233 }
2234
2235 // Step 2: Check if the backedge's condition is in desirable form.
2236
2237 auto MatchVariableBitMask = [&]() {
2238 return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2239 match(CmpLHS,
2240 m_c_And(m_Value(CurrX),
2242 m_Value(BitMask),
2243 m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),
2244 CurLoop))));
2245 };
2246 auto MatchConstantBitMask = [&]() {
2247 return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2248 match(CmpLHS, m_And(m_Value(CurrX),
2249 m_CombineAnd(m_Value(BitMask), m_Power2()))) &&
2250 (BitPos = ConstantExpr::getExactLogBase2(cast<Constant>(BitMask)));
2251 };
2252 auto MatchDecomposableConstantBitMask = [&]() {
2253 APInt Mask;
2254 return llvm::decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, CurrX, Mask) &&
2255 ICmpInst::isEquality(Pred) && Mask.isPowerOf2() &&
2256 (BitMask = ConstantInt::get(CurrX->getType(), Mask)) &&
2257 (BitPos = ConstantInt::get(CurrX->getType(), Mask.logBase2()));
2258 };
2259
2260 if (!MatchVariableBitMask() && !MatchConstantBitMask() &&
2261 !MatchDecomposableConstantBitMask()) {
2262 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n");
2263 return false;
2264 }
2265
2266 // Step 3: Check if the recurrence is in desirable form.
2267 auto *CurrXPN = dyn_cast<PHINode>(CurrX);
2268 if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) {
2269 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
2270 return false;
2271 }
2272
2273 BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);
2274 NextX =
2275 dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));
2276
2277 assert(CurLoop->isLoopInvariant(BaseX) &&
2278 "Expected BaseX to be avaliable in the preheader!");
2279
2280 if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {
2281 // FIXME: support right-shift?
2282 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
2283 return false;
2284 }
2285
2286 // Step 4: Check if the backedge's destinations are in desirable form.
2287
2289 "Should only get equality predicates here.");
2290
2291 // cmp-br is commutative, so canonicalize to a single variant.
2292 if (Pred != ICmpInst::Predicate::ICMP_EQ) {
2293 Pred = ICmpInst::getInversePredicate(Pred);
2294 std::swap(TrueBB, FalseBB);
2295 }
2296
2297 // We expect to exit loop when comparison yields false,
2298 // so when it yields true we should branch back to loop header.
2299 if (TrueBB != LoopHeaderBB) {
2300 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
2301 return false;
2302 }
2303
2304 // Okay, idiom checks out.
2305 return true;
2306}
2307
2308/// Look for the following loop:
2309/// \code
2310/// entry:
2311/// <...>
2312/// %bitmask = shl i32 1, %bitpos
2313/// br label %loop
2314///
2315/// loop:
2316/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2317/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2318/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2319/// %x.next = shl i32 %x.curr, 1
2320/// <...>
2321/// br i1 %x.curr.isbitunset, label %loop, label %end
2322///
2323/// end:
2324/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2325/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2326/// <...>
2327/// \endcode
2328///
2329/// And transform it into:
2330/// \code
2331/// entry:
2332/// %bitmask = shl i32 1, %bitpos
2333/// %lowbitmask = add i32 %bitmask, -1
2334/// %mask = or i32 %lowbitmask, %bitmask
2335/// %x.masked = and i32 %x, %mask
2336/// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
2337/// i1 true)
2338/// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
2339/// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
2340/// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
2341/// %tripcount = add i32 %backedgetakencount, 1
2342/// %x.curr = shl i32 %x, %backedgetakencount
2343/// %x.next = shl i32 %x, %tripcount
2344/// br label %loop
2345///
2346/// loop:
2347/// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
2348/// %loop.iv.next = add nuw i32 %loop.iv, 1
2349/// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
2350/// <...>
2351/// br i1 %loop.ivcheck, label %end, label %loop
2352///
2353/// end:
2354/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2355/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2356/// <...>
2357/// \endcode
2358bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
2359 bool MadeChange = false;
2360
2361 Value *X, *BitMask, *BitPos, *XCurr;
2362 Instruction *XNext;
2363 if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,
2364 XNext)) {
2366 " shift-until-bittest idiom detection failed.\n");
2367 return MadeChange;
2368 }
2369 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n");
2370
2371 // Ok, it is the idiom we were looking for, we *could* transform this loop,
2372 // but is it profitable to transform?
2373
2374 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2375 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2376 assert(LoopPreheaderBB && "There is always a loop preheader.");
2377
2378 BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2379 assert(SuccessorBB && "There is only a single successor.");
2380
2381 IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2382 Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());
2383
2384 Intrinsic::ID IntrID = Intrinsic::ctlz;
2385 Type *Ty = X->getType();
2386 unsigned Bitwidth = Ty->getScalarSizeInBits();
2387
2390
2391 // The rewrite is considered to be unprofitable iff and only iff the
2392 // intrinsic/shift we'll use are not cheap. Note that we are okay with *just*
2393 // making the loop countable, even if nothing else changes.
2395 IntrID, Ty, {PoisonValue::get(Ty), /*is_zero_poison=*/Builder.getTrue()});
2399 " Intrinsic is too costly, not beneficial\n");
2400 return MadeChange;
2401 }
2402 if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >
2404 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n");
2405 return MadeChange;
2406 }
2407
2408 // Ok, transform appears worthwhile.
2409 MadeChange = true;
2410
2411 if (!isGuaranteedNotToBeUndefOrPoison(BitPos)) {
2412 // BitMask may be computed from BitPos, Freeze BitPos so we can increase
2413 // it's use count.
2414 Instruction *InsertPt = nullptr;
2415 if (auto *BitPosI = dyn_cast<Instruction>(BitPos))
2416 InsertPt = BitPosI->getInsertionPointAfterDef();
2417 else
2418 InsertPt = &*DT->getRoot()->getFirstNonPHIOrDbgOrAlloca();
2419 if (!InsertPt)
2420 return false;
2421 FreezeInst *BitPosFrozen =
2422 new FreezeInst(BitPos, BitPos->getName() + ".fr", InsertPt);
2423 BitPos->replaceUsesWithIf(BitPosFrozen, [BitPosFrozen](Use &U) {
2424 return U.getUser() != BitPosFrozen;
2425 });
2426 BitPos = BitPosFrozen;
2427 }
2428
2429 // Step 1: Compute the loop trip count.
2430
2431 Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
2432 BitPos->getName() + ".lowbitmask");
2433 Value *Mask =
2434 Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
2435 Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
2436 CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
2437 IntrID, Ty, {XMasked, /*is_zero_poison=*/Builder.getTrue()},
2438 /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros");
2439 Value *XMaskedNumActiveBits = Builder.CreateSub(
2440 ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
2441 XMasked->getName() + ".numactivebits", /*HasNUW=*/true,
2442 /*HasNSW=*/Bitwidth != 2);
2443 Value *XMaskedLeadingOnePos =
2444 Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),
2445 XMasked->getName() + ".leadingonepos", /*HasNUW=*/false,
2446 /*HasNSW=*/Bitwidth > 2);
2447
2448 Value *LoopBackedgeTakenCount = Builder.CreateSub(
2449 BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount",
2450 /*HasNUW=*/true, /*HasNSW=*/true);
2451 // We know loop's backedge-taken count, but what's loop's trip count?
2452 // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2453 Value *LoopTripCount =
2454 Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
2455 CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
2456 /*HasNSW=*/Bitwidth != 2);
2457
2458 // Step 2: Compute the recurrence's final value without a loop.
2459
2460 // NewX is always safe to compute, because `LoopBackedgeTakenCount`
2461 // will always be smaller than `bitwidth(X)`, i.e. we never get poison.
2462 Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);
2463 NewX->takeName(XCurr);
2464 if (auto *I = dyn_cast<Instruction>(NewX))
2465 I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
2466
2467 Value *NewXNext;
2468 // Rewriting XNext is more complicated, however, because `X << LoopTripCount`
2469 // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
2470 // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
2471 // that isn't the case, we'll need to emit an alternative, safe IR.
2472 if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() ||
2476 Ty->getScalarSizeInBits() - 1))))
2477 NewXNext = Builder.CreateShl(X, LoopTripCount);
2478 else {
2479 // Otherwise, just additionally shift by one. It's the smallest solution,
2480 // alternatively, we could check that NewX is INT_MIN (or BitPos is )
2481 // and select 0 instead.
2482 NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));
2483 }
2484
2485 NewXNext->takeName(XNext);
2486 if (auto *I = dyn_cast<Instruction>(NewXNext))
2487 I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
2488
2489 // Step 3: Adjust the successor basic block to recieve the computed
2490 // recurrence's final value instead of the recurrence itself.
2491
2492 XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);
2493 XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);
2494
2495 // Step 4: Rewrite the loop into a countable form, with canonical IV.
2496
2497 // The new canonical induction variable.
2498 Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->begin());
2499 auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
2500
2501 // The induction itself.
2502 // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2503 Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
2504 auto *IVNext =
2505 Builder.CreateAdd(IV, ConstantInt::get(Ty, 1), IV->getName() + ".next",
2506 /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
2507
2508 // The loop trip count check.
2509 auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
2510 CurLoop->getName() + ".ivcheck");
2511 Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
2512 LoopHeaderBB->getTerminator()->eraseFromParent();
2513
2514 // Populate the IV PHI.
2515 IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
2516 IV->addIncoming(IVNext, LoopHeaderBB);
2517
2518 // Step 5: Forget the "non-computable" trip-count SCEV associated with the
2519 // loop. The loop would otherwise not be deleted even if it becomes empty.
2520
2521 SE->forgetLoop(CurLoop);
2522
2523 // Other passes will take care of actually deleting the loop if possible.
2524
2525 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n");
2526
2527 ++NumShiftUntilBitTest;
2528 return MadeChange;
2529}
2530
2531/// Return true if the idiom is detected in the loop.
2532///
2533/// The core idiom we are trying to detect is:
2534/// \code
2535/// entry:
2536/// <...>
2537/// %start = <...>
2538/// %extraoffset = <...>
2539/// <...>
2540/// br label %for.cond
2541///
2542/// loop:
2543/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
2544/// %nbits = add nsw i8 %iv, %extraoffset
2545/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
2546/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
2547/// %iv.next = add i8 %iv, 1
2548/// <...>
2549/// br i1 %val.shifted.iszero, label %end, label %loop
2550///
2551/// end:
2552/// %iv.res = phi i8 [ %iv, %loop ] <...>
2553/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
2554/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
2555/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
2556/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
2557/// <...>
2558/// \endcode
2560 Instruction *&ValShiftedIsZero,
2561 Intrinsic::ID &IntrinID, Instruction *&IV,
2562 Value *&Start, Value *&Val,
2563 const SCEV *&ExtraOffsetExpr,
2564 bool &InvertedCond) {
2566 " Performing shift-until-zero idiom detection.\n");
2567
2568 // Give up if the loop has multiple blocks or multiple backedges.
2569 if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
2570 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
2571 return false;
2572 }
2573
2574 Instruction *ValShifted, *NBits, *IVNext;
2575 Value *ExtraOffset;
2576
2577 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2578 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2579 assert(LoopPreheaderBB && "There is always a loop preheader.");
2580
2581 using namespace PatternMatch;
2582
2583 // Step 1: Check if the loop backedge, condition is in desirable form.
2584
2586 BasicBlock *TrueBB, *FalseBB;
2587 if (!match(LoopHeaderBB->getTerminator(),
2588 m_Br(m_Instruction(ValShiftedIsZero), m_BasicBlock(TrueBB),
2589 m_BasicBlock(FalseBB))) ||
2590 !match(ValShiftedIsZero,
2591 m_ICmp(Pred, m_Instruction(ValShifted), m_Zero())) ||
2592 !ICmpInst::isEquality(Pred)) {
2593 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
2594 return false;
2595 }
2596
2597 // Step 2: Check if the comparison's operand is in desirable form.
2598 // FIXME: Val could be a one-input PHI node, which we should look past.
2599 if (!match(ValShifted, m_Shift(m_LoopInvariant(m_Value(Val), CurLoop),
2600 m_Instruction(NBits)))) {
2601 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad comparisons value computation.\n");
2602 return false;
2603 }
2604 IntrinID = ValShifted->getOpcode() == Instruction::Shl ? Intrinsic::cttz
2605 : Intrinsic::ctlz;
2606
2607 // Step 3: Check if the shift amount is in desirable form.
2608
2609 if (match(NBits, m_c_Add(m_Instruction(IV),
2610 m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
2611 (NBits->hasNoSignedWrap() || NBits->hasNoUnsignedWrap()))
2612 ExtraOffsetExpr = SE->getNegativeSCEV(SE->getSCEV(ExtraOffset));
2613 else if (match(NBits,
2615 m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
2616 NBits->hasNoSignedWrap())
2617 ExtraOffsetExpr = SE->getSCEV(ExtraOffset);
2618 else {
2619 IV = NBits;
2620 ExtraOffsetExpr = SE->getZero(NBits->getType());
2621 }
2622
2623 // Step 4: Check if the recurrence is in desirable form.
2624 auto *IVPN = dyn_cast<PHINode>(IV);
2625 if (!IVPN || IVPN->getParent() != LoopHeaderBB) {
2626 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
2627 return false;
2628 }
2629
2630 Start = IVPN->getIncomingValueForBlock(LoopPreheaderBB);
2631 IVNext = dyn_cast<Instruction>(IVPN->getIncomingValueForBlock(LoopHeaderBB));
2632
2633 if (!IVNext || !match(IVNext, m_Add(m_Specific(IVPN), m_One()))) {
2634 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
2635 return false;
2636 }
2637
2638 // Step 4: Check if the backedge's destinations are in desirable form.
2639
2641 "Should only get equality predicates here.");
2642
2643 // cmp-br is commutative, so canonicalize to a single variant.
2644 InvertedCond = Pred != ICmpInst::Predicate::ICMP_EQ;
2645 if (InvertedCond) {
2646 Pred = ICmpInst::getInversePredicate(Pred);
2647 std::swap(TrueBB, FalseBB);
2648 }
2649
2650 // We expect to exit loop when comparison yields true,
2651 // so when it yields false we should branch back to loop header.
2652 if (FalseBB != LoopHeaderBB) {
2653 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
2654 return false;
2655 }
2656
2657 // The new, countable, loop will certainly only run a known number of
2658 // iterations, It won't be infinite. But the old loop might be infinite
2659 // under certain conditions. For logical shifts, the value will become zero
2660 // after at most bitwidth(%Val) loop iterations. However, for arithmetic
2661 // right-shift, iff the sign bit was set, the value will never become zero,
2662 // and the loop may never finish.
2663 if (ValShifted->getOpcode() == Instruction::AShr &&
2664 !isMustProgress(CurLoop) && !SE->isKnownNonNegative(SE->getSCEV(Val))) {
2665 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Can not prove the loop is finite.\n");
2666 return false;
2667 }
2668
2669 // Okay, idiom checks out.
2670 return true;
2671}
2672
2673/// Look for the following loop:
2674/// \code
2675/// entry:
2676/// <...>
2677/// %start = <...>
2678/// %extraoffset = <...>
2679/// <...>
2680/// br label %for.cond
2681///
2682/// loop:
2683/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
2684/// %nbits = add nsw i8 %iv, %extraoffset
2685/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
2686/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
2687/// %iv.next = add i8 %iv, 1
2688/// <...>
2689/// br i1 %val.shifted.iszero, label %end, label %loop
2690///
2691/// end:
2692/// %iv.res = phi i8 [ %iv, %loop ] <...>
2693/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
2694/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
2695/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
2696/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
2697/// <...>
2698/// \endcode
2699///
2700/// And transform it into:
2701/// \code
2702/// entry:
2703/// <...>
2704/// %start = <...>
2705/// %extraoffset = <...>
2706/// <...>
2707/// %val.numleadingzeros = call i8 @llvm.ct{l,t}z.i8(i8 %val, i1 0)
2708/// %val.numactivebits = sub i8 8, %val.numleadingzeros
2709/// %extraoffset.neg = sub i8 0, %extraoffset
2710/// %tmp = add i8 %val.numactivebits, %extraoffset.neg
2711/// %iv.final = call i8 @llvm.smax.i8(i8 %tmp, i8 %start)
2712/// %loop.tripcount = sub i8 %iv.final, %start
2713/// br label %loop
2714///
2715/// loop:
2716/// %loop.iv = phi i8 [ 0, %entry ], [ %loop.iv.next, %loop ]
2717/// %loop.iv.next = add i8 %loop.iv, 1
2718/// %loop.ivcheck = icmp eq i8 %loop.iv.next, %loop.tripcount
2719/// %iv = add i8 %loop.iv, %start
2720/// <...>
2721/// br i1 %loop.ivcheck, label %end, label %loop
2722///
2723/// end:
2724/// %iv.res = phi i8 [ %iv.final, %loop ] <...>
2725/// <...>
2726/// \endcode
2727bool LoopIdiomRecognize::recognizeShiftUntilZero() {
2728 bool MadeChange = false;
2729
2730 Instruction *ValShiftedIsZero;
2731 Intrinsic::ID IntrID;
2732 Instruction *IV;
2733 Value *Start, *Val;
2734 const SCEV *ExtraOffsetExpr;
2735 bool InvertedCond;
2736 if (!detectShiftUntilZeroIdiom(CurLoop, SE, ValShiftedIsZero, IntrID, IV,
2737 Start, Val, ExtraOffsetExpr, InvertedCond)) {
2739 " shift-until-zero idiom detection failed.\n");
2740 return MadeChange;
2741 }
2742 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom detected!\n");
2743
2744 // Ok, it is the idiom we were looking for, we *could* transform this loop,
2745 // but is it profitable to transform?
2746
2747 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2748 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2749 assert(LoopPreheaderBB && "There is always a loop preheader.");
2750
2751 BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2752 assert(SuccessorBB && "There is only a single successor.");
2753
2754 IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2755 Builder.SetCurrentDebugLocation(IV->getDebugLoc());
2756
2757 Type *Ty = Val->getType();
2758 unsigned Bitwidth = Ty->getScalarSizeInBits();
2759
2762
2763 // The rewrite is considered to be unprofitable iff and only iff the
2764 // intrinsic we'll use are not cheap. Note that we are okay with *just*
2765 // making the loop countable, even if nothing else changes.
2767 IntrID, Ty, {PoisonValue::get(Ty), /*is_zero_poison=*/Builder.getFalse()});
2771 " Intrinsic is too costly, not beneficial\n");
2772 return MadeChange;
2773 }
2774
2775 // Ok, transform appears worthwhile.
2776 MadeChange = true;
2777
2778 bool OffsetIsZero = false;
2779 if (auto *ExtraOffsetExprC = dyn_cast<SCEVConstant>(ExtraOffsetExpr))
2780 OffsetIsZero = ExtraOffsetExprC->isZero();
2781
2782 // Step 1: Compute the loop's final IV value / trip count.
2783
2784 CallInst *ValNumLeadingZeros = Builder.CreateIntrinsic(
2785 IntrID, Ty, {Val, /*is_zero_poison=*/Builder.getFalse()},
2786 /*FMFSource=*/nullptr, Val->getName() + ".numleadingzeros");
2787 Value *ValNumActiveBits = Builder.CreateSub(
2788 ConstantInt::get(Ty, Ty->getScalarSizeInBits()), ValNumLeadingZeros,
2789 Val->getName() + ".numactivebits", /*HasNUW=*/true,
2790 /*HasNSW=*/Bitwidth != 2);
2791
2792 SCEVExpander Expander(*SE, *DL, "loop-idiom");
2793 Expander.setInsertPoint(&*Builder.GetInsertPoint());
2794 Value *ExtraOffset = Expander.expandCodeFor(ExtraOffsetExpr);
2795
2796 Value *ValNumActiveBitsOffset = Builder.CreateAdd(
2797 ValNumActiveBits, ExtraOffset, ValNumActiveBits->getName() + ".offset",
2798 /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true);
2799 Value *IVFinal = Builder.CreateIntrinsic(Intrinsic::smax, {Ty},
2800 {ValNumActiveBitsOffset, Start},
2801 /*FMFSource=*/nullptr, "iv.final");
2802
2803 auto *LoopBackedgeTakenCount = cast<Instruction>(Builder.CreateSub(
2804 IVFinal, Start, CurLoop->getName() + ".backedgetakencount",
2805 /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true));
2806 // FIXME: or when the offset was `add nuw`
2807
2808 // We know loop's backedge-taken count, but what's loop's trip count?
2809 Value *LoopTripCount =
2810 Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
2811 CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
2812 /*HasNSW=*/Bitwidth != 2);
2813
2814 // Step 2: Adjust the successor basic block to recieve the original
2815 // induction variable's final value instead of the orig. IV itself.
2816
2817 IV->replaceUsesOutsideBlock(IVFinal, LoopHeaderBB);
2818
2819 // Step 3: Rewrite the loop into a countable form, with canonical IV.
2820
2821 // The new canonical induction variable.
2822 Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->begin());
2823 auto *CIV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
2824
2825 // The induction itself.
2826 Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->getFirstNonPHIIt());
2827 auto *CIVNext =
2828 Builder.CreateAdd(CIV, ConstantInt::get(Ty, 1), CIV->getName() + ".next",
2829 /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
2830
2831 // The loop trip count check.
2832 auto *CIVCheck = Builder.CreateICmpEQ(CIVNext, LoopTripCount,
2833 CurLoop->getName() + ".ivcheck");
2834 auto *NewIVCheck = CIVCheck;
2835 if (InvertedCond) {
2836 NewIVCheck = Builder.CreateNot(CIVCheck);
2837 NewIVCheck->takeName(ValShiftedIsZero);
2838 }
2839
2840 // The original IV, but rebased to be an offset to the CIV.
2841 auto *IVDePHId = Builder.CreateAdd(CIV, Start, "", /*HasNUW=*/false,
2842 /*HasNSW=*/true); // FIXME: what about NUW?
2843 IVDePHId->takeName(IV);
2844
2845 // The loop terminator.
2846 Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
2847 Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
2848 LoopHeaderBB->getTerminator()->eraseFromParent();
2849
2850 // Populate the IV PHI.
2851 CIV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
2852 CIV->addIncoming(CIVNext, LoopHeaderBB);
2853
2854 // Step 4: Forget the "non-computable" trip-count SCEV associated with the
2855 // loop. The loop would otherwise not be deleted even if it becomes empty.
2856
2857 SE->forgetLoop(CurLoop);
2858
2859 // Step 5: Try to cleanup the loop's body somewhat.
2860 IV->replaceAllUsesWith(IVDePHId);
2861 IV->eraseFromParent();
2862
2863 ValShiftedIsZero->replaceAllUsesWith(NewIVCheck);
2864 ValShiftedIsZero->eraseFromParent();
2865
2866 // Other passes will take care of actually deleting the loop if possible.
2867
2868 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom optimized!\n");
2869
2870 ++NumShiftUntilZero;
2871 return MadeChange;
2872}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file implements a class to represent arbitrary precision integral constant values and operations...
assume Assume Builder
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
std::string Name
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define DEBUG_TYPE
static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, const SCEV *BECount, unsigned StoreSize, AliasAnalysis &AA, SmallPtrSetImpl< Instruction * > &Ignored)
mayLoopAccessLocation - Return true if the specified loop might access the specified pointer location...
IRTranslator LLVM IR MI
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static Value * matchCondition(BranchInst *BI, BasicBlock *LoopEntry, bool JmpOnZero=false)
Check if the given conditional branch is based on the comparison between a variable and zero,...
static PHINode * getRecurrenceVar(Value *VarX, Instruction *DefX, BasicBlock *LoopEntry)
static cl::opt< bool, true > DisableLIRPMemset("disable-" DEBUG_TYPE "-memset", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memset."), cl::location(DisableLIRP::Memset), cl::init(false), cl::ReallyHidden)
static cl::opt< bool > UseLIRCodeSizeHeurs("use-lir-code-size-heurs", cl::desc("Use loop idiom recognition code size heuristics when compiling" "with -Os/-Oz"), cl::init(true), cl::Hidden)
static CallInst * createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL, bool ZeroCheck, Intrinsic::ID IID)
static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX, Value *&BitMask, Value *&BitPos, Value *&CurrX, Instruction *&NextX)
Return true if the idiom is detected in the loop.
static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, Instruction *&CntInst, PHINode *&CntPhi, Value *&Var)
Return true iff the idiom is detected in the loop.
static Constant * getMemSetPatternValue(Value *V, const DataLayout *DL)
getMemSetPatternValue - If a strided store of the specified value is safe to turn into a memset_patte...
static cl::opt< bool, true > DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memcpy."), cl::location(DisableLIRP::Memcpy), cl::init(false), cl::ReallyHidden)
static CallInst * createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL)
static const SCEV * getNumBytes(const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, Loop *CurLoop, const DataLayout *DL, ScalarEvolution *SE)
Compute the number of bytes as a SCEV from the backedge taken count.
static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL, Intrinsic::ID &IntrinID, Value *&InitX, Instruction *&CntInst, PHINode *&CntPhi, Instruction *&DefX)
Return true if the idiom is detected in the loop.
static const SCEV * getStartForNegStride(const SCEV *Start, const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, ScalarEvolution *SE)
static APInt getStoreStride(const SCEVAddRecExpr *StoreEv)
match_LoopInvariant< Ty > m_LoopInvariant(const Ty &M, const Loop *L)
Matches if the value is loop-invariant.
#define DEBUG_TYPE
static cl::opt< bool, true > DisableLIRPAll("disable-" DEBUG_TYPE "-all", cl::desc("Options to disable Loop Idiom Recognize Pass."), cl::location(DisableLIRP::All), cl::init(false), cl::ReallyHidden)
static void deleteDeadInstruction(Instruction *I)
#define I(x, y, z)
Definition: MD5.cpp:58
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
Module.h This file contains the declarations for the Module class.
Contains a collection of routines for determining if a given instruction is guaranteed to execute if ...
This header defines various interfaces for pass management in LLVM.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isSimple(Instruction *I)
verify safepoint Safepoint IR Verifier
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This pass exposes codegen information to IR-level passes.
static const uint32_t IV[8]
Definition: blake3_impl.h:78
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Check whether or not an instruction may read or write the optionally specified memory location.
Class for arbitrary precision integers.
Definition: APInt.h:76
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1433
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1507
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:620
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:648
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
iterator end()
Definition: BasicBlock.h:337
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:335
iterator_range< filter_iterator< BasicBlock::const_iterator, std::function< bool(const Instruction &)> > > instructionsWithoutDebug(bool SkipPseudoOp=true) const
Return a const iterator range over the instructions in the block, skipping any debug instructions.
Definition: BasicBlock.cpp:103
InstListType::const_iterator getFirstNonPHIIt() const
Iterator returning form of getFirstNonPHI.
Definition: BasicBlock.cpp:223
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:216
const Instruction & front() const
Definition: BasicBlock.h:347
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:296
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:112
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:87
const_iterator getFirstNonPHIOrDbgOrAlloca() const
Returns an iterator to the first instruction in this block that is not a PHINode, a debug intrinsic,...
Definition: BasicBlock.cpp:267
size_t size() const
Definition: BasicBlock.h:345
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:127
BinaryOps getOpcode() const
Definition: InstrTypes.h:391
Conditional or Unconditional Branch instruction.
bool isConditional() const
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1412
This class represents a function call, abstracting a target machine's calling convention.
void setPredicate(Predicate P)
Set the predicate for this instruction to the specified value.
Definition: InstrTypes.h:804
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:711
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:741
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:734
@ ICMP_EQ
equal
Definition: InstrTypes.h:732
@ ICMP_NE
not equal
Definition: InstrTypes.h:733
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:825
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:801
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1235
static Constant * getBitCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2213
static Constant * getExactLogBase2(Constant *C)
If C is a scalar/fixed width vector of known powers of 2, then this function returns a new scalar/fix...
Definition: Constants.cpp:2608
This is the shared class of boolean and integer constants.
Definition: Constants.h:78
bool isMinusOne() const
This function will return true iff every bit in this constant is set to true.
Definition: Constants.h:209
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
Definition: Constants.h:203
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:197
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:888
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:145
static ConstantInt * getBool(LLVMContext &Context, bool V)
Definition: Constants.cpp:847
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:403
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
A debug info location.
Definition: DebugLoc.h:33
NodeT * getRoot() const
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
This class represents a freeze function that returns random concrete value if an operand is either a ...
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:165
void setAlignment(Align Align)
Sets the alignment attribute of the GlobalObject.
Definition: Globals.cpp:128
void setUnnamedAddr(UnnamedAddr Val)
Definition: GlobalValue.h:227
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:652
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:56
This instruction compares its operands according to the predicate given to the constructor.
bool isEquality() const
Return true if this predicate is either EQ or NE.
ConstantInt * getInt1(bool V)
Get a constant value representing either true or false.
Definition: IRBuilder.h:447
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:174
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2374
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2628
bool hasNoUnsignedWrap() const LLVM_READONLY
Determine whether the no unsigned wrap flag is set.
bool hasNoSignedWrap() const LLVM_READONLY
Determine whether the no signed wrap flag is set.
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction.
Definition: Instruction.cpp:89
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:392
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:71
bool isAtomic() const LLVM_READONLY
Return true if this instruction has an AtomicOrdering of unordered or higher.
const BasicBlock * getParent() const
Definition: Instruction.h:90
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:75
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1521
AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
Definition: Metadata.cpp:1596
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:195
bool isShift() const
Definition: Instruction.h:202
Instruction * getInsertionPointAfterDef()
Get the first insertion point at which the result of this instruction is defined.
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:83
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:389
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
An instruction for reading from memory.
Definition: Instructions.h:177
Value * getPointerOperand()
Definition: Instructions.h:264
bool isVolatile() const
Return true if this is a load from a volatile memory location.
Definition: Instructions.h:214
bool isUnordered() const
Definition: Instructions.h:258
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:220
static LocationSize precise(uint64_t Value)
static constexpr LocationSize afterPointer()
Any location after the base pointer (but still within the underlying object).
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
bool isOutermost() const
Return true if the loop does not have a parent (natural) loop.
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
unsigned getNumBackEdges() const
Calculate the number of back edges to the loop header.
BlockT * getHeader() const
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
void getUniqueExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all unique successor blocks of this loop.
block_iterator block_begin() const
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:47
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:60
StringRef getName() const
Definition: LoopInfo.h:391
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
This class wraps the llvm.memcpy intrinsic.
Value * getLength() const
Value * getDest() const
This is just like getRawDest, but it strips off any cast instructions (including addrspacecast) that ...
MaybeAlign getDestAlign() const
bool isVolatile() const
Value * getValue() const
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
MaybeAlign getSourceAlign() const
Value * getSource() const
This is just like getRawSource, but it strips off any cast instructions that feed it,...
Representation for a specific memory location.
An analysis that produces MemorySSA for a function.
Definition: MemorySSA.h:923
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition: MemorySSA.h:700
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
Value * getIncomingValueForBlock(const BasicBlock *BB) const
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1743
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:152
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:158
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents a constant integer value.
ConstantInt * getValue() const
const APInt & getAPInt() const
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
This class uses information about analyze scalars to rewrite expressions in canonical form.
const SCEV * getOperand(unsigned i) const
This class represents an analyzed expression in the program.
bool isOne() const
Return true if the expression is a constant one.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
The main scalar evolution driver.
bool isKnownNonNegative(const SCEV *S)
Test if the given expression is known to be non-negative.
const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getTripCountFromExitCount(const SCEV *ExitCount)
A version of getTripCountFromExitCount below which always picks an evaluation type which can not resu...
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getTruncateOrZeroExtend(const SCEV *V, Type *Ty, unsigned Depth=0)
Return a SCEV corresponding to a conversion of the input value to the specified type.
A vector that has set insertion semantics.
Definition: SetVector.h:57
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
Simple and conservative implementation of LoopSafetyInfo that can give false-positive answers to its ...
Definition: MustExecute.h:110
void computeLoopSafetyInfo(const Loop *CurLoop) override
Computes safety information for a loop checks loop body & header for the possibility of may throw exc...
Definition: MustExecute.cpp:51
bool anyBlockMayThrow() const override
Returns true iff any block of the loop for which this info is contains an instruction that may throw ...
Definition: MustExecute.cpp:47
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:345
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:380
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:384
iterator end() const
Definition: SmallPtrSet.h:409
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:366
iterator begin() const
Definition: SmallPtrSet.h:404
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:390
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:451
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
An instruction for storing to memory.
Definition: Instructions.h:301
Align getAlign() const
Definition: Instructions.h:345
Value * getValueOperand()
Definition: Instructions.h:390
Value * getPointerOperand()
Definition: Instructions.h:393
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Provides information about what library functions are available for the current target.
bool has(LibFunc F) const
Tests whether a library function is available.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
unsigned getAtomicMemIntrinsicMaxElementSize() const
PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const
Return hardware support for population count.
TargetCostKind
The kind of cost model.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
@ TCC_Basic
The cost of a typical 'add' instruction.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:535
iterator_range< user_iterator > users()
Definition: Value.h:421
void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition: Value.cpp:543
void replaceUsesOutsideBlock(Value *V, BasicBlock *BB)
replaceUsesOutsideBlock - Go through the uses list for this definition and make each use point to "V"...
Definition: Value.cpp:587
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1069
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:384
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:182
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:166
self_iterator getIterator()
Definition: ilist_node.h:82
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
@ HeaderSize
Definition: BTF.h:61
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:119
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1422
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
Definition: PatternMatch.h:982
cst_pred_ty< is_power2 > m_Power2()
Match an integer or vector power-of-2.
Definition: PatternMatch.h:552
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:724
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:780
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:525
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
Definition: PatternMatch.h:224
CmpClass_match< LHS, RHS, ICmpInst, ICmpInst::Predicate > m_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R)
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
BinOpPred_match< LHS, RHS, is_shift_op > m_Shift(const LHS &L, const RHS &R)
Matches shift operations.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
class_match< BasicBlock > m_BasicBlock()
Match an arbitrary basic block value and ignore it.
Definition: PatternMatch.h:168
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:545
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
Definition: PatternMatch.h:994
cst_pred_ty< icmp_pred_with_threshold > m_SpecificInt_ICMP(ICmpInst::Predicate Predicate, const APInt &Threshold)
Match an integer or vector with every element comparing 'pred' (eg/ne/...) to Threshold.
Definition: PatternMatch.h:606
@ ReallyHidden
Definition: CommandLine.h:139
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:465
constexpr double e
Definition: MathExtras.h:31
DiagnosticInfoOptimizationBase::setExtraArgs setExtraArgs
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:529
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
bool inferNonMandatoryLibFuncAttrs(Module *M, StringRef Name, const TargetLibraryInfo &TLI)
Analyze the name and prototype of the given function and set any applicable attributes.
bool isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI, LibFunc TheLibFunc)
Check whether the library function is available on target and also that it in the current Module is a...
bool isMustProgress(const Loop *L)
Return true if this loop can be assumed to make progress.
Definition: LoopInfo.cpp:1117
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
FunctionCallee getOrInsertLibFunc(Module *M, const TargetLibraryInfo &TLI, LibFunc TheLibFunc, FunctionType *T, AttributeList AttributeList)
Calls getOrInsertFunction() and then makes sure to add mandatory argument attributes.
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition: ModRef.h:27
@ ModRef
The access may reference and may modify the value stored in memory.
@ Mod
The access may modify the value stored in memory.
bool VerifyMemorySSA
Enables verification of MemorySSA.
Definition: MemorySSA.cpp:83
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
bool isGuaranteedNotToBeUndefOrPoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Return true if this function can prove that V does not have undef bits and is never poison.
bool isKnownNonNegative(const Value *V, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Returns true if the give value is known to be non-negative.
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
Value * isBytewiseValue(Value *V, const DataLayout &DL)
If the specified value can be set by repeating the same byte in memory, return the i8 value that it i...
bool decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate &Pred, Value *&X, APInt &Mask, bool LookThroughTrunc=true)
Decompose an icmp into the form ((X & Mask) pred 0) if possible.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition: Metadata.h:651
MDNode * TBAAStruct
The tag for type-based alias analysis (tbaa struct).
Definition: Metadata.h:671
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition: Metadata.h:674
MDNode * TBAA
The tag for type-based alias analysis.
Definition: Metadata.h:668
AAMDNodes merge(const AAMDNodes &Other) const
Given two sets of AAMDNodes applying to potentially different locations, determine the best AAMDNodes...
MDNode * NoAlias
The tag specifying the noalias scope.
Definition: Metadata.h:677
AAMDNodes extendTo(ssize_t Len) const
Create a new AAMDNode that describes this AAMDNode after extending it to apply to a series of bytes o...
Definition: Metadata.h:718
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static bool Memset
When true, Memset is disabled.
static bool All
When true, the entire pass is disabled.
static bool Memcpy
When true, Memcpy is disabled.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Match loop-invariant value.
match_LoopInvariant(const SubPattern_t &SP, const Loop *L)